From 04f08eb44b5011493d77b602fdec29ff0f5c6cd5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Sep 2021 17:00:29 -0700
Subject: net/af_unix: fix a data-race in unix_dgram_poll

syzbot reported another data-race in af_unix [1]

Lets change __skb_insert() to use WRITE_ONCE() when changing
skb head qlen.

Also, change unix_dgram_poll() to use lockless version
of unix_recvq_full()

It is verry possible we can switch all/most unix_recvq_full()
to the lockless version, this will be done in a future kernel version.

[1] HEAD commit: 8596e589b787732c8346f0482919e83cc9362db1

BUG: KCSAN: data-race in skb_queue_tail / unix_dgram_poll

write to 0xffff88814eeb24e0 of 4 bytes by task 25815 on cpu 0:
 __skb_insert include/linux/skbuff.h:1938 [inline]
 __skb_queue_before include/linux/skbuff.h:2043 [inline]
 __skb_queue_tail include/linux/skbuff.h:2076 [inline]
 skb_queue_tail+0x80/0xa0 net/core/skbuff.c:3264
 unix_dgram_sendmsg+0xff2/0x1600 net/unix/af_unix.c:1850
 sock_sendmsg_nosec net/socket.c:703 [inline]
 sock_sendmsg net/socket.c:723 [inline]
 ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392
 ___sys_sendmsg net/socket.c:2446 [inline]
 __sys_sendmmsg+0x315/0x4b0 net/socket.c:2532
 __do_sys_sendmmsg net/socket.c:2561 [inline]
 __se_sys_sendmmsg net/socket.c:2558 [inline]
 __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2558
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

read to 0xffff88814eeb24e0 of 4 bytes by task 25834 on cpu 1:
 skb_queue_len include/linux/skbuff.h:1869 [inline]
 unix_recvq_full net/unix/af_unix.c:194 [inline]
 unix_dgram_poll+0x2bc/0x3e0 net/unix/af_unix.c:2777
 sock_poll+0x23e/0x260 net/socket.c:1288
 vfs_poll include/linux/poll.h:90 [inline]
 ep_item_poll fs/eventpoll.c:846 [inline]
 ep_send_events fs/eventpoll.c:1683 [inline]
 ep_poll fs/eventpoll.c:1798 [inline]
 do_epoll_wait+0x6ad/0xf00 fs/eventpoll.c:2226
 __do_sys_epoll_wait fs/eventpoll.c:2238 [inline]
 __se_sys_epoll_wait fs/eventpoll.c:2233 [inline]
 __x64_sys_epoll_wait+0xf6/0x120 fs/eventpoll.c:2233
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

value changed: 0x0000001b -> 0x00000001

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 25834 Comm: syz-executor.1 Tainted: G        W         5.14.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: 86b18aaa2b5b ("skbuff: fix a data race in skb_queue_len()")
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6bdb0db3e825..841e2f0f5240 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1940,7 +1940,7 @@ static inline void __skb_insert(struct sk_buff *newsk,
 	WRITE_ONCE(newsk->prev, prev);
 	WRITE_ONCE(next->prev, newsk);
 	WRITE_ONCE(prev->next, newsk);
-	list->qlen++;
+	WRITE_ONCE(list->qlen, list->qlen + 1);
 }
 
 static inline void __skb_queue_splice(const struct sk_buff_head *list,
-- 
cgit v1.2.3-71-gd317


From 2f1aaf3ea666b737ad717b3d88667225aca23149 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 9 Sep 2021 08:49:59 -0700
Subject: bpf, mm: Fix lockdep warning triggered by
 stack_map_get_build_id_offset()

Currently the bpf selftest "get_stack_raw_tp" triggered the warning:

  [ 1411.304463] WARNING: CPU: 3 PID: 140 at include/linux/mmap_lock.h:164 find_vma+0x47/0xa0
  [ 1411.304469] Modules linked in: bpf_testmod(O) [last unloaded: bpf_testmod]
  [ 1411.304476] CPU: 3 PID: 140 Comm: systemd-journal Tainted: G        W  O      5.14.0+ #53
  [ 1411.304479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
  [ 1411.304481] RIP: 0010:find_vma+0x47/0xa0
  [ 1411.304484] Code: de 48 89 ef e8 ba f5 fe ff 48 85 c0 74 2e 48 83 c4 08 5b 5d c3 48 8d bf 28 01 00 00 be ff ff ff ff e8 2d 9f d8 00 85 c0 75 d4 <0f> 0b 48 89 de 48 8
  [ 1411.304487] RSP: 0018:ffffabd440403db8 EFLAGS: 00010246
  [ 1411.304490] RAX: 0000000000000000 RBX: 00007f00ad80a0e0 RCX: 0000000000000000
  [ 1411.304492] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e
  [ 1411.304494] RBP: ffff9cf5c2f50000 R08: ffff9cf5c3eb25d8 R09: 00000000fffffffe
  [ 1411.304496] R10: 0000000000000001 R11: 00000000ef974e19 R12: ffff9cf5c39ae0e0
  [ 1411.304498] R13: 0000000000000000 R14: 0000000000000000 R15: ffff9cf5c39ae0e0
  [ 1411.304501] FS:  00007f00ae754780(0000) GS:ffff9cf5fba00000(0000) knlGS:0000000000000000
  [ 1411.304504] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [ 1411.304506] CR2: 000000003e34343c CR3: 0000000103a98005 CR4: 0000000000370ee0
  [ 1411.304508] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [ 1411.304510] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [ 1411.304512] Call Trace:
  [ 1411.304517]  stack_map_get_build_id_offset+0x17c/0x260
  [ 1411.304528]  __bpf_get_stack+0x18f/0x230
  [ 1411.304541]  bpf_get_stack_raw_tp+0x5a/0x70
  [ 1411.305752] RAX: 0000000000000000 RBX: 5541f689495641d7 RCX: 0000000000000000
  [ 1411.305756] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e
  [ 1411.305758] RBP: ffff9cf5c02b2f40 R08: ffff9cf5ca7606c0 R09: ffffcbd43ee02c04
  [ 1411.306978]  bpf_prog_32007c34f7726d29_bpf_prog1+0xaf/0xd9c
  [ 1411.307861] R10: 0000000000000001 R11: 0000000000000044 R12: ffff9cf5c2ef60e0
  [ 1411.307865] R13: 0000000000000005 R14: 0000000000000000 R15: ffff9cf5c2ef6108
  [ 1411.309074]  bpf_trace_run2+0x8f/0x1a0
  [ 1411.309891] FS:  00007ff485141700(0000) GS:ffff9cf5fae00000(0000) knlGS:0000000000000000
  [ 1411.309896] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [ 1411.311221]  syscall_trace_enter.isra.20+0x161/0x1f0
  [ 1411.311600] CR2: 00007ff48514d90e CR3: 0000000107114001 CR4: 0000000000370ef0
  [ 1411.312291]  do_syscall_64+0x15/0x80
  [ 1411.312941] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [ 1411.313803]  entry_SYSCALL_64_after_hwframe+0x44/0xae
  [ 1411.314223] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [ 1411.315082] RIP: 0033:0x7f00ad80a0e0
  [ 1411.315626] Call Trace:
  [ 1411.315632]  stack_map_get_build_id_offset+0x17c/0x260

To reproduce, first build `test_progs` binary:

  make -C tools/testing/selftests/bpf -j60

and then run the binary at tools/testing/selftests/bpf directory:

  ./test_progs -t get_stack_raw_tp

The warning is due to commit 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked()
annotations to find_vma*()") which added mmap_assert_locked() in find_vma()
function. The mmap_assert_locked() function asserts that mm->mmap_lock needs
to be held. But this is not the case for bpf_get_stack() or bpf_get_stackid()
helper (kernel/bpf/stackmap.c), which uses mmap_read_trylock_non_owner()
instead. Since mm->mmap_lock is not held in bpf_get_stack[id]() use case,
the above warning is emitted during test run.

This patch fixed the issue by (1). using mmap_read_trylock() instead of
mmap_read_trylock_non_owner() to satisfy lockdep checking in find_vma(), and
(2). droping lockdep for mmap_lock right before the irq_work_queue(). The
function mmap_read_trylock_non_owner() is also removed since after this
patch nobody calls it any more.

Fixes: 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()")
Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Luigi Rizzo <lrizzo@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: linux-mm@kvack.org
Link: https://lore.kernel.org/bpf/20210909155000.1610299-1-yhs@fb.com
---
 include/linux/mmap_lock.h |  9 ---------
 kernel/bpf/stackmap.c     | 10 ++++++++--
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 0540f0156f58..3af8f7fb067d 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -144,15 +144,6 @@ static inline void mmap_read_unlock(struct mm_struct *mm)
 	__mmap_lock_trace_released(mm, false);
 }
 
-static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
-{
-	if (mmap_read_trylock(mm)) {
-		rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
-		return true;
-	}
-	return false;
-}
-
 static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
 {
 	up_read_non_owner(&mm->mmap_lock);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index e8eefdf8cf3e..09a3fd97d329 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -179,7 +179,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	 * with build_id.
 	 */
 	if (!user || !current || !current->mm || irq_work_busy ||
-	    !mmap_read_trylock_non_owner(current->mm)) {
+	    !mmap_read_trylock(current->mm)) {
 		/* cannot access current->mm, fall back to ips */
 		for (i = 0; i < trace_nr; i++) {
 			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
@@ -204,9 +204,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	}
 
 	if (!work) {
-		mmap_read_unlock_non_owner(current->mm);
+		mmap_read_unlock(current->mm);
 	} else {
 		work->mm = current->mm;
+
+		/* The lock will be released once we're out of interrupt
+		 * context. Tell lockdep that we've released it now so
+		 * it doesn't complain that we forgot to release it.
+		 */
+		rwsem_release(&current->mm->mmap_lock.dep_map, _RET_IP_);
 		irq_work_queue(&work->irq_work);
 	}
 }
-- 
cgit v1.2.3-71-gd317


From 4eb6bd55cfb22ffc20652732340c4962f3ac9a91 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Fri, 10 Sep 2021 16:40:39 -0700
Subject: compiler.h: drop fallback overflow checkers

Once upgrading the minimum supported version of GCC to 5.1, we can drop
the fallback code for !COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW.

This is effectively a revert of commit f0907827a8a9 ("compiler.h: enable
builtin overflow checkers and add fallback code")

Link: https://github.com/ClangBuiltLinux/linux/issues/1438#issuecomment-916745801
Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-clang.h     |  13 ----
 include/linux/compiler-gcc.h       |   4 --
 include/linux/overflow.h           | 138 +-----------------------------------
 tools/include/linux/compiler-gcc.h |   4 --
 tools/include/linux/overflow.h     | 140 +------------------------------------
 5 files changed, 6 insertions(+), 293 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 49b0ac8b6fd3..3c4de9b6c6e3 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -62,19 +62,6 @@
 #define __no_sanitize_coverage
 #endif
 
-/*
- * Not all versions of clang implement the type-generic versions
- * of the builtin overflow checkers. Fortunately, clang implements
- * __has_builtin allowing us to avoid awkward version
- * checks. Unfortunately, we don't know which version of gcc clang
- * pretends to be, so the macro may or may not be defined.
- */
-#if __has_builtin(__builtin_mul_overflow) && \
-    __has_builtin(__builtin_add_overflow) && \
-    __has_builtin(__builtin_sub_overflow)
-#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
-#endif
-
 #if __has_feature(shadow_call_stack)
 # define __noscs	__attribute__((__no_sanitize__("shadow-call-stack")))
 #endif
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index cb9217fc60af..3f7f6fa0e051 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -128,10 +128,6 @@
 #define __no_sanitize_coverage
 #endif
 
-#if GCC_VERSION >= 50100
-#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
-#endif
-
 /*
  * Turn individual warnings and errors on and off locally, depending
  * on version.
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index 0f12345c21fb..4669632bd72b 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -6,12 +6,9 @@
 #include <linux/limits.h>
 
 /*
- * In the fallback code below, we need to compute the minimum and
- * maximum values representable in a given type. These macros may also
- * be useful elsewhere, so we provide them outside the
- * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
- *
- * It would seem more obvious to do something like
+ * We need to compute the minimum and maximum values representable in a given
+ * type. These macros may also be useful elsewhere. It would seem more obvious
+ * to do something like:
  *
  * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
  * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
@@ -54,7 +51,6 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 	return unlikely(overflow);
 }
 
-#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
 /*
  * For simplicity and code hygiene, the fallback code below insists on
  * a, b and *d having the same type (similar to the min() and max()
@@ -90,134 +86,6 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 	__builtin_mul_overflow(__a, __b, __d);	\
 }))
 
-#else
-
-
-/* Checking for unsigned overflow is relatively easy without causing UB. */
-#define __unsigned_add_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = __a + __b;			\
-	*__d < __a;				\
-})
-#define __unsigned_sub_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = __a - __b;			\
-	__a < __b;				\
-})
-/*
- * If one of a or b is a compile-time constant, this avoids a division.
- */
-#define __unsigned_mul_overflow(a, b, d) ({		\
-	typeof(a) __a = (a);				\
-	typeof(b) __b = (b);				\
-	typeof(d) __d = (d);				\
-	(void) (&__a == &__b);				\
-	(void) (&__a == __d);				\
-	*__d = __a * __b;				\
-	__builtin_constant_p(__b) ?			\
-	  __b > 0 && __a > type_max(typeof(__a)) / __b : \
-	  __a > 0 && __b > type_max(typeof(__b)) / __a;	 \
-})
-
-/*
- * For signed types, detecting overflow is much harder, especially if
- * we want to avoid UB. But the interface of these macros is such that
- * we must provide a result in *d, and in fact we must produce the
- * result promised by gcc's builtins, which is simply the possibly
- * wrapped-around value. Fortunately, we can just formally do the
- * operations in the widest relevant unsigned type (u64) and then
- * truncate the result - gcc is smart enough to generate the same code
- * with and without the (u64) casts.
- */
-
-/*
- * Adding two signed integers can overflow only if they have the same
- * sign, and overflow has happened iff the result has the opposite
- * sign.
- */
-#define __signed_add_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = (u64)__a + (u64)__b;		\
-	(((~(__a ^ __b)) & (*__d ^ __a))	\
-		& type_min(typeof(__a))) != 0;	\
-})
-
-/*
- * Subtraction is similar, except that overflow can now happen only
- * when the signs are opposite. In this case, overflow has happened if
- * the result has the opposite sign of a.
- */
-#define __signed_sub_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = (u64)__a - (u64)__b;		\
-	((((__a ^ __b)) & (*__d ^ __a))		\
-		& type_min(typeof(__a))) != 0;	\
-})
-
-/*
- * Signed multiplication is rather hard. gcc always follows C99, so
- * division is truncated towards 0. This means that we can write the
- * overflow check like this:
- *
- * (a > 0 && (b > MAX/a || b < MIN/a)) ||
- * (a < -1 && (b > MIN/a || b < MAX/a) ||
- * (a == -1 && b == MIN)
- *
- * The redundant casts of -1 are to silence an annoying -Wtype-limits
- * (included in -Wextra) warning: When the type is u8 or u16, the
- * __b_c_e in check_mul_overflow obviously selects
- * __unsigned_mul_overflow, but unfortunately gcc still parses this
- * code and warns about the limited range of __b.
- */
-
-#define __signed_mul_overflow(a, b, d) ({				\
-	typeof(a) __a = (a);						\
-	typeof(b) __b = (b);						\
-	typeof(d) __d = (d);						\
-	typeof(a) __tmax = type_max(typeof(a));				\
-	typeof(a) __tmin = type_min(typeof(a));				\
-	(void) (&__a == &__b);						\
-	(void) (&__a == __d);						\
-	*__d = (u64)__a * (u64)__b;					\
-	(__b > 0   && (__a > __tmax/__b || __a < __tmin/__b)) ||	\
-	(__b < (typeof(__b))-1  && (__a > __tmin/__b || __a < __tmax/__b)) || \
-	(__b == (typeof(__b))-1 && __a == __tmin);			\
-})
-
-
-#define check_add_overflow(a, b, d)	__must_check_overflow(		\
-	__builtin_choose_expr(is_signed_type(typeof(a)),		\
-			__signed_add_overflow(a, b, d),			\
-			__unsigned_add_overflow(a, b, d)))
-
-#define check_sub_overflow(a, b, d)	__must_check_overflow(		\
-	__builtin_choose_expr(is_signed_type(typeof(a)),		\
-			__signed_sub_overflow(a, b, d),			\
-			__unsigned_sub_overflow(a, b, d)))
-
-#define check_mul_overflow(a, b, d)	__must_check_overflow(		\
-	__builtin_choose_expr(is_signed_type(typeof(a)),		\
-			__signed_mul_overflow(a, b, d),			\
-			__unsigned_mul_overflow(a, b, d)))
-
-#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
-
 /** check_shl_overflow() - Calculate a left-shifted value and check overflow
  *
  * @a: Value to be shifted
diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h
index 95c072b70d0e..a590a1dfafd9 100644
--- a/tools/include/linux/compiler-gcc.h
+++ b/tools/include/linux/compiler-gcc.h
@@ -38,7 +38,3 @@
 #endif
 #define __printf(a, b)	__attribute__((format(printf, a, b)))
 #define __scanf(a, b)	__attribute__((format(scanf, a, b)))
-
-#if GCC_VERSION >= 50100
-#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
-#endif
diff --git a/tools/include/linux/overflow.h b/tools/include/linux/overflow.h
index 8712ff70995f..dcb0c1bf6866 100644
--- a/tools/include/linux/overflow.h
+++ b/tools/include/linux/overflow.h
@@ -5,12 +5,9 @@
 #include <linux/compiler.h>
 
 /*
- * In the fallback code below, we need to compute the minimum and
- * maximum values representable in a given type. These macros may also
- * be useful elsewhere, so we provide them outside the
- * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
- *
- * It would seem more obvious to do something like
+ * We need to compute the minimum and maximum values representable in a given
+ * type. These macros may also be useful elsewhere. It would seem more obvious
+ * to do something like:
  *
  * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
  * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
@@ -36,8 +33,6 @@
 #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
 #define type_min(T) ((T)((T)-type_max(T)-(T)1))
 
-
-#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
 /*
  * For simplicity and code hygiene, the fallback code below insists on
  * a, b and *d having the same type (similar to the min() and max()
@@ -73,135 +68,6 @@
 	__builtin_mul_overflow(__a, __b, __d);	\
 })
 
-#else
-
-
-/* Checking for unsigned overflow is relatively easy without causing UB. */
-#define __unsigned_add_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = __a + __b;			\
-	*__d < __a;				\
-})
-#define __unsigned_sub_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = __a - __b;			\
-	__a < __b;				\
-})
-/*
- * If one of a or b is a compile-time constant, this avoids a division.
- */
-#define __unsigned_mul_overflow(a, b, d) ({		\
-	typeof(a) __a = (a);				\
-	typeof(b) __b = (b);				\
-	typeof(d) __d = (d);				\
-	(void) (&__a == &__b);				\
-	(void) (&__a == __d);				\
-	*__d = __a * __b;				\
-	__builtin_constant_p(__b) ?			\
-	  __b > 0 && __a > type_max(typeof(__a)) / __b : \
-	  __a > 0 && __b > type_max(typeof(__b)) / __a;	 \
-})
-
-/*
- * For signed types, detecting overflow is much harder, especially if
- * we want to avoid UB. But the interface of these macros is such that
- * we must provide a result in *d, and in fact we must produce the
- * result promised by gcc's builtins, which is simply the possibly
- * wrapped-around value. Fortunately, we can just formally do the
- * operations in the widest relevant unsigned type (u64) and then
- * truncate the result - gcc is smart enough to generate the same code
- * with and without the (u64) casts.
- */
-
-/*
- * Adding two signed integers can overflow only if they have the same
- * sign, and overflow has happened iff the result has the opposite
- * sign.
- */
-#define __signed_add_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = (u64)__a + (u64)__b;		\
-	(((~(__a ^ __b)) & (*__d ^ __a))	\
-		& type_min(typeof(__a))) != 0;	\
-})
-
-/*
- * Subtraction is similar, except that overflow can now happen only
- * when the signs are opposite. In this case, overflow has happened if
- * the result has the opposite sign of a.
- */
-#define __signed_sub_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = (u64)__a - (u64)__b;		\
-	((((__a ^ __b)) & (*__d ^ __a))		\
-		& type_min(typeof(__a))) != 0;	\
-})
-
-/*
- * Signed multiplication is rather hard. gcc always follows C99, so
- * division is truncated towards 0. This means that we can write the
- * overflow check like this:
- *
- * (a > 0 && (b > MAX/a || b < MIN/a)) ||
- * (a < -1 && (b > MIN/a || b < MAX/a) ||
- * (a == -1 && b == MIN)
- *
- * The redundant casts of -1 are to silence an annoying -Wtype-limits
- * (included in -Wextra) warning: When the type is u8 or u16, the
- * __b_c_e in check_mul_overflow obviously selects
- * __unsigned_mul_overflow, but unfortunately gcc still parses this
- * code and warns about the limited range of __b.
- */
-
-#define __signed_mul_overflow(a, b, d) ({				\
-	typeof(a) __a = (a);						\
-	typeof(b) __b = (b);						\
-	typeof(d) __d = (d);						\
-	typeof(a) __tmax = type_max(typeof(a));				\
-	typeof(a) __tmin = type_min(typeof(a));				\
-	(void) (&__a == &__b);						\
-	(void) (&__a == __d);						\
-	*__d = (u64)__a * (u64)__b;					\
-	(__b > 0   && (__a > __tmax/__b || __a < __tmin/__b)) ||	\
-	(__b < (typeof(__b))-1  && (__a > __tmin/__b || __a < __tmax/__b)) || \
-	(__b == (typeof(__b))-1 && __a == __tmin);			\
-})
-
-
-#define check_add_overflow(a, b, d)					\
-	__builtin_choose_expr(is_signed_type(typeof(a)),		\
-			__signed_add_overflow(a, b, d),			\
-			__unsigned_add_overflow(a, b, d))
-
-#define check_sub_overflow(a, b, d)					\
-	__builtin_choose_expr(is_signed_type(typeof(a)),		\
-			__signed_sub_overflow(a, b, d),			\
-			__unsigned_sub_overflow(a, b, d))
-
-#define check_mul_overflow(a, b, d)					\
-	__builtin_choose_expr(is_signed_type(typeof(a)),		\
-			__signed_mul_overflow(a, b, d),			\
-			__unsigned_mul_overflow(a, b, d))
-
-
-#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
-
 /**
  * array_size() - Calculate size of 2-dimensional array.
  *
-- 
cgit v1.2.3-71-gd317


From 4e59869aa6550657cb148ad49835605660ec9b88 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Fri, 10 Sep 2021 16:40:46 -0700
Subject: compiler-gcc.h: drop checks for older GCC versions

Now that GCC 5.1 is the minimally supported default, drop the values we
don't use.

Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h       | 4 +---
 tools/include/linux/compiler-gcc.h | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 3f7f6fa0e051..fd82ce169ce9 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -98,10 +98,8 @@
 
 #if GCC_VERSION >= 70000
 #define KASAN_ABI_VERSION 5
-#elif GCC_VERSION >= 50000
+#else
 #define KASAN_ABI_VERSION 4
-#elif GCC_VERSION >= 40902
-#define KASAN_ABI_VERSION 3
 #endif
 
 #if __has_attribute(__no_sanitize_address__)
diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h
index a590a1dfafd9..43d9a46d36f0 100644
--- a/tools/include/linux/compiler-gcc.h
+++ b/tools/include/linux/compiler-gcc.h
@@ -16,9 +16,7 @@
 # define __fallthrough __attribute__ ((fallthrough))
 #endif
 
-#if GCC_VERSION >= 40300
-# define __compiletime_error(message) __attribute__((error(message)))
-#endif /* GCC_VERSION >= 40300 */
+#define __compiletime_error(message) __attribute__((error(message)))
 
 /* &a[0] degrades to a pointer: a different type from an array */
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
-- 
cgit v1.2.3-71-gd317


From 6d2ef226f2f18d530e48ead0cb5704505628b797 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 13 Sep 2021 10:20:01 -0700
Subject: compiler_attributes.h: drop __has_attribute() support for gcc4

Now that GCC 5.1 is the minimally supported default, the manual
workaround for older gcc versions not having __has_attribute() are no
longer relevant and can be removed.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler_attributes.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 2487be0e7199..ba417a5c80af 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -20,26 +20,6 @@
  * Provide links to the documentation of each supported compiler, if it exists.
  */
 
-/*
- * __has_attribute is supported on gcc >= 5, clang >= 2.9 and icc >= 17.
- * In the meantime, to support gcc < 5, we implement __has_attribute
- * by hand.
- */
-#ifndef __has_attribute
-# define __has_attribute(x) __GCC4_has_attribute_##x
-# define __GCC4_has_attribute___assume_aligned__      1
-# define __GCC4_has_attribute___copy__                0
-# define __GCC4_has_attribute___designated_init__     0
-# define __GCC4_has_attribute___externally_visible__  1
-# define __GCC4_has_attribute___no_caller_saved_registers__ 0
-# define __GCC4_has_attribute___noclone__             1
-# define __GCC4_has_attribute___no_profile_instrument_function__ 0
-# define __GCC4_has_attribute___nonstring__           0
-# define __GCC4_has_attribute___no_sanitize_address__ 1
-# define __GCC4_has_attribute___no_sanitize_undefined__ 1
-# define __GCC4_has_attribute___fallthrough__         0
-#endif
-
 /*
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alias-function-attribute
  */
-- 
cgit v1.2.3-71-gd317


From df26327ea097eb78e7967c45df6b23010c43c28d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 13 Sep 2021 10:29:44 -0700
Subject: Drop some straggling mentions of gcc-4.9 as being stale

Fix up the admin-guide README file to the new gcc-5.1 requirement, and
remove a stale comment about gcc support for the __assume_aligned__
attribute.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/README.rst                    | 2 +-
 Documentation/translations/zh_CN/admin-guide/README.rst | 2 +-
 Documentation/translations/zh_TW/admin-guide/README.rst | 2 +-
 include/linux/compiler_attributes.h                     | 1 -
 4 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst
index 35314b63008c..caa3c09a5c3f 100644
--- a/Documentation/admin-guide/README.rst
+++ b/Documentation/admin-guide/README.rst
@@ -259,7 +259,7 @@ Configuring the kernel
 Compiling the kernel
 --------------------
 
- - Make sure you have at least gcc 4.9 available.
+ - Make sure you have at least gcc 5.1 available.
    For more information, refer to :ref:`Documentation/process/changes.rst <changes>`.
 
    Please note that you can still run a.out user programs with this kernel.
diff --git a/Documentation/translations/zh_CN/admin-guide/README.rst b/Documentation/translations/zh_CN/admin-guide/README.rst
index 669a022f6817..980eb20521cf 100644
--- a/Documentation/translations/zh_CN/admin-guide/README.rst
+++ b/Documentation/translations/zh_CN/admin-guide/README.rst
@@ -223,7 +223,7 @@ Linux内核5.x版本 <http://kernel.org/>
 编译内核
 ---------
 
- - 确保您至少有gcc 4.9可用。
+ - 确保您至少有gcc 5.1可用。
    有关更多信息，请参阅 :ref:`Documentation/process/changes.rst <changes>` 。
 
    请注意，您仍然可以使用此内核运行a.out用户程序。
diff --git a/Documentation/translations/zh_TW/admin-guide/README.rst b/Documentation/translations/zh_TW/admin-guide/README.rst
index b752e50359e6..6ce97edbab37 100644
--- a/Documentation/translations/zh_TW/admin-guide/README.rst
+++ b/Documentation/translations/zh_TW/admin-guide/README.rst
@@ -226,7 +226,7 @@ Linux內核5.x版本 <http://kernel.org/>
 編譯內核
 ---------
 
- - 確保您至少有gcc 4.9可用。
+ - 確保您至少有gcc 5.1可用。
    有關更多信息，請參閱 :ref:`Documentation/process/changes.rst <changes>` 。
 
    請注意，您仍然可以使用此內核運行a.out用戶程序。
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index ba417a5c80af..ee19cebabcf5 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -54,7 +54,6 @@
  * compiler should see some alignment anyway, when the return value is
  * massaged by 'flags = ptr & 3; ptr &= ~3;').
  *
- * Optional: only supported since gcc >= 4.9
  * Optional: not supported by icc
  *
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-assume_005faligned-function-attribute
-- 
cgit v1.2.3-71-gd317


From 80f0a1f99983296be587325004acf72dd11eccd8 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eb@emlix.com>
Date: Mon, 13 Sep 2021 12:02:56 +0200
Subject: workqueue: annotate alloc_workqueue() as printf

This also enables checking of allows alloc_ordered_workqueue().

Signed-off-by: Rolf Eike Beer <eb@emlix.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 2ebef6b1a3d6..74d3c1efd9bb 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -399,9 +399,8 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq;
  * RETURNS:
  * Pointer to the allocated workqueue on success, %NULL on failure.
  */
-struct workqueue_struct *alloc_workqueue(const char *fmt,
-					 unsigned int flags,
-					 int max_active, ...);
+__printf(1, 4) struct workqueue_struct *
+alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...);
 
 /**
  * alloc_ordered_workqueue - allocate an ordered workqueue
-- 
cgit v1.2.3-71-gd317


From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 14 Sep 2021 01:07:57 +0200
Subject: bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode

Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
embedded per-socket cgroup information into sock->sk_cgrp_data and in order
to save 8 bytes in struct sock made both mutually exclusive, that is, when
cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).

The assumption made was "there is no reason to mix the two and this is in line
with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
this assumption no longer holds, and the possibility of the v1/v2 mixed mode
with the v2 root fallback being hit becomes a real security issue.

Many of the cgroup v2 BPF programs are also used for policy enforcement, just
to pick _one_ example, that is, to programmatically deny socket related system
calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
a policy bypass for the affected Pods.

In production environments, we have recently seen this case due to various
circumstances: i) a different 3rd party agent and/or ii) a container runtime
such as [0] in the user's environment configuring legacy cgroup v1 net_cls
tags, which triggered implicitly mentioned root fallback. Another case is
Kubernetes projects like kind [1] which create Kubernetes nodes in a container
and also add cgroup namespaces to the mix, meaning programs which are attached
to the cgroup v2 root of the cgroup namespace get attached to a non-root
cgroup v2 path from init namespace point of view. And the latter's root is
out of reach for agents on a kind Kubernetes node to configure. Meaning, any
entity on the node setting cgroup v1 net_cls tag will trigger the bypass
despite cgroup v2 BPF programs attached to the namespace root.

Generally, this mutual exclusiveness does not hold anymore in today's user
environments and makes cgroup v2 usage from BPF side fragile and unreliable.
This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
sock_cgroup_data in order to address these issues; this implicitly also fixes
the tradeoffs being made back then with regards to races and refcount leaks
as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
programs always operate as expected.

  [0] https://github.com/nestybox/sysbox/
  [1] https://kind.sigs.k8s.io/

Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
---
 include/linux/cgroup-defs.h  | 107 +++++++++++--------------------------------
 include/linux/cgroup.h       |  22 +--------
 kernel/cgroup/cgroup.c       |  50 ++++----------------
 net/core/netclassid_cgroup.c |   7 +--
 net/core/netprio_cgroup.c    |  10 +---
 5 files changed, 41 insertions(+), 155 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index e1c705fdfa7c..db2e147e069f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
  * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
  * per-socket cgroup information except for memcg association.
  *
- * On legacy hierarchies, net_prio and net_cls controllers directly set
- * attributes on each sock which can then be tested by the network layer.
- * On the default hierarchy, each sock is associated with the cgroup it was
- * created in and the networking layer can match the cgroup directly.
- *
- * To avoid carrying all three cgroup related fields separately in sock,
- * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
- * On boot, sock_cgroup_data records the cgroup that the sock was created
- * in so that cgroup2 matches can be made; however, once either net_prio or
- * net_cls starts being used, the area is overridden to carry prioidx and/or
- * classid.  The two modes are distinguished by whether the lowest bit is
- * set.  Clear bit indicates cgroup pointer while set bit prioidx and
- * classid.
- *
- * While userland may start using net_prio or net_cls at any time, once
- * either is used, cgroup2 matching no longer works.  There is no reason to
- * mix the two and this is in line with how legacy and v2 compatibility is
- * handled.  On mode switch, cgroup references which are already being
- * pointed to by socks may be leaked.  While this can be remedied by adding
- * synchronization around sock_cgroup_data, given that the number of leaked
- * cgroups is bound and highly unlikely to be high, this seems to be the
- * better trade-off.
+ * On legacy hierarchies, net_prio and net_cls controllers directly
+ * set attributes on each sock which can then be tested by the network
+ * layer. On the default hierarchy, each sock is associated with the
+ * cgroup it was created in and the networking layer can match the
+ * cgroup directly.
  */
 struct sock_cgroup_data {
-	union {
-#ifdef __LITTLE_ENDIAN
-		struct {
-			u8	is_data : 1;
-			u8	no_refcnt : 1;
-			u8	unused : 6;
-			u8	padding;
-			u16	prioidx;
-			u32	classid;
-		} __packed;
-#else
-		struct {
-			u32	classid;
-			u16	prioidx;
-			u8	padding;
-			u8	unused : 6;
-			u8	no_refcnt : 1;
-			u8	is_data : 1;
-		} __packed;
+	struct cgroup	*cgroup; /* v2 */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	u32		classid; /* v1 */
+#endif
+#ifdef CONFIG_CGROUP_NET_PRIO
+	u16		prioidx; /* v1 */
 #endif
-		u64		val;
-	};
 };
 
-/*
- * There's a theoretical window where the following accessors race with
- * updaters and return part of the previous pointer as the prioidx or
- * classid.  Such races are short-lived and the result isn't critical.
- */
 static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
 {
-	/* fallback to 1 which is always the ID of the root cgroup */
-	return (skcd->is_data & 1) ? skcd->prioidx : 1;
+#ifdef CONFIG_CGROUP_NET_PRIO
+	return READ_ONCE(skcd->prioidx);
+#else
+	return 1;
+#endif
 }
 
 static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
 {
-	/* fallback to 0 which is the unconfigured default classid */
-	return (skcd->is_data & 1) ? skcd->classid : 0;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	return READ_ONCE(skcd->classid);
+#else
+	return 0;
+#endif
 }
 
-/*
- * If invoked concurrently, the updaters may clobber each other.  The
- * caller is responsible for synchronization.
- */
 static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
 					   u16 prioidx)
 {
-	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
-	if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
-		return;
-
-	if (!(skcd_buf.is_data & 1)) {
-		skcd_buf.val = 0;
-		skcd_buf.is_data = 1;
-	}
-
-	skcd_buf.prioidx = prioidx;
-	WRITE_ONCE(skcd->val, skcd_buf.val);	/* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_PRIO
+	WRITE_ONCE(skcd->prioidx, prioidx);
+#endif
 }
 
 static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
 					   u32 classid)
 {
-	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
-	if (sock_cgroup_classid(&skcd_buf) == classid)
-		return;
-
-	if (!(skcd_buf.is_data & 1)) {
-		skcd_buf.val = 0;
-		skcd_buf.is_data = 1;
-	}
-
-	skcd_buf.classid = classid;
-	WRITE_ONCE(skcd->val, skcd_buf.val);	/* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	WRITE_ONCE(skcd->classid, classid);
+#endif
 }
 
 #else	/* CONFIG_SOCK_CGROUP_DATA */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7bf60454a313..75c151413fda 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -829,33 +829,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
  */
 #ifdef CONFIG_SOCK_CGROUP_DATA
 
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-extern spinlock_t cgroup_sk_update_lock;
-#endif
-
-void cgroup_sk_alloc_disable(void);
 void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
 void cgroup_sk_clone(struct sock_cgroup_data *skcd);
 void cgroup_sk_free(struct sock_cgroup_data *skcd);
 
 static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
 {
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-	unsigned long v;
-
-	/*
-	 * @skcd->val is 64bit but the following is safe on 32bit too as we
-	 * just need the lower ulong to be written and read atomically.
-	 */
-	v = READ_ONCE(skcd->val);
-
-	if (v & 3)
-		return &cgrp_dfl_root.cgrp;
-
-	return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
-#else
-	return (struct cgroup *)(unsigned long)skcd->val;
-#endif
+	return skcd->cgroup;
 }
 
 #else	/* CONFIG_CGROUP_DATA */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 881ce1470beb..8afa8690d288 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
  */
 #ifdef CONFIG_SOCK_CGROUP_DATA
 
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
-	if (cgroup_sk_alloc_disabled)
-		return;
-	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
-	cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled	false
-
-#endif
-
 void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 {
-	if (cgroup_sk_alloc_disabled) {
-		skcd->no_refcnt = 1;
-		return;
-	}
-
 	/* Don't associate the sock with unrelated interrupted task's cgroup. */
 	if (in_interrupt())
 		return;
 
 	rcu_read_lock();
-
 	while (true) {
 		struct css_set *cset;
 
 		cset = task_css_set(current);
 		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
-			skcd->val = (unsigned long)cset->dfl_cgrp;
+			skcd->cgroup = cset->dfl_cgrp;
 			cgroup_bpf_get(cset->dfl_cgrp);
 			break;
 		}
 		cpu_relax();
 	}
-
 	rcu_read_unlock();
 }
 
 void cgroup_sk_clone(struct sock_cgroup_data *skcd)
 {
-	if (skcd->val) {
-		if (skcd->no_refcnt)
-			return;
-		/*
-		 * We might be cloning a socket which is left in an empty
-		 * cgroup and the cgroup might have already been rmdir'd.
-		 * Don't use cgroup_get_live().
-		 */
-		cgroup_get(sock_cgroup_ptr(skcd));
-		cgroup_bpf_get(sock_cgroup_ptr(skcd));
-	}
+	struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+	/*
+	 * We might be cloning a socket which is left in an empty
+	 * cgroup and the cgroup might have already been rmdir'd.
+	 * Don't use cgroup_get_live().
+	 */
+	cgroup_get(cgrp);
+	cgroup_bpf_get(cgrp);
 }
 
 void cgroup_sk_free(struct sock_cgroup_data *skcd)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(skcd);
 
-	if (skcd->no_refcnt)
-		return;
 	cgroup_bpf_put(cgrp);
 	cgroup_put(cgrp);
 }
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index b49c57d35a88..1a6a86693b74 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -71,11 +71,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
 	struct update_classid_context *ctx = (void *)v;
 	struct socket *sock = sock_from_file(file);
 
-	if (sock) {
-		spin_lock(&cgroup_sk_update_lock);
+	if (sock)
 		sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
-		spin_unlock(&cgroup_sk_update_lock);
-	}
 	if (--ctx->batch == 0) {
 		ctx->batch = UPDATE_CLASSID_BATCH;
 		return n + 1;
@@ -121,8 +118,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
 	struct css_task_iter it;
 	struct task_struct *p;
 
-	cgroup_sk_alloc_disable();
-
 	cs->classid = (u32)value;
 
 	css_task_iter_start(css, 0, &it);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 99a431c56f23..8456dfbe2eb4 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
 	if (!dev)
 		return -ENODEV;
 
-	cgroup_sk_alloc_disable();
-
 	rtnl_lock();
 
 	ret = netprio_set_prio(of_css(of), dev, prio);
@@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
 static int update_netprio(const void *v, struct file *file, unsigned n)
 {
 	struct socket *sock = sock_from_file(file);
-	if (sock) {
-		spin_lock(&cgroup_sk_update_lock);
+
+	if (sock)
 		sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
 					(unsigned long)v);
-		spin_unlock(&cgroup_sk_update_lock);
-	}
 	return 0;
 }
 
@@ -235,8 +231,6 @@ static void net_prio_attach(struct cgroup_taskset *tset)
 	struct task_struct *p;
 	struct cgroup_subsys_state *css;
 
-	cgroup_sk_alloc_disable();
-
 	cgroup_taskset_for_each(p, css, tset) {
 		void *v = (void *)(unsigned long)css->id;
 
-- 
cgit v1.2.3-71-gd317


From 7a8aa39d44564703620d937bb54cdea2d003657f Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Mon, 13 Sep 2021 17:05:51 +0100
Subject: nvmem: core: Add stubs for nvmem_cell_read_variable_le_u32/64 if
 !CONFIG_NVMEM

When I added nvmem_cell_read_variable_le_u32() and
nvmem_cell_read_variable_le_u64() I forgot to add the "static inline"
stub functions for when CONFIG_NVMEM wasn't defined. Add them
now. This was causing problems with randconfig builds that compiled
`drivers/soc/qcom/cpr.c`.

Fixes: 6feba6a62c57 ("PM: AVS: qcom-cpr: Use nvmem_cell_read_variable_le_u32()")
Fixes: a28e824fb827 ("nvmem: core: Add functions to make number reading easy")
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20210913160551.12907-1-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-consumer.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index 923dada24eb4..c0c0cefc3b92 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -150,6 +150,20 @@ static inline int nvmem_cell_read_u64(struct device *dev,
 	return -EOPNOTSUPP;
 }
 
+static inline int nvmem_cell_read_variable_le_u32(struct device *dev,
+						 const char *cell_id,
+						 u32 *val)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int nvmem_cell_read_variable_le_u64(struct device *dev,
+						  const char *cell_id,
+						  u64 *val)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct nvmem_device *nvmem_device_get(struct device *dev,
 						    const char *name)
 {
-- 
cgit v1.2.3-71-gd317


From 81065b35e2486c024c7aa86caed452e1f01a59d4 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Mon, 13 Sep 2021 14:52:39 -0700
Subject: x86/mce: Avoid infinite loop for copy from user recovery

There are two cases for machine check recovery:

1) The machine check was triggered by ring3 (application) code.
   This is the simpler case. The machine check handler simply queues
   work to be executed on return to user. That code unmaps the page
   from all users and arranges to send a SIGBUS to the task that
   triggered the poison.

2) The machine check was triggered in kernel code that is covered by
   an exception table entry. In this case the machine check handler
   still queues a work entry to unmap the page, etc. but this will
   not be called right away because the #MC handler returns to the
   fix up code address in the exception table entry.

Problems occur if the kernel triggers another machine check before the
return to user processes the first queued work item.

Specifically, the work is queued using the ->mce_kill_me callback
structure in the task struct for the current thread. Attempting to queue
a second work item using this same callback results in a loop in the
linked list of work functions to call. So when the kernel does return to
user, it enters an infinite loop processing the same entry for ever.

There are some legitimate scenarios where the kernel may take a second
machine check before returning to the user.

1) Some code (e.g. futex) first tries a get_user() with page faults
   disabled. If this fails, the code retries with page faults enabled
   expecting that this will resolve the page fault.

2) Copy from user code retries a copy in byte-at-time mode to check
   whether any additional bytes can be copied.

On the other side of the fence are some bad drivers that do not check
the return value from individual get_user() calls and may access
multiple user addresses without noticing that some/all calls have
failed.

Fix by adding a counter (current->mce_count) to keep track of repeated
machine checks before task_work() is called. First machine check saves
the address information and calls task_work_add(). Subsequent machine
checks before that task_work call back is executed check that the address
is in the same page as the first machine check (since the callback will
offline exactly one page).

Expected worst case is four machine checks before moving on (e.g. one
user access with page faults disabled, then a repeat to the same address
with page faults enabled ... repeat in copy tail bytes). Just in case
there is some code that loops forever enforce a limit of 10.

 [ bp: Massage commit message, drop noinstr, fix typo, extend panic
   messages. ]

Fixes: 5567d11c21a1 ("x86/mce: Send #MC singal from task work")
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/YT/IJ9ziLqmtqEPu@agluck-desk2.amr.corp.intel.com
---
 arch/x86/kernel/cpu/mce/core.c | 43 +++++++++++++++++++++++++++++++-----------
 include/linux/sched.h          |  1 +
 2 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 8cb7816d03b4..193204aee880 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1253,6 +1253,9 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
 
 static void kill_me_now(struct callback_head *ch)
 {
+	struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
+
+	p->mce_count = 0;
 	force_sig(SIGBUS);
 }
 
@@ -1262,6 +1265,7 @@ static void kill_me_maybe(struct callback_head *cb)
 	int flags = MF_ACTION_REQUIRED;
 	int ret;
 
+	p->mce_count = 0;
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
 
 	if (!p->mce_ripv)
@@ -1290,17 +1294,34 @@ static void kill_me_maybe(struct callback_head *cb)
 	}
 }
 
-static void queue_task_work(struct mce *m, int kill_current_task)
+static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
 {
-	current->mce_addr = m->addr;
-	current->mce_kflags = m->kflags;
-	current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
-	current->mce_whole_page = whole_page(m);
+	int count = ++current->mce_count;
 
-	if (kill_current_task)
-		current->mce_kill_me.func = kill_me_now;
-	else
-		current->mce_kill_me.func = kill_me_maybe;
+	/* First call, save all the details */
+	if (count == 1) {
+		current->mce_addr = m->addr;
+		current->mce_kflags = m->kflags;
+		current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+		current->mce_whole_page = whole_page(m);
+
+		if (kill_current_task)
+			current->mce_kill_me.func = kill_me_now;
+		else
+			current->mce_kill_me.func = kill_me_maybe;
+	}
+
+	/* Ten is likely overkill. Don't expect more than two faults before task_work() */
+	if (count > 10)
+		mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
+
+	/* Second or later call, make sure page address matches the one from first call */
+	if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
+		mce_panic("Consecutive machine checks to different user pages", m, msg);
+
+	/* Do not call task_work_add() more than once */
+	if (count > 1)
+		return;
 
 	task_work_add(current, &current->mce_kill_me, TWA_RESUME);
 }
@@ -1438,7 +1459,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
 		/* If this triggers there is no way to recover. Die hard. */
 		BUG_ON(!on_thread_stack() || !user_mode(regs));
 
-		queue_task_work(&m, kill_current_task);
+		queue_task_work(&m, msg, kill_current_task);
 
 	} else {
 		/*
@@ -1456,7 +1477,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
 		}
 
 		if (m.kflags & MCE_IN_KERNEL_COPYIN)
-			queue_task_work(&m, kill_current_task);
+			queue_task_work(&m, msg, kill_current_task);
 	}
 out:
 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1780260f237b..361c7bc72cbb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1468,6 +1468,7 @@ struct task_struct {
 					mce_whole_page : 1,
 					__mce_reserved : 62;
 	struct callback_head		mce_kill_me;
+	int				mce_count;
 #endif
 
 #ifdef CONFIG_KRETPROBES
-- 
cgit v1.2.3-71-gd317


From 58877b0824da15698bd85a0a9dbfa8c354e6ecb7 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Thu, 9 Sep 2021 12:11:58 +0530
Subject: usb: core: hcd: Add support for deferring roothub registration

It has been observed with certain PCIe USB cards (like Inateck connected
to AM64 EVM or J7200 EVM) that as soon as the primary roothub is
registered, port status change is handled even before xHC is running
leading to cold plug USB devices not detected. For such cases, registering
both the root hubs along with the second HCD is required. Add support for
deferring roothub registration in usb_add_hcd(), so that both primary and
secondary roothubs are registered along with the second HCD.

CC: stable@vger.kernel.org # 5.4+
Suggested-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Tested-by: Chris Chiu <chris.chiu@canonical.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Link: https://lore.kernel.org/r/20210909064200.16216-2-kishon@ti.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c  | 29 +++++++++++++++++++++++------
 include/linux/usb/hcd.h |  2 ++
 2 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 0f8b7c93310e..99ff2d23be05 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -2775,6 +2775,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
 {
 	int retval;
 	struct usb_device *rhdev;
+	struct usb_hcd *shared_hcd;
 
 	if (!hcd->skip_phy_initialization && usb_hcd_is_primary_hcd(hcd)) {
 		hcd->phy_roothub = usb_phy_roothub_alloc(hcd->self.sysdev);
@@ -2935,13 +2936,26 @@ int usb_add_hcd(struct usb_hcd *hcd,
 		goto err_hcd_driver_start;
 	}
 
+	/* starting here, usbcore will pay attention to the shared HCD roothub */
+	shared_hcd = hcd->shared_hcd;
+	if (!usb_hcd_is_primary_hcd(hcd) && shared_hcd && HCD_DEFER_RH_REGISTER(shared_hcd)) {
+		retval = register_root_hub(shared_hcd);
+		if (retval != 0)
+			goto err_register_root_hub;
+
+		if (shared_hcd->uses_new_polling && HCD_POLL_RH(shared_hcd))
+			usb_hcd_poll_rh_status(shared_hcd);
+	}
+
 	/* starting here, usbcore will pay attention to this root hub */
-	retval = register_root_hub(hcd);
-	if (retval != 0)
-		goto err_register_root_hub;
+	if (!HCD_DEFER_RH_REGISTER(hcd)) {
+		retval = register_root_hub(hcd);
+		if (retval != 0)
+			goto err_register_root_hub;
 
-	if (hcd->uses_new_polling && HCD_POLL_RH(hcd))
-		usb_hcd_poll_rh_status(hcd);
+		if (hcd->uses_new_polling && HCD_POLL_RH(hcd))
+			usb_hcd_poll_rh_status(hcd);
+	}
 
 	return retval;
 
@@ -2985,6 +2999,7 @@ EXPORT_SYMBOL_GPL(usb_add_hcd);
 void usb_remove_hcd(struct usb_hcd *hcd)
 {
 	struct usb_device *rhdev = hcd->self.root_hub;
+	bool rh_registered;
 
 	dev_info(hcd->self.controller, "remove, state %x\n", hcd->state);
 
@@ -2995,6 +3010,7 @@ void usb_remove_hcd(struct usb_hcd *hcd)
 
 	dev_dbg(hcd->self.controller, "roothub graceful disconnect\n");
 	spin_lock_irq (&hcd_root_hub_lock);
+	rh_registered = hcd->rh_registered;
 	hcd->rh_registered = 0;
 	spin_unlock_irq (&hcd_root_hub_lock);
 
@@ -3004,7 +3020,8 @@ void usb_remove_hcd(struct usb_hcd *hcd)
 	cancel_work_sync(&hcd->died_work);
 
 	mutex_lock(&usb_bus_idr_lock);
-	usb_disconnect(&rhdev);		/* Sets rhdev to NULL */
+	if (rh_registered)
+		usb_disconnect(&rhdev);		/* Sets rhdev to NULL */
 	mutex_unlock(&usb_bus_idr_lock);
 
 	/*
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 548a028f2dab..2c1fc9212cf2 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -124,6 +124,7 @@ struct usb_hcd {
 #define HCD_FLAG_RH_RUNNING		5	/* root hub is running? */
 #define HCD_FLAG_DEAD			6	/* controller has died? */
 #define HCD_FLAG_INTF_AUTHORIZED	7	/* authorize interfaces? */
+#define HCD_FLAG_DEFER_RH_REGISTER	8	/* Defer roothub registration */
 
 	/* The flags can be tested using these macros; they are likely to
 	 * be slightly faster than test_bit().
@@ -134,6 +135,7 @@ struct usb_hcd {
 #define HCD_WAKEUP_PENDING(hcd)	((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING))
 #define HCD_RH_RUNNING(hcd)	((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING))
 #define HCD_DEAD(hcd)		((hcd)->flags & (1U << HCD_FLAG_DEAD))
+#define HCD_DEFER_RH_REGISTER(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEFER_RH_REGISTER))
 
 	/*
 	 * Specifies if interfaces are authorized by default
-- 
cgit v1.2.3-71-gd317


From 8fb0f47a9d7acf620d0fd97831b69da9bc5e22ed Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 10 Sep 2021 11:18:36 -0600
Subject: iov_iter: add helper to save iov_iter state

In an ideal world, when someone is passed an iov_iter and returns X bytes,
then X bytes would have been consumed/advanced from the iov_iter. But we
have use cases that always consume the entire iterator, a few examples
of that are iomap and bdev O_DIRECT. This means we cannot rely on the
state of the iov_iter once we've called ->read_iter() or ->write_iter().

This would be easier if we didn't always have to deal with truncate of
the iov_iter, as rewinding would be trivial without that. We recently
added a commit to track the truncate state, but that grew the iov_iter
by 8 bytes and wasn't the best solution.

Implement a helper to save enough of the iov_iter state to sanely restore
it after we've called the read/write iterator helpers. This currently
only works for IOVEC/BVEC/KVEC as that's all we need, support for other
iterator types are left as an exercise for the reader.

Link: https://lore.kernel.org/linux-fsdevel/CAHk-=wiacKV4Gh-MYjteU0LwNBSGpWrK-Ov25HdqB1ewinrFPg@mail.gmail.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/uio.h | 15 +++++++++++++++
 lib/iov_iter.c      | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 5265024e8b90..984c4ab74859 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -27,6 +27,12 @@ enum iter_type {
 	ITER_DISCARD,
 };
 
+struct iov_iter_state {
+	size_t iov_offset;
+	size_t count;
+	unsigned long nr_segs;
+};
+
 struct iov_iter {
 	u8 iter_type;
 	bool data_source;
@@ -55,6 +61,14 @@ static inline enum iter_type iov_iter_type(const struct iov_iter *i)
 	return i->iter_type;
 }
 
+static inline void iov_iter_save_state(struct iov_iter *iter,
+				       struct iov_iter_state *state)
+{
+	state->iov_offset = iter->iov_offset;
+	state->count = iter->count;
+	state->nr_segs = iter->nr_segs;
+}
+
 static inline bool iter_is_iovec(const struct iov_iter *i)
 {
 	return iov_iter_type(i) == ITER_IOVEC;
@@ -233,6 +247,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
 			size_t maxsize, size_t *start);
 int iov_iter_npages(const struct iov_iter *i, int maxpages);
+void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);
 
 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
 
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f2d50d69a6c3..755c10c5138c 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1972,3 +1972,39 @@ int import_single_range(int rw, void __user *buf, size_t len,
 	return 0;
 }
 EXPORT_SYMBOL(import_single_range);
+
+/**
+ * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
+ *     iov_iter_save_state() was called.
+ *
+ * @i: &struct iov_iter to restore
+ * @state: state to restore from
+ *
+ * Used after iov_iter_save_state() to bring restore @i, if operations may
+ * have advanced it.
+ *
+ * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
+ */
+void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
+{
+	if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
+			 !iov_iter_is_kvec(i))
+		return;
+	i->iov_offset = state->iov_offset;
+	i->count = state->count;
+	/*
+	 * For the *vec iters, nr_segs + iov is constant - if we increment
+	 * the vec, then we also decrement the nr_segs count. Hence we don't
+	 * need to track both of these, just one is enough and we can deduct
+	 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
+	 * size, so we can just increment the iov pointer as they are unionzed.
+	 * ITER_BVEC _may_ be the same size on some archs, but on others it is
+	 * not. Be safe and handle it separately.
+	 */
+	BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
+	if (iov_iter_is_bvec(i))
+		i->bvec -= state->nr_segs - i->nr_segs;
+	else
+		i->iov -= state->nr_segs - i->nr_segs;
+	i->nr_segs = state->nr_segs;
+}
-- 
cgit v1.2.3-71-gd317


From 356ed64991c6847a0c4f2e8fa3b1133f7a14f1fc Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Tue, 14 Sep 2021 10:33:51 +0800
Subject: bpf: Handle return value of BPF_PROG_TYPE_STRUCT_OPS prog

Currently if a function ptr in struct_ops has a return value, its
caller will get a random return value from it, because the return
value of related BPF_PROG_TYPE_STRUCT_OPS prog is just dropped.

So adding a new flag BPF_TRAMP_F_RET_FENTRY_RET to tell bpf trampoline
to save and return the return value of struct_ops prog if ret_size of
the function ptr is greater than 0. Also restricting the flag to be
used alone.

Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210914023351.3664499-1-houtao1@huawei.com
---
 arch/x86/net/bpf_jit_comp.c | 53 ++++++++++++++++++++++++++++++++++-----------
 include/linux/bpf.h         |  3 ++-
 kernel/bpf/bpf_struct_ops.c |  7 ++++--
 3 files changed, 47 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 0fe6aacef3db..d24a512fd6f3 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1744,7 +1744,7 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
 }
 
 static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
-			   struct bpf_prog *p, int stack_size, bool mod_ret)
+			   struct bpf_prog *p, int stack_size, bool save_ret)
 {
 	u8 *prog = *pprog;
 	u8 *jmp_insn;
@@ -1777,11 +1777,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	if (emit_call(&prog, p->bpf_func, prog))
 		return -EINVAL;
 
-	/* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return
+	/*
+	 * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return
 	 * of the previous call which is then passed on the stack to
 	 * the next BPF program.
+	 *
+	 * BPF_TRAMP_FENTRY trampoline may need to return the return
+	 * value of BPF_PROG_TYPE_STRUCT_OPS prog.
 	 */
-	if (mod_ret)
+	if (save_ret)
 		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
 
 	/* replace 2 nops with JE insn, since jmp target is known */
@@ -1828,13 +1832,15 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
 }
 
 static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
-		      struct bpf_tramp_progs *tp, int stack_size)
+		      struct bpf_tramp_progs *tp, int stack_size,
+		      bool save_ret)
 {
 	int i;
 	u8 *prog = *pprog;
 
 	for (i = 0; i < tp->nr_progs; i++) {
-		if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false))
+		if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size,
+				    save_ret))
 			return -EINVAL;
 	}
 	*pprog = prog;
@@ -1877,6 +1883,23 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
 	return 0;
 }
 
+static bool is_valid_bpf_tramp_flags(unsigned int flags)
+{
+	if ((flags & BPF_TRAMP_F_RESTORE_REGS) &&
+	    (flags & BPF_TRAMP_F_SKIP_FRAME))
+		return false;
+
+	/*
+	 * BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops,
+	 * and it must be used alone.
+	 */
+	if ((flags & BPF_TRAMP_F_RET_FENTRY_RET) &&
+	    (flags & ~BPF_TRAMP_F_RET_FENTRY_RET))
+		return false;
+
+	return true;
+}
+
 /* Example:
  * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
  * its 'struct btf_func_model' will be nr_args=2
@@ -1949,17 +1972,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN];
 	u8 **branches = NULL;
 	u8 *prog;
+	bool save_ret;
 
 	/* x86-64 supports up to 6 arguments. 7+ can be added in the future */
 	if (nr_args > 6)
 		return -ENOTSUPP;
 
-	if ((flags & BPF_TRAMP_F_RESTORE_REGS) &&
-	    (flags & BPF_TRAMP_F_SKIP_FRAME))
+	if (!is_valid_bpf_tramp_flags(flags))
 		return -EINVAL;
 
-	if (flags & BPF_TRAMP_F_CALL_ORIG)
-		stack_size += 8; /* room for return value of orig_call */
+	/* room for return value of orig_call or fentry prog */
+	save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET);
+	if (save_ret)
+		stack_size += 8;
 
 	if (flags & BPF_TRAMP_F_IP_ARG)
 		stack_size += 8; /* room for IP address argument */
@@ -2005,7 +2030,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	}
 
 	if (fentry->nr_progs)
-		if (invoke_bpf(m, &prog, fentry, stack_size))
+		if (invoke_bpf(m, &prog, fentry, stack_size,
+			       flags & BPF_TRAMP_F_RET_FENTRY_RET))
 			return -EINVAL;
 
 	if (fmod_ret->nr_progs) {
@@ -2052,7 +2078,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	}
 
 	if (fexit->nr_progs)
-		if (invoke_bpf(m, &prog, fexit, stack_size)) {
+		if (invoke_bpf(m, &prog, fexit, stack_size, false)) {
 			ret = -EINVAL;
 			goto cleanup;
 		}
@@ -2072,9 +2098,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 			ret = -EINVAL;
 			goto cleanup;
 		}
-		/* restore original return value back into RAX */
-		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
 	}
+	/* restore return value of orig_call or fentry prog back into RAX */
+	if (save_ret)
+		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
 
 	EMIT1(0x5B); /* pop rbx */
 	EMIT1(0xC9); /* leave */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f4c16f19f83e..020a7d5bf470 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -578,11 +578,12 @@ struct btf_func_model {
  * programs only. Should not be used with normal calls and indirect calls.
  */
 #define BPF_TRAMP_F_SKIP_FRAME		BIT(2)
-
 /* Store IP address of the caller on the trampoline stack,
  * so it's available for trampoline's programs.
  */
 #define BPF_TRAMP_F_IP_ARG		BIT(3)
+/* Return the return value of fentry prog. Only used by bpf_struct_ops. */
+#define BPF_TRAMP_F_RET_FENTRY_RET	BIT(4)
 
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.  Pick a number to fit into BPF_IMAGE_SIZE / 2
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index d6731c32864e..9abcc33f02cf 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -368,6 +368,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		const struct btf_type *mtype, *ptype;
 		struct bpf_prog *prog;
 		u32 moff;
+		u32 flags;
 
 		moff = btf_member_bit_offset(t, member) / 8;
 		ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
@@ -431,10 +432,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
 		tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
 		tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
+		flags = st_ops->func_models[i].ret_size > 0 ?
+			BPF_TRAMP_F_RET_FENTRY_RET : 0;
 		err = arch_prepare_bpf_trampoline(NULL, image,
 						  st_map->image + PAGE_SIZE,
-						  &st_ops->func_models[i], 0,
-						  tprogs, NULL);
+						  &st_ops->func_models[i],
+						  flags, tprogs, NULL);
 		if (err < 0)
 			goto reset_unlock;
 
-- 
cgit v1.2.3-71-gd317


From 77e02cf57b6cff9919949defb7fd9b8ac16399a2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 14 Sep 2021 13:23:22 -0700
Subject: memblock: introduce saner 'memblock_free_ptr()' interface

The boot-time allocation interface for memblock is a mess, with
'memblock_alloc()' returning a virtual pointer, but then you are
supposed to free it with 'memblock_free()' that takes a _physical_
address.

Not only is that all kinds of strange and illogical, but it actually
causes bugs, when people then use it like a normal allocation function,
and it fails spectacularly on a NULL pointer:

   https://lore.kernel.org/all/20210912140820.GD25450@xsang-OptiPlex-9020/

or just random memory corruption if the debug checks don't catch it:

   https://lore.kernel.org/all/61ab2d0c-3313-aaab-514c-e15b7aa054a0@suse.cz/

I really don't want to apply patches that treat the symptoms, when the
fundamental cause is this horribly confusing interface.

I started out looking at just automating a sane replacement sequence,
but because of this mix or virtual and physical addresses, and because
people have used the "__pa()" macro that can take either a regular
kernel pointer, or just the raw "unsigned long" address, it's all quite
messy.

So this just introduces a new saner interface for freeing a virtual
address that was allocated using 'memblock_alloc()', and that was kept
as a regular kernel pointer.  And then it converts a couple of users
that are obvious and easy to test, including the 'xbc_nodes' case in
lib/bootconfig.c that caused problems.

Reported-by: kernel test robot <oliver.sang@intel.com>
Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed")
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/setup_percpu.c |  2 +-
 arch/x86/mm/kasan_init_64.c    |  6 ++----
 arch/x86/mm/numa.c             |  2 +-
 arch/x86/mm/numa_emulation.c   |  3 +--
 drivers/base/arch_numa.c       |  2 +-
 drivers/macintosh/smu.c        |  2 +-
 include/linux/memblock.h       |  1 +
 init/main.c                    |  2 +-
 kernel/printk/printk.c         |  4 ++--
 lib/bootconfig.c               |  2 +-
 mm/memblock.c                  | 16 +++++++++++++++-
 11 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 78a32b956e81..5afd98559193 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
 
 static void __init pcpu_fc_free(void *ptr, size_t size)
 {
-	memblock_free(__pa(ptr), size);
+	memblock_free_ptr(ptr, size);
 }
 
 static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 1a50434c8a4d..ef885370719a 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -49,8 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
 			p = early_alloc(PMD_SIZE, nid, false);
 			if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
 				return;
-			else if (p)
-				memblock_free(__pa(p), PMD_SIZE);
+			memblock_free_ptr(p, PMD_SIZE);
 		}
 
 		p = early_alloc(PAGE_SIZE, nid, true);
@@ -86,8 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
 			p = early_alloc(PUD_SIZE, nid, false);
 			if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
 				return;
-			else if (p)
-				memblock_free(__pa(p), PUD_SIZE);
+			memblock_free_ptr(p, PUD_SIZE);
 		}
 
 		p = early_alloc(PAGE_SIZE, nid, true);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a1b5c71099e6..1e9b93b088db 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -355,7 +355,7 @@ void __init numa_reset_distance(void)
 
 	/* numa_distance could be 1LU marking allocation failure, test cnt */
 	if (numa_distance_cnt)
-		memblock_free(__pa(numa_distance), size);
+		memblock_free_ptr(numa_distance, size);
 	numa_distance_cnt = 0;
 	numa_distance = NULL;	/* enable table creation */
 }
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 737491b13728..e801e30089c4 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -517,8 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	}
 
 	/* free the copied physical distance table */
-	if (phys_dist)
-		memblock_free(__pa(phys_dist), phys_size);
+	memblock_free_ptr(phys_dist, phys_size);
 	return;
 
 no_emu:
diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 46c503486e96..00fb4120a5b3 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -264,7 +264,7 @@ void __init numa_free_distance(void)
 	size = numa_distance_cnt * numa_distance_cnt *
 		sizeof(numa_distance[0]);
 
-	memblock_free(__pa(numa_distance), size);
+	memblock_free_ptr(numa_distance, size);
 	numa_distance_cnt = 0;
 	numa_distance = NULL;
 }
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 94fb63a7b357..fe63d5ee201b 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -570,7 +570,7 @@ fail_msg_node:
 fail_db_node:
 	of_node_put(smu->db_node);
 fail_bootmem:
-	memblock_free(__pa(smu), sizeof(struct smu_device));
+	memblock_free_ptr(smu, sizeof(struct smu_device));
 	smu = NULL;
 fail_np:
 	of_node_put(np);
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index b066024c62e3..34de69b3b8ba 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -118,6 +118,7 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
 
 void memblock_free_all(void);
+void memblock_free_ptr(void *ptr, size_t size);
 void reset_node_managed_pages(pg_data_t *pgdat);
 void reset_all_zones_managed_pages(void);
 
diff --git a/init/main.c b/init/main.c
index 5c9a48df90e1..3f7216934441 100644
--- a/init/main.c
+++ b/init/main.c
@@ -924,7 +924,7 @@ static void __init print_unknown_bootoptions(void)
 		end += sprintf(end, " %s", *p);
 
 	pr_notice("Unknown command line parameters:%s\n", unknown_options);
-	memblock_free(__pa(unknown_options), len);
+	memblock_free_ptr(unknown_options, len);
 }
 
 asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 825277e1e742..a8d0a58deebc 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early)
 	return;
 
 err_free_descs:
-	memblock_free(__pa(new_descs), new_descs_size);
+	memblock_free_ptr(new_descs, new_descs_size);
 err_free_log_buf:
-	memblock_free(__pa(new_log_buf), new_log_buf_len);
+	memblock_free_ptr(new_log_buf, new_log_buf_len);
 }
 
 static bool __read_mostly ignore_loglevel;
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index f8419cff1147..5ae248b29373 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -792,7 +792,7 @@ void __init xbc_destroy_all(void)
 	xbc_data = NULL;
 	xbc_data_size = 0;
 	xbc_node_num = 0;
-	memblock_free(__pa(xbc_nodes), sizeof(struct xbc_node) * XBC_NODE_MAX);
+	memblock_free_ptr(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX);
 	xbc_nodes = NULL;
 	brace_index = 0;
 }
diff --git a/mm/memblock.c b/mm/memblock.c
index 0ab5a749bfa6..184dcd2e5d99 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -472,7 +472,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
 		kfree(old_array);
 	else if (old_array != memblock_memory_init_regions &&
 		 old_array != memblock_reserved_init_regions)
-		memblock_free(__pa(old_array), old_alloc_size);
+		memblock_free_ptr(old_array, old_alloc_size);
 
 	/*
 	 * Reserve the new array if that comes from the memblock.  Otherwise, we
@@ -795,6 +795,20 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
 	return memblock_remove_range(&memblock.memory, base, size);
 }
 
+/**
+ * memblock_free_ptr - free boot memory allocation
+ * @ptr: starting address of the  boot memory allocation
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
+void __init_memblock memblock_free_ptr(void *ptr, size_t size)
+{
+	if (ptr)
+		memblock_free(__pa(ptr), size);
+}
+
 /**
  * memblock_free - free boot memory block
  * @base: phys starting address of the  boot memory block
-- 
cgit v1.2.3-71-gd317


From 7dedd3e18077f996a10c47250ac85d080e5f474e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 10 Sep 2021 11:19:58 -0600
Subject: Revert "iov_iter: track truncated size"

This reverts commit 2112ff5ce0c1128fe7b4d19cfe7f2b8ce5b595fa.

We no longer need to track the truncation count, the one user that did
need it has been converted to using iov_iter_restore() instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/uio.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 984c4ab74859..207101a9c5c3 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -53,7 +53,6 @@ struct iov_iter {
 		};
 		loff_t xarray_start;
 	};
-	size_t truncated;
 };
 
 static inline enum iter_type iov_iter_type(const struct iov_iter *i)
@@ -270,10 +269,8 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
 	 * conversion in assignement is by definition greater than all
 	 * values of size_t, including old i->count.
 	 */
-	if (i->count > count) {
-		i->truncated += i->count - count;
+	if (i->count > count)
 		i->count = count;
-	}
 }
 
 /*
@@ -282,7 +279,6 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
  */
 static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 {
-	i->truncated -= count - i->count;
 	i->count = count;
 }
 
-- 
cgit v1.2.3-71-gd317


From f6b5f1a56987de837f8e25cd560847106b8632a8 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Tue, 14 Sep 2021 20:52:24 -0700
Subject: compiler.h: Introduce absolute_pointer macro

absolute_pointer() disassociates a pointer from its originating symbol
type and context. Use it to prevent compiler warnings/errors such as

  drivers/net/ethernet/i825xx/82596.c: In function 'i82596_probe':
  arch/m68k/include/asm/string.h:72:25: error:
	'__builtin_memcpy' reading 6 bytes from a region of size 0 [-Werror=stringop-overread]

Such warnings may be reported by gcc 11.x for string and memory
operations on fixed addresses.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b67261a1e3e9..3d5af56337bd 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -188,6 +188,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
     (typeof(ptr)) (__ptr + (off)); })
 #endif
 
+#define absolute_pointer(val)	RELOC_HIDE((void *)(val), 0)
+
 #ifndef OPTIMIZER_HIDE_VAR
 /* Make the optimizer believe the variable can be manipulated arbitrarily. */
 #define OPTIMIZER_HIDE_VAR(var)						\
-- 
cgit v1.2.3-71-gd317


From 3c9cfb5269f76d447dbadb67835368f3111a91d7 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 17 Sep 2021 14:17:35 +0300
Subject: net: update NXP copyright text

NXP Legal insists that the following are not fine:

- Saying "NXP Semiconductors" instead of "NXP", since the company's
  registered name is "NXP"

- Putting a "(c)" sign in the copyright string

- Putting a comma in the copyright string

The only accepted copyright string format is "Copyright <year-range> NXP".

This patch changes the copyright headers in the networking files that
were sent by me, or derived from code sent by me.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c                                 | 2 +-
 drivers/net/dsa/ocelot/felix.h                                 | 2 +-
 drivers/net/dsa/ocelot/felix_vsc9959.c                         | 2 +-
 drivers/net/dsa/sja1105/sja1105_clocking.c                     | 2 +-
 drivers/net/dsa/sja1105/sja1105_devlink.c                      | 2 +-
 drivers/net/dsa/sja1105/sja1105_flower.c                       | 2 +-
 drivers/net/dsa/sja1105/sja1105_mdio.c                         | 2 +-
 drivers/net/dsa/sja1105/sja1105_spi.c                          | 2 +-
 drivers/net/dsa/sja1105/sja1105_static_config.c                | 2 +-
 drivers/net/dsa/sja1105/sja1105_static_config.h                | 2 +-
 drivers/net/dsa/sja1105/sja1105_vl.c                           | 2 +-
 drivers/net/dsa/sja1105/sja1105_vl.h                           | 2 +-
 drivers/net/ethernet/freescale/enetc/enetc_ierb.c              | 2 +-
 drivers/net/ethernet/freescale/enetc/enetc_ierb.h              | 2 +-
 drivers/net/ethernet/mscc/ocelot_devlink.c                     | 2 +-
 drivers/net/ethernet/mscc/ocelot_mrp.c                         | 2 +-
 drivers/net/ethernet/mscc/ocelot_net.c                         | 2 +-
 drivers/net/pcs/pcs-xpcs-nxp.c                                 | 2 +-
 include/linux/dsa/ocelot.h                                     | 2 +-
 include/linux/packing.h                                        | 2 +-
 lib/packing.c                                                  | 2 +-
 net/dsa/tag_ocelot.c                                           | 2 +-
 net/dsa/tag_ocelot_8021q.c                                     | 2 +-
 tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh | 2 +-
 24 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 3656e67af789..a3a9636430d6 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright 2019-2021 NXP Semiconductors
+/* Copyright 2019-2021 NXP
  *
  * This is an umbrella module for all network switches that are
  * register-compatible with Ocelot and that perform I/O to their host CPU
diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h
index 5854bab43327..54024b6f9498 100644
--- a/drivers/net/dsa/ocelot/felix.h
+++ b/drivers/net/dsa/ocelot/felix.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright 2019 NXP Semiconductors
+/* Copyright 2019 NXP
  */
 #ifndef _MSCC_FELIX_H
 #define _MSCC_FELIX_H
diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index f966a253d1c7..9e2ac8e46619 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
 /* Copyright 2017 Microsemi Corporation
- * Copyright 2018-2019 NXP Semiconductors
+ * Copyright 2018-2019 NXP
  */
 #include <linux/fsl/enetc_mdio.h>
 #include <soc/mscc/ocelot_qsys.h>
diff --git a/drivers/net/dsa/sja1105/sja1105_clocking.c b/drivers/net/dsa/sja1105/sja1105_clocking.c
index 387a1f2f161c..5bbf1707f2af 100644
--- a/drivers/net/dsa/sja1105/sja1105_clocking.c
+++ b/drivers/net/dsa/sja1105/sja1105_clocking.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: BSD-3-Clause
-/* Copyright (c) 2016-2018, NXP Semiconductors
+/* Copyright 2016-2018 NXP
  * Copyright (c) 2018-2019, Vladimir Oltean <olteanv@gmail.com>
  */
 #include <linux/packing.h>
diff --git a/drivers/net/dsa/sja1105/sja1105_devlink.c b/drivers/net/dsa/sja1105/sja1105_devlink.c
index 05c7f4ca3b1a..0569ff066634 100644
--- a/drivers/net/dsa/sja1105/sja1105_devlink.c
+++ b/drivers/net/dsa/sja1105/sja1105_devlink.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2018-2019, Vladimir Oltean <olteanv@gmail.com>
- * Copyright 2020 NXP Semiconductors
+ * Copyright 2020 NXP
  */
 #include "sja1105.h"
 
diff --git a/drivers/net/dsa/sja1105/sja1105_flower.c b/drivers/net/dsa/sja1105/sja1105_flower.c
index 6c10ffa968ce..72b9b39b0989 100644
--- a/drivers/net/dsa/sja1105/sja1105_flower.c
+++ b/drivers/net/dsa/sja1105/sja1105_flower.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright 2020, NXP Semiconductors
+/* Copyright 2020 NXP
  */
 #include "sja1105.h"
 #include "sja1105_vl.h"
diff --git a/drivers/net/dsa/sja1105/sja1105_mdio.c b/drivers/net/dsa/sja1105/sja1105_mdio.c
index 705d3900e43a..215dd17ca790 100644
--- a/drivers/net/dsa/sja1105/sja1105_mdio.c
+++ b/drivers/net/dsa/sja1105/sja1105_mdio.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright 2021, NXP Semiconductors
+/* Copyright 2021 NXP
  */
 #include <linux/pcs/pcs-xpcs.h>
 #include <linux/of_mdio.h>
diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c
index d60a530d0272..d3c9ad6d39d4 100644
--- a/drivers/net/dsa/sja1105/sja1105_spi.c
+++ b/drivers/net/dsa/sja1105/sja1105_spi.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: BSD-3-Clause
-/* Copyright (c) 2016-2018, NXP Semiconductors
+/* Copyright 2016-2018 NXP
  * Copyright (c) 2018, Sensor-Technik Wiedemann GmbH
  * Copyright (c) 2018-2019, Vladimir Oltean <olteanv@gmail.com>
  */
diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.c b/drivers/net/dsa/sja1105/sja1105_static_config.c
index 7a422ef4deb6..baba204ad62f 100644
--- a/drivers/net/dsa/sja1105/sja1105_static_config.c
+++ b/drivers/net/dsa/sja1105/sja1105_static_config.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: BSD-3-Clause
-/* Copyright (c) 2016-2018, NXP Semiconductors
+/* Copyright 2016-2018 NXP
  * Copyright (c) 2018-2019, Vladimir Oltean <olteanv@gmail.com>
  */
 #include "sja1105_static_config.h"
diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.h b/drivers/net/dsa/sja1105/sja1105_static_config.h
index bce0f5c03d0b..6a372d5f22ae 100644
--- a/drivers/net/dsa/sja1105/sja1105_static_config.h
+++ b/drivers/net/dsa/sja1105/sja1105_static_config.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright (c) 2016-2018, NXP Semiconductors
+/* Copyright 2016-2018 NXP
  * Copyright (c) 2018-2019, Vladimir Oltean <olteanv@gmail.com>
  */
 #ifndef _SJA1105_STATIC_CONFIG_H
diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c
index ec7b65daec20..6802f4057cc0 100644
--- a/drivers/net/dsa/sja1105/sja1105_vl.c
+++ b/drivers/net/dsa/sja1105/sja1105_vl.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright 2020, NXP Semiconductors
+/* Copyright 2020 NXP
  */
 #include <net/tc_act/tc_gate.h>
 #include <linux/dsa/8021q.h>
diff --git a/drivers/net/dsa/sja1105/sja1105_vl.h b/drivers/net/dsa/sja1105/sja1105_vl.h
index 173d78963fed..51fba0dce91a 100644
--- a/drivers/net/dsa/sja1105/sja1105_vl.h
+++ b/drivers/net/dsa/sja1105/sja1105_vl.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright 2020, NXP Semiconductors
+/* Copyright 2020 NXP
  */
 #ifndef _SJA1105_VL_H
 #define _SJA1105_VL_H
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ierb.c b/drivers/net/ethernet/freescale/enetc/enetc_ierb.c
index ee1468e3eaa3..91f02c505028 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_ierb.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_ierb.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
-/* Copyright 2021 NXP Semiconductors
+/* Copyright 2021 NXP
  *
  * The Integrated Endpoint Register Block (IERB) is configured by pre-boot
  * software and is supposed to be to ENETC what a NVRAM is to a 'real' PCIe
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ierb.h b/drivers/net/ethernet/freescale/enetc/enetc_ierb.h
index b3b774e0998a..c2ce47c4be9f 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_ierb.h
+++ b/drivers/net/ethernet/freescale/enetc/enetc_ierb.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
-/* Copyright 2021 NXP Semiconductors */
+/* Copyright 2021 NXP */
 
 #include <linux/pci.h>
 #include <linux/platform_device.h>
diff --git a/drivers/net/ethernet/mscc/ocelot_devlink.c b/drivers/net/ethernet/mscc/ocelot_devlink.c
index edafbd37d12c..b8737efd2a85 100644
--- a/drivers/net/ethernet/mscc/ocelot_devlink.c
+++ b/drivers/net/ethernet/mscc/ocelot_devlink.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
-/* Copyright 2020-2021 NXP Semiconductors
+/* Copyright 2020-2021 NXP
  */
 #include <net/devlink.h>
 #include "ocelot.h"
diff --git a/drivers/net/ethernet/mscc/ocelot_mrp.c b/drivers/net/ethernet/mscc/ocelot_mrp.c
index 08b481a93460..4b0941f09f71 100644
--- a/drivers/net/ethernet/mscc/ocelot_mrp.c
+++ b/drivers/net/ethernet/mscc/ocelot_mrp.c
@@ -2,7 +2,7 @@
 /* Microsemi Ocelot Switch driver
  *
  * Copyright (c) 2017, 2019 Microsemi Corporation
- * Copyright 2020-2021 NXP Semiconductors
+ * Copyright 2020-2021 NXP
  */
 
 #include <linux/if_bridge.h>
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index c0c465a4a981..e54b9fb2a97a 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -5,7 +5,7 @@
  * mscc_ocelot_switch_lib.
  *
  * Copyright (c) 2017, 2019 Microsemi Corporation
- * Copyright 2020-2021 NXP Semiconductors
+ * Copyright 2020-2021 NXP
  */
 
 #include <linux/if_bridge.h>
diff --git a/drivers/net/pcs/pcs-xpcs-nxp.c b/drivers/net/pcs/pcs-xpcs-nxp.c
index 984c9f7f16a8..d16fc58cd48d 100644
--- a/drivers/net/pcs/pcs-xpcs-nxp.c
+++ b/drivers/net/pcs/pcs-xpcs-nxp.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright 2021 NXP Semiconductors
+/* Copyright 2021 NXP
  */
 #include <linux/pcs/pcs-xpcs.h>
 #include "pcs-xpcs.h"
diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
index c6bc45ae5e03..435777a0073c 100644
--- a/include/linux/dsa/ocelot.h
+++ b/include/linux/dsa/ocelot.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0
- * Copyright 2019-2021 NXP Semiconductors
+ * Copyright 2019-2021 NXP
  */
 
 #ifndef _NET_DSA_TAG_OCELOT_H
diff --git a/include/linux/packing.h b/include/linux/packing.h
index 54667735cc67..8d6571feb95d 100644
--- a/include/linux/packing.h
+++ b/include/linux/packing.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright (c) 2016-2018, NXP Semiconductors
+ * Copyright 2016-2018 NXP
  * Copyright (c) 2018-2019, Vladimir Oltean <olteanv@gmail.com>
  */
 #ifndef _LINUX_PACKING_H
diff --git a/lib/packing.c b/lib/packing.c
index 6ed72dccfdb5..9a72f4bbf0e2 100644
--- a/lib/packing.c
+++ b/lib/packing.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
-/* Copyright (c) 2016-2018, NXP Semiconductors
+/* Copyright 2016-2018 NXP
  * Copyright (c) 2018-2019, Vladimir Oltean <olteanv@gmail.com>
  */
 #include <linux/packing.h>
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index d37ab98e7fe1..8025ed778d33 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright 2019 NXP Semiconductors
+/* Copyright 2019 NXP
  */
 #include <linux/dsa/ocelot.h>
 #include <soc/mscc/ocelot.h>
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 3038a257ba05..59072930cb02 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright 2020-2021 NXP Semiconductors
+/* Copyright 2020-2021 NXP
  *
  * An implementation of the software-defined tag_8021q.c tagger format, which
  * also preserves full functionality under a vlan_filtering bridge. It does
diff --git a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh
index beee0d5646a6..f7d84549cc3e 100755
--- a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh
+++ b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
-# Copyright 2020 NXP Semiconductors
+# Copyright 2020 NXP
 
 WAIT_TIME=1
 NUM_NETIFS=4
-- 
cgit v1.2.3-71-gd317


From cf9579976f724ad517cc15b7caadea728c7e245c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 17 Sep 2021 16:34:32 +0300
Subject: net: mdio: introduce a shutdown method to mdio device drivers

MDIO-attached devices might have interrupts and other things that might
need quiesced when we kexec into a new kernel. Things are even more
creepy when those interrupt lines are shared, and in that case it is
absolutely mandatory to disable all interrupt sources.

Moreover, MDIO devices might be DSA switches, and DSA needs its own
shutdown method to unlink from the DSA master, which is a new
requirement that appeared after commit 2f1e8ea726e9 ("net: dsa: link
interfaces with the DSA master to get rid of lockdep warnings").

So introduce a ->shutdown method in the MDIO device driver structure.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio_device.c | 11 +++++++++++
 include/linux/mdio.h          |  3 +++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio_device.c b/drivers/net/phy/mdio_device.c
index c94cb5382dc9..250742ffdfd9 100644
--- a/drivers/net/phy/mdio_device.c
+++ b/drivers/net/phy/mdio_device.c
@@ -179,6 +179,16 @@ static int mdio_remove(struct device *dev)
 	return 0;
 }
 
+static void mdio_shutdown(struct device *dev)
+{
+	struct mdio_device *mdiodev = to_mdio_device(dev);
+	struct device_driver *drv = mdiodev->dev.driver;
+	struct mdio_driver *mdiodrv = to_mdio_driver(drv);
+
+	if (mdiodrv->shutdown)
+		mdiodrv->shutdown(mdiodev);
+}
+
 /**
  * mdio_driver_register - register an mdio_driver with the MDIO layer
  * @drv: new mdio_driver to register
@@ -193,6 +203,7 @@ int mdio_driver_register(struct mdio_driver *drv)
 	mdiodrv->driver.bus = &mdio_bus_type;
 	mdiodrv->driver.probe = mdio_probe;
 	mdiodrv->driver.remove = mdio_remove;
+	mdiodrv->driver.shutdown = mdio_shutdown;
 
 	retval = driver_register(&mdiodrv->driver);
 	if (retval) {
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index ffb787d5ebde..5e6dc38f418e 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -80,6 +80,9 @@ struct mdio_driver {
 
 	/* Clears up any memory if needed */
 	void (*remove)(struct mdio_device *mdiodev);
+
+	/* Quiesces the device on system shutdown, turns off interrupts etc */
+	void (*shutdown)(struct mdio_device *mdiodev);
 };
 
 static inline struct mdio_driver *
-- 
cgit v1.2.3-71-gd317


From e840f42a49925707fca90e6c7a4095118fdb8c4d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sun, 19 Sep 2021 14:09:49 +0100
Subject: KVM: arm64: Fix PMU probe ordering

Russell reported that since 5.13, KVM's probing of the PMU has
started to fail on his HW. As it turns out, there is an implicit
ordering dependency between the architectural PMU probing code and
and KVM's own probing. If, due to probe ordering reasons, KVM probes
before the PMU driver, it will fail to detect the PMU and prevent it
from being advertised to guests as well as the VMM.

Obviously, this is one probing too many, and we should be able to
deal with any ordering.

Add a callback from the PMU code into KVM to advertise the registration
of a host CPU PMU, allowing for any probing order.

Fixes: 5421db1be3b1 ("KVM: arm64: Divorce the perf code from oprofile helpers")
Reported-by: "Russell King (Oracle)" <linux@armlinux.org.uk>
Tested-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/YUYRKVflRtUytzy5@shell.armlinux.org.uk
Cc: stable@vger.kernel.org
---
 arch/arm64/kvm/perf.c        | 3 ---
 arch/arm64/kvm/pmu-emul.c    | 9 ++++++++-
 drivers/perf/arm_pmu.c       | 2 ++
 include/kvm/arm_pmu.h        | 3 ---
 include/linux/perf/arm_pmu.h | 6 ++++++
 5 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c
index f9bb3b14130e..c84fe24b2ea1 100644
--- a/arch/arm64/kvm/perf.c
+++ b/arch/arm64/kvm/perf.c
@@ -50,9 +50,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
 
 int kvm_perf_init(void)
 {
-	if (kvm_pmu_probe_pmuver() != ID_AA64DFR0_PMUVER_IMP_DEF && !is_protected_kvm_enabled())
-		static_branch_enable(&kvm_arm_pmu_available);
-
 	return perf_register_guest_info_callbacks(&kvm_guest_cbs);
 }
 
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index f5065f23b413..2af3c37445e0 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -740,7 +740,14 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
 	kvm_pmu_create_perf_event(vcpu, select_idx);
 }
 
-int kvm_pmu_probe_pmuver(void)
+void kvm_host_pmu_init(struct arm_pmu *pmu)
+{
+	if (pmu->pmuver != 0 && pmu->pmuver != ID_AA64DFR0_PMUVER_IMP_DEF &&
+	    !kvm_arm_support_pmu_v3() && !is_protected_kvm_enabled())
+		static_branch_enable(&kvm_arm_pmu_available);
+}
+
+static int kvm_pmu_probe_pmuver(void)
 {
 	struct perf_event_attr attr = { };
 	struct perf_event *event;
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 3cbc3baf087f..295cc7952d0e 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -952,6 +952,8 @@ int armpmu_register(struct arm_pmu *pmu)
 		pmu->name, pmu->num_events,
 		has_nmi ? ", using NMIs" : "");
 
+	kvm_host_pmu_init(pmu);
+
 	return 0;
 
 out_destroy:
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 864b9997efb2..90f21898aad8 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -61,7 +61,6 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu,
 int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu,
 			    struct kvm_device_attr *attr);
 int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu);
-int kvm_pmu_probe_pmuver(void);
 #else
 struct kvm_pmu {
 };
@@ -118,8 +117,6 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
 	return 0;
 }
 
-static inline int kvm_pmu_probe_pmuver(void) { return 0xf; }
-
 #endif
 
 #endif
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 505480217cf1..2512e2f9cd4e 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -163,6 +163,12 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn);
 static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; }
 #endif
 
+#ifdef CONFIG_KVM
+void kvm_host_pmu_init(struct arm_pmu *pmu);
+#else
+#define kvm_host_pmu_init(x)	do { } while(0)
+#endif
+
 /* Internal functions only for core arm_pmu code */
 struct arm_pmu *armpmu_alloc(void);
 struct arm_pmu *armpmu_alloc_atomic(void);
-- 
cgit v1.2.3-71-gd317


From d4ffd5df9d18031b6a53f934388726775b4452d3 Mon Sep 17 00:00:00 2001
From: Jiashuo Liang <liangjs@pku.edu.cn>
Date: Fri, 30 Jul 2021 11:01:52 +0800
Subject: x86/fault: Fix wrong signal when vsyscall fails with pkey

The function __bad_area_nosemaphore() calls kernelmode_fixup_or_oops()
with the parameter @signal being actually @pkey, which will send a
signal numbered with the argument in @pkey.

This bug can be triggered when the kernel fails to access user-given
memory pages that are protected by a pkey, so it can go down the
do_user_addr_fault() path and pass the !user_mode() check in
__bad_area_nosemaphore().

Most cases will simply run the kernel fixup code to make an -EFAULT. But
when another condition current->thread.sig_on_uaccess_err is met, which
is only used to emulate vsyscall, the kernel will generate the wrong
signal.

Add a new parameter @pkey to kernelmode_fixup_or_oops() to fix this.

 [ bp: Massage commit message, fix build error as reported by the 0day
   bot: https://lkml.kernel.org/r/202109202245.APvuT8BX-lkp@intel.com ]

Fixes: 5042d40a264c ("x86/fault: Bypass no_context() for implicit kernel faults from usermode")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Jiashuo Liang <liangjs@pku.edu.cn>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20210730030152.249106-1-liangjs@pku.edu.cn
---
 arch/x86/include/asm/pkeys.h |  2 --
 arch/x86/mm/fault.c          | 26 ++++++++++++++++++--------
 include/linux/pkeys.h        |  2 ++
 3 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 5c7bcaa79623..1d5f14aff5f6 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -2,8 +2,6 @@
 #ifndef _ASM_X86_PKEYS_H
 #define _ASM_X86_PKEYS_H
 
-#define ARCH_DEFAULT_PKEY	0
-
 /*
  * If more than 16 keys are ever supported, a thorough audit
  * will be necessary to ensure that the types that store key
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b2eefdefc108..84a2c8c4af73 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -710,7 +710,8 @@ oops:
 
 static noinline void
 kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
-			 unsigned long address, int signal, int si_code)
+			 unsigned long address, int signal, int si_code,
+			 u32 pkey)
 {
 	WARN_ON_ONCE(user_mode(regs));
 
@@ -735,8 +736,12 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
 
 			set_signal_archinfo(address, error_code);
 
-			/* XXX: hwpoison faults will set the wrong code. */
-			force_sig_fault(signal, si_code, (void __user *)address);
+			if (si_code == SEGV_PKUERR) {
+				force_sig_pkuerr((void __user *)address, pkey);
+			} else {
+				/* XXX: hwpoison faults will set the wrong code. */
+				force_sig_fault(signal, si_code, (void __user *)address);
+			}
 		}
 
 		/*
@@ -798,7 +803,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 	struct task_struct *tsk = current;
 
 	if (!user_mode(regs)) {
-		kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code);
+		kernelmode_fixup_or_oops(regs, error_code, address,
+					 SIGSEGV, si_code, pkey);
 		return;
 	}
 
@@ -930,7 +936,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 {
 	/* Kernel mode? Handle exceptions or die: */
 	if (!user_mode(regs)) {
-		kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR);
+		kernelmode_fixup_or_oops(regs, error_code, address,
+					 SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY);
 		return;
 	}
 
@@ -1396,7 +1403,8 @@ good_area:
 		 */
 		if (!user_mode(regs))
 			kernelmode_fixup_or_oops(regs, error_code, address,
-						 SIGBUS, BUS_ADRERR);
+						 SIGBUS, BUS_ADRERR,
+						 ARCH_DEFAULT_PKEY);
 		return;
 	}
 
@@ -1416,7 +1424,8 @@ good_area:
 		return;
 
 	if (fatal_signal_pending(current) && !user_mode(regs)) {
-		kernelmode_fixup_or_oops(regs, error_code, address, 0, 0);
+		kernelmode_fixup_or_oops(regs, error_code, address,
+					 0, 0, ARCH_DEFAULT_PKEY);
 		return;
 	}
 
@@ -1424,7 +1433,8 @@ good_area:
 		/* Kernel mode? Handle exceptions or die: */
 		if (!user_mode(regs)) {
 			kernelmode_fixup_or_oops(regs, error_code, address,
-						 SIGSEGV, SEGV_MAPERR);
+						 SIGSEGV, SEGV_MAPERR,
+						 ARCH_DEFAULT_PKEY);
 			return;
 		}
 
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index 6beb26b7151d..86be8bf27b41 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -4,6 +4,8 @@
 
 #include <linux/mm.h>
 
+#define ARCH_DEFAULT_PKEY	0
+
 #ifdef CONFIG_ARCH_HAS_PKEYS
 #include <asm/pkeys.h>
 #else /* ! CONFIG_ARCH_HAS_PKEYS */
-- 
cgit v1.2.3-71-gd317


From c86a2d9058c5a4a05d20ef89e699b7a6b2c89da6 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Fri, 17 Sep 2021 00:27:05 +0200
Subject: cpumask: Omit terminating null byte in
 cpumap_print_{list,bitmask}_to_buf

The changes in the patch series [1] introduced a terminating null byte
when reading from cpulist or cpumap sysfs files, for example:

  $ xxd /sys/devices/system/node/node0/cpulist
  00000000: 302d 310a 00                             0-1..

Before this change, the output looked as follows:

  $ xxd /sys/devices/system/node/node0/cpulist
  00000000: 302d 310a                                0-1.

Fix this regression by excluding the terminating null byte from the
returned length in cpumap_print_list_to_buf and
cpumap_print_bitmask_to_buf.

[1] https://lore.kernel.org/all/20210806110251.560-1-song.bao.hua@hisilicon.com/

Fixes: 1fae562983ca ("cpumask: introduce cpumap_print_list/bitmask_to_buf to support large bitmask and list")
Acked-by: Barry Song <song.bao.hua@hisilicon.com>
Acked-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Link: https://lore.kernel.org/r/20210916222705.13554-1-tklauser@distanz.ch
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cpumask.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 5d4d07a9e1ed..1e7399fc69c0 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -996,14 +996,15 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
  * cpumask; Typically used by bin_attribute to export cpumask bitmask
  * ABI.
  *
- * Returns the length of how many bytes have been copied.
+ * Returns the length of how many bytes have been copied, excluding
+ * terminating '\0'.
  */
 static inline ssize_t
 cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
 		loff_t off, size_t count)
 {
 	return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask),
-				   nr_cpu_ids, off, count);
+				   nr_cpu_ids, off, count) - 1;
 }
 
 /**
@@ -1018,7 +1019,7 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
 		loff_t off, size_t count)
 {
 	return bitmap_print_list_to_buf(buf, cpumask_bits(mask),
-				   nr_cpu_ids, off, count);
+				   nr_cpu_ids, off, count) - 1;
 }
 
 #if NR_CPUS <= BITS_PER_LONG
-- 
cgit v1.2.3-71-gd317


From 20c36ce2164f1774b487d443ece99b754bc6ad43 Mon Sep 17 00:00:00 2001
From: Bixuan Cui <cuibixuan@huawei.com>
Date: Thu, 16 Sep 2021 10:52:03 +0800
Subject: irqdomain: Change the type of 'size' in __irq_domain_add() to be
 consistent

The 'size' is used in struct_size(domain, revmap, size) and its input
parameter type is 'size_t'(unsigned int).
Changing the size to 'unsigned int' to make the type consistent.

Signed-off-by: Bixuan Cui <cuibixuan@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210916025203.44841-1-cuibixuan@huawei.com
---
 include/linux/irqdomain.h | 2 +-
 kernel/irq/irqdomain.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 23e4ee523576..9ee238ad29ce 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -251,7 +251,7 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa)
 }
 
 void irq_domain_free_fwnode(struct fwnode_handle *fwnode);
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
 				    irq_hw_number_t hwirq_max, int direct_max,
 				    const struct irq_domain_ops *ops,
 				    void *host_data);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 62be16135e7c..bfa289ed57ab 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -136,7 +136,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
  * Allocates and initializes an irq_domain structure.
  * Returns pointer to IRQ domain, or NULL on failure.
  */
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
 				    irq_hw_number_t hwirq_max, int direct_max,
 				    const struct irq_domain_ops *ops,
 				    void *host_data)
-- 
cgit v1.2.3-71-gd317


From a68de80f61f6af397bc06fb391ff2e571c9c4d80 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 1 Sep 2021 13:30:27 -0700
Subject: entry: rseq: Call rseq_handle_notify_resume() in
 tracehook_notify_resume()

Invoke rseq_handle_notify_resume() from tracehook_notify_resume() now
that the two function are always called back-to-back by architectures
that have rseq.  The rseq helper is stubbed out for architectures that
don't support rseq, i.e. this is a nop across the board.

Note, tracehook_notify_resume() is horribly named and arguably does not
belong in tracehook.h as literally every line of code in it has nothing
to do with tracing.  But, that's been true since commit a42c6ded827d
("move key_repace_session_keyring() into tracehook_notify_resume()")
first usurped tracehook_notify_resume() back in 2012.  Punt cleaning that
mess up to future patches.

No functional change intended.

Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20210901203030.1292304-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/kernel/signal.c     | 1 -
 arch/arm64/kernel/signal.c   | 4 +---
 arch/csky/kernel/signal.c    | 4 +---
 arch/mips/kernel/signal.c    | 4 +---
 arch/powerpc/kernel/signal.c | 4 +---
 include/linux/tracehook.h    | 2 ++
 kernel/entry/common.c        | 4 +---
 kernel/entry/kvm.c           | 4 +---
 8 files changed, 8 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index d0a800be0486..a41e27ace391 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -628,7 +628,6 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 				uprobe_notify_resume(regs);
 			} else {
 				tracehook_notify_resume(regs);
-				rseq_handle_notify_resume(NULL, regs);
 			}
 		}
 		local_irq_disable();
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 9fe70b12b34f..c287b9407f28 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -940,10 +940,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
 			if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 				do_signal(regs);
 
-			if (thread_flags & _TIF_NOTIFY_RESUME) {
+			if (thread_flags & _TIF_NOTIFY_RESUME)
 				tracehook_notify_resume(regs);
-				rseq_handle_notify_resume(NULL, regs);
-			}
 
 			if (thread_flags & _TIF_FOREIGN_FPSTATE)
 				fpsimd_restore_current_state();
diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c
index 312f046d452d..bc4238b9f709 100644
--- a/arch/csky/kernel/signal.c
+++ b/arch/csky/kernel/signal.c
@@ -260,8 +260,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 		do_signal(regs);
 
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+	if (thread_info_flags & _TIF_NOTIFY_RESUME)
 		tracehook_notify_resume(regs);
-		rseq_handle_notify_resume(NULL, regs);
-	}
 }
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index f1e985109da0..c9b2a75563e1 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -906,10 +906,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused,
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 		do_signal(regs);
 
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+	if (thread_info_flags & _TIF_NOTIFY_RESUME)
 		tracehook_notify_resume(regs);
-		rseq_handle_notify_resume(NULL, regs);
-	}
 
 	user_enter();
 }
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index e600764a926c..b93b87df499d 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -293,10 +293,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
 		do_signal(current);
 	}
 
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+	if (thread_info_flags & _TIF_NOTIFY_RESUME)
 		tracehook_notify_resume(regs);
-		rseq_handle_notify_resume(NULL, regs);
-	}
 }
 
 static unsigned long get_tm_stackpointer(struct task_struct *tsk)
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 3e80c4bc66f7..2564b7434b4d 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -197,6 +197,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 
 	mem_cgroup_handle_over_high();
 	blkcg_maybe_throttle_current();
+
+	rseq_handle_notify_resume(NULL, regs);
 }
 
 /*
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bf16395b9e13..d5a61d565ad5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -171,10 +171,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 			handle_signal_work(regs, ti_work);
 
-		if (ti_work & _TIF_NOTIFY_RESUME) {
+		if (ti_work & _TIF_NOTIFY_RESUME)
 			tracehook_notify_resume(regs);
-			rseq_handle_notify_resume(NULL, regs);
-		}
 
 		/* Architecture specific TIF work */
 		arch_exit_to_user_mode_work(regs, ti_work);
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 049fd06b4c3d..49972ee99aff 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -19,10 +19,8 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 		if (ti_work & _TIF_NEED_RESCHED)
 			schedule();
 
-		if (ti_work & _TIF_NOTIFY_RESUME) {
+		if (ti_work & _TIF_NOTIFY_RESUME)
 			tracehook_notify_resume(NULL);
-			rseq_handle_notify_resume(NULL, NULL);
-		}
 
 		ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
 		if (ret)
-- 
cgit v1.2.3-71-gd317


From 4eeef2424153e79910d65248b5e1abf137d050e9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 10 Sep 2021 11:32:19 -0700
Subject: KVM: x86: Query vcpu->vcpu_idx directly and drop its accessor

Read vcpu->vcpu_idx directly instead of bouncing through the one-line
wrapper, kvm_vcpu_get_idx(), and drop the wrapper.  The wrapper is a
remnant of the original implementation and serves no purpose; remove it
before it gains more users.

Back when kvm_vcpu_get_idx() was added by commit 497d72d80a78 ("KVM: Add
kvm_vcpu_get_idx to get vcpu index in kvm->vcpus"), the implementation
was more than just a simple wrapper as vcpu->vcpu_idx did not exist and
retrieving the index meant walking over the vCPU array to find the given
vCPU.

When vcpu_idx was introduced by commit 8750e72a79dd ("KVM: remember
position in kvm->vcpus array"), the helper was left behind, likely to
avoid extra thrash (but even then there were only two users, the original
arm usage having been removed at some point in the past).

No functional change intended.

Suggested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20210910183220.2397812-2-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/s390/kvm/interrupt.c | 4 ++--
 arch/s390/kvm/kvm-s390.c  | 2 +-
 arch/s390/kvm/kvm-s390.h  | 2 +-
 arch/x86/kvm/hyperv.c     | 7 +++----
 arch/x86/kvm/hyperv.h     | 2 +-
 include/linux/kvm_host.h  | 5 -----
 6 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 16256e17a544..10722455fd02 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 static void __set_cpu_idle(struct kvm_vcpu *vcpu)
 {
 	kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
-	set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
+	set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
 }
 
 static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
 {
 	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
-	clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
+	clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
 }
 
 static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 752a0ffab9bf..6a6dd5e1daf6 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -4066,7 +4066,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
 		kvm_s390_patch_guest_per_regs(vcpu);
 	}
 
-	clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask);
+	clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
 
 	vcpu->arch.sie_block->icptcode = 0;
 	cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index ecd741ee3276..52bc8fbaa60a 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 
 static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
 {
-	return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
+	return test_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
 }
 
 static inline int kvm_is_ucontrol(struct kvm *kvm)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 232a86a6faaf..d5124b520f76 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -939,7 +939,7 @@ static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
 	for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
 		stimer_init(&hv_vcpu->stimer[i], i);
 
-	hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
+	hv_vcpu->vp_index = vcpu->vcpu_idx;
 
 	return 0;
 }
@@ -1444,7 +1444,6 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 	switch (msr) {
 	case HV_X64_MSR_VP_INDEX: {
 		struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
-		int vcpu_idx = kvm_vcpu_get_idx(vcpu);
 		u32 new_vp_index = (u32)data;
 
 		if (!host || new_vp_index >= KVM_MAX_VCPUS)
@@ -1459,9 +1458,9 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 		 * VP index is changing, adjust num_mismatched_vp_indexes if
 		 * it now matches or no longer matches vcpu_idx.
 		 */
-		if (hv_vcpu->vp_index == vcpu_idx)
+		if (hv_vcpu->vp_index == vcpu->vcpu_idx)
 			atomic_inc(&hv->num_mismatched_vp_indexes);
-		else if (new_vp_index == vcpu_idx)
+		else if (new_vp_index == vcpu->vcpu_idx)
 			atomic_dec(&hv->num_mismatched_vp_indexes);
 
 		hv_vcpu->vp_index = new_vp_index;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 730da8537d05..ed1c4e546d04 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -83,7 +83,7 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
 {
 	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
 
-	return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu);
+	return hv_vcpu ? hv_vcpu->vp_index : vcpu->vcpu_idx;
 }
 
 int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 041ca7f15ea4..000ea73dd324 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -721,11 +721,6 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
 	return NULL;
 }
 
-static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
-{
-	return vcpu->vcpu_idx;
-}
-
 #define kvm_for_each_memslot(memslot, slots)				\
 	for (memslot = &slots->memslots[0];				\
 	     memslot < slots->memslots + slots->used_slots; memslot++)	\
-- 
cgit v1.2.3-71-gd317


From 6bc6db000295332bae2c1e8815d7450b72923d23 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@linux.alibaba.com>
Date: Sat, 18 Sep 2021 08:56:29 +0800
Subject: KVM: Remove tlbs_dirty

There is no user of tlbs_dirty.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210918005636.3675-4-jiangshanlai@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  1 -
 virt/kvm/kvm_main.c      | 10 ++--------
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 000ea73dd324..0f18df7fe874 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -608,7 +608,6 @@ struct kvm {
 	unsigned long mmu_notifier_range_start;
 	unsigned long mmu_notifier_range_end;
 #endif
-	long tlbs_dirty;
 	struct list_head devices;
 	u64 manual_dirty_log_protect;
 	struct dentry *debugfs_dentry;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e95e7a9e4d53..7851f3a1b5f7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -326,13 +326,8 @@ EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-	/*
-	 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in
-	 * kvm_make_all_cpus_request.
-	 */
-	long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
-
 	++kvm->stat.generic.remote_tlb_flush_requests;
+
 	/*
 	 * We want to publish modifications to the page tables before reading
 	 * mode. Pairs with a memory barrier in arch-specific code.
@@ -347,7 +342,6 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 	if (!kvm_arch_flush_remote_tlb(kvm)
 	    || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 		++kvm->stat.generic.remote_tlb_flush;
-	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 #endif
@@ -552,7 +546,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 		}
 	}
 
-	if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
+	if (range->flush_on_ret && ret)
 		kvm_flush_remote_tlbs(kvm);
 
 	if (locked)
-- 
cgit v1.2.3-71-gd317


From 5501765a02a6c324f78581e6bb8209d054fe13ae Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Wed, 15 Sep 2021 10:09:38 -0700
Subject: driver core: fw_devlink: Add support for
 FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD

If a parent device is also a supplier to a child device, fw_devlink=on by
design delays the probe() of the child device until the probe() of the
parent finishes successfully.

However, some drivers of such parent devices (where parent is also a
supplier) expect the child device to finish probing successfully as soon as
they are added using device_add() and before the probe() of the parent
device has completed successfully. One example of such a case is discussed
in the link mentioned below.

Add a flag to make fw_devlink=on not enforce these supplier-consumer
relationships, so these drivers can continue working.

Link: https://lore.kernel.org/netdev/CAGETcx_uj0V4DChME-gy5HGKTYnxLBX=TH2rag29f_p=UcG+Tg@mail.gmail.com/
Fixes: ea718c699055 ("Revert "Revert "driver core: Set fw_devlink=on by default""")
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210915170940.617415-3-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 19 +++++++++++++++++++
 include/linux/fwnode.h | 11 ++++++++---
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 316df6027093..21d4cb5d3767 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1722,6 +1722,25 @@ static int fw_devlink_create_devlink(struct device *con,
 	struct device *sup_dev;
 	int ret = 0;
 
+	/*
+	 * In some cases, a device P might also be a supplier to its child node
+	 * C. However, this would defer the probe of C until the probe of P
+	 * completes successfully. This is perfectly fine in the device driver
+	 * model. device_add() doesn't guarantee probe completion of the device
+	 * by the time it returns.
+	 *
+	 * However, there are a few drivers that assume C will finish probing
+	 * as soon as it's added and before P finishes probing. So, we provide
+	 * a flag to let fw_devlink know not to delay the probe of C until the
+	 * probe of P completes successfully.
+	 *
+	 * When such a flag is set, we can't create device links where P is the
+	 * supplier of C as that would delay the probe of C.
+	 */
+	if (sup_handle->flags & FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD &&
+	    fwnode_is_ancestor_of(sup_handle, con->fwnode))
+		return -EINVAL;
+
 	sup_dev = get_dev_from_fwnode(sup_handle);
 	if (sup_dev) {
 		/*
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 59828516ebaf..9f4ad719bfe3 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -22,10 +22,15 @@ struct device;
  * LINKS_ADDED:	The fwnode has already be parsed to add fwnode links.
  * NOT_DEVICE:	The fwnode will never be populated as a struct device.
  * INITIALIZED: The hardware corresponding to fwnode has been initialized.
+ * NEEDS_CHILD_BOUND_ON_ADD: For this fwnode/device to probe successfully, its
+ *			     driver needs its child devices to be bound with
+ *			     their respective drivers as soon as they are
+ *			     added.
  */
-#define FWNODE_FLAG_LINKS_ADDED		BIT(0)
-#define FWNODE_FLAG_NOT_DEVICE		BIT(1)
-#define FWNODE_FLAG_INITIALIZED		BIT(2)
+#define FWNODE_FLAG_LINKS_ADDED			BIT(0)
+#define FWNODE_FLAG_NOT_DEVICE			BIT(1)
+#define FWNODE_FLAG_INITIALIZED			BIT(2)
+#define FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD	BIT(3)
 
 struct fwnode_handle {
 	struct fwnode_handle *secondary;
-- 
cgit v1.2.3-71-gd317


From 243418e3925d5b5b0657ae54c322d43035e97eed Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Fri, 24 Sep 2021 15:43:47 -0700
Subject: mm: fs: invalidate bh_lrus for only cold path

The kernel test robot reported the regression of fio.write_iops[1] with
commit 8cc621d2f45d ("mm: fs: invalidate BH LRU during page migration").

Since lru_add_drain is called frequently, invalidate bh_lrus there could
increase bh_lrus cache miss ratio, which needs more IO in the end.

This patch moves the bh_lrus invalidation from the hot path( e.g.,
zap_page_range, pagevec_release) to cold path(i.e., lru_add_drain_all,
lru_cache_disable).

Zhengjun Xing confirmed
 "I test the patch, the regression reduced to -2.9%"

[1] https://lore.kernel.org/lkml/20210520083144.GD14190@xsang-OptiPlex-9020/
[2] 8cc621d2f45d, mm: fs: invalidate BH LRU during page migration

Link: https://lkml.kernel.org/r/20210907212347.1977686-1-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: kernel test robot <oliver.sang@intel.com>
Reviewed-by: Chris Goldsworthy <cgoldswo@codeaurora.org>
Tested-by: "Xing, Zhengjun" <zhengjun.xing@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                 |  8 ++++++--
 include/linux/buffer_head.h |  4 ++--
 mm/swap.c                   | 19 ++++++++++++++++---
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index ab7573d72dd7..c615387aedca 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1425,12 +1425,16 @@ void invalidate_bh_lrus(void)
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 
-void invalidate_bh_lrus_cpu(int cpu)
+/*
+ * It's called from workqueue context so we need a bh_lru_lock to close
+ * the race with preemption/irq.
+ */
+void invalidate_bh_lrus_cpu(void)
 {
 	struct bh_lru *b;
 
 	bh_lru_lock();
-	b = per_cpu_ptr(&bh_lrus, cpu);
+	b = this_cpu_ptr(&bh_lrus);
 	__invalidate_bh_lrus(b);
 	bh_lru_unlock();
 }
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 6486d3c19463..36f33685c8c0 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -194,7 +194,7 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
 struct buffer_head *__bread_gfp(struct block_device *,
 				sector_t block, unsigned size, gfp_t gfp);
 void invalidate_bh_lrus(void);
-void invalidate_bh_lrus_cpu(int cpu);
+void invalidate_bh_lrus_cpu(void);
 bool has_bh_in_lru(int cpu, void *dummy);
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
@@ -408,7 +408,7 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; }
 static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
-static inline void invalidate_bh_lrus_cpu(int cpu) {}
+static inline void invalidate_bh_lrus_cpu(void) {}
 static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
 #define buffer_heads_over_limit 0
 
diff --git a/mm/swap.c b/mm/swap.c
index 897200d27dd0..af3cad4e5378 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -620,7 +620,6 @@ void lru_add_drain_cpu(int cpu)
 		pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
 
 	activate_page_drain(cpu);
-	invalidate_bh_lrus_cpu(cpu);
 }
 
 /**
@@ -703,6 +702,20 @@ void lru_add_drain(void)
 	local_unlock(&lru_pvecs.lock);
 }
 
+/*
+ * It's called from per-cpu workqueue context in SMP case so
+ * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
+ * the same cpu. It shouldn't be a problem in !SMP case since
+ * the core is only one and the locks will disable preemption.
+ */
+static void lru_add_and_bh_lrus_drain(void)
+{
+	local_lock(&lru_pvecs.lock);
+	lru_add_drain_cpu(smp_processor_id());
+	local_unlock(&lru_pvecs.lock);
+	invalidate_bh_lrus_cpu();
+}
+
 void lru_add_drain_cpu_zone(struct zone *zone)
 {
 	local_lock(&lru_pvecs.lock);
@@ -717,7 +730,7 @@ static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
 
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
-	lru_add_drain();
+	lru_add_and_bh_lrus_drain();
 }
 
 /*
@@ -858,7 +871,7 @@ void lru_cache_disable(void)
 	 */
 	__lru_add_drain_all(true);
 #else
-	lru_add_drain();
+	lru_add_and_bh_lrus_drain();
 #endif
 }
 
-- 
cgit v1.2.3-71-gd317


From 57ed7b4303a1c4d1885019fef03e6a5af2e8468a Mon Sep 17 00:00:00 2001
From: Weizhao Ouyang <o451686892@gmail.com>
Date: Fri, 24 Sep 2021 15:43:53 -0700
Subject: mm/debug: sync up latest migrate_reason to migrate_reason_names

Sync up MR_DEMOTION to migrate_reason_names and add a synch prompt.

Link: https://lkml.kernel.org/r/20210921064553.293905-3-o451686892@gmail.com
Fixes: 26aa2d199d6f ("mm/migrate: demote pages during reclaim")
Signed-off-by: Weizhao Ouyang <o451686892@gmail.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mina Almasry <almasrymina@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 6 +++++-
 mm/debug.c              | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 326250996b4e..c8077e936691 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -19,6 +19,11 @@ struct migration_target_control;
  */
 #define MIGRATEPAGE_SUCCESS		0
 
+/*
+ * Keep sync with:
+ * - macro MIGRATE_REASON in include/trace/events/migrate.h
+ * - migrate_reason_names[MR_TYPES] in mm/debug.c
+ */
 enum migrate_reason {
 	MR_COMPACTION,
 	MR_MEMORY_FAILURE,
@@ -32,7 +37,6 @@ enum migrate_reason {
 	MR_TYPES
 };
 
-/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */
 extern const char *migrate_reason_names[MR_TYPES];
 
 #ifdef CONFIG_MIGRATION
diff --git a/mm/debug.c b/mm/debug.c
index e61037cded98..fae0f81ad831 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -26,6 +26,7 @@ const char *migrate_reason_names[MR_TYPES] = {
 	"numa_misplaced",
 	"contig_range",
 	"longterm_pin",
+	"demotion",
 };
 
 const struct trace_print_flags pageflag_names[] = {
-- 
cgit v1.2.3-71-gd317


From f792565326825ed806626da50c6f9a928f1079c1 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 29 Sep 2021 12:43:13 -0700
Subject: perf/core: fix userpage->time_enabled of inactive events

Users of rdpmc rely on the mmapped user page to calculate accurate
time_enabled. Currently, userpage->time_enabled is only updated when the
event is added to the pmu. As a result, inactive event (due to counter
multiplexing) does not have accurate userpage->time_enabled. This can
be reproduced with something like:

   /* open 20 task perf_event "cycles", to create multiplexing */

   fd = perf_event_open();  /* open task perf_event "cycles" */
   userpage = mmap(fd);     /* use mmap and rdmpc */

   while (true) {
     time_enabled_mmap = xxx; /* use logic in perf_event_mmap_page */
     time_enabled_read = read(fd).time_enabled;
     if (time_enabled_mmap > time_enabled_read)
         BUG();
   }

Fix this by updating userpage for inactive events in merge_sched_in.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-and-tested-by: Lucian Grijincu <lucian@fb.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210929194313.2398474-1-songliubraving@fb.com
---
 include/linux/perf_event.h |  4 +++-
 kernel/events/core.c       | 34 ++++++++++++++++++++++++++++++----
 2 files changed, 33 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fe156a8170aa..9b60bb89d86a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -683,7 +683,9 @@ struct perf_event {
 	/*
 	 * timestamp shadows the actual context timing but it can
 	 * be safely used in NMI interrupt context. It reflects the
-	 * context time as it was when the event was last scheduled in.
+	 * context time as it was when the event was last scheduled in,
+	 * or when ctx_sched_in failed to schedule the event because we
+	 * run out of PMC.
 	 *
 	 * ctx_time already accounts for ctx->timestamp. Therefore to
 	 * compute ctx_time for a sample, simply add perf_clock().
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0c000cb01eeb..f23ca260307f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3707,6 +3707,29 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
 	return 0;
 }
 
+static inline bool event_update_userpage(struct perf_event *event)
+{
+	if (likely(!atomic_read(&event->mmap_count)))
+		return false;
+
+	perf_event_update_time(event);
+	perf_set_shadow_time(event, event->ctx);
+	perf_event_update_userpage(event);
+
+	return true;
+}
+
+static inline void group_update_userpage(struct perf_event *group_event)
+{
+	struct perf_event *event;
+
+	if (!event_update_userpage(group_event))
+		return;
+
+	for_each_sibling_event(event, group_event)
+		event_update_userpage(event);
+}
+
 static int merge_sched_in(struct perf_event *event, void *data)
 {
 	struct perf_event_context *ctx = event->ctx;
@@ -3725,14 +3748,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
 	}
 
 	if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		*can_add_hw = 0;
 		if (event->attr.pinned) {
 			perf_cgroup_event_disable(event, ctx);
 			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+		} else {
+			ctx->rotate_necessary = 1;
+			perf_mux_hrtimer_restart(cpuctx);
+			group_update_userpage(event);
 		}
-
-		*can_add_hw = 0;
-		ctx->rotate_necessary = 1;
-		perf_mux_hrtimer_restart(cpuctx);
 	}
 
 	return 0;
@@ -6324,6 +6348,8 @@ accounting:
 
 		ring_buffer_attach(event, rb);
 
+		perf_event_update_time(event);
+		perf_set_shadow_time(event, event->ctx);
 		perf_event_init_userpage(event);
 		perf_event_update_userpage(event);
 	} else {
-- 
cgit v1.2.3-71-gd317


From 83d40a61046f73103b4e5d8f1310261487ff63b0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 20 Sep 2021 15:31:11 +0200
Subject: sched: Always inline is_percpu_thread()

  vmlinux.o: warning: objtool: check_preemption_disabled()+0x81: call to is_percpu_thread() leaves .noinstr.text section

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210928084218.063371959@infradead.org
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 39039ce8ac4c..c1a927ddec64 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1720,7 +1720,7 @@ extern struct pid *cad_pid;
 #define tsk_used_math(p)			((p)->flags & PF_USED_MATH)
 #define used_math()				tsk_used_math(current)
 
-static inline bool is_percpu_thread(void)
+static __always_inline bool is_percpu_thread(void)
 {
 #ifdef CONFIG_SMP
 	return (current->flags & PF_NO_SETAFFINITY) &&
-- 
cgit v1.2.3-71-gd317


From 3f6cffb8604b537e3d7ea040d7f4368689638eaf Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 4 Oct 2021 16:01:40 -0700
Subject: etherdevice: use __dev_addr_set()

Andrew points out that eth_hw_addr_set() replaces memcpy()
calls so we can't use ether_addr_copy() which assumes
both arguments are 2-bytes aligned.

Reported-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 928c411bd509..c58d50451485 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -308,7 +308,7 @@ static inline void ether_addr_copy(u8 *dst, const u8 *src)
  */
 static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 {
-	ether_addr_copy(dev->dev_addr, addr);
+	__dev_addr_set(dev, addr, ETH_ALEN);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From 94ad8aacbc2d4908b052c8bdb5ae13bc702f77ea Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 27 Sep 2021 16:40:50 +0200
Subject: ARM: omap1: move omap15xx local bus handling to usb.c

Commit 38225f2ef2f4 ("ARM/omap1: switch to use dma_direct_set_offset for
lbus DMA offsets") removed a lot of mach/memory.h, but left the USB
offset handling split into arch/arm/mach-omap1/usb.c and
drivers/usb/host/ohci-omap.c.

This can cause a randconfig build warning that now fails the build
with -Werror:

arch/arm/mach-omap1/usb.c:561:30: error: 'omap_1510_usb_ohci_nb' defined but not used [-Werror=unused-variable]
  561 | static struct notifier_block omap_1510_usb_ohci_nb = {
      |                              ^~~~~~~~~~~~~~~~~~~~~

Move it all into the platform file to get rid of the final
location that relies on mach/memory.h.

Acked-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Tony Lindgren <tony@atomide.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20210927144118.2464881-1-arnd@kernel.org'
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/mach-omap1/include/mach/memory.h |  12 ----
 arch/arm/mach-omap1/usb.c                 | 116 +++++++++++++++++++++---------
 drivers/usb/host/ohci-omap.c              |  72 +------------------
 include/linux/platform_data/usb-omap1.h   |   2 +
 4 files changed, 86 insertions(+), 116 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap1/include/mach/memory.h b/arch/arm/mach-omap1/include/mach/memory.h
index 36bc0000cb6a..ba3a350479c8 100644
--- a/arch/arm/mach-omap1/include/mach/memory.h
+++ b/arch/arm/mach-omap1/include/mach/memory.h
@@ -9,16 +9,4 @@
 /* REVISIT: omap1 legacy drivers still rely on this */
 #include <mach/soc.h>
 
-/*
- * Bus address is physical address, except for OMAP-1510 Local Bus.
- * OMAP-1510 bus address is translated into a Local Bus address if the
- * OMAP bus type is lbus. We do the address translation based on the
- * device overriding the defaults used in the dma-mapping API.
- */
-
-/*
- * OMAP-1510 Local Bus address offset
- */
-#define OMAP1510_LB_OFFSET	UL(0x30000000)
-
 #endif
diff --git a/arch/arm/mach-omap1/usb.c b/arch/arm/mach-omap1/usb.c
index 86d3b3c157af..e60831c82b78 100644
--- a/arch/arm/mach-omap1/usb.c
+++ b/arch/arm/mach-omap1/usb.c
@@ -11,6 +11,7 @@
 #include <linux/platform_device.h>
 #include <linux/dma-map-ops.h>
 #include <linux/io.h>
+#include <linux/delay.h>
 
 #include <asm/irq.h>
 
@@ -206,8 +207,6 @@ static inline void udc_device_init(struct omap_usb_config *pdata)
 
 #endif
 
-#if	IS_ENABLED(CONFIG_USB_OHCI_HCD)
-
 /* The dmamask must be set for OHCI to work */
 static u64 ohci_dmamask = ~(u32)0;
 
@@ -236,20 +235,15 @@ static struct platform_device ohci_device = {
 
 static inline void ohci_device_init(struct omap_usb_config *pdata)
 {
+	if (!IS_ENABLED(CONFIG_USB_OHCI_HCD))
+		return;
+
 	if (cpu_is_omap7xx())
 		ohci_resources[1].start = INT_7XX_USB_HHC_1;
 	pdata->ohci_device = &ohci_device;
 	pdata->ocpi_enable = &ocpi_enable;
 }
 
-#else
-
-static inline void ohci_device_init(struct omap_usb_config *pdata)
-{
-}
-
-#endif
-
 #if	defined(CONFIG_USB_OTG) && defined(CONFIG_ARCH_OMAP_OTG)
 
 static struct resource otg_resources[] = {
@@ -534,33 +528,87 @@ bad:
 }
 
 #ifdef	CONFIG_ARCH_OMAP15XX
+/* OMAP-1510 OHCI has its own MMU for DMA */
+#define OMAP1510_LB_MEMSIZE	32	/* Should be same as SDRAM size */
+#define OMAP1510_LB_CLOCK_DIV	0xfffec10c
+#define OMAP1510_LB_MMU_CTL	0xfffec208
+#define OMAP1510_LB_MMU_LCK	0xfffec224
+#define OMAP1510_LB_MMU_LD_TLB	0xfffec228
+#define OMAP1510_LB_MMU_CAM_H	0xfffec22c
+#define OMAP1510_LB_MMU_CAM_L	0xfffec230
+#define OMAP1510_LB_MMU_RAM_H	0xfffec234
+#define OMAP1510_LB_MMU_RAM_L	0xfffec238
 
-/* ULPD_DPLL_CTRL */
-#define DPLL_IOB		(1 << 13)
-#define DPLL_PLL_ENABLE		(1 << 4)
-#define DPLL_LOCK		(1 << 0)
+/*
+ * Bus address is physical address, except for OMAP-1510 Local Bus.
+ * OMAP-1510 bus address is translated into a Local Bus address if the
+ * OMAP bus type is lbus.
+ */
+#define OMAP1510_LB_OFFSET	   UL(0x30000000)
 
-/* ULPD_APLL_CTRL */
-#define APLL_NDPLL_SWITCH	(1 << 0)
+/*
+ * OMAP-1510 specific Local Bus clock on/off
+ */
+static int omap_1510_local_bus_power(int on)
+{
+	if (on) {
+		omap_writel((1 << 1) | (1 << 0), OMAP1510_LB_MMU_CTL);
+		udelay(200);
+	} else {
+		omap_writel(0, OMAP1510_LB_MMU_CTL);
+	}
 
-static int omap_1510_usb_ohci_notifier(struct notifier_block *nb,
-		unsigned long event, void *data)
+	return 0;
+}
+
+/*
+ * OMAP-1510 specific Local Bus initialization
+ * NOTE: This assumes 32MB memory size in OMAP1510LB_MEMSIZE.
+ *       See also arch/mach-omap/memory.h for __virt_to_dma() and
+ *       __dma_to_virt() which need to match with the physical
+ *       Local Bus address below.
+ */
+static int omap_1510_local_bus_init(void)
 {
-	struct device *dev = data;
+	unsigned int tlb;
+	unsigned long lbaddr, physaddr;
+
+	omap_writel((omap_readl(OMAP1510_LB_CLOCK_DIV) & 0xfffffff8) | 0x4,
+	       OMAP1510_LB_CLOCK_DIV);
+
+	/* Configure the Local Bus MMU table */
+	for (tlb = 0; tlb < OMAP1510_LB_MEMSIZE; tlb++) {
+		lbaddr = tlb * 0x00100000 + OMAP1510_LB_OFFSET;
+		physaddr = tlb * 0x00100000 + PHYS_OFFSET;
+		omap_writel((lbaddr & 0x0fffffff) >> 22, OMAP1510_LB_MMU_CAM_H);
+		omap_writel(((lbaddr & 0x003ffc00) >> 6) | 0xc,
+		       OMAP1510_LB_MMU_CAM_L);
+		omap_writel(physaddr >> 16, OMAP1510_LB_MMU_RAM_H);
+		omap_writel((physaddr & 0x0000fc00) | 0x300, OMAP1510_LB_MMU_RAM_L);
+		omap_writel(tlb << 4, OMAP1510_LB_MMU_LCK);
+		omap_writel(0x1, OMAP1510_LB_MMU_LD_TLB);
+	}
 
-	if (event != BUS_NOTIFY_ADD_DEVICE)
-		return NOTIFY_DONE;
+	/* Enable the walking table */
+	omap_writel(omap_readl(OMAP1510_LB_MMU_CTL) | (1 << 3), OMAP1510_LB_MMU_CTL);
+	udelay(200);
 
-	if (strncmp(dev_name(dev), "ohci", 4) == 0 &&
-	    dma_direct_set_offset(dev, PHYS_OFFSET, OMAP1510_LB_OFFSET,
-			(u64)-1))
-		WARN_ONCE(1, "failed to set DMA offset\n");
-	return NOTIFY_OK;
+	return 0;
 }
 
-static struct notifier_block omap_1510_usb_ohci_nb = {
-	.notifier_call		= omap_1510_usb_ohci_notifier,
-};
+static void omap_1510_local_bus_reset(void)
+{
+	omap_1510_local_bus_power(1);
+	omap_1510_local_bus_init();
+}
+
+/* ULPD_DPLL_CTRL */
+#define DPLL_IOB		(1 << 13)
+#define DPLL_PLL_ENABLE		(1 << 4)
+#define DPLL_LOCK		(1 << 0)
+
+/* ULPD_APLL_CTRL */
+#define APLL_NDPLL_SWITCH	(1 << 0)
 
 static void __init omap_1510_usb_init(struct omap_usb_config *config)
 {
@@ -616,19 +664,19 @@ static void __init omap_1510_usb_init(struct omap_usb_config *config)
 	}
 #endif
 
-#if	IS_ENABLED(CONFIG_USB_OHCI_HCD)
-	if (config->register_host) {
+	if (IS_ENABLED(CONFIG_USB_OHCI_HCD) && config->register_host) {
 		int status;
 
-		bus_register_notifier(&platform_bus_type,
-				      &omap_1510_usb_ohci_nb);
 		ohci_device.dev.platform_data = config;
+		dma_direct_set_offset(&ohci_device.dev, PHYS_OFFSET,
+				      OMAP1510_LB_OFFSET, (u64)-1);
 		status = platform_device_register(&ohci_device);
 		if (status)
 			pr_debug("can't register OHCI device, %d\n", status);
 		/* hcd explicitly gates 48MHz */
+
+		config->lb_reset = omap_1510_local_bus_reset;
 	}
-#endif
 }
 
 #else
diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c
index 0b3722770760..ded9738392e4 100644
--- a/drivers/usb/host/ohci-omap.c
+++ b/drivers/usb/host/ohci-omap.c
@@ -40,17 +40,6 @@
 #include <mach/usb.h>
 
 
-/* OMAP-1510 OHCI has its own MMU for DMA */
-#define OMAP1510_LB_MEMSIZE	32	/* Should be same as SDRAM size */
-#define OMAP1510_LB_CLOCK_DIV	0xfffec10c
-#define OMAP1510_LB_MMU_CTL	0xfffec208
-#define OMAP1510_LB_MMU_LCK	0xfffec224
-#define OMAP1510_LB_MMU_LD_TLB	0xfffec228
-#define OMAP1510_LB_MMU_CAM_H	0xfffec22c
-#define OMAP1510_LB_MMU_CAM_L	0xfffec230
-#define OMAP1510_LB_MMU_RAM_H	0xfffec234
-#define OMAP1510_LB_MMU_RAM_L	0xfffec238
-
 #define DRIVER_DESC "OHCI OMAP driver"
 
 struct ohci_omap_priv {
@@ -104,61 +93,6 @@ static int omap_ohci_transceiver_power(struct ohci_omap_priv *priv, int on)
 	return 0;
 }
 
-#ifdef CONFIG_ARCH_OMAP15XX
-/*
- * OMAP-1510 specific Local Bus clock on/off
- */
-static int omap_1510_local_bus_power(int on)
-{
-	if (on) {
-		omap_writel((1 << 1) | (1 << 0), OMAP1510_LB_MMU_CTL);
-		udelay(200);
-	} else {
-		omap_writel(0, OMAP1510_LB_MMU_CTL);
-	}
-
-	return 0;
-}
-
-/*
- * OMAP-1510 specific Local Bus initialization
- * NOTE: This assumes 32MB memory size in OMAP1510LB_MEMSIZE.
- *       See also arch/mach-omap/memory.h for __virt_to_dma() and
- *       __dma_to_virt() which need to match with the physical
- *       Local Bus address below.
- */
-static int omap_1510_local_bus_init(void)
-{
-	unsigned int tlb;
-	unsigned long lbaddr, physaddr;
-
-	omap_writel((omap_readl(OMAP1510_LB_CLOCK_DIV) & 0xfffffff8) | 0x4,
-	       OMAP1510_LB_CLOCK_DIV);
-
-	/* Configure the Local Bus MMU table */
-	for (tlb = 0; tlb < OMAP1510_LB_MEMSIZE; tlb++) {
-		lbaddr = tlb * 0x00100000 + OMAP1510_LB_OFFSET;
-		physaddr = tlb * 0x00100000 + PHYS_OFFSET;
-		omap_writel((lbaddr & 0x0fffffff) >> 22, OMAP1510_LB_MMU_CAM_H);
-		omap_writel(((lbaddr & 0x003ffc00) >> 6) | 0xc,
-		       OMAP1510_LB_MMU_CAM_L);
-		omap_writel(physaddr >> 16, OMAP1510_LB_MMU_RAM_H);
-		omap_writel((physaddr & 0x0000fc00) | 0x300, OMAP1510_LB_MMU_RAM_L);
-		omap_writel(tlb << 4, OMAP1510_LB_MMU_LCK);
-		omap_writel(0x1, OMAP1510_LB_MMU_LD_TLB);
-	}
-
-	/* Enable the walking table */
-	omap_writel(omap_readl(OMAP1510_LB_MMU_CTL) | (1 << 3), OMAP1510_LB_MMU_CTL);
-	udelay(200);
-
-	return 0;
-}
-#else
-#define omap_1510_local_bus_power(x)	{}
-#define omap_1510_local_bus_init()	{}
-#endif
-
 #ifdef	CONFIG_USB_OTG
 
 static void start_hnp(struct ohci_hcd *ohci)
@@ -229,10 +163,8 @@ static int ohci_omap_reset(struct usb_hcd *hcd)
 
 	omap_ohci_clock_power(priv, 1);
 
-	if (cpu_is_omap15xx()) {
-		omap_1510_local_bus_power(1);
-		omap_1510_local_bus_init();
-	}
+	if (config->lb_reset)
+		config->lb_reset();
 
 	ret = ohci_setup(hcd);
 	if (ret < 0)
diff --git a/include/linux/platform_data/usb-omap1.h b/include/linux/platform_data/usb-omap1.h
index 43b5ce139c37..878e572a78bf 100644
--- a/include/linux/platform_data/usb-omap1.h
+++ b/include/linux/platform_data/usb-omap1.h
@@ -48,6 +48,8 @@ struct omap_usb_config {
 	u32 (*usb2_init)(unsigned nwires, unsigned alt_pingroup);
 
 	int (*ocpi_enable)(void);
+
+	void (*lb_reset)(void);
 };
 
 #endif /* __LINUX_USB_OMAP1_H */
-- 
cgit v1.2.3-71-gd317


From 424953cf3c6657f1e67e1a2c5d6e3bb518ea4e9a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 28 Sep 2021 09:50:27 +0200
Subject: qcom_scm: hide Kconfig symbol

Now that SCM can be a loadable module, we have to add another
dependency to avoid link failures when ipa or adreno-gpu are
built-in:

aarch64-linux-ld: drivers/net/ipa/ipa_main.o: in function `ipa_probe':
ipa_main.c:(.text+0xfc4): undefined reference to `qcom_scm_is_available'

ld.lld: error: undefined symbol: qcom_scm_is_available
>>> referenced by adreno_gpu.c
>>>               gpu/drm/msm/adreno/adreno_gpu.o:(adreno_zap_shader_load) in archive drivers/built-in.a

This can happen when CONFIG_ARCH_QCOM is disabled and we don't select
QCOM_MDT_LOADER, but some other module selects QCOM_SCM. Ideally we'd
use a similar dependency here to what we have for QCOM_RPROC_COMMON,
but that causes dependency loops from other things selecting QCOM_SCM.

This appears to be an endless problem, so try something different this
time:

 - CONFIG_QCOM_SCM becomes a hidden symbol that nothing 'depends on'
   but that is simply selected by all of its users

 - All the stubs in include/linux/qcom_scm.h can go away

 - arm-smccc.h needs to provide a stub for __arm_smccc_smc() to
   allow compile-testing QCOM_SCM on all architectures.

 - To avoid a circular dependency chain involving RESET_CONTROLLER
   and PINCTRL_SUNXI, drop the 'select RESET_CONTROLLER' statement.
   According to my testing this still builds fine, and the QCOM
   platform selects this symbol already.

Acked-by: Kalle Valo <kvalo@codeaurora.org>
Acked-by: Alex Elder <elder@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/firmware/Kconfig                   |  5 +--
 drivers/gpu/drm/msm/Kconfig                |  4 +-
 drivers/iommu/Kconfig                      |  3 +-
 drivers/iommu/arm/arm-smmu/Makefile        |  3 +-
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  3 +-
 drivers/media/platform/Kconfig             |  2 +-
 drivers/mmc/host/Kconfig                   |  2 +-
 drivers/net/ipa/Kconfig                    |  1 +
 drivers/net/wireless/ath/ath10k/Kconfig    |  2 +-
 drivers/pinctrl/qcom/Kconfig               |  3 +-
 include/linux/arm-smccc.h                  | 10 +++++
 include/linux/qcom_scm.h                   | 71 ------------------------------
 12 files changed, 24 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 220a58cf0a44..cda7d7162cbb 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -203,10 +203,7 @@ config INTEL_STRATIX10_RSU
 	  Say Y here if you want Intel RSU support.
 
 config QCOM_SCM
-	tristate "Qcom SCM driver"
-	depends on ARM || ARM64
-	depends on HAVE_ARM_SMCCC
-	select RESET_CONTROLLER
+	tristate
 
 config QCOM_SCM_DOWNLOAD_MODE_DEFAULT
 	bool "Qualcomm download mode enabled by default"
diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
index e9c6af78b1d7..3ddf739a6f9b 100644
--- a/drivers/gpu/drm/msm/Kconfig
+++ b/drivers/gpu/drm/msm/Kconfig
@@ -17,7 +17,7 @@ config DRM_MSM
 	select DRM_SCHED
 	select SHMEM
 	select TMPFS
-	select QCOM_SCM if ARCH_QCOM
+	select QCOM_SCM
 	select WANT_DEV_COREDUMP
 	select SND_SOC_HDMI_CODEC if SND_SOC
 	select SYNC_FILE
@@ -55,7 +55,7 @@ config DRM_MSM_GPU_SUDO
 
 config DRM_MSM_HDMI_HDCP
 	bool "Enable HDMI HDCP support in MSM DRM driver"
-	depends on DRM_MSM && QCOM_SCM
+	depends on DRM_MSM
 	default y
 	help
 	  Choose this option to enable HDCP state machine
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 124c41adeca1..c5c71b7ab7e8 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -308,7 +308,6 @@ config APPLE_DART
 config ARM_SMMU
 	tristate "ARM Ltd. System MMU (SMMU) Support"
 	depends on ARM64 || ARM || (COMPILE_TEST && !GENERIC_ATOMIC64)
-	depends on QCOM_SCM || !QCOM_SCM #if QCOM_SCM=m this can't be =y
 	select IOMMU_API
 	select IOMMU_IO_PGTABLE_LPAE
 	select ARM_DMA_USE_IOMMU if ARM
@@ -438,7 +437,7 @@ config QCOM_IOMMU
 	# Note: iommu drivers cannot (yet?) be built as modules
 	bool "Qualcomm IOMMU Support"
 	depends on ARCH_QCOM || (COMPILE_TEST && !GENERIC_ATOMIC64)
-	depends on QCOM_SCM=y
+	select QCOM_SCM
 	select IOMMU_API
 	select IOMMU_IO_PGTABLE_LPAE
 	select ARM_DMA_USE_IOMMU
diff --git a/drivers/iommu/arm/arm-smmu/Makefile b/drivers/iommu/arm/arm-smmu/Makefile
index e240a7bcf310..b0cc01aa20c9 100644
--- a/drivers/iommu/arm/arm-smmu/Makefile
+++ b/drivers/iommu/arm/arm-smmu/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_ARM_SMMU) += arm_smmu.o
-arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-nvidia.o arm-smmu-qcom.o
+arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-nvidia.o
+arm_smmu-$(CONFIG_ARM_SMMU_QCOM) += arm-smmu-qcom.o
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 9f465e146799..2c25cce38060 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -215,7 +215,8 @@ struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu)
 	    of_device_is_compatible(np, "nvidia,tegra186-smmu"))
 		return nvidia_smmu_impl_init(smmu);
 
-	smmu = qcom_smmu_impl_init(smmu);
+	if (IS_ENABLED(CONFIG_ARM_SMMU_QCOM))
+		smmu = qcom_smmu_impl_init(smmu);
 
 	if (of_device_is_compatible(np, "marvell,ap806-smmu-500"))
 		smmu->impl = &mrvl_mmu500_impl;
diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig
index 157c924686e4..80321e03809a 100644
--- a/drivers/media/platform/Kconfig
+++ b/drivers/media/platform/Kconfig
@@ -565,7 +565,7 @@ config VIDEO_QCOM_VENUS
 	depends on VIDEO_DEV && VIDEO_V4L2 && QCOM_SMEM
 	depends on (ARCH_QCOM && IOMMU_DMA) || COMPILE_TEST
 	select QCOM_MDT_LOADER if ARCH_QCOM
-	select QCOM_SCM if ARCH_QCOM
+	select QCOM_SCM
 	select VIDEOBUF2_DMA_CONTIG
 	select V4L2_MEM2MEM_DEV
 	help
diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index 71313961cc54..95b3511b0560 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -547,7 +547,7 @@ config MMC_SDHCI_MSM
 	depends on MMC_SDHCI_PLTFM
 	select MMC_SDHCI_IO_ACCESSORS
 	select MMC_CQHCI
-	select QCOM_SCM if MMC_CRYPTO && ARCH_QCOM
+	select QCOM_SCM if MMC_CRYPTO
 	help
 	  This selects the Secure Digital Host Controller Interface (SDHCI)
 	  support present in Qualcomm SOCs. The controller supports
diff --git a/drivers/net/ipa/Kconfig b/drivers/net/ipa/Kconfig
index 8f99cfa14680..d037682fb7ad 100644
--- a/drivers/net/ipa/Kconfig
+++ b/drivers/net/ipa/Kconfig
@@ -4,6 +4,7 @@ config QCOM_IPA
 	depends on ARCH_QCOM || COMPILE_TEST
 	depends on QCOM_RPROC_COMMON || (QCOM_RPROC_COMMON=n && COMPILE_TEST)
 	select QCOM_MDT_LOADER if ARCH_QCOM
+	select QCOM_SCM
 	select QCOM_QMI_HELPERS
 	help
 	  Choose Y or M here to include support for the Qualcomm
diff --git a/drivers/net/wireless/ath/ath10k/Kconfig b/drivers/net/wireless/ath/ath10k/Kconfig
index 741289e385d5..ca007b800f75 100644
--- a/drivers/net/wireless/ath/ath10k/Kconfig
+++ b/drivers/net/wireless/ath/ath10k/Kconfig
@@ -44,7 +44,7 @@ config ATH10K_SNOC
 	tristate "Qualcomm ath10k SNOC support"
 	depends on ATH10K
 	depends on ARCH_QCOM || COMPILE_TEST
-	depends on QCOM_SCM || !QCOM_SCM #if QCOM_SCM=m this can't be =y
+	select QCOM_SCM
 	select QCOM_QMI_HELPERS
 	help
 	  This module adds support for integrated WCN3990 chip connected
diff --git a/drivers/pinctrl/qcom/Kconfig b/drivers/pinctrl/qcom/Kconfig
index 32ea2a8ec02b..5ff4207df66e 100644
--- a/drivers/pinctrl/qcom/Kconfig
+++ b/drivers/pinctrl/qcom/Kconfig
@@ -3,7 +3,8 @@ if (ARCH_QCOM || COMPILE_TEST)
 
 config PINCTRL_MSM
 	tristate "Qualcomm core pin controller driver"
-	depends on GPIOLIB && (QCOM_SCM || !QCOM_SCM) #if QCOM_SCM=m this can't be =y
+	depends on GPIOLIB
+	select QCOM_SCM
 	select PINMUX
 	select PINCONF
 	select GENERIC_PINCONF
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 7d1cabe15262..63ccb5252190 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -321,10 +321,20 @@ asmlinkage unsigned long __arm_smccc_sve_check(unsigned long x0);
  * from register 0 to 3 on return from the SMC instruction.  An optional
  * quirk structure provides vendor specific behavior.
  */
+#ifdef CONFIG_HAVE_ARM_SMCCC
 asmlinkage void __arm_smccc_smc(unsigned long a0, unsigned long a1,
 			unsigned long a2, unsigned long a3, unsigned long a4,
 			unsigned long a5, unsigned long a6, unsigned long a7,
 			struct arm_smccc_res *res, struct arm_smccc_quirk *quirk);
+#else
+static inline void __arm_smccc_smc(unsigned long a0, unsigned long a1,
+			unsigned long a2, unsigned long a3, unsigned long a4,
+			unsigned long a5, unsigned long a6, unsigned long a7,
+			struct arm_smccc_res *res, struct arm_smccc_quirk *quirk)
+{
+	*res = (struct arm_smccc_res){};
+}
+#endif
 
 /**
  * __arm_smccc_hvc() - make HVC calls
diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h
index c0475d1c9885..81cad9e1e412 100644
--- a/include/linux/qcom_scm.h
+++ b/include/linux/qcom_scm.h
@@ -61,7 +61,6 @@ enum qcom_scm_ice_cipher {
 #define QCOM_SCM_PERM_RW (QCOM_SCM_PERM_READ | QCOM_SCM_PERM_WRITE)
 #define QCOM_SCM_PERM_RWX (QCOM_SCM_PERM_RW | QCOM_SCM_PERM_EXEC)
 
-#if IS_ENABLED(CONFIG_QCOM_SCM)
 extern bool qcom_scm_is_available(void);
 
 extern int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus);
@@ -115,74 +114,4 @@ extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
 extern int qcom_scm_lmh_profile_change(u32 profile_id);
 extern bool qcom_scm_lmh_dcvsh_available(void);
 
-#else
-
-#include <linux/errno.h>
-
-static inline bool qcom_scm_is_available(void) { return false; }
-
-static inline int qcom_scm_set_cold_boot_addr(void *entry,
-		const cpumask_t *cpus) { return -ENODEV; }
-static inline int qcom_scm_set_warm_boot_addr(void *entry,
-		const cpumask_t *cpus) { return -ENODEV; }
-static inline void qcom_scm_cpu_power_down(u32 flags) {}
-static inline u32 qcom_scm_set_remote_state(u32 state,u32 id)
-		{ return -ENODEV; }
-
-static inline int qcom_scm_pas_init_image(u32 peripheral, const void *metadata,
-		size_t size) { return -ENODEV; }
-static inline int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr,
-		phys_addr_t size) { return -ENODEV; }
-static inline int qcom_scm_pas_auth_and_reset(u32 peripheral)
-		{ return -ENODEV; }
-static inline int qcom_scm_pas_shutdown(u32 peripheral) { return -ENODEV; }
-static inline bool qcom_scm_pas_supported(u32 peripheral) { return false; }
-
-static inline int qcom_scm_io_readl(phys_addr_t addr, unsigned int *val)
-		{ return -ENODEV; }
-static inline int qcom_scm_io_writel(phys_addr_t addr, unsigned int val)
-		{ return -ENODEV; }
-
-static inline bool qcom_scm_restore_sec_cfg_available(void) { return false; }
-static inline int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare)
-		{ return -ENODEV; }
-static inline int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size)
-		{ return -ENODEV; }
-static inline int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare)
-		{ return -ENODEV; }
-extern inline int qcom_scm_mem_protect_video_var(u32 cp_start, u32 cp_size,
-						 u32 cp_nonpixel_start,
-						 u32 cp_nonpixel_size)
-		{ return -ENODEV; }
-static inline int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz,
-		unsigned int *src, const struct qcom_scm_vmperm *newvm,
-		unsigned int dest_cnt) { return -ENODEV; }
-
-static inline bool qcom_scm_ocmem_lock_available(void) { return false; }
-static inline int qcom_scm_ocmem_lock(enum qcom_scm_ocmem_client id, u32 offset,
-		u32 size, u32 mode) { return -ENODEV; }
-static inline int qcom_scm_ocmem_unlock(enum qcom_scm_ocmem_client id,
-		u32 offset, u32 size) { return -ENODEV; }
-
-static inline bool qcom_scm_ice_available(void) { return false; }
-static inline int qcom_scm_ice_invalidate_key(u32 index) { return -ENODEV; }
-static inline int qcom_scm_ice_set_key(u32 index, const u8 *key, u32 key_size,
-				       enum qcom_scm_ice_cipher cipher,
-				       u32 data_unit_size) { return -ENODEV; }
-
-static inline bool qcom_scm_hdcp_available(void) { return false; }
-static inline int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt,
-		u32 *resp) { return -ENODEV; }
-
-static inline int qcom_scm_qsmmu500_wait_safe_toggle(bool en)
-		{ return -ENODEV; }
-
-static inline int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
-				     u64 limit_node, u32 node_id, u64 version)
-		{ return -ENODEV; }
-
-static inline int qcom_scm_lmh_profile_change(u32 profile_id) { return -ENODEV; }
-
-static inline bool qcom_scm_lmh_dcvsh_available(void) { return -ENODEV; }
-#endif
 #endif
-- 
cgit v1.2.3-71-gd317


From 5bded8259ee3815a91791462dfb3312480779c3d Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 7 Oct 2021 19:47:11 +0300
Subject: net: dsa: mv88e6xxx: isolate the ATU databases of standalone and
 bridged ports

Similar to commit 6087175b7991 ("net: dsa: mt7530: use independent VLAN
learning on VLAN-unaware bridges"), software forwarding between an
unoffloaded LAG port (a bonding interface with an unsupported policy)
and a mv88e6xxx user port directly under a bridge is broken.

We adopt the same strategy, which is to make the standalone ports not
find any ATU entry learned on a bridge port.

Theory: the mv88e6xxx ATU is looked up by FID and MAC address. There are
as many FIDs as VIDs (4096). The FID is derived from the VID when
possible (the VTU maps a VID to a FID), with a fallback to the port
based default FID value when not (802.1Q Mode is disabled on the port,
or the classified VID isn't present in the VTU).

The mv88e6xxx driver makes the following use of FIDs and VIDs:

- the port's DefaultVID (to which untagged & pvid-tagged packets get
  classified) is 0 and is absent from the VTU, so this kind of packets is
  processed in FID 0, the default FID assigned by mv88e6xxx_setup_port.

- every time a bridge VLAN is created, mv88e6xxx_port_vlan_join() ->
  mv88e6xxx_atu_new() associates a FID with that VID which increases
  linearly starting from 1. Like this:

  bridge vlan add dev lan0 vid 100 # FID 1
  bridge vlan add dev lan1 vid 100 # still FID 1
  bridge vlan add dev lan2 vid 1024 # FID 2

The FID allocation made by the driver is sub-optimal for the following
reasons:

(a) A standalone port has a DefaultPVID of 0 and a default FID of 0 too.
    A VLAN-unaware bridged port has a DefaultPVID of 0 and a default FID
    of 0 too. The difference is that the bridged ports may learn ATU
    entries, while the standalone port has the requirement that it must
    not, and must not find them either. Standalone ports must not use
    the same FID as ports belonging to a bridge. All standalone ports
    can use the same FID, since the ATU will never have an entry in
    that FID.

(b) Multiple VLAN-unaware bridges will all use a DefaultPVID of 0 and a
    default FID of 0 on all their ports. The FDBs will not be isolated
    between these bridges. Every VLAN-unaware bridge must use the same
    FID on all its ports, different from the FID of other bridge ports.

(c) Each bridge VLAN uses a unique FID which is useful for Independent
    VLAN Learning, but the same VLAN ID on multiple VLAN-aware bridges
    will result in the same FID being used by mv88e6xxx_atu_new().
    The correct behavior is for VLAN 1 in br0 to have a different FID
    compared to VLAN 1 in br1.

This patch cannot fix all the above. Traditionally the DSA framework did
not care about this, and the reality is that DSA core involvement is
needed for the aforementioned issues to be solved. The only thing we can
solve here is an issue which does not require API changes, and that is
issue (a), aka use a different FID for standalone ports vs ports under
VLAN-unaware bridges.

The first step is deciding what VID and FID to use for standalone ports,
and what VID and FID for bridged ports. The 0/0 pair for standalone
ports is what they used up till now, let's keep using that. For bridged
ports, there are 2 cases:

- VLAN-aware ports will never end up using the port default FID, because
  packets will always be classified to a VID in the VTU or dropped
  otherwise. The FID is the one associated with the VID in the VTU.

- On VLAN-unaware ports, we _could_ leave their DefaultVID (pvid) at
  zero (just as in the case of standalone ports), and just change the
  port's default FID from 0 to a different number (say 1).

However, Tobias points out that there is one more requirement to cater to:
cross-chip bridging. The Marvell DSA header does not carry the FID in
it, only the VID. So once a packet crosses a DSA link, if it has a VID
of zero it will get classified to the default FID of that cascade port.
Relying on a port default FID for upstream cascade ports results in
contradictions: a default FID of 0 breaks ATU isolation of bridged ports
on the downstream switch, a default FID of 1 breaks standalone ports on
the downstream switch.

So not only must standalone ports have different FIDs compared to
bridged ports, they must also have different DefaultVID values.
IEEE 802.1Q defines two reserved VID values: 0 and 4095. So we simply
choose 4095 as the DefaultVID of ports belonging to VLAN-unaware
bridges, and VID 4095 maps to FID 1.

For the xmit operation to look up the same ATU database, we need to put
VID 4095 in DSA tags sent to ports belonging to VLAN-unaware bridges
too. All shared ports are configured to map this VID to the bridging
FID, because they are members of that VLAN in the VTU. Shared ports
don't need to have 802.1QMode enabled in any way, they always parse the
VID from the DSA header, they don't need to look at the 802.1Q header.

We install VID 4095 to the VTU in mv88e6xxx_setup_port(), with the
mention that mv88e6xxx_vtu_setup() which was located right below that
call was flushing the VTU so those entries wouldn't be preserved.
So we need to relocate the VTU flushing prior to the port initialization
during ->setup(). Also note that this is why it is safe to assume that
VID 4095 will get associated with FID 1: the user ports haven't been
created, so there is no avenue for the user to create a bridge VLAN
which could otherwise race with the creation of another FID which would
otherwise use up the non-reserved FID value of 1.

[ Currently mv88e6xxx_port_vlan_join() doesn't have the option of
  specifying a preferred FID, it always calls mv88e6xxx_atu_new(). ]

mv88e6xxx_port_db_load_purge() is the function to access the ATU for
FDB/MDB entries, and it used to determine the FID to use for
VLAN-unaware FDB entries (VID=0) using mv88e6xxx_port_get_fid().
But the driver only called mv88e6xxx_port_set_fid() once, during probe,
so no surprises, the port FID was always 0, the call to get_fid() was
redundant. As much as I would have wanted to not touch that code, the
logic is broken when we add a new FID which is not the port-based
default. Now the port-based default FID only corresponds to standalone
ports, and FDB/MDB entries belong to the bridging service. So while in
the future, when the DSA API will support FDB isolation, we will have to
figure out the FID based on the bridge number, for now there's a single
bridging FID, so hardcode that.

Lastly, the tagger needs to check, when it is transmitting a VLAN
untagged skb, whether it is sending it towards a bridged or a standalone
port. When we see it is bridged we assume the bridge is VLAN-unaware.
Not because it cannot be VLAN-aware but:

- if we are transmitting from a VLAN-aware bridge we are likely doing so
  using TX forwarding offload. That code path guarantees that skbs have
  a vlan hwaccel tag in them, so we would not enter the "else" branch
  of the "if (skb->protocol == htons(ETH_P_8021Q))" condition.

- if we are transmitting on behalf of a VLAN-aware bridge but with no TX
  forwarding offload (no PVT support, out of space in the PVT, whatever),
  we would indeed be transmitting with VLAN 4095 instead of the bridge
  device's pvid. However we would be injecting a "From CPU" frame, and
  the switch won't learn from that - it only learns from "Forward" frames.
  So it is inconsequential for address learning. And VLAN 4095 is
  absolutely enough for the frame to exit the switch, since we never
  remove that VLAN from any port.

Fixes: 57e661aae6a8 ("net: dsa: mv88e6xxx: Link aggregation support")
Reported-by: Tobias Waldekranz <tobias@waldekranz.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS                      |  1 +
 drivers/net/dsa/mv88e6xxx/chip.c | 67 ++++++++++++++++++++++++++++++++--------
 drivers/net/dsa/mv88e6xxx/chip.h |  3 ++
 include/linux/dsa/mv88e6xxx.h    | 13 ++++++++
 net/dsa/tag_dsa.c                | 12 +++++--
 5 files changed, 80 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/dsa/mv88e6xxx.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index a4a0c2baaf27..17f652b2f653 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11153,6 +11153,7 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/net/dsa/marvell.txt
 F:	Documentation/networking/devlink/mv88e6xxx.rst
 F:	drivers/net/dsa/mv88e6xxx/
+F:	include/linux/dsa/mv88e6xxx.h
 F:	include/linux/platform_data/mv88e6xxx.h
 
 MARVELL ARMADA 3700 PHY DRIVERS
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index d672112afffd..d7b29792732b 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -12,6 +12,7 @@
 
 #include <linux/bitfield.h>
 #include <linux/delay.h>
+#include <linux/dsa/mv88e6xxx.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 #include <linux/if_bridge.h>
@@ -1681,13 +1682,17 @@ static int mv88e6xxx_port_commit_pvid(struct mv88e6xxx_chip *chip, int port)
 {
 	struct dsa_port *dp = dsa_to_port(chip->ds, port);
 	struct mv88e6xxx_port *p = &chip->ports[port];
+	u16 pvid = MV88E6XXX_VID_STANDALONE;
 	bool drop_untagged = false;
-	u16 pvid = 0;
 	int err;
 
-	if (dp->bridge_dev && br_vlan_enabled(dp->bridge_dev)) {
-		pvid = p->bridge_pvid.vid;
-		drop_untagged = !p->bridge_pvid.valid;
+	if (dp->bridge_dev) {
+		if (br_vlan_enabled(dp->bridge_dev)) {
+			pvid = p->bridge_pvid.vid;
+			drop_untagged = !p->bridge_pvid.valid;
+		} else {
+			pvid = MV88E6XXX_VID_BRIDGED;
+		}
 	}
 
 	err = mv88e6xxx_port_set_pvid(chip, port, pvid);
@@ -1754,11 +1759,15 @@ static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port,
 	u16 fid;
 	int err;
 
-	/* Null VLAN ID corresponds to the port private database */
+	/* Ports have two private address databases: one for when the port is
+	 * standalone and one for when the port is under a bridge and the
+	 * 802.1Q mode is disabled. When the port is standalone, DSA wants its
+	 * address database to remain 100% empty, so we never load an ATU entry
+	 * into a standalone port's database. Therefore, translate the null
+	 * VLAN ID into the port's database used for VLAN-unaware bridging.
+	 */
 	if (vid == 0) {
-		err = mv88e6xxx_port_get_fid(chip, port, &fid);
-		if (err)
-			return err;
+		fid = MV88E6XXX_FID_BRIDGED;
 	} else {
 		err = mv88e6xxx_vtu_get(chip, vid, &vlan);
 		if (err)
@@ -2434,7 +2443,16 @@ static int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
 	int err;
 
 	mv88e6xxx_reg_lock(chip);
+
 	err = mv88e6xxx_bridge_map(chip, br);
+	if (err)
+		goto unlock;
+
+	err = mv88e6xxx_port_commit_pvid(chip, port);
+	if (err)
+		goto unlock;
+
+unlock:
 	mv88e6xxx_reg_unlock(chip);
 
 	return err;
@@ -2444,11 +2462,20 @@ static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port,
 					struct net_device *br)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
+	int err;
 
 	mv88e6xxx_reg_lock(chip);
+
 	if (mv88e6xxx_bridge_map(chip, br) ||
 	    mv88e6xxx_port_vlan_map(chip, port))
 		dev_err(ds->dev, "failed to remap in-chip Port VLAN\n");
+
+	err = mv88e6xxx_port_commit_pvid(chip, port);
+	if (err)
+		dev_err(ds->dev,
+			"port %d failed to restore standalone pvid: %pe\n",
+			port, ERR_PTR(err));
+
 	mv88e6xxx_reg_unlock(chip);
 }
 
@@ -2894,6 +2921,20 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
 	if (err)
 		return err;
 
+	/* Associate MV88E6XXX_VID_BRIDGED with MV88E6XXX_FID_BRIDGED in the
+	 * ATU by virtue of the fact that mv88e6xxx_atu_new() will pick it as
+	 * the first free FID after MV88E6XXX_FID_STANDALONE. This will be used
+	 * as the private PVID on ports under a VLAN-unaware bridge.
+	 * Shared (DSA and CPU) ports must also be members of it, to translate
+	 * the VID from the DSA tag into MV88E6XXX_FID_BRIDGED, instead of
+	 * relying on their port default FID.
+	 */
+	err = mv88e6xxx_port_vlan_join(chip, port, MV88E6XXX_VID_BRIDGED,
+				       MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_UNTAGGED,
+				       false);
+	if (err)
+		return err;
+
 	if (chip->info->ops->port_set_jumbo_size) {
 		err = chip->info->ops->port_set_jumbo_size(chip, port, 10218);
 		if (err)
@@ -2966,7 +3007,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
 	 * database, and allow bidirectional communication between the
 	 * CPU and DSA port(s), and the other ports.
 	 */
-	err = mv88e6xxx_port_set_fid(chip, port, 0);
+	err = mv88e6xxx_port_set_fid(chip, port, MV88E6XXX_FID_STANDALONE);
 	if (err)
 		return err;
 
@@ -3156,6 +3197,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 		}
 	}
 
+	err = mv88e6xxx_vtu_setup(chip);
+	if (err)
+		goto unlock;
+
 	/* Setup Switch Port Registers */
 	for (i = 0; i < mv88e6xxx_num_ports(chip); i++) {
 		if (dsa_is_unused_port(ds, i))
@@ -3185,10 +3230,6 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 	if (err)
 		goto unlock;
 
-	err = mv88e6xxx_vtu_setup(chip);
-	if (err)
-		goto unlock;
-
 	err = mv88e6xxx_pvt_setup(chip);
 	if (err)
 		goto unlock;
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 33d067e8396d..8271b8aa7b71 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -21,6 +21,9 @@
 #define EDSA_HLEN		8
 #define MV88E6XXX_N_FID		4096
 
+#define MV88E6XXX_FID_STANDALONE	0
+#define MV88E6XXX_FID_BRIDGED		1
+
 /* PVT limits for 4-bit port and 5-bit switch */
 #define MV88E6XXX_MAX_PVT_SWITCHES	32
 #define MV88E6XXX_MAX_PVT_PORTS		16
diff --git a/include/linux/dsa/mv88e6xxx.h b/include/linux/dsa/mv88e6xxx.h
new file mode 100644
index 000000000000..8c3d45eca46b
--- /dev/null
+++ b/include/linux/dsa/mv88e6xxx.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright 2021 NXP
+ */
+
+#ifndef _NET_DSA_TAG_MV88E6XXX_H
+#define _NET_DSA_TAG_MV88E6XXX_H
+
+#include <linux/if_vlan.h>
+
+#define MV88E6XXX_VID_STANDALONE	0
+#define MV88E6XXX_VID_BRIDGED		(VLAN_N_VID - 1)
+
+#endif
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 68d5ddc3ef35..b3da4b2ea11c 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -45,6 +45,7 @@
  *   6    6       2        2      4    2       N
  */
 
+#include <linux/dsa/mv88e6xxx.h>
 #include <linux/etherdevice.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -164,16 +165,21 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev,
 			dsa_header[2] &= ~0x10;
 		}
 	} else {
+		struct net_device *br = dp->bridge_dev;
+		u16 vid;
+
+		vid = br ? MV88E6XXX_VID_BRIDGED : MV88E6XXX_VID_STANDALONE;
+
 		skb_push(skb, DSA_HLEN + extra);
 		dsa_alloc_etype_header(skb, DSA_HLEN + extra);
 
-		/* Construct untagged DSA tag. */
+		/* Construct DSA header from untagged frame. */
 		dsa_header = dsa_etype_header_pos_tx(skb) + extra;
 
 		dsa_header[0] = (cmd << 6) | tag_dev;
 		dsa_header[1] = tag_port << 3;
-		dsa_header[2] = 0;
-		dsa_header[3] = 0;
+		dsa_header[2] = vid >> 8;
+		dsa_header[3] = vid & 0xff;
 	}
 
 	return skb;
-- 
cgit v1.2.3-71-gd317


From 0bc73ad46a76ed6ece4dcacb28858e7b38561e1c Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Sun, 26 Sep 2021 17:55:41 +0300
Subject: net/mlx5e: Mutually exclude RX-FCS and RX-port-timestamp

Due to current HW arch limitations, RX-FCS (scattering FCS frame field
to software) and RX-port-timestamp (improved timestamp accuracy on the
receive side) can't work together.
RX-port-timestamp is not controlled by the user and it is enabled by
default when supported by the HW/FW.
This patch sets RX-port-timestamp opposite to RX-FCS configuration.

Fixes: 102722fc6832 ("net/mlx5e: Add support for RXFCS feature flag")
Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 57 +++++++++++++++++++++--
 include/linux/mlx5/mlx5_ifc.h                     | 10 +++-
 2 files changed, 60 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 336aa07313da..09c8b71b186c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3325,20 +3325,67 @@ static int set_feature_rx_all(struct net_device *netdev, bool enable)
 	return mlx5_set_port_fcs(mdev, !enable);
 }
 
+static int mlx5e_set_rx_port_ts(struct mlx5_core_dev *mdev, bool enable)
+{
+	u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {};
+	bool supported, curr_state;
+	int err;
+
+	if (!MLX5_CAP_GEN(mdev, ports_check))
+		return 0;
+
+	err = mlx5_query_ports_check(mdev, in, sizeof(in));
+	if (err)
+		return err;
+
+	supported = MLX5_GET(pcmr_reg, in, rx_ts_over_crc_cap);
+	curr_state = MLX5_GET(pcmr_reg, in, rx_ts_over_crc);
+
+	if (!supported || enable == curr_state)
+		return 0;
+
+	MLX5_SET(pcmr_reg, in, local_port, 1);
+	MLX5_SET(pcmr_reg, in, rx_ts_over_crc, enable);
+
+	return mlx5_set_ports_check(mdev, in, sizeof(in));
+}
+
 static int set_feature_rx_fcs(struct net_device *netdev, bool enable)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_channels *chs = &priv->channels;
+	struct mlx5_core_dev *mdev = priv->mdev;
 	int err;
 
 	mutex_lock(&priv->state_lock);
 
-	priv->channels.params.scatter_fcs_en = enable;
-	err = mlx5e_modify_channels_scatter_fcs(&priv->channels, enable);
-	if (err)
-		priv->channels.params.scatter_fcs_en = !enable;
+	if (enable) {
+		err = mlx5e_set_rx_port_ts(mdev, false);
+		if (err)
+			goto out;
 
-	mutex_unlock(&priv->state_lock);
+		chs->params.scatter_fcs_en = true;
+		err = mlx5e_modify_channels_scatter_fcs(chs, true);
+		if (err) {
+			chs->params.scatter_fcs_en = false;
+			mlx5e_set_rx_port_ts(mdev, true);
+		}
+	} else {
+		chs->params.scatter_fcs_en = false;
+		err = mlx5e_modify_channels_scatter_fcs(chs, false);
+		if (err) {
+			chs->params.scatter_fcs_en = true;
+			goto out;
+		}
+		err = mlx5e_set_rx_port_ts(mdev, true);
+		if (err) {
+			mlx5_core_warn(mdev, "Failed to set RX port timestamp %d\n", err);
+			err = 0;
+		}
+	}
 
+out:
+	mutex_unlock(&priv->state_lock);
 	return err;
 }
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f3638d09ba77..993204a6c1a1 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9475,16 +9475,22 @@ struct mlx5_ifc_pcmr_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         local_port[0x8];
 	u8         reserved_at_10[0x10];
+
 	u8         entropy_force_cap[0x1];
 	u8         entropy_calc_cap[0x1];
 	u8         entropy_gre_calc_cap[0x1];
-	u8         reserved_at_23[0x1b];
+	u8         reserved_at_23[0xf];
+	u8         rx_ts_over_crc_cap[0x1];
+	u8         reserved_at_33[0xb];
 	u8         fcs_cap[0x1];
 	u8         reserved_at_3f[0x1];
+
 	u8         entropy_force[0x1];
 	u8         entropy_calc[0x1];
 	u8         entropy_gre_calc[0x1];
-	u8         reserved_at_43[0x1b];
+	u8         reserved_at_43[0xf];
+	u8         rx_ts_over_crc[0x1];
+	u8         reserved_at_53[0xb];
 	u8         fcs_chk[0x1];
 	u8         reserved_at_5f[0x1];
 };
-- 
cgit v1.2.3-71-gd317


From 28da0555c3b542d605e4ca26eea6a740cf2c9174 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 22 Sep 2021 17:37:25 +0300
Subject: net: dsa: move sja1110_process_meta_tstamp inside the tagging
 protocol driver

The problem is that DSA tagging protocols really must not depend on the
switch driver, because this creates a circular dependency at insmod
time, and the switch driver will effectively not load when the tagging
protocol driver is missing.

The code was structured in the way it was for a reason, though. The DSA
driver-facing API for PTP timestamping relies on the assumption that
two-step TX timestamps are provided by the hardware in an out-of-band
manner, typically by raising an interrupt and making that timestamp
available inside some sort of FIFO which is to be accessed over
SPI/MDIO/etc.

So the API puts .port_txtstamp into dsa_switch_ops, because it is
expected that the switch driver needs to save some state (like put the
skb into a queue until its TX timestamp arrives).

On SJA1110, TX timestamps are provided by the switch as Ethernet
packets, so this makes them be received and processed by the tagging
protocol driver. This in itself is great, because the timestamps are
full 64-bit and do not require reconstruction, and since Ethernet is the
fastest I/O method available to/from the switch, PTP timestamps arrive
very quickly, no matter how bottlenecked the SPI connection is, because
SPI interaction is not needed at all.

DSA's code structure and strict isolation between the tagging protocol
driver and the switch driver break the natural code organization.

When the tagging protocol driver receives a packet which is classified
as a metadata packet containing timestamps, it passes those timestamps
one by one to the switch driver, which then proceeds to compare them
based on the recorded timestamp ID that was generated in .port_txtstamp.

The communication between the tagging protocol and the switch driver is
done through a method exported by the switch driver, sja1110_process_meta_tstamp.
To satisfy build requirements, we force a dependency to build the
tagging protocol driver as a module when the switch driver is a module.
However, as explained in the first paragraph, that causes the circular
dependency.

To solve this, move the skb queue from struct sja1105_private :: struct
sja1105_ptp_data to struct sja1105_private :: struct sja1105_tagger_data.
The latter is a data structure for which hacks have already been put
into place to be able to create persistent storage per switch that is
accessible from the tagging protocol driver (see sja1105_setup_ports).

With the skb queue directly accessible from the tagging protocol driver,
we can now move sja1110_process_meta_tstamp into the tagging driver
itself, and avoid exporting a symbol.

Fixes: 566b18c8b752 ("net: dsa: sja1105: implement TX timestamping for SJA1110")
Link: https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/sja1105/sja1105_ptp.c | 45 +++++------------------------------
 drivers/net/dsa/sja1105/sja1105_ptp.h | 19 ---------------
 include/linux/dsa/sja1105.h           | 29 +++++++++++-----------
 net/dsa/tag_sja1105.c                 | 43 +++++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c
index 691f6dd7e669..54396992a919 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.c
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.c
@@ -64,6 +64,7 @@ enum sja1105_ptp_clk_mode {
 static int sja1105_change_rxtstamping(struct sja1105_private *priv,
 				      bool on)
 {
+	struct sja1105_tagger_data *tagger_data = &priv->tagger_data;
 	struct sja1105_ptp_data *ptp_data = &priv->ptp_data;
 	struct sja1105_general_params_entry *general_params;
 	struct sja1105_table *table;
@@ -79,7 +80,7 @@ static int sja1105_change_rxtstamping(struct sja1105_private *priv,
 		priv->tagger_data.stampable_skb = NULL;
 	}
 	ptp_cancel_worker_sync(ptp_data->clock);
-	skb_queue_purge(&ptp_data->skb_txtstamp_queue);
+	skb_queue_purge(&tagger_data->skb_txtstamp_queue);
 	skb_queue_purge(&ptp_data->skb_rxtstamp_queue);
 
 	return sja1105_static_config_reload(priv, SJA1105_RX_HWTSTAMPING);
@@ -452,40 +453,6 @@ bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port,
 	return priv->info->rxtstamp(ds, port, skb);
 }
 
-void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id,
-				 enum sja1110_meta_tstamp dir, u64 tstamp)
-{
-	struct sja1105_private *priv = ds->priv;
-	struct sja1105_ptp_data *ptp_data = &priv->ptp_data;
-	struct sk_buff *skb, *skb_tmp, *skb_match = NULL;
-	struct skb_shared_hwtstamps shwt = {0};
-
-	/* We don't care about RX timestamps on the CPU port */
-	if (dir == SJA1110_META_TSTAMP_RX)
-		return;
-
-	spin_lock(&ptp_data->skb_txtstamp_queue.lock);
-
-	skb_queue_walk_safe(&ptp_data->skb_txtstamp_queue, skb, skb_tmp) {
-		if (SJA1105_SKB_CB(skb)->ts_id != ts_id)
-			continue;
-
-		__skb_unlink(skb, &ptp_data->skb_txtstamp_queue);
-		skb_match = skb;
-
-		break;
-	}
-
-	spin_unlock(&ptp_data->skb_txtstamp_queue.lock);
-
-	if (WARN_ON(!skb_match))
-		return;
-
-	shwt.hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(tstamp));
-	skb_complete_tx_timestamp(skb_match, &shwt);
-}
-EXPORT_SYMBOL_GPL(sja1110_process_meta_tstamp);
-
 /* In addition to cloning the skb which is done by the common
  * sja1105_port_txtstamp, we need to generate a timestamp ID and save the
  * packet to the TX timestamping queue.
@@ -494,7 +461,6 @@ void sja1110_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb)
 {
 	struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone;
 	struct sja1105_private *priv = ds->priv;
-	struct sja1105_ptp_data *ptp_data = &priv->ptp_data;
 	struct sja1105_port *sp = &priv->ports[port];
 	u8 ts_id;
 
@@ -510,7 +476,7 @@ void sja1110_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb)
 
 	spin_unlock(&sp->data->meta_lock);
 
-	skb_queue_tail(&ptp_data->skb_txtstamp_queue, clone);
+	skb_queue_tail(&sp->data->skb_txtstamp_queue, clone);
 }
 
 /* Called from dsa_skb_tx_timestamp. This callback is just to clone
@@ -953,7 +919,7 @@ int sja1105_ptp_clock_register(struct dsa_switch *ds)
 	/* Only used on SJA1105 */
 	skb_queue_head_init(&ptp_data->skb_rxtstamp_queue);
 	/* Only used on SJA1110 */
-	skb_queue_head_init(&ptp_data->skb_txtstamp_queue);
+	skb_queue_head_init(&tagger_data->skb_txtstamp_queue);
 	spin_lock_init(&tagger_data->meta_lock);
 
 	ptp_data->clock = ptp_clock_register(&ptp_data->caps, ds->dev);
@@ -971,6 +937,7 @@ int sja1105_ptp_clock_register(struct dsa_switch *ds)
 void sja1105_ptp_clock_unregister(struct dsa_switch *ds)
 {
 	struct sja1105_private *priv = ds->priv;
+	struct sja1105_tagger_data *tagger_data = &priv->tagger_data;
 	struct sja1105_ptp_data *ptp_data = &priv->ptp_data;
 
 	if (IS_ERR_OR_NULL(ptp_data->clock))
@@ -978,7 +945,7 @@ void sja1105_ptp_clock_unregister(struct dsa_switch *ds)
 
 	del_timer_sync(&ptp_data->extts_timer);
 	ptp_cancel_worker_sync(ptp_data->clock);
-	skb_queue_purge(&ptp_data->skb_txtstamp_queue);
+	skb_queue_purge(&tagger_data->skb_txtstamp_queue);
 	skb_queue_purge(&ptp_data->skb_rxtstamp_queue);
 	ptp_clock_unregister(ptp_data->clock);
 	ptp_data->clock = NULL;
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h
index 3c874bb4c17b..3ae6b9fdd492 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.h
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.h
@@ -8,21 +8,6 @@
 
 #if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP)
 
-/* Timestamps are in units of 8 ns clock ticks (equivalent to
- * a fixed 125 MHz clock).
- */
-#define SJA1105_TICK_NS			8
-
-static inline s64 ns_to_sja1105_ticks(s64 ns)
-{
-	return ns / SJA1105_TICK_NS;
-}
-
-static inline s64 sja1105_ticks_to_ns(s64 ticks)
-{
-	return ticks * SJA1105_TICK_NS;
-}
-
 /* Calculate the first base_time in the future that satisfies this
  * relationship:
  *
@@ -77,10 +62,6 @@ struct sja1105_ptp_data {
 	struct timer_list extts_timer;
 	/* Used only on SJA1105 to reconstruct partial timestamps */
 	struct sk_buff_head skb_rxtstamp_queue;
-	/* Used on SJA1110 where meta frames are generated only for
-	 * 2-step TX timestamps
-	 */
-	struct sk_buff_head skb_txtstamp_queue;
 	struct ptp_clock_info caps;
 	struct ptp_clock *clock;
 	struct sja1105_ptp_cmd cmd;
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 171106202fe5..0485ab2fcc46 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -48,6 +48,10 @@ struct sja1105_tagger_data {
 	spinlock_t meta_lock;
 	unsigned long state;
 	u8 ts_id;
+	/* Used on SJA1110 where meta frames are generated only for
+	 * 2-step TX timestamps
+	 */
+	struct sk_buff_head skb_txtstamp_queue;
 };
 
 struct sja1105_skb_cb {
@@ -69,25 +73,20 @@ struct sja1105_port {
 	bool hwts_tx_en;
 };
 
-enum sja1110_meta_tstamp {
-	SJA1110_META_TSTAMP_TX = 0,
-	SJA1110_META_TSTAMP_RX = 1,
-};
-
-#if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP)
-
-void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id,
-				 enum sja1110_meta_tstamp dir, u64 tstamp);
-
-#else
+/* Timestamps are in units of 8 ns clock ticks (equivalent to
+ * a fixed 125 MHz clock).
+ */
+#define SJA1105_TICK_NS			8
 
-static inline void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port,
-					       u8 ts_id, enum sja1110_meta_tstamp dir,
-					       u64 tstamp)
+static inline s64 ns_to_sja1105_ticks(s64 ns)
 {
+	return ns / SJA1105_TICK_NS;
 }
 
-#endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */
+static inline s64 sja1105_ticks_to_ns(s64 ticks)
+{
+	return ticks * SJA1105_TICK_NS;
+}
 
 #if IS_ENABLED(CONFIG_NET_DSA_SJA1105)
 
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index c054f48541c8..2edede9ddac9 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -4,6 +4,7 @@
 #include <linux/if_vlan.h>
 #include <linux/dsa/sja1105.h>
 #include <linux/dsa/8021q.h>
+#include <linux/skbuff.h>
 #include <linux/packing.h>
 #include "dsa_priv.h"
 
@@ -53,6 +54,11 @@
 #define SJA1110_TX_TRAILER_LEN			4
 #define SJA1110_MAX_PADDING_LEN			15
 
+enum sja1110_meta_tstamp {
+	SJA1110_META_TSTAMP_TX = 0,
+	SJA1110_META_TSTAMP_RX = 1,
+};
+
 /* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */
 static inline bool sja1105_is_link_local(const struct sk_buff *skb)
 {
@@ -520,6 +526,43 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 					      is_meta);
 }
 
+static void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port,
+					u8 ts_id, enum sja1110_meta_tstamp dir,
+					u64 tstamp)
+{
+	struct sk_buff *skb, *skb_tmp, *skb_match = NULL;
+	struct dsa_port *dp = dsa_to_port(ds, port);
+	struct skb_shared_hwtstamps shwt = {0};
+	struct sja1105_port *sp = dp->priv;
+
+	if (!dsa_port_is_sja1105(dp))
+		return;
+
+	/* We don't care about RX timestamps on the CPU port */
+	if (dir == SJA1110_META_TSTAMP_RX)
+		return;
+
+	spin_lock(&sp->data->skb_txtstamp_queue.lock);
+
+	skb_queue_walk_safe(&sp->data->skb_txtstamp_queue, skb, skb_tmp) {
+		if (SJA1105_SKB_CB(skb)->ts_id != ts_id)
+			continue;
+
+		__skb_unlink(skb, &sp->data->skb_txtstamp_queue);
+		skb_match = skb;
+
+		break;
+	}
+
+	spin_unlock(&sp->data->skb_txtstamp_queue.lock);
+
+	if (WARN_ON(!skb_match))
+		return;
+
+	shwt.hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(tstamp));
+	skb_complete_tx_timestamp(skb_match, &shwt);
+}
+
 static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header)
 {
 	u8 *buf = dsa_etype_header_pos_rx(skb) + SJA1110_HEADER_LEN;
-- 
cgit v1.2.3-71-gd317


From 4ac0567e40b334b54988e3c28a2425ff9c8bdd35 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 22 Sep 2021 17:37:26 +0300
Subject: net: dsa: sja1105: break dependency between dsa_port_is_sja1105 and
 switch driver

It's nice to be able to test a tagging protocol with dsa_loop, but not
at the cost of losing the ability of building the tagging protocol and
switch driver as modules, because as things stand, there is a circular
dependency between the two. Tagging protocol drivers cannot depend on
switch drivers, that is a hard fact.

The reasoning behind the blamed patch was that accessing dp->priv should
first make sure that the structure behind that pointer is what we really
think it is.

Currently the "sja1105" and "sja1110" tagging protocols only operate
with the sja1105 switch driver, just like any other tagging protocol and
switch combination. The only way to mix and match them is by modifying
the code, and this applies to dsa_loop as well (by default that uses
DSA_TAG_PROTO_NONE). So while in principle there is an issue, in
practice there isn't one.

Until we extend dsa_loop to allow user space configuration, treat the
problem as a non-issue and just say that DSA ports found by tag_sja1105
are always sja1105 ports, which is in fact true. But keep the
dsa_port_is_sja1105 function so that it's easy to patch it during
testing, and rely on dead code elimination.

Fixes: 994d2cbb08ca ("net: dsa: tag_sja1105: be dsa_loop-safe")
Link: https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/sja1105/sja1105_main.c |  3 +--
 include/linux/dsa/sja1105.h            | 15 +--------------
 net/dsa/Kconfig                        |  1 -
 3 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 7c0db80eff00..924c3f129992 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -3117,7 +3117,7 @@ static void sja1105_teardown(struct dsa_switch *ds)
 	sja1105_static_config_free(&priv->static_config);
 }
 
-const struct dsa_switch_ops sja1105_switch_ops = {
+static const struct dsa_switch_ops sja1105_switch_ops = {
 	.get_tag_protocol	= sja1105_get_tag_protocol,
 	.setup			= sja1105_setup,
 	.teardown		= sja1105_teardown,
@@ -3166,7 +3166,6 @@ const struct dsa_switch_ops sja1105_switch_ops = {
 	.port_bridge_tx_fwd_offload = dsa_tag_8021q_bridge_tx_fwd_offload,
 	.port_bridge_tx_fwd_unoffload = dsa_tag_8021q_bridge_tx_fwd_unoffload,
 };
-EXPORT_SYMBOL_GPL(sja1105_switch_ops);
 
 static const struct of_device_id sja1105_dt_ids[];
 
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 0485ab2fcc46..9e07079528a5 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -88,22 +88,9 @@ static inline s64 sja1105_ticks_to_ns(s64 ticks)
 	return ticks * SJA1105_TICK_NS;
 }
 
-#if IS_ENABLED(CONFIG_NET_DSA_SJA1105)
-
-extern const struct dsa_switch_ops sja1105_switch_ops;
-
-static inline bool dsa_port_is_sja1105(struct dsa_port *dp)
-{
-	return dp->ds->ops == &sja1105_switch_ops;
-}
-
-#else
-
 static inline bool dsa_port_is_sja1105(struct dsa_port *dp)
 {
-	return false;
+	return true;
 }
 
-#endif
-
 #endif /* _NET_DSA_SJA1105_H */
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 548285539752..bca1b5d66df2 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -138,7 +138,6 @@ config NET_DSA_TAG_LAN9303
 
 config NET_DSA_TAG_SJA1105
 	tristate "Tag driver for NXP SJA1105 switches"
-	depends on NET_DSA_SJA1105 || !NET_DSA_SJA1105
 	select PACKING
 	help
 	  Say Y or M if you want to enable support for tagging frames with the
-- 
cgit v1.2.3-71-gd317


From deab6b1cd9789bb9bd466d5e76aecb8b336259b4 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 12 Oct 2021 14:40:40 +0300
Subject: net: dsa: tag_ocelot: break circular dependency with ocelot switch
 lib driver

As explained here:
https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/
DSA tagging protocol drivers cannot depend on symbols exported by switch
drivers, because this creates a circular dependency that breaks module
autoloading.

The tag_ocelot.c file depends on the ocelot_ptp_rew_op() function
exported by the common ocelot switch lib. This function looks at
OCELOT_SKB_CB(skb) and computes how to populate the REW_OP field of the
DSA tag, for PTP timestamping (the command: one-step/two-step, and the
TX timestamp identifier).

None of that requires deep insight into the driver, it is quite
stateless, as it only depends upon the skb->cb. So let's make it a
static inline function and put it in include/linux/dsa/ocelot.h, a
file that despite its name is used by the ocelot switch driver for
populating the injection header too - since commit 40d3f295b5fe ("net:
mscc: ocelot: use common tag parsing code with DSA").

With that function declared as static inline, its body is expanded
inside each call site, so the dependency is broken and the DSA tagger
can be built without the switch library, upon which the felix driver
depends.

Fixes: 39e5308b3250 ("net: mscc: ocelot: support PTP Sync one-step timestamping")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mscc/ocelot.c     | 17 ----------------
 drivers/net/ethernet/mscc/ocelot_net.c |  1 +
 include/linux/dsa/ocelot.h             | 37 ++++++++++++++++++++++++++++++++++
 include/soc/mscc/ocelot.h              | 24 ----------------------
 net/dsa/Kconfig                        |  2 --
 net/dsa/tag_ocelot.c                   |  1 -
 net/dsa/tag_ocelot_8021q.c             |  1 +
 7 files changed, 39 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index f0044329e3d7..a08e4f530c1c 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -601,23 +601,6 @@ static int ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port,
 	return 0;
 }
 
-u32 ocelot_ptp_rew_op(struct sk_buff *skb)
-{
-	struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone;
-	u8 ptp_cmd = OCELOT_SKB_CB(skb)->ptp_cmd;
-	u32 rew_op = 0;
-
-	if (ptp_cmd == IFH_REW_OP_TWO_STEP_PTP && clone) {
-		rew_op = ptp_cmd;
-		rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3;
-	} else if (ptp_cmd == IFH_REW_OP_ORIGIN_PTP) {
-		rew_op = ptp_cmd;
-	}
-
-	return rew_op;
-}
-EXPORT_SYMBOL(ocelot_ptp_rew_op);
-
 static bool ocelot_ptp_is_onestep_sync(struct sk_buff *skb,
 				       unsigned int ptp_class)
 {
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 2a85bcb5d0c2..2545727fd5b2 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -8,6 +8,7 @@
  * Copyright 2020-2021 NXP
  */
 
+#include <linux/dsa/ocelot.h>
 #include <linux/if_bridge.h>
 #include <linux/of_net.h>
 #include <linux/phy/phy.h>
diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
index 435777a0073c..50641a7529ad 100644
--- a/include/linux/dsa/ocelot.h
+++ b/include/linux/dsa/ocelot.h
@@ -6,6 +6,26 @@
 #define _NET_DSA_TAG_OCELOT_H
 
 #include <linux/packing.h>
+#include <linux/skbuff.h>
+
+struct ocelot_skb_cb {
+	struct sk_buff *clone;
+	unsigned int ptp_class; /* valid only for clones */
+	u8 ptp_cmd;
+	u8 ts_id;
+};
+
+#define OCELOT_SKB_CB(skb) \
+	((struct ocelot_skb_cb *)((skb)->cb))
+
+#define IFH_TAG_TYPE_C			0
+#define IFH_TAG_TYPE_S			1
+
+#define IFH_REW_OP_NOOP			0x0
+#define IFH_REW_OP_DSCP			0x1
+#define IFH_REW_OP_ONE_STEP_PTP		0x2
+#define IFH_REW_OP_TWO_STEP_PTP		0x3
+#define IFH_REW_OP_ORIGIN_PTP		0x5
 
 #define OCELOT_TAG_LEN			16
 #define OCELOT_SHORT_PREFIX_LEN		4
@@ -215,4 +235,21 @@ static inline void ocelot_ifh_set_vid(void *injection, u64 vid)
 	packing(injection, &vid, 11, 0, OCELOT_TAG_LEN, PACK, 0);
 }
 
+/* Determine the PTP REW_OP to use for injecting the given skb */
+static inline u32 ocelot_ptp_rew_op(struct sk_buff *skb)
+{
+	struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone;
+	u8 ptp_cmd = OCELOT_SKB_CB(skb)->ptp_cmd;
+	u32 rew_op = 0;
+
+	if (ptp_cmd == IFH_REW_OP_TWO_STEP_PTP && clone) {
+		rew_op = ptp_cmd;
+		rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3;
+	} else if (ptp_cmd == IFH_REW_OP_ORIGIN_PTP) {
+		rew_op = ptp_cmd;
+	}
+
+	return rew_op;
+}
+
 #endif
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index cabacef8731c..66b2e65c1179 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -89,15 +89,6 @@
 /* Source PGIDs, one per physical port */
 #define PGID_SRC			80
 
-#define IFH_TAG_TYPE_C			0
-#define IFH_TAG_TYPE_S			1
-
-#define IFH_REW_OP_NOOP			0x0
-#define IFH_REW_OP_DSCP			0x1
-#define IFH_REW_OP_ONE_STEP_PTP		0x2
-#define IFH_REW_OP_TWO_STEP_PTP		0x3
-#define IFH_REW_OP_ORIGIN_PTP		0x5
-
 #define OCELOT_NUM_TC			8
 
 #define OCELOT_SPEED_2500		0
@@ -695,16 +686,6 @@ struct ocelot_policer {
 	u32 burst; /* bytes */
 };
 
-struct ocelot_skb_cb {
-	struct sk_buff *clone;
-	unsigned int ptp_class; /* valid only for clones */
-	u8 ptp_cmd;
-	u8 ts_id;
-};
-
-#define OCELOT_SKB_CB(skb) \
-	((struct ocelot_skb_cb *)((skb)->cb))
-
 #define ocelot_read_ix(ocelot, reg, gi, ri) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
 #define ocelot_read_gix(ocelot, reg, gi) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi))
 #define ocelot_read_rix(ocelot, reg, ri) __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri))
@@ -765,7 +746,6 @@ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
 int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb);
 void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp);
 
-u32 ocelot_ptp_rew_op(struct sk_buff *skb);
 #else
 
 static inline bool ocelot_can_inject(struct ocelot *ocelot, int grp)
@@ -789,10 +769,6 @@ static inline void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp)
 {
 }
 
-static inline u32 ocelot_ptp_rew_op(struct sk_buff *skb)
-{
-	return 0;
-}
 #endif
 
 /* Hardware initialization */
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index bca1b5d66df2..d166377d7085 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -101,8 +101,6 @@ config NET_DSA_TAG_RTL4_A
 
 config NET_DSA_TAG_OCELOT
 	tristate "Tag driver for Ocelot family of switches, using NPI port"
-	depends on MSCC_OCELOT_SWITCH_LIB || \
-		   (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST)
 	select PACKING
 	help
 	  Say Y or M if you want to enable NPI tagging for the Ocelot switches
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 8025ed778d33..605b51ca6921 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -2,7 +2,6 @@
 /* Copyright 2019 NXP
  */
 #include <linux/dsa/ocelot.h>
-#include <soc/mscc/ocelot.h>
 #include "dsa_priv.h"
 
 static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev,
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 59072930cb02..1e4e66ea6796 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -9,6 +9,7 @@
  *   that on egress
  */
 #include <linux/dsa/8021q.h>
+#include <linux/dsa/ocelot.h>
 #include <soc/mscc/ocelot.h>
 #include <soc/mscc/ocelot_ptp.h>
 #include "dsa_priv.h"
-- 
cgit v1.2.3-71-gd317


From 49f885b2d97093451410e7279aa29d81e094e108 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 12 Oct 2021 14:40:41 +0300
Subject: net: dsa: tag_ocelot_8021q: break circular dependency with ocelot
 switch lib

Michael reported that when using the "ocelot-8021q" tagging protocol,
the switch driver module must be manually loaded before the tagging
protocol can be loaded/is available.

This appears to be the same problem described here:
https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/
where due to the fact that DSA tagging protocols make use of symbols
exported by the switch drivers, circular dependencies appear and this
breaks module autoloading.

The ocelot_8021q driver needs the ocelot_can_inject() and
ocelot_port_inject_frame() functions from the switch library. Previously
the wrong approach was taken to solve that dependency: shims were
provided for the case where the ocelot switch library was compiled out,
but that turns out to be insufficient, because the dependency when the
switch lib _is_ compiled is problematic too.

We cannot declare ocelot_can_inject() and ocelot_port_inject_frame() as
static inline functions, because these access I/O functions like
__ocelot_write_ix() which is called by ocelot_write_rix(). Making those
static inline basically means exposing the whole guts of the ocelot
switch library, not ideal...

We already have one tagging protocol driver which calls into the switch
driver during xmit but not using any exported symbol: sja1105_defer_xmit.
We can do the same thing here: create a kthread worker and one work item
per skb, and let the switch driver itself do the register accesses to
send the skb, and then consume it.

Fixes: 0a6f17c6ae21 ("net: dsa: tag_ocelot_8021q: add support for PTP timestamping")
Reported-by: Michael Walle <michael@walle.cc>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix.c | 96 ++++++++++++++++++++++++++++++++++++++++--
 drivers/net/dsa/ocelot/felix.h |  1 +
 include/linux/dsa/ocelot.h     | 12 ++++++
 include/soc/mscc/ocelot.h      | 27 ------------
 net/dsa/Kconfig                |  2 -
 net/dsa/tag_ocelot_8021q.c     | 38 +++++++++++------
 6 files changed, 130 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 50ef20724958..f8603e068e7c 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -1074,6 +1074,73 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports)
 	return 0;
 }
 
+#define work_to_xmit_work(w) \
+		container_of((w), struct felix_deferred_xmit_work, work)
+
+static void felix_port_deferred_xmit(struct kthread_work *work)
+{
+	struct felix_deferred_xmit_work *xmit_work = work_to_xmit_work(work);
+	struct dsa_switch *ds = xmit_work->dp->ds;
+	struct sk_buff *skb = xmit_work->skb;
+	u32 rew_op = ocelot_ptp_rew_op(skb);
+	struct ocelot *ocelot = ds->priv;
+	int port = xmit_work->dp->index;
+	int retries = 10;
+
+	do {
+		if (ocelot_can_inject(ocelot, 0))
+			break;
+
+		cpu_relax();
+	} while (--retries);
+
+	if (!retries) {
+		dev_err(ocelot->dev, "port %d failed to inject skb\n",
+			port);
+		kfree_skb(skb);
+		return;
+	}
+
+	ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);
+
+	consume_skb(skb);
+	kfree(xmit_work);
+}
+
+static int felix_port_setup_tagger_data(struct dsa_switch *ds, int port)
+{
+	struct dsa_port *dp = dsa_to_port(ds, port);
+	struct ocelot *ocelot = ds->priv;
+	struct felix *felix = ocelot_to_felix(ocelot);
+	struct felix_port *felix_port;
+
+	if (!dsa_port_is_user(dp))
+		return 0;
+
+	felix_port = kzalloc(sizeof(*felix_port), GFP_KERNEL);
+	if (!felix_port)
+		return -ENOMEM;
+
+	felix_port->xmit_worker = felix->xmit_worker;
+	felix_port->xmit_work_fn = felix_port_deferred_xmit;
+
+	dp->priv = felix_port;
+
+	return 0;
+}
+
+static void felix_port_teardown_tagger_data(struct dsa_switch *ds, int port)
+{
+	struct dsa_port *dp = dsa_to_port(ds, port);
+	struct felix_port *felix_port = dp->priv;
+
+	if (!felix_port)
+		return;
+
+	dp->priv = NULL;
+	kfree(felix_port);
+}
+
 /* Hardware initialization done here so that we can allocate structures with
  * devm without fear of dsa_register_switch returning -EPROBE_DEFER and causing
  * us to allocate structures twice (leak memory) and map PCI memory twice
@@ -1102,6 +1169,12 @@ static int felix_setup(struct dsa_switch *ds)
 		}
 	}
 
+	felix->xmit_worker = kthread_create_worker(0, "felix_xmit");
+	if (IS_ERR(felix->xmit_worker)) {
+		err = PTR_ERR(felix->xmit_worker);
+		goto out_deinit_timestamp;
+	}
+
 	for (port = 0; port < ds->num_ports; port++) {
 		if (dsa_is_unused_port(ds, port))
 			continue;
@@ -1112,6 +1185,14 @@ static int felix_setup(struct dsa_switch *ds)
 		 * bits of vlan tag.
 		 */
 		felix_port_qos_map_init(ocelot, port);
+
+		err = felix_port_setup_tagger_data(ds, port);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to set up tagger data: %pe\n",
+				port, ERR_PTR(err));
+			goto out_deinit_ports;
+		}
 	}
 
 	err = ocelot_devlink_sb_register(ocelot);
@@ -1138,9 +1219,13 @@ out_deinit_ports:
 		if (dsa_is_unused_port(ds, port))
 			continue;
 
+		felix_port_teardown_tagger_data(ds, port);
 		ocelot_deinit_port(ocelot, port);
 	}
 
+	kthread_destroy_worker(felix->xmit_worker);
+
+out_deinit_timestamp:
 	ocelot_deinit_timestamp(ocelot);
 	ocelot_deinit(ocelot);
 
@@ -1164,17 +1249,20 @@ static void felix_teardown(struct dsa_switch *ds)
 		felix_del_tag_protocol(ds, port, felix->tag_proto);
 	}
 
-	ocelot_devlink_sb_unregister(ocelot);
-	ocelot_deinit_timestamp(ocelot);
-	ocelot_deinit(ocelot);
-
 	for (port = 0; port < ocelot->num_phys_ports; port++) {
 		if (dsa_is_unused_port(ds, port))
 			continue;
 
+		felix_port_teardown_tagger_data(ds, port);
 		ocelot_deinit_port(ocelot, port);
 	}
 
+	kthread_destroy_worker(felix->xmit_worker);
+
+	ocelot_devlink_sb_unregister(ocelot);
+	ocelot_deinit_timestamp(ocelot);
+	ocelot_deinit(ocelot);
+
 	if (felix->info->mdio_bus_free)
 		felix->info->mdio_bus_free(ocelot);
 }
diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h
index 54024b6f9498..be3e42e135c0 100644
--- a/drivers/net/dsa/ocelot/felix.h
+++ b/drivers/net/dsa/ocelot/felix.h
@@ -62,6 +62,7 @@ struct felix {
 	resource_size_t			switch_base;
 	resource_size_t			imdio_base;
 	enum dsa_tag_protocol		tag_proto;
+	struct kthread_worker		*xmit_worker;
 };
 
 struct net_device *felix_port_to_netdev(struct ocelot *ocelot, int port);
diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
index 50641a7529ad..8ae999f587c4 100644
--- a/include/linux/dsa/ocelot.h
+++ b/include/linux/dsa/ocelot.h
@@ -5,6 +5,7 @@
 #ifndef _NET_DSA_TAG_OCELOT_H
 #define _NET_DSA_TAG_OCELOT_H
 
+#include <linux/kthread.h>
 #include <linux/packing.h>
 #include <linux/skbuff.h>
 
@@ -160,6 +161,17 @@ struct ocelot_skb_cb {
  *         +------+------+------+------+------+------+------+------+
  */
 
+struct felix_deferred_xmit_work {
+	struct dsa_port *dp;
+	struct sk_buff *skb;
+	struct kthread_work work;
+};
+
+struct felix_port {
+	void (*xmit_work_fn)(struct kthread_work *work);
+	struct kthread_worker *xmit_worker;
+};
+
 static inline void ocelot_xfh_get_rew_val(void *extraction, u64 *rew_val)
 {
 	packing(extraction, rew_val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0);
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 66b2e65c1179..d7055b41982d 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -737,8 +737,6 @@ u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target,
 void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target,
 			      u32 val, u32 reg, u32 offset);
 
-#if IS_ENABLED(CONFIG_MSCC_OCELOT_SWITCH_LIB)
-
 /* Packet I/O */
 bool ocelot_can_inject(struct ocelot *ocelot, int grp);
 void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
@@ -746,31 +744,6 @@ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
 int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb);
 void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp);
 
-#else
-
-static inline bool ocelot_can_inject(struct ocelot *ocelot, int grp)
-{
-	return false;
-}
-
-static inline void ocelot_port_inject_frame(struct ocelot *ocelot, int port,
-					    int grp, u32 rew_op,
-					    struct sk_buff *skb)
-{
-}
-
-static inline int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp,
-					struct sk_buff **skb)
-{
-	return -EIO;
-}
-
-static inline void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp)
-{
-}
-
-#endif
-
 /* Hardware initialization */
 int ocelot_regfields_init(struct ocelot *ocelot,
 			  const struct reg_field *const regfields);
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index d166377d7085..d8ee15f1c7a9 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -112,8 +112,6 @@ config NET_DSA_TAG_OCELOT
 
 config NET_DSA_TAG_OCELOT_8021Q
 	tristate "Tag driver for Ocelot family of switches, using VLAN"
-	depends on MSCC_OCELOT_SWITCH_LIB || \
-	          (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST)
 	help
 	  Say Y or M if you want to enable support for tagging frames with a
 	  custom VLAN-based header. Frames that require timestamping, such as
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 1e4e66ea6796..d05c352f96e5 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -10,10 +10,31 @@
  */
 #include <linux/dsa/8021q.h>
 #include <linux/dsa/ocelot.h>
-#include <soc/mscc/ocelot.h>
-#include <soc/mscc/ocelot_ptp.h>
 #include "dsa_priv.h"
 
+static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
+					 struct sk_buff *skb)
+{
+	struct felix_deferred_xmit_work *xmit_work;
+	struct felix_port *felix_port = dp->priv;
+
+	xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC);
+	if (!xmit_work)
+		return NULL;
+
+	/* Calls felix_port_deferred_xmit in felix.c */
+	kthread_init_work(&xmit_work->work, felix_port->xmit_work_fn);
+	/* Increase refcount so the kfree_skb in dsa_slave_xmit
+	 * won't really free the packet.
+	 */
+	xmit_work->dp = dp;
+	xmit_work->skb = skb_get(skb);
+
+	kthread_queue_work(felix_port->xmit_worker, &xmit_work->work);
+
+	return NULL;
+}
+
 static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 				   struct net_device *netdev)
 {
@@ -21,18 +42,9 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 	u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index);
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
-	struct ocelot *ocelot = dp->ds->priv;
-	int port = dp->index;
-	u32 rew_op = 0;
 
-	rew_op = ocelot_ptp_rew_op(skb);
-	if (rew_op) {
-		if (!ocelot_can_inject(ocelot, 0))
-			return NULL;
-
-		ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);
-		return NULL;
-	}
+	if (ocelot_ptp_rew_op(skb))
+		return ocelot_defer_xmit(dp, skb);
 
 	return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q,
 			      ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
-- 
cgit v1.2.3-71-gd317


From 6098475d4cb48d821bdf453c61118c56e26294f0 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 8 Oct 2021 14:31:57 +0100
Subject: spi: Fix deadlock when adding SPI controllers on SPI buses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently we have a global spi_add_lock which we take when adding new
devices so that we can check that we're not trying to reuse a chip
select that's already controlled.  This means that if the SPI device is
itself a SPI controller and triggers the instantiation of further SPI
devices we trigger a deadlock as we try to register and instantiate
those devices while in the process of doing so for the parent controller
and hence already holding the global spi_add_lock.  Since we only care
about concurrency within a single SPI bus move the lock to be per
controller, avoiding the deadlock.

This can be easily triggered in the case of spi-mux.

Reported-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 17 ++++++-----------
 include/linux/spi/spi.h |  3 +++
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index aea037c65985..412a10586233 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -478,12 +478,6 @@ static LIST_HEAD(spi_controller_list);
  */
 static DEFINE_MUTEX(board_lock);
 
-/*
- * Prevents addition of devices with same chip select and
- * addition of devices below an unregistering controller.
- */
-static DEFINE_MUTEX(spi_add_lock);
-
 /**
  * spi_alloc_device - Allocate a new SPI device
  * @ctlr: Controller to which device is connected
@@ -636,9 +630,9 @@ int spi_add_device(struct spi_device *spi)
 	 * chipselect **BEFORE** we call setup(), else we'll trash
 	 * its configuration.  Lock against concurrent add() calls.
 	 */
-	mutex_lock(&spi_add_lock);
+	mutex_lock(&ctlr->add_lock);
 	status = __spi_add_device(spi);
-	mutex_unlock(&spi_add_lock);
+	mutex_unlock(&ctlr->add_lock);
 	return status;
 }
 EXPORT_SYMBOL_GPL(spi_add_device);
@@ -658,7 +652,7 @@ static int spi_add_device_locked(struct spi_device *spi)
 	/* Set the bus ID string */
 	spi_dev_set_name(spi);
 
-	WARN_ON(!mutex_is_locked(&spi_add_lock));
+	WARN_ON(!mutex_is_locked(&ctlr->add_lock));
 	return __spi_add_device(spi);
 }
 
@@ -2830,6 +2824,7 @@ int spi_register_controller(struct spi_controller *ctlr)
 	spin_lock_init(&ctlr->bus_lock_spinlock);
 	mutex_init(&ctlr->bus_lock_mutex);
 	mutex_init(&ctlr->io_mutex);
+	mutex_init(&ctlr->add_lock);
 	ctlr->bus_lock_flag = 0;
 	init_completion(&ctlr->xfer_completion);
 	if (!ctlr->max_dma_len)
@@ -2966,7 +2961,7 @@ void spi_unregister_controller(struct spi_controller *ctlr)
 
 	/* Prevent addition of new devices, unregister existing ones */
 	if (IS_ENABLED(CONFIG_SPI_DYNAMIC))
-		mutex_lock(&spi_add_lock);
+		mutex_lock(&ctlr->add_lock);
 
 	device_for_each_child(&ctlr->dev, NULL, __unregister);
 
@@ -2997,7 +2992,7 @@ void spi_unregister_controller(struct spi_controller *ctlr)
 	mutex_unlock(&board_lock);
 
 	if (IS_ENABLED(CONFIG_SPI_DYNAMIC))
-		mutex_unlock(&spi_add_lock);
+		mutex_unlock(&ctlr->add_lock);
 }
 EXPORT_SYMBOL_GPL(spi_unregister_controller);
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 8371bca13729..6b0b686f6f90 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -531,6 +531,9 @@ struct spi_controller {
 	/* I/O mutex */
 	struct mutex		io_mutex;
 
+	/* Used to avoid adding the same CS twice */
+	struct mutex		add_lock;
+
 	/* lock and mutex for SPI bus locking */
 	spinlock_t		bus_lock_spinlock;
 	struct mutex		bus_lock_mutex;
-- 
cgit v1.2.3-71-gd317


From 8e141f9eb803e209714a80aa6ec073893f94c526 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 29 Sep 2021 09:12:40 +0200
Subject: block: drain file system I/O on del_gendisk

Instead of delaying draining of file system I/O related items like the
blk-qos queues, the integrity read workqueue and timeouts only when the
request_queue is removed, do that when del_gendisk is called.  This is
important for SCSI where the upper level drivers that control the gendisk
are separate entities, and the disk can be freed much earlier than the
request_queue, or can even be unbound without tearing down the queue.

Fixes: edb0872f44ec ("block: move the bdi from the request_queue to the gendisk")
Reported-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20210929071241.934472-5-hch@lst.de
Tested-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c      | 27 ++++++++++++---------------
 block/blk.h           |  1 +
 block/genhd.c         | 21 +++++++++++++++++++++
 include/linux/genhd.h |  1 +
 4 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 43f5da707d8e..4d8f5fe91588 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -49,7 +49,6 @@
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
 #include "blk-pm.h"
-#include "blk-rq-qos.h"
 
 struct dentry *blk_debugfs_root;
 
@@ -337,23 +336,25 @@ void blk_put_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_put_queue);
 
-void blk_set_queue_dying(struct request_queue *q)
+void blk_queue_start_drain(struct request_queue *q)
 {
-	blk_queue_flag_set(QUEUE_FLAG_DYING, q);
-
 	/*
 	 * When queue DYING flag is set, we need to block new req
 	 * entering queue, so we call blk_freeze_queue_start() to
 	 * prevent I/O from crossing blk_queue_enter().
 	 */
 	blk_freeze_queue_start(q);
-
 	if (queue_is_mq(q))
 		blk_mq_wake_waiters(q);
-
 	/* Make blk_queue_enter() reexamine the DYING flag. */
 	wake_up_all(&q->mq_freeze_wq);
 }
+
+void blk_set_queue_dying(struct request_queue *q)
+{
+	blk_queue_flag_set(QUEUE_FLAG_DYING, q);
+	blk_queue_start_drain(q);
+}
 EXPORT_SYMBOL_GPL(blk_set_queue_dying);
 
 /**
@@ -385,13 +386,8 @@ void blk_cleanup_queue(struct request_queue *q)
 	 */
 	blk_freeze_queue(q);
 
-	rq_qos_exit(q);
-
 	blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
 
-	/* for synchronous bio-based driver finish in-flight integrity i/o */
-	blk_flush_integrity();
-
 	blk_sync_queue(q);
 	if (queue_is_mq(q))
 		blk_mq_exit_queue(q);
@@ -474,11 +470,12 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 
 static inline int bio_queue_enter(struct bio *bio)
 {
-	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	struct request_queue *q = disk->queue;
 
 	while (!blk_try_enter_queue(q, false)) {
 		if (bio->bi_opf & REQ_NOWAIT) {
-			if (blk_queue_dying(q))
+			if (test_bit(GD_DEAD, &disk->state))
 				goto dead;
 			bio_wouldblock_error(bio);
 			return -EBUSY;
@@ -495,8 +492,8 @@ static inline int bio_queue_enter(struct bio *bio)
 		wait_event(q->mq_freeze_wq,
 			   (!q->mq_freeze_depth &&
 			    blk_pm_resume_queue(false, q)) ||
-			   blk_queue_dying(q));
-		if (blk_queue_dying(q))
+			   test_bit(GD_DEAD, &disk->state));
+		if (test_bit(GD_DEAD, &disk->state))
 			goto dead;
 	}
 
diff --git a/block/blk.h b/block/blk.h
index 7d2a0ba7ed21..e2ed2257709a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,6 +51,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
 void blk_free_flush_queue(struct blk_flush_queue *q);
 
 void blk_freeze_queue(struct request_queue *q);
+void blk_queue_start_drain(struct request_queue *q);
 
 #define BIO_INLINE_VECS 4
 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
diff --git a/block/genhd.c b/block/genhd.c
index 496e8458c357..1fe816be9bcd 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -26,6 +26,7 @@
 #include <linux/badblocks.h>
 
 #include "blk.h"
+#include "blk-rq-qos.h"
 
 static struct kobject *block_depr;
 
@@ -559,6 +560,8 @@ EXPORT_SYMBOL(device_add_disk);
  */
 void del_gendisk(struct gendisk *disk)
 {
+	struct request_queue *q = disk->queue;
+
 	might_sleep();
 
 	if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN)))
@@ -575,8 +578,26 @@ void del_gendisk(struct gendisk *disk)
 	fsync_bdev(disk->part0);
 	__invalidate_device(disk->part0, true);
 
+	/*
+	 * Fail any new I/O.
+	 */
+	set_bit(GD_DEAD, &disk->state);
 	set_capacity(disk, 0);
 
+	/*
+	 * Prevent new I/O from crossing bio_queue_enter().
+	 */
+	blk_queue_start_drain(q);
+	blk_mq_freeze_queue_wait(q);
+
+	rq_qos_exit(q);
+	blk_sync_queue(q);
+	blk_flush_integrity();
+	/*
+	 * Allow using passthrough request again after the queue is torn down.
+	 */
+	blk_mq_unfreeze_queue(q);
+
 	if (!(disk->flags & GENHD_FL_HIDDEN)) {
 		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c68d83c87f83..0f5315c2b5a3 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -149,6 +149,7 @@ struct gendisk {
 	unsigned long state;
 #define GD_NEED_PART_SCAN		0
 #define GD_READ_ONLY			1
+#define GD_DEAD				2
 
 	struct mutex open_mutex;	/* open/close mutex */
 	unsigned open_partitions;	/* number of open partitions */
-- 
cgit v1.2.3-71-gd317