From f56b919fa4f1b27c589e71f7d90e9785f9196bf1 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 16 Feb 2022 09:39:22 +0100
Subject: linux/fb.h: Spelling s/palette/palette/

Fix a misspelling of "palette" in a comment.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/uapi/linux/fb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/fb.h b/include/uapi/linux/fb.h
index 4c14e8be7267..3a49913d006c 100644
--- a/include/uapi/linux/fb.h
+++ b/include/uapi/linux/fb.h
@@ -182,7 +182,7 @@ struct fb_fix_screeninfo {
  *
  * For pseudocolor: offset and length should be the same for all color
  * components. Offset specifies the position of the least significant bit
- * of the pallette index in a pixel value. Length indicates the number
+ * of the palette index in a pixel value. Length indicates the number
  * of available palette entries (i.e. # of entries = 1 << length).
  */
 struct fb_bitfield {
-- 
cgit v1.2.3-71-gd317


From d4e5268a08b211b536fed29beb24271ecd85187e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 8 Apr 2022 11:45:55 +0200
Subject: x86,objtool: Mark cpu_startup_entry() __noreturn

GCC-8 isn't clever enough to figure out that cpu_start_entry() is a
noreturn while objtool is. This results in code after the call in
start_secondary(). Give GCC a hand so that they all agree on things.

  vmlinux.o: warning: objtool: start_secondary()+0x10e: unreachable

Reported-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20220408094718.383658532@infradead.org
---
 include/linux/cpu.h   | 2 +-
 tools/objtool/check.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 9cf51e41e697..54dc2f9a2d56 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -167,7 +167,7 @@ static inline int suspend_disable_secondary_cpus(void) { return 0; }
 static inline void suspend_enable_secondary_cpus(void) { }
 #endif /* !CONFIG_PM_SLEEP_SMP */
 
-void cpu_startup_entry(enum cpuhp_state state);
+void __noreturn cpu_startup_entry(enum cpuhp_state state);
 
 void cpu_idle_poll_ctrl(bool enable);
 
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index bd0c2c828940..e3a675d6a704 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -184,6 +184,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"do_group_exit",
 		"stop_this_cpu",
 		"__invalid_creds",
+               "cpu_startup_entry",
 	};
 
 	if (!func)
-- 
cgit v1.2.3-71-gd317


From 37c5f9e80e015d0df17d0c377c18523002986851 Mon Sep 17 00:00:00 2001
From: Oleksandr Ocheretnyi <oocheret@cisco.com>
Date: Sun, 17 Apr 2022 11:46:47 -0700
Subject: mtd: fix 'part' field data corruption in mtd_info

Commit 46b5889cc2c5 ("mtd: implement proper partition handling")
started using "mtd_get_master_ofs()" in mtd callbacks to determine
memory offsets by means of 'part' field from mtd_info, what previously
was smashed accessing 'master' field in the mtd_set_dev_defaults() method.
That provides wrong offset what causes hardware access errors.

Just make 'part', 'master' as separate fields, rather than using
union type to avoid 'part' data corruption when mtd_set_dev_defaults()
is called.

Fixes: 46b5889cc2c5 ("mtd: implement proper partition handling")
Signed-off-by: Oleksandr Ocheretnyi <oocheret@cisco.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20220417184649.449289-1-oocheret@cisco.com
---
 include/linux/mtd/mtd.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 151607e9d64a..955aee14b0f7 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -389,10 +389,8 @@ struct mtd_info {
 	/* List of partitions attached to this MTD device */
 	struct list_head partitions;
 
-	union {
-		struct mtd_part part;
-		struct mtd_master master;
-	};
+	struct mtd_part part;
+	struct mtd_master master;
 };
 
 static inline struct mtd_info *mtd_get_master(struct mtd_info *mtd)
-- 
cgit v1.2.3-71-gd317


From 7e842d70fe599bc13594b650b2144c4b6e6d6bf1 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 20 Apr 2022 09:05:26 +0200
Subject: memory: renesas-rpc-if: Fix HF/OSPI data transfer in Manual Mode

HyperFlash devices fail to probe:

    rpc-if-hyperflash rpc-if-hyperflash: probing of hyperbus device failed

In HyperFlash or Octal-SPI Flash mode, the Transfer Data Enable bits
(SPIDE) in the Manual Mode Enable Setting Register (SMENR) are derived
from half of the transfer size, cfr. the rpcif_bits_set() helper
function.  However, rpcif_reg_{read,write}() does not take the bus size
into account, and does not double all Manual Mode Data Register access
sizes when communicating with a HyperFlash or Octal-SPI Flash device.

Fix this, and avoid the back-and-forth conversion between transfer size
and Transfer Data Enable bits, by explicitly storing the transfer size
in struct rpcif, and using that value to determine access size in
rpcif_reg_{read,write}().

Enforce that the "high" Manual Mode Read/Write Data Registers
(SM[RW]DR1) are only used for 8-byte data accesses.
While at it, forbid writing to the Manual Mode Read Data Registers,
as they are read-only.

Fixes: fff53a551db50f5e ("memory: renesas-rpc-if: Correct QSPI data transfer in Manual mode")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Link: https://lore.kernel.org/r/cde9bfacf704c81865f57b15d1b48a4793da4286.1649681476.git.geert+renesas@glider.be
Link: https://lore.kernel.org/r/20220420070526.9367-1-krzysztof.kozlowski@linaro.org'
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/memory/renesas-rpc-if.c | 60 +++++++++++++++++++++++++++++++----------
 include/memory/renesas-rpc-if.h |  1 +
 2 files changed, 47 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/memory/renesas-rpc-if.c b/drivers/memory/renesas-rpc-if.c
index 2e545f473cc6..019a0822bde0 100644
--- a/drivers/memory/renesas-rpc-if.c
+++ b/drivers/memory/renesas-rpc-if.c
@@ -164,25 +164,39 @@ static const struct regmap_access_table rpcif_volatile_table = {
 
 
 /*
- * Custom accessor functions to ensure SMRDR0 and SMWDR0 are always accessed
- * with proper width. Requires SMENR_SPIDE to be correctly set before!
+ * Custom accessor functions to ensure SM[RW]DR[01] are always accessed with
+ * proper width.  Requires rpcif.xfer_size to be correctly set before!
  */
 static int rpcif_reg_read(void *context, unsigned int reg, unsigned int *val)
 {
 	struct rpcif *rpc = context;
 
-	if (reg == RPCIF_SMRDR0 || reg == RPCIF_SMWDR0) {
-		u32 spide = readl(rpc->base + RPCIF_SMENR) & RPCIF_SMENR_SPIDE(0xF);
-
-		if (spide == 0x8) {
+	switch (reg) {
+	case RPCIF_SMRDR0:
+	case RPCIF_SMWDR0:
+		switch (rpc->xfer_size) {
+		case 1:
 			*val = readb(rpc->base + reg);
 			return 0;
-		} else if (spide == 0xC) {
+
+		case 2:
 			*val = readw(rpc->base + reg);
 			return 0;
-		} else if (spide != 0xF) {
+
+		case 4:
+		case 8:
+			*val = readl(rpc->base + reg);
+			return 0;
+
+		default:
 			return -EILSEQ;
 		}
+
+	case RPCIF_SMRDR1:
+	case RPCIF_SMWDR1:
+		if (rpc->xfer_size != 8)
+			return -EILSEQ;
+		break;
 	}
 
 	*val = readl(rpc->base + reg);
@@ -193,18 +207,34 @@ static int rpcif_reg_write(void *context, unsigned int reg, unsigned int val)
 {
 	struct rpcif *rpc = context;
 
-	if (reg == RPCIF_SMRDR0 || reg == RPCIF_SMWDR0) {
-		u32 spide = readl(rpc->base + RPCIF_SMENR) & RPCIF_SMENR_SPIDE(0xF);
-
-		if (spide == 0x8) {
+	switch (reg) {
+	case RPCIF_SMWDR0:
+		switch (rpc->xfer_size) {
+		case 1:
 			writeb(val, rpc->base + reg);
 			return 0;
-		} else if (spide == 0xC) {
+
+		case 2:
 			writew(val, rpc->base + reg);
 			return 0;
-		} else if (spide != 0xF) {
+
+		case 4:
+		case 8:
+			writel(val, rpc->base + reg);
+			return 0;
+
+		default:
 			return -EILSEQ;
 		}
+
+	case RPCIF_SMWDR1:
+		if (rpc->xfer_size != 8)
+			return -EILSEQ;
+		break;
+
+	case RPCIF_SMRDR0:
+	case RPCIF_SMRDR1:
+		return -EPERM;
 	}
 
 	writel(val, rpc->base + reg);
@@ -469,6 +499,7 @@ int rpcif_manual_xfer(struct rpcif *rpc)
 
 			smenr |= RPCIF_SMENR_SPIDE(rpcif_bits_set(rpc, nbytes));
 			regmap_write(rpc->regmap, RPCIF_SMENR, smenr);
+			rpc->xfer_size = nbytes;
 
 			memcpy(data, rpc->buffer + pos, nbytes);
 			if (nbytes == 8) {
@@ -533,6 +564,7 @@ int rpcif_manual_xfer(struct rpcif *rpc)
 			regmap_write(rpc->regmap, RPCIF_SMENR, smenr);
 			regmap_write(rpc->regmap, RPCIF_SMCR,
 				     rpc->smcr | RPCIF_SMCR_SPIE);
+			rpc->xfer_size = nbytes;
 			ret = wait_msg_xfer_end(rpc);
 			if (ret)
 				goto err_out;
diff --git a/include/memory/renesas-rpc-if.h b/include/memory/renesas-rpc-if.h
index 7c93f5177532..9c0ad64b8d29 100644
--- a/include/memory/renesas-rpc-if.h
+++ b/include/memory/renesas-rpc-if.h
@@ -72,6 +72,7 @@ struct rpcif {
 	enum rpcif_type type;
 	enum rpcif_data_dir dir;
 	u8 bus_size;
+	u8 xfer_size;
 	void *buffer;
 	u32 xferlen;
 	u32 smcr;
-- 
cgit v1.2.3-71-gd317


From 8d084b2eae7fc5fcfc9f143cd7321a88e1cd76aa Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 5 Apr 2022 17:15:13 +0200
Subject: usb: typec: tcpm: Fix undefined behavior due to shift overflowing the
 constant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix:

  drivers/usb/typec/tcpm/tcpm.c: In function ‘run_state_machine’:
  drivers/usb/typec/tcpm/tcpm.c:4724:3: error: case label does not reduce to an integer constant
     case BDO_MODE_TESTDATA:
     ^~~~

See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory
details as to why it triggers with older gccs only.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-usb@vger.kernel.org
Link: https://lore.kernel.org/r/20220405151517.29753-8-bp@alien8.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/pd_bdo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/usb/pd_bdo.h b/include/linux/usb/pd_bdo.h
index 033fe3e17141..7c25b88d79f9 100644
--- a/include/linux/usb/pd_bdo.h
+++ b/include/linux/usb/pd_bdo.h
@@ -15,7 +15,7 @@
 #define BDO_MODE_CARRIER2	(5 << 28)
 #define BDO_MODE_CARRIER3	(6 << 28)
 #define BDO_MODE_EYE		(7 << 28)
-#define BDO_MODE_TESTDATA	(8 << 28)
+#define BDO_MODE_TESTDATA	(8U << 28)
 
 #define BDO_MODE_MASK(mode)	((mode) & 0xf0000000)
 
-- 
cgit v1.2.3-71-gd317


From b253a0680ceadc5d7b4acca7aa2d870326cad8ad Mon Sep 17 00:00:00 2001
From: Pengcheng Yang <yangpc@wangsu.com>
Date: Wed, 20 Apr 2022 10:34:41 +0800
Subject: tcp: ensure to use the most recently sent skb when filling the rate
 sample

If an ACK (s)acks multiple skbs, we favor the information
from the most recently sent skb by choosing the skb with
the highest prior_delivered count. But in the interval
between receiving ACKs, we send multiple skbs with the same
prior_delivered, because the tp->delivered only changes
when we receive an ACK.

We used RACK's solution, copying tcp_rack_sent_after() as
tcp_skb_sent_after() helper to determine "which packet was
sent last?". Later, we will use tcp_skb_sent_after() instead
in RACK.

Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection")
Signed-off-by: Pengcheng Yang <yangpc@wangsu.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/1650422081-22153-1-git-send-email-yangpc@wangsu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h   |  6 ++++++
 net/ipv4/tcp_rate.c | 11 ++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 70ca4a5e330a..be712fb9ddd7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1042,6 +1042,7 @@ struct rate_sample {
 	int  losses;		/* number of packets marked lost upon ACK */
 	u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
 	u32  prior_in_flight;	/* in flight before this ACK */
+	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
 	bool is_retrans;	/* is sample from retransmission? */
 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
@@ -1164,6 +1165,11 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 		  bool is_sack_reneg, struct rate_sample *rs);
 void tcp_rate_check_app_limited(struct sock *sk);
 
+static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+{
+	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+}
+
 /* These functions determine how the current flow behaves in respect of SACK
  * handling. SACK is negotiated with the peer, and therefore it can vary
  * between different flows.
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index fbab921670cc..9a8e014d9b5b 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -74,27 +74,32 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
  *
  * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
  * called multiple times. We favor the information from the most recently
- * sent skb, i.e., the skb with the highest prior_delivered count.
+ * sent skb, i.e., the skb with the most recently sent time and the highest
+ * sequence.
  */
 void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 			    struct rate_sample *rs)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+	u64 tx_tstamp;
 
 	if (!scb->tx.delivered_mstamp)
 		return;
 
+	tx_tstamp = tcp_skb_timestamp_us(skb);
 	if (!rs->prior_delivered ||
-	    after(scb->tx.delivered, rs->prior_delivered)) {
+	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+			       scb->end_seq, rs->last_end_seq)) {
 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
 		rs->prior_delivered  = scb->tx.delivered;
 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
 		rs->is_app_limited   = scb->tx.is_app_limited;
 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
+		rs->last_end_seq     = scb->end_seq;
 
 		/* Record send time of most recently ACKed packet: */
-		tp->first_tx_mstamp  = tcp_skb_timestamp_us(skb);
+		tp->first_tx_mstamp  = tx_tstamp;
 		/* Find the duration of the "send phase" of this window: */
 		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
 						     scb->tx.first_tx_mstamp);
-- 
cgit v1.2.3-71-gd317


From 8be99c7b8bb17f2b82af4a0a6798b795f4d74436 Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley@microchip.com>
Date: Wed, 13 Apr 2022 08:58:32 +0100
Subject: dt-bindings: clk: mpfs: add defines for two new clocks

The RTC reference and MSSPLL were previously not documented or defined,
as they were unused. Add their defines to the PolarFire SoC header.

Fixes: 2145bb687e3f ("dt-bindings: clk: microchip: Add Microchip PolarFire host binding")
Reviewed-by: Daire McNamara <daire.mcnamara@microchip.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20220413075835.3354193-6-conor.dooley@microchip.com
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/microchip,mpfs-clock.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/microchip,mpfs-clock.h b/include/dt-bindings/clock/microchip,mpfs-clock.h
index 73f2a9324857..4048669bf756 100644
--- a/include/dt-bindings/clock/microchip,mpfs-clock.h
+++ b/include/dt-bindings/clock/microchip,mpfs-clock.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
 /*
  * Daire McNamara,<daire.mcnamara@microchip.com>
- * Copyright (C) 2020 Microchip Technology Inc.  All rights reserved.
+ * Copyright (C) 2020-2022 Microchip Technology Inc.  All rights reserved.
  */
 
 #ifndef _DT_BINDINGS_CLK_MICROCHIP_MPFS_H_
@@ -42,4 +42,7 @@
 #define CLK_ATHENA	31
 #define CLK_CFM		32
 
+#define CLK_RTCREF	33
+#define CLK_MSSPLL	34
+
 #endif	/* _DT_BINDINGS_CLK_MICROCHIP_MPFS_H_ */
-- 
cgit v1.2.3-71-gd317


From 31c417c948d7f6909cb63f0ac3298f3c38f8ce20 Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Thu, 21 Apr 2022 15:09:02 -0700
Subject: ip_gre, ip6_gre: Fix race condition on o_seqno in collect_md mode

As pointed out by Jakub Kicinski, currently using TUNNEL_SEQ in
collect_md mode is racy for [IP6]GRE[TAP] devices.  Consider the
following sequence of events:

1. An [IP6]GRE[TAP] device is created in collect_md mode using "ip link
   add ... external".  "ip" ignores "[o]seq" if "external" is specified,
   so TUNNEL_SEQ is off, and the device is marked as NETIF_F_LLTX (i.e.
   it uses lockless TX);
2. Someone sets TUNNEL_SEQ on outgoing skb's, using e.g.
   bpf_skb_set_tunnel_key() in an eBPF program attached to this device;
3. gre_fb_xmit() or __gre6_xmit() processes these skb's:

	gre_build_header(skb, tun_hlen,
			 flags, protocol,
			 tunnel_id_to_key32(tun_info->key.tun_id),
			 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++)
					      : 0);   ^^^^^^^^^^^^^^^^^

Since we are not using the TX lock (&txq->_xmit_lock), multiple CPUs may
try to do this tunnel->o_seqno++ in parallel, which is racy.  Fix it by
making o_seqno atomic_t.

As mentioned by Eric Dumazet in commit b790e01aee74 ("ip_gre: lockless
xmit"), making o_seqno atomic_t increases "chance for packets being out
of order at receiver" when NETIF_F_LLTX is on.

Maybe a better fix would be:

1. Do not ignore "oseq" in external mode.  Users MUST specify "oseq" if
   they want the kernel to allow sequencing of outgoing packets;
2. Reject all outgoing TUNNEL_SEQ packets if the device was not created
   with "oseq".

Unfortunately, that would break userspace.

We could now make [IP6]GRE[TAP] devices always NETIF_F_LLTX, but let us
do it in separate patches to keep this fix minimal.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Fixes: 77a5196a804e ("gre: add sequence number for collect md mode.")
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h | 2 +-
 include/net/ip_tunnels.h | 2 +-
 net/ipv4/ip_gre.c        | 6 +++---
 net/ipv6/ip6_gre.c       | 7 ++++---
 4 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index a38c4f1e4e5c..74b369bddf49 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -58,7 +58,7 @@ struct ip6_tnl {
 
 	/* These fields used only by GRE */
 	__u32 i_seqno;	/* The last seen seqno	*/
-	__u32 o_seqno;	/* The last output seqno */
+	atomic_t o_seqno;	/* The last output seqno */
 	int hlen;       /* tun_hlen + encap_hlen */
 	int tun_hlen;	/* Precalculated header length */
 	int encap_hlen; /* Encap header length (FOU,GUE) */
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 88dee57eac8a..c24fa934221d 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -116,7 +116,7 @@ struct ip_tunnel {
 
 	/* These four fields used only by GRE */
 	u32		i_seqno;	/* The last seen seqno	*/
-	u32		o_seqno;	/* The last output seqno */
+	atomic_t	o_seqno;	/* The last output seqno */
 	int		tun_hlen;	/* Precalculated header length */
 
 	/* These four fields used only by ERSPAN */
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 21a8943f6fa4..aacee9dd771b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -464,7 +464,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 	/* Push GRE header. */
 	gre_build_header(skb, tunnel->tun_hlen,
 			 flags, proto, tunnel->parms.o_key,
-			 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
+			 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
 
 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
 }
@@ -502,7 +502,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
 	gre_build_header(skb, tunnel_hlen, flags, proto,
 			 tunnel_id_to_key32(tun_info->key.tun_id),
-			 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
+			 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
 
 	ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
 
@@ -579,7 +579,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	gre_build_header(skb, 8, TUNNEL_SEQ,
-			 proto, 0, htonl(tunnel->o_seqno++));
+			 proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno)));
 
 	ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index d9e4ac94eab4..5136959b3dc5 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -766,7 +766,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		gre_build_header(skb, tun_hlen,
 				 flags, protocol,
 				 tunnel_id_to_key32(tun_info->key.tun_id),
-				 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++)
+				 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno))
 						      : 0);
 
 	} else {
@@ -777,7 +777,8 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 
 		gre_build_header(skb, tunnel->tun_hlen, flags,
 				 protocol, tunnel->parms.o_key,
-				 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
+				 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno))
+						      : 0);
 	}
 
 	return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
@@ -1055,7 +1056,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	/* Push GRE header. */
 	proto = (t->parms.erspan_ver == 1) ? htons(ETH_P_ERSPAN)
 					   : htons(ETH_P_ERSPAN2);
-	gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(t->o_seqno++));
+	gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno)));
 
 	/* TooBig packet may have updated dst->dev's mtu */
 	if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu)
-- 
cgit v1.2.3-71-gd317


From 4bfe744ff1644fbc0a991a2677dc874475dd6776 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 24 Apr 2022 17:34:07 -0700
Subject: tcp: fix potential xmit stalls caused by TCP_NOTSENT_LOWAT

I had this bug sitting for too long in my pile, it is time to fix it.

Thanks to Doug Porter for reminding me of it!

We had various attempts in the past, including commit
0cbe6a8f089e ("tcp: remove SOCK_QUEUE_SHRUNK"),
but the issue is that TCP stack currently only generates
EPOLLOUT from input path, when tp->snd_una has advanced
and skb(s) cleaned from rtx queue.

If a flow has a big RTT, and/or receives SACKs, it is possible
that the notsent part (tp->write_seq - tp->snd_nxt) reaches 0
and no more data can be sent until tp->snd_una finally advances.

What is needed is to also check if POLLOUT needs to be generated
whenever tp->snd_nxt is advanced, from output path.

This bug triggers more often after an idle period, as
we do not receive ACK for at least one RTT. tcp_notsent_lowat
could be a fraction of what CWND and pacing rate would allow to
send during this RTT.

In a followup patch, I will remove the bogus call
to tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED)
from tcp_check_space(). Fact that we have decided to generate
an EPOLLOUT does not mean the application has immediately
refilled the transmit queue. This optimistic call
might have been the reason the bug seemed not too serious.

Tested:

200 ms rtt, 1% packet loss, 32 MB tcp_rmem[2] and tcp_wmem[2]

$ echo 500000 >/proc/sys/net/ipv4/tcp_notsent_lowat
$ cat bench_rr.sh
SUM=0
for i in {1..10}
do
 V=`netperf -H remote_host -l30 -t TCP_RR -- -r 10000000,10000 -o LOCAL_BYTES_SENT | egrep -v "MIGRATED|Bytes"`
 echo $V
 SUM=$(($SUM + $V))
done
echo SUM=$SUM

Before patch:
$ bench_rr.sh
130000000
80000000
140000000
140000000
140000000
140000000
130000000
40000000
90000000
110000000
SUM=1140000000

After patch:
$ bench_rr.sh
430000000
590000000
530000000
450000000
450000000
350000000
450000000
490000000
480000000
460000000
SUM=4680000000  # This is 410 % of the value before patch.

Fixes: c9bee3b7fdec ("tcp: TCP_NOTSENT_LOWAT socket option")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Doug Porter <dsp@fb.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  1 +
 net/ipv4/tcp_input.c  | 12 +++++++++++-
 net/ipv4/tcp_output.c |  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index be712fb9ddd7..b99d9d9cbd99 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -620,6 +620,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk, struct sk_buff *skb);
 void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
 void tcp_fin(struct sock *sk);
+void tcp_check_space(struct sock *sk);
 
 /* tcp_timer.c */
 void tcp_init_xmit_timers(struct sock *);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2088f93fa37b..48f607522860 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5454,7 +5454,17 @@ static void tcp_new_space(struct sock *sk)
 	INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
 }
 
-static void tcp_check_space(struct sock *sk)
+/* Caller made space either from:
+ * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
+ * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
+ *
+ * We might be able to generate EPOLLOUT to the application if:
+ * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
+ * 2) notsent amount (tp->write_seq - tp->snd_nxt) became
+ *    small enough that tcp_stream_memory_free() decides it
+ *    is time to generate EPOLLOUT.
+ */
+void tcp_check_space(struct sock *sk)
 {
 	/* pairs with tcp_poll() */
 	smp_mb();
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9ede847f4199..1ca2f28c9981 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -82,6 +82,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
 
 	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
 		      tcp_skb_pcount(skb));
+	tcp_check_space(sk);
 }
 
 /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
-- 
cgit v1.2.3-71-gd317


From ba5a4fdd63ae0c575707030db0b634b160baddd7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 24 Apr 2022 13:35:09 -0700
Subject: tcp: make sure treq->af_specific is initialized

syzbot complained about a recent change in TCP stack,
hitting a NULL pointer [1]

tcp request sockets have an af_specific pointer, which
was used before the blamed change only for SYNACK generation
in non SYNCOOKIE mode.

tcp requests sockets momentarily created when third packet
coming from client in SYNCOOKIE mode were not using
treq->af_specific.

Make sure this field is populated, in the same way normal
TCP requests sockets do in tcp_conn_request().

[1]
TCP: request_sock_TCPv6: Possible SYN flooding on port 20002. Sending cookies.  Check SNMP counters.
general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
CPU: 1 PID: 3695 Comm: syz-executor864 Not tainted 5.18.0-rc3-syzkaller-00224-g5fd1fe4807f9 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:tcp_create_openreq_child+0xe16/0x16b0 net/ipv4/tcp_minisocks.c:534
Code: 48 c1 ea 03 80 3c 02 00 0f 85 e5 07 00 00 4c 8b b3 28 01 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7e 08 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 c9 07 00 00 48 8b 3c 24 48 89 de 41 ff 56 08 48
RSP: 0018:ffffc90000de0588 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: ffff888076490330 RCX: 0000000000000100
RDX: 0000000000000001 RSI: ffffffff87d67ff0 RDI: 0000000000000008
RBP: ffff88806ee1c7f8 R08: 0000000000000000 R09: 0000000000000000
R10: ffffffff87d67f00 R11: 0000000000000000 R12: ffff88806ee1bfc0
R13: ffff88801b0e0368 R14: 0000000000000000 R15: 0000000000000000
FS:  00007f517fe58700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffcead76960 CR3: 000000006f97b000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <IRQ>
 tcp_v6_syn_recv_sock+0x199/0x23b0 net/ipv6/tcp_ipv6.c:1267
 tcp_get_cookie_sock+0xc9/0x850 net/ipv4/syncookies.c:207
 cookie_v6_check+0x15c3/0x2340 net/ipv6/syncookies.c:258
 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:1131 [inline]
 tcp_v6_do_rcv+0x1148/0x13b0 net/ipv6/tcp_ipv6.c:1486
 tcp_v6_rcv+0x3305/0x3840 net/ipv6/tcp_ipv6.c:1725
 ip6_protocol_deliver_rcu+0x2e9/0x1900 net/ipv6/ip6_input.c:422
 ip6_input_finish+0x14c/0x2c0 net/ipv6/ip6_input.c:464
 NF_HOOK include/linux/netfilter.h:307 [inline]
 NF_HOOK include/linux/netfilter.h:301 [inline]
 ip6_input+0x9c/0xd0 net/ipv6/ip6_input.c:473
 dst_input include/net/dst.h:461 [inline]
 ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline]
 NF_HOOK include/linux/netfilter.h:307 [inline]
 NF_HOOK include/linux/netfilter.h:301 [inline]
 ipv6_rcv+0x27f/0x3b0 net/ipv6/ip6_input.c:297
 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5405
 __netif_receive_skb+0x24/0x1b0 net/core/dev.c:5519
 process_backlog+0x3a0/0x7c0 net/core/dev.c:5847
 __napi_poll+0xb3/0x6e0 net/core/dev.c:6413
 napi_poll net/core/dev.c:6480 [inline]
 net_rx_action+0x8ec/0xc60 net/core/dev.c:6567
 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558
 invoke_softirq kernel/softirq.c:432 [inline]
 __irq_exit_rcu+0x123/0x180 kernel/softirq.c:637
 irq_exit_rcu+0x5/0x20 kernel/softirq.c:649
 sysvec_apic_timer_interrupt+0x93/0xc0 arch/x86/kernel/apic/apic.c:1097

Fixes: 5b0b9e4c2c89 ("tcp: md5: incorrect tcp_header_len for incoming connections")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 1 +
 net/ipv4/syncookies.c | 8 +++++++-
 net/ipv6/syncookies.c | 3 ++-
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b99d9d9cbd99..cc1295037533 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -480,6 +480,7 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
 		      u32 cookie);
 struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
 struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
+					    const struct tcp_request_sock_ops *af_ops,
 					    struct sock *sk, struct sk_buff *skb);
 #ifdef CONFIG_SYN_COOKIES
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 2cb3b852d148..f33c31dd7366 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -281,6 +281,7 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
 EXPORT_SYMBOL(cookie_ecn_ok);
 
 struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
+					    const struct tcp_request_sock_ops *af_ops,
 					    struct sock *sk,
 					    struct sk_buff *skb)
 {
@@ -297,6 +298,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
 		return NULL;
 
 	treq = tcp_rsk(req);
+
+	/* treq->af_specific might be used to perform TCP_MD5 lookup */
+	treq->af_specific = af_ops;
+
 	treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
 #if IS_ENABLED(CONFIG_MPTCP)
 	treq->is_mptcp = sk_is_mptcp(sk);
@@ -364,7 +369,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 		goto out;
 
 	ret = NULL;
-	req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb);
+	req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops,
+				     &tcp_request_sock_ipv4_ops, sk, skb);
 	if (!req)
 		goto out;
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index d1b61d00368e..9cc123f000fb 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -170,7 +170,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		goto out;
 
 	ret = NULL;
-	req = cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb);
+	req = cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops,
+				     &tcp_request_sock_ipv6_ops, sk, skb);
 	if (!req)
 		goto out;
 
-- 
cgit v1.2.3-71-gd317


From 1fa568e26f001e951b634d62ef3accdc80a87c7b Mon Sep 17 00:00:00 2001
From: Shida Zhang <zhangshida@kylinos.cn>
Date: Tue, 26 Apr 2022 11:20:07 +0800
Subject: bug: Have __warn() prototype defined unconditionally
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The __warn() prototype is declared in CONFIG_BUG scope but the function
definition in panic.c is unconditional. The IBT enablement started using
it unconditionally but a CONFIG_X86_KERNEL_IBT=y, CONFIG_BUG=n .config
will trigger a

  arch/x86/kernel/traps.c: In function ‘__exc_control_protection’:
  arch/x86/kernel/traps.c:249:17: error: implicit declaration of function \
  	  ‘__warn’; did you mean ‘pr_warn’? [-Werror=implicit-function-declaration]

Pull up the declarations so that they're unconditionally visible too.

  [ bp: Rewrite commit message. ]

Fixes: 991625f3dd2c ("x86/ibt: Add IBT feature, MSR and #CP handling")
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Shida Zhang <zhangshida@kylinos.cn>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220426032007.510245-1-starzhangzsd@gmail.com
---
 include/asm-generic/bug.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index edb0e2a602a8..ba1f860af38b 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -21,6 +21,12 @@
 #include <linux/panic.h>
 #include <linux/printk.h>
 
+struct warn_args;
+struct pt_regs;
+
+void __warn(const char *file, int line, void *caller, unsigned taint,
+	    struct pt_regs *regs, struct warn_args *args);
+
 #ifdef CONFIG_BUG
 
 #ifdef CONFIG_GENERIC_BUG
@@ -110,11 +116,6 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 #endif
 
 /* used internally by panic.c */
-struct warn_args;
-struct pt_regs;
-
-void __warn(const char *file, int line, void *caller, unsigned taint,
-	    struct pt_regs *regs, struct warn_args *args);
 
 #ifndef WARN_ON
 #define WARN_ON(condition) ({						\
-- 
cgit v1.2.3-71-gd317


From ba3beec2ec1d3b4fd8672ca6e781dac4b3267f6e Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Mon, 25 Apr 2022 17:37:45 +0200
Subject: xsk: Fix possible crash when multiple sockets are created

Fix a crash that happens if an Rx only socket is created first, then a
second socket is created that is Tx only and bound to the same umem as
the first socket and also the same netdev and queue_id together with the
XDP_SHARED_UMEM flag. In this specific case, the tx_descs array page
pool was not created by the first socket as it was an Rx only socket.
When the second socket is bound it needs this tx_descs array of this
shared page pool as it has a Tx component, but unfortunately it was
never allocated, leading to a crash. Note that this array is only used
for zero-copy drivers using the batched Tx APIs, currently only ice and
i40e.

[ 5511.150360] BUG: kernel NULL pointer dereference, address: 0000000000000008
[ 5511.158419] #PF: supervisor write access in kernel mode
[ 5511.164472] #PF: error_code(0x0002) - not-present page
[ 5511.170416] PGD 0 P4D 0
[ 5511.173347] Oops: 0002 [#1] PREEMPT SMP PTI
[ 5511.178186] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G            E     5.18.0-rc1+ #97
[ 5511.187245] Hardware name: Intel Corp. GRANTLEY/GRANTLEY, BIOS GRRFCRB1.86B.0276.D07.1605190235 05/19/2016
[ 5511.198418] RIP: 0010:xsk_tx_peek_release_desc_batch+0x198/0x310
[ 5511.205375] Code: c0 83 c6 01 84 c2 74 6d 8d 46 ff 23 07 44 89 e1 48 83 c0 14 48 c1 e1 04 48 c1 e0 04 48 03 47 10 4c 01 c1 48 8b 50 08 48 8b 00 <48> 89 51 08 48 89 01 41 80 bd d7 00 00 00 00 75 82 48 8b 19 49 8b
[ 5511.227091] RSP: 0018:ffffc90000003dd0 EFLAGS: 00010246
[ 5511.233135] RAX: 0000000000000000 RBX: ffff88810c8da600 RCX: 0000000000000000
[ 5511.241384] RDX: 000000000000003c RSI: 0000000000000001 RDI: ffff888115f555c0
[ 5511.249634] RBP: ffffc90000003e08 R08: 0000000000000000 R09: ffff889092296b48
[ 5511.257886] R10: 0000ffffffffffff R11: ffff889092296800 R12: 0000000000000000
[ 5511.266138] R13: ffff88810c8db500 R14: 0000000000000040 R15: 0000000000000100
[ 5511.274387] FS:  0000000000000000(0000) GS:ffff88903f800000(0000) knlGS:0000000000000000
[ 5511.283746] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5511.290389] CR2: 0000000000000008 CR3: 00000001046e2001 CR4: 00000000003706f0
[ 5511.298640] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 5511.306892] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 5511.315142] Call Trace:
[ 5511.317972]  <IRQ>
[ 5511.320301]  ice_xmit_zc+0x68/0x2f0 [ice]
[ 5511.324977]  ? ktime_get+0x38/0xa0
[ 5511.328913]  ice_napi_poll+0x7a/0x6a0 [ice]
[ 5511.333784]  __napi_poll+0x2c/0x160
[ 5511.337821]  net_rx_action+0xdd/0x200
[ 5511.342058]  __do_softirq+0xe6/0x2dd
[ 5511.346198]  irq_exit_rcu+0xb5/0x100
[ 5511.350339]  common_interrupt+0xa4/0xc0
[ 5511.354777]  </IRQ>
[ 5511.357201]  <TASK>
[ 5511.359625]  asm_common_interrupt+0x1e/0x40
[ 5511.364466] RIP: 0010:cpuidle_enter_state+0xd2/0x360
[ 5511.370211] Code: 49 89 c5 0f 1f 44 00 00 31 ff e8 e9 00 7b ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 72 02 00 00 31 ff e8 02 0c 80 ff fb 45 85 f6 <0f> 88 11 01 00 00 49 63 c6 4c 2b 2c 24 48 8d 14 40 48 8d 14 90 49
[ 5511.391921] RSP: 0018:ffffffff82a03e60 EFLAGS: 00000202
[ 5511.397962] RAX: ffff88903f800000 RBX: 0000000000000001 RCX: 000000000000001f
[ 5511.406214] RDX: 0000000000000000 RSI: ffffffff823400b9 RDI: ffffffff8234c046
[ 5511.424646] RBP: ffff88810a384800 R08: 000005032a28c046 R09: 0000000000000008
[ 5511.443233] R10: 000000000000000b R11: 0000000000000006 R12: ffffffff82bcf700
[ 5511.461922] R13: 000005032a28c046 R14: 0000000000000001 R15: 0000000000000000
[ 5511.480300]  cpuidle_enter+0x29/0x40
[ 5511.494329]  do_idle+0x1c7/0x250
[ 5511.507610]  cpu_startup_entry+0x19/0x20
[ 5511.521394]  start_kernel+0x649/0x66e
[ 5511.534626]  secondary_startup_64_no_verify+0xc3/0xcb
[ 5511.549230]  </TASK>

Detect such case during bind() and allocate this memory region via newly
introduced xp_alloc_tx_descs(). Also, use kvcalloc instead of kcalloc as
for other buffer pool allocations, so that it matches the kvfree() from
xp_destroy().

Fixes: d1bc532e99be ("i40e: xsk: Move tmp desc array from driver to pool")
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/bpf/20220425153745.481322-1-maciej.fijalkowski@intel.com
---
 include/net/xsk_buff_pool.h |  1 +
 net/xdp/xsk.c               | 13 +++++++++++++
 net/xdp/xsk_buff_pool.c     | 16 ++++++++++++----
 3 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 5554ee75e7da..647722e847b4 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -97,6 +97,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
 		  u16 queue_id, u16 flags);
 int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
 			 struct net_device *dev, u16 queue_id);
+int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs);
 void xp_destroy(struct xsk_buff_pool *pool);
 void xp_get_pool(struct xsk_buff_pool *pool);
 bool xp_put_pool(struct xsk_buff_pool *pool);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7d3a00cb24ec..3a9348030e20 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -967,6 +967,19 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 
 			xp_get_pool(umem_xs->pool);
 			xs->pool = umem_xs->pool;
+
+			/* If underlying shared umem was created without Tx
+			 * ring, allocate Tx descs array that Tx batching API
+			 * utilizes
+			 */
+			if (xs->tx && !xs->pool->tx_descs) {
+				err = xp_alloc_tx_descs(xs->pool, xs);
+				if (err) {
+					xp_put_pool(xs->pool);
+					sockfd_put(sock);
+					goto out_unlock;
+				}
+			}
 		}
 
 		xdp_get_umem(umem_xs->umem);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index af040ffa14ff..87bdd71c7bb6 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -42,6 +42,16 @@ void xp_destroy(struct xsk_buff_pool *pool)
 	kvfree(pool);
 }
 
+int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs)
+{
+	pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs),
+				  GFP_KERNEL);
+	if (!pool->tx_descs)
+		return -ENOMEM;
+
+	return 0;
+}
+
 struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 						struct xdp_umem *umem)
 {
@@ -59,11 +69,9 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 	if (!pool->heads)
 		goto out;
 
-	if (xs->tx) {
-		pool->tx_descs = kcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), GFP_KERNEL);
-		if (!pool->tx_descs)
+	if (xs->tx)
+		if (xp_alloc_tx_descs(pool, xs))
 			goto out;
-	}
 
 	pool->chunk_mask = ~((u64)umem->chunk_size - 1);
 	pool->addrs_cnt = umem->size;
-- 
cgit v1.2.3-71-gd317


From c86cc5a3ec70f5644f1fa21610b943d0441bc1f7 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 22 Apr 2022 12:58:16 -0700
Subject: Bluetooth: hci_event: Fix checking for invalid handle on error status

Commit d5ebaa7c5f6f6 introduces checks for handle range
(e.g HCI_CONN_HANDLE_MAX) but controllers like Intel AX200 don't seem
to respect the valid range int case of error status:

> HCI Event: Connect Complete (0x03) plen 11
        Status: Page Timeout (0x04)
        Handle: 65535
        Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment&
	Sound Products Inc)
        Link type: ACL (0x01)
        Encryption: Disabled (0x00)
[1644965.827560] Bluetooth: hci0: Ignoring HCI_Connection_Complete for invalid handle

Because of it is impossible to cleanup the connections properly since
the stack would attempt to cancel the connection which is no longer in
progress causing the following trace:

< HCI Command: Create Connection Cancel (0x01|0x0008) plen 6
        Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment&
	Sound Products Inc)
= bluetoothd: src/profile.c:record_cb() Unable to get Hands-Free Voice
	gateway SDP record: Connection timed out
> HCI Event: Command Complete (0x0e) plen 10
      Create Connection Cancel (0x01|0x0008) ncmd 1
        Status: Unknown Connection Identifier (0x02)
        Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment&
	Sound Products Inc)
< HCI Command: Create Connection Cancel (0x01|0x0008) plen 6
        Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment&
	Sound Products Inc)

Fixes: d5ebaa7c5f6f6 ("Bluetooth: hci_event: Ignore multiple conn complete events")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h |  1 +
 net/bluetooth/hci_event.c   | 65 +++++++++++++++++++++++++--------------------
 2 files changed, 37 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 5cb095b09a94..69ef31cea582 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -578,6 +578,7 @@ enum {
 #define HCI_ERROR_CONNECTION_TIMEOUT	0x08
 #define HCI_ERROR_REJ_LIMITED_RESOURCES	0x0d
 #define HCI_ERROR_REJ_BAD_ADDR		0x0f
+#define HCI_ERROR_INVALID_PARAMETERS	0x12
 #define HCI_ERROR_REMOTE_USER_TERM	0x13
 #define HCI_ERROR_REMOTE_LOW_RESOURCES	0x14
 #define HCI_ERROR_REMOTE_POWER_OFF	0x15
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index abaabfae19cc..3a9071b987f4 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3067,13 +3067,9 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
 {
 	struct hci_ev_conn_complete *ev = data;
 	struct hci_conn *conn;
+	u8 status = ev->status;
 
-	if (__le16_to_cpu(ev->handle) > HCI_CONN_HANDLE_MAX) {
-		bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for invalid handle");
-		return;
-	}
-
-	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+	bt_dev_dbg(hdev, "status 0x%2.2x", status);
 
 	hci_dev_lock(hdev);
 
@@ -3122,8 +3118,14 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
 		goto unlock;
 	}
 
-	if (!ev->status) {
+	if (!status) {
 		conn->handle = __le16_to_cpu(ev->handle);
+		if (conn->handle > HCI_CONN_HANDLE_MAX) {
+			bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x",
+				   conn->handle, HCI_CONN_HANDLE_MAX);
+			status = HCI_ERROR_INVALID_PARAMETERS;
+			goto done;
+		}
 
 		if (conn->type == ACL_LINK) {
 			conn->state = BT_CONFIG;
@@ -3164,18 +3166,18 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
 			hci_send_cmd(hdev, HCI_OP_CHANGE_CONN_PTYPE, sizeof(cp),
 				     &cp);
 		}
-	} else {
-		conn->state = BT_CLOSED;
-		if (conn->type == ACL_LINK)
-			mgmt_connect_failed(hdev, &conn->dst, conn->type,
-					    conn->dst_type, ev->status);
 	}
 
 	if (conn->type == ACL_LINK)
 		hci_sco_setup(conn, ev->status);
 
-	if (ev->status) {
-		hci_connect_cfm(conn, ev->status);
+done:
+	if (status) {
+		conn->state = BT_CLOSED;
+		if (conn->type == ACL_LINK)
+			mgmt_connect_failed(hdev, &conn->dst, conn->type,
+					    conn->dst_type, status);
+		hci_connect_cfm(conn, status);
 		hci_conn_del(conn);
 	} else if (ev->link_type == SCO_LINK) {
 		switch (conn->setting & SCO_AIRMODE_MASK) {
@@ -3185,7 +3187,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
 			break;
 		}
 
-		hci_connect_cfm(conn, ev->status);
+		hci_connect_cfm(conn, status);
 	}
 
 unlock:
@@ -4676,6 +4678,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
 {
 	struct hci_ev_sync_conn_complete *ev = data;
 	struct hci_conn *conn;
+	u8 status = ev->status;
 
 	switch (ev->link_type) {
 	case SCO_LINK:
@@ -4690,12 +4693,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
 		return;
 	}
 
-	if (__le16_to_cpu(ev->handle) > HCI_CONN_HANDLE_MAX) {
-		bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete for invalid handle");
-		return;
-	}
-
-	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+	bt_dev_dbg(hdev, "status 0x%2.2x", status);
 
 	hci_dev_lock(hdev);
 
@@ -4729,9 +4727,17 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
 		goto unlock;
 	}
 
-	switch (ev->status) {
+	switch (status) {
 	case 0x00:
 		conn->handle = __le16_to_cpu(ev->handle);
+		if (conn->handle > HCI_CONN_HANDLE_MAX) {
+			bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x",
+				   conn->handle, HCI_CONN_HANDLE_MAX);
+			status = HCI_ERROR_INVALID_PARAMETERS;
+			conn->state = BT_CLOSED;
+			break;
+		}
+
 		conn->state  = BT_CONNECTED;
 		conn->type   = ev->link_type;
 
@@ -4775,8 +4781,8 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
 		}
 	}
 
-	hci_connect_cfm(conn, ev->status);
-	if (ev->status)
+	hci_connect_cfm(conn, status);
+	if (status)
 		hci_conn_del(conn);
 
 unlock:
@@ -5527,11 +5533,6 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 	struct smp_irk *irk;
 	u8 addr_type;
 
-	if (handle > HCI_CONN_HANDLE_MAX) {
-		bt_dev_err(hdev, "Ignoring HCI_LE_Connection_Complete for invalid handle");
-		return;
-	}
-
 	hci_dev_lock(hdev);
 
 	/* All controllers implicitly stop advertising in the event of a
@@ -5603,6 +5604,12 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 
 	conn->dst_type = ev_bdaddr_type(hdev, conn->dst_type, NULL);
 
+	if (handle > HCI_CONN_HANDLE_MAX) {
+		bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", handle,
+			   HCI_CONN_HANDLE_MAX);
+		status = HCI_ERROR_INVALID_PARAMETERS;
+	}
+
 	if (status) {
 		hci_le_conn_failed(conn, status);
 		goto unlock;
-- 
cgit v1.2.3-71-gd317


From 9b3628d79b46f06157affc56fdb218fdd4988321 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 22 Apr 2022 12:58:18 -0700
Subject: Bluetooth: hci_sync: Cleanup hci_conn if it cannot be aborted

This attempts to cleanup the hci_conn if it cannot be aborted as
otherwise it would likely result in having the controller and host
stack out of sync with respect to connection handle.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  2 +-
 net/bluetooth/hci_conn.c         | 32 ++++++++++++++++++++++++--------
 net/bluetooth/hci_event.c        | 13 ++++---------
 net/bluetooth/hci_sync.c         | 11 ++++++++++-
 4 files changed, 39 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index d5377740e99c..8abd08245326 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1156,7 +1156,7 @@ int hci_conn_switch_role(struct hci_conn *conn, __u8 role);
 
 void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active);
 
-void hci_le_conn_failed(struct hci_conn *conn, u8 status);
+void hci_conn_failed(struct hci_conn *conn, u8 status);
 
 /*
  * hci_conn_get() and hci_conn_put() are used to control the life-time of an
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 84312c836549..fe803bee419a 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -670,7 +670,7 @@ static void le_conn_timeout(struct work_struct *work)
 		/* Disable LE Advertising */
 		le_disable_advertising(hdev);
 		hci_dev_lock(hdev);
-		hci_le_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT);
+		hci_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT);
 		hci_dev_unlock(hdev);
 		return;
 	}
@@ -873,7 +873,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type)
 EXPORT_SYMBOL(hci_get_route);
 
 /* This function requires the caller holds hdev->lock */
-void hci_le_conn_failed(struct hci_conn *conn, u8 status)
+static void hci_le_conn_failed(struct hci_conn *conn, u8 status)
 {
 	struct hci_dev *hdev = conn->hdev;
 	struct hci_conn_params *params;
@@ -886,8 +886,6 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status)
 		params->conn = NULL;
 	}
 
-	conn->state = BT_CLOSED;
-
 	/* If the status indicates successful cancellation of
 	 * the attempt (i.e. Unknown Connection Id) there's no point of
 	 * notifying failure since we'll go back to keep trying to
@@ -899,10 +897,6 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status)
 		mgmt_connect_failed(hdev, &conn->dst, conn->type,
 				    conn->dst_type, status);
 
-	hci_connect_cfm(conn, status);
-
-	hci_conn_del(conn);
-
 	/* Since we may have temporarily stopped the background scanning in
 	 * favor of connection establishment, we should restart it.
 	 */
@@ -914,6 +908,28 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status)
 	hci_enable_advertising(hdev);
 }
 
+/* This function requires the caller holds hdev->lock */
+void hci_conn_failed(struct hci_conn *conn, u8 status)
+{
+	struct hci_dev *hdev = conn->hdev;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+	switch (conn->type) {
+	case LE_LINK:
+		hci_le_conn_failed(conn, status);
+		break;
+	case ACL_LINK:
+		mgmt_connect_failed(hdev, &conn->dst, conn->type,
+				    conn->dst_type, status);
+		break;
+	}
+
+	conn->state = BT_CLOSED;
+	hci_connect_cfm(conn, status);
+	hci_conn_del(conn);
+}
+
 static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
 {
 	struct hci_conn *conn = data;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 5a6c8afc51a0..66451661283c 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2834,7 +2834,7 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
 	bt_dev_dbg(hdev, "status 0x%2.2x", status);
 
 	/* All connection failure handling is taken care of by the
-	 * hci_le_conn_failed function which is triggered by the HCI
+	 * hci_conn_failed function which is triggered by the HCI
 	 * request completion callbacks used for connecting.
 	 */
 	if (status)
@@ -2859,7 +2859,7 @@ static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status)
 	bt_dev_dbg(hdev, "status 0x%2.2x", status);
 
 	/* All connection failure handling is taken care of by the
-	 * hci_le_conn_failed function which is triggered by the HCI
+	 * hci_conn_failed function which is triggered by the HCI
 	 * request completion callbacks used for connecting.
 	 */
 	if (status)
@@ -3179,12 +3179,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
 
 done:
 	if (status) {
-		conn->state = BT_CLOSED;
-		if (conn->type == ACL_LINK)
-			mgmt_connect_failed(hdev, &conn->dst, conn->type,
-					    conn->dst_type, status);
-		hci_connect_cfm(conn, status);
-		hci_conn_del(conn);
+		hci_conn_failed(conn, status);
 	} else if (ev->link_type == SCO_LINK) {
 		switch (conn->setting & SCO_AIRMODE_MASK) {
 		case SCO_AIRMODE_CVSD:
@@ -5623,7 +5618,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 	}
 
 	if (status) {
-		hci_le_conn_failed(conn, status);
+		hci_conn_failed(conn, status);
 		goto unlock;
 	}
 
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 8f4c5698913d..13600bf120b0 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4408,12 +4408,21 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn,
 static int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn,
 			       u8 reason)
 {
+	int err;
+
 	switch (conn->state) {
 	case BT_CONNECTED:
 	case BT_CONFIG:
 		return hci_disconnect_sync(hdev, conn, reason);
 	case BT_CONNECT:
-		return hci_connect_cancel_sync(hdev, conn);
+		err = hci_connect_cancel_sync(hdev, conn);
+		/* Cleanup hci_conn object if it cannot be cancelled as it
+		 * likelly means the controller and host stack are out of sync.
+		 */
+		if (err)
+			hci_conn_failed(conn, err);
+
+		return err;
 	case BT_CONNECT2:
 		return hci_reject_conn_sync(hdev, conn, reason);
 	default:
-- 
cgit v1.2.3-71-gd317


From 6510ea973d8d9d4a0cb2fb557b36bd1ab3eb49f6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Apr 2022 18:39:46 +0200
Subject: net: Use this_cpu_inc() to increment net->core_stats

The macro dev_core_stats_##FIELD##_inc() disables preemption and invokes
netdev_core_stats_alloc() to return a per-CPU pointer.
netdev_core_stats_alloc() will allocate memory on its first invocation
which breaks on PREEMPT_RT because it requires non-atomic context for
memory allocation.

This can be avoided by enabling preemption in netdev_core_stats_alloc()
assuming the caller always disables preemption.

It might be better to replace local_inc() with this_cpu_inc() now that
dev_core_stats_##FIELD##_inc() gained a preempt-disable section and does
not rely on already disabled preemption. This results in less
instructions on x86-64:
local_inc:
|          incl %gs:__preempt_count(%rip)  # __preempt_count
|          movq    488(%rdi), %rax # _1->core_stats, _22
|          testq   %rax, %rax      # _22
|          je      .L585   #,
|          add %gs:this_cpu_off(%rip), %rax        # this_cpu_off, tcp_ptr__
|  .L586:
|          testq   %rax, %rax      # _27
|          je      .L587   #,
|          incq (%rax)            # _6->a.counter
|  .L587:
|          decl %gs:__preempt_count(%rip)  # __preempt_count

this_cpu_inc(), this patch:
|         movq    488(%rdi), %rax # _1->core_stats, _5
|         testq   %rax, %rax      # _5
|         je      .L591   #,
| .L585:
|         incq %gs:(%rax) # _18->rx_dropped

Use unsigned long as type for the counter. Use this_cpu_inc() to
increment the counter. Use a plain read of the counter.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/YmbO0pxgtKpCw4SY@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 21 +++++++++------------
 net/core/dev.c            | 14 +++++---------
 2 files changed, 14 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 59e27a2b7bf0..b1fbe21650bb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -199,10 +199,10 @@ struct net_device_stats {
  * Try to fit them in a single cache line, for dev_get_stats() sake.
  */
 struct net_device_core_stats {
-	local_t		rx_dropped;
-	local_t		tx_dropped;
-	local_t		rx_nohandler;
-} __aligned(4 * sizeof(local_t));
+	unsigned long	rx_dropped;
+	unsigned long	tx_dropped;
+	unsigned long	rx_nohandler;
+} __aligned(4 * sizeof(unsigned long));
 
 #include <linux/cache.h>
 #include <linux/skbuff.h>
@@ -3843,15 +3843,15 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
 	return false;
 }
 
-struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev);
+struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev);
 
-static inline struct net_device_core_stats *dev_core_stats(struct net_device *dev)
+static inline struct net_device_core_stats __percpu *dev_core_stats(struct net_device *dev)
 {
 	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
 	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
 
 	if (likely(p))
-		return this_cpu_ptr(p);
+		return p;
 
 	return netdev_core_stats_alloc(dev);
 }
@@ -3859,14 +3859,11 @@ static inline struct net_device_core_stats *dev_core_stats(struct net_device *de
 #define DEV_CORE_STATS_INC(FIELD)						\
 static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)		\
 {										\
-	struct net_device_core_stats *p;					\
+	struct net_device_core_stats __percpu *p;				\
 										\
-	preempt_disable();							\
 	p = dev_core_stats(dev);						\
-										\
 	if (p)									\
-		local_inc(&p->FIELD);						\
-	preempt_enable();							\
+		this_cpu_inc(p->FIELD);						\
 }
 DEV_CORE_STATS_INC(rx_dropped)
 DEV_CORE_STATS_INC(tx_dropped)
diff --git a/net/core/dev.c b/net/core/dev.c
index 8c6c08446556..1461c2d9dec8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10304,7 +10304,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 }
 EXPORT_SYMBOL(netdev_stats_to_stats64);
 
-struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev)
+struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
 {
 	struct net_device_core_stats __percpu *p;
 
@@ -10315,11 +10315,7 @@ struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev)
 		free_percpu(p);
 
 	/* This READ_ONCE() pairs with the cmpxchg() above */
-	p = READ_ONCE(dev->core_stats);
-	if (!p)
-		return NULL;
-
-	return this_cpu_ptr(p);
+	return READ_ONCE(dev->core_stats);
 }
 EXPORT_SYMBOL(netdev_core_stats_alloc);
 
@@ -10356,9 +10352,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 
 		for_each_possible_cpu(i) {
 			core_stats = per_cpu_ptr(p, i);
-			storage->rx_dropped += local_read(&core_stats->rx_dropped);
-			storage->tx_dropped += local_read(&core_stats->tx_dropped);
-			storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
+			storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
+			storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
+			storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
 		}
 	}
 	return storage;
-- 
cgit v1.2.3-71-gd317


From e5be15767e7e284351853cbaba80cde8620341fb Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 25 Apr 2022 08:07:48 -0400
Subject: hex2bin: make the function hex_to_bin constant-time

The function hex2bin is used to load cryptographic keys into device
mapper targets dm-crypt and dm-integrity.  It should take constant time
independent on the processed data, so that concurrently running
unprivileged code can't infer any information about the keys via
microarchitectural convert channels.

This patch changes the function hex_to_bin so that it contains no
branches and no memory accesses.

Note that this shouldn't cause performance degradation because the size
of the new function is the same as the size of the old function (on
x86-64) - and the new function causes no branch misprediction penalties.

I compile-tested this function with gcc on aarch64 alpha arm hppa hppa64
i386 ia64 m68k mips32 mips64 powerpc powerpc64 riscv sh4 s390x sparc32
sparc64 x86_64 and with clang on aarch64 arm hexagon i386 mips32 mips64
powerpc powerpc64 s390x sparc32 sparc64 x86_64 to verify that there are
no branches in the generated code.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  2 +-
 lib/hexdump.c          | 32 +++++++++++++++++++++++++-------
 2 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a890428bcc1a..fe6efb24d151 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -285,7 +285,7 @@ static inline char *hex_byte_pack_upper(char *buf, u8 byte)
 	return buf;
 }
 
-extern int hex_to_bin(char ch);
+extern int hex_to_bin(unsigned char ch);
 extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
 extern char *bin2hex(char *dst, const void *src, size_t count);
 
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 9301578f98e8..369420ce553a 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -22,15 +22,33 @@ EXPORT_SYMBOL(hex_asc_upper);
  *
  * hex_to_bin() converts one hex digit to its actual value or -1 in case of bad
  * input.
+ *
+ * This function is used to load cryptographic keys, so it is coded in such a
+ * way that there are no conditions or memory accesses that depend on data.
+ *
+ * Explanation of the logic:
+ * (ch - '9' - 1) is negative if ch <= '9'
+ * ('0' - 1 - ch) is negative if ch >= '0'
+ * we "and" these two values, so the result is negative if ch is in the range
+ *	'0' ... '9'
+ * we are only interested in the sign, so we do a shift ">> 8"; note that right
+ *	shift of a negative value is implementation-defined, so we cast the
+ *	value to (unsigned) before the shift --- we have 0xffffff if ch is in
+ *	the range '0' ... '9', 0 otherwise
+ * we "and" this value with (ch - '0' + 1) --- we have a value 1 ... 10 if ch is
+ *	in the range '0' ... '9', 0 otherwise
+ * we add this value to -1 --- we have a value 0 ... 9 if ch is in the range '0'
+ *	... '9', -1 otherwise
+ * the next line is similar to the previous one, but we need to decode both
+ *	uppercase and lowercase letters, so we use (ch & 0xdf), which converts
+ *	lowercase to uppercase
  */
-int hex_to_bin(char ch)
+int hex_to_bin(unsigned char ch)
 {
-	if ((ch >= '0') && (ch <= '9'))
-		return ch - '0';
-	ch = tolower(ch);
-	if ((ch >= 'a') && (ch <= 'f'))
-		return ch - 'a' + 10;
-	return -1;
+	unsigned char cu = ch & 0xdf;
+	return -1 +
+		((ch - '0' +  1) & (unsigned)((ch - '9' - 1) & ('0' - 1 - ch)) >> 8) +
+		((cu - 'A' + 11) & (unsigned)((cu - 'F' - 1) & ('A' - 1 - cu)) >> 8);
 }
 EXPORT_SYMBOL(hex_to_bin);
 
-- 
cgit v1.2.3-71-gd317


From ed36d04e8f8d7b00db451b0fa56a54e8e02ec43e Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 25 Apr 2022 13:42:02 +0100
Subject: iommu: Introduce device_iommu_capable()

iommu_capable() only really works for systems where all IOMMU instances
are completely homogeneous, and all devices are IOMMU-mapped. Implement
the new variant which will be able to give a more accurate answer for
whichever device the caller is actually interested in, and even more so
once all the external users have been converted and we can reliably pass
the device pointer through the internal driver interface too.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/8407eb9586677995b7a9fd70d0fd82d85929a9bb.1650878781.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 23 +++++++++++++++++++++++
 include/linux/iommu.h |  6 ++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index f2c45b85b9fc..780c11734979 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1906,6 +1906,29 @@ bool iommu_present(struct bus_type *bus)
 }
 EXPORT_SYMBOL_GPL(iommu_present);
 
+/**
+ * device_iommu_capable() - check for a general IOMMU capability
+ * @dev: device to which the capability would be relevant, if available
+ * @cap: IOMMU capability
+ *
+ * Return: true if an IOMMU is present and supports the given capability
+ * for the given device, otherwise false.
+ */
+bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
+{
+	const struct iommu_ops *ops;
+
+	if (!dev->iommu || !dev->iommu->iommu_dev)
+		return false;
+
+	ops = dev_iommu_ops(dev);
+	if (!ops->capable)
+		return false;
+
+	return ops->capable(cap);
+}
+EXPORT_SYMBOL_GPL(device_iommu_capable);
+
 bool iommu_capable(struct bus_type *bus, enum iommu_cap cap)
 {
 	if (!bus->iommu_ops || !bus->iommu_ops->capable)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9208eca4b0d1..e26cf84e5d82 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -417,6 +417,7 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
 extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops);
 extern int bus_iommu_probe(struct bus_type *bus);
 extern bool iommu_present(struct bus_type *bus);
+extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_capable(struct bus_type *bus, enum iommu_cap cap);
 extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
 extern struct iommu_group *iommu_group_get_by_id(int id);
@@ -689,6 +690,11 @@ static inline bool iommu_present(struct bus_type *bus)
 	return false;
 }
 
+static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
+{
+	return false;
+}
+
 static inline bool iommu_capable(struct bus_type *bus, enum iommu_cap cap)
 {
 	return false;
-- 
cgit v1.2.3-71-gd317


From d0be55fbeb6ac694d15af5d1aad19cdec8cd64e5 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 25 Apr 2022 13:42:03 +0100
Subject: iommu: Add capability for pre-boot DMA protection

VT-d's dmar_platform_optin() actually represents a combination of
properties fairly well standardised by Microsoft as "Pre-boot DMA
Protection" and "Kernel DMA Protection"[1]. As such, we can provide
interested consumers with an abstracted capability rather than
driver-specific interfaces that won't scale. We name it for the former
aspect since that's what external callers are most likely to be
interested in; the latter is for the IOMMU layer to handle itself.

[1] https://docs.microsoft.com/en-us/windows-hardware/design/device-experiences/oem-kernel-dma-protection

Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/d6218dff2702472da80db6aec2c9589010684551.1650878781.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c | 2 ++
 include/linux/iommu.h       | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index df5c62ecf942..0edf6084dc14 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4551,6 +4551,8 @@ static bool intel_iommu_capable(enum iommu_cap cap)
 		return domain_update_iommu_snooping(NULL);
 	if (cap == IOMMU_CAP_INTR_REMAP)
 		return irq_remapping_enabled == 1;
+	if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
+		return dmar_platform_optin();
 
 	return false;
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e26cf84e5d82..4123693ae319 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -107,6 +107,8 @@ enum iommu_cap {
 					   transactions */
 	IOMMU_CAP_INTR_REMAP,		/* IOMMU supports interrupt isolation */
 	IOMMU_CAP_NOEXEC,		/* IOMMU_NOEXEC flag */
+	IOMMU_CAP_PRE_BOOT_PROTECTION,	/* Firmware says it used the IOMMU for
+					   DMA protection and we should too */
 };
 
 /* These are the possible reserved region types */
-- 
cgit v1.2.3-71-gd317


From 86eaf4a5b4312bea8676fb79399d9e08b53d8e71 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 25 Apr 2022 13:42:04 +0100
Subject: thunderbolt: Make iommu_dma_protection more accurate

Between me trying to get rid of iommu_present() and Mario wanting to
support the AMD equivalent of DMAR_PLATFORM_OPT_IN, scrutiny has shown
that the iommu_dma_protection attribute is being far too optimistic.
Even if an IOMMU might be present for some PCI segment in the system,
that doesn't necessarily mean it provides translation for the device(s)
we care about. Furthermore, all that DMAR_PLATFORM_OPT_IN really does
is tell us that memory was protected before the kernel was loaded, and
prevent the user from disabling the intel-iommu driver entirely. While
that lets us assume kernel integrity, what matters for actual runtime
DMA protection is whether we trust individual devices, based on the
"external facing" property that we expect firmware to describe for
Thunderbolt ports.

It's proven challenging to determine the appropriate ports accurately
given the variety of possible topologies, so while still not getting a
perfect answer, by putting enough faith in firmware we can at least get
a good bit closer. If we can see that any device near a Thunderbolt NHI
has all the requisites for Kernel DMA Protection, chances are that it
*is* a relevant port, but moreover that implies that firmware is playing
the game overall, so we'll use that to assume that all Thunderbolt ports
should be correctly marked and thus will end up fully protected.

CC: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/b153f208bc9eafab5105bad0358b77366509d2d4.1650878781.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/thunderbolt/domain.c | 12 +++---------
 drivers/thunderbolt/nhi.c    | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/thunderbolt.h  |  2 ++
 3 files changed, 49 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c
index 7018d959f775..2889a214dadc 100644
--- a/drivers/thunderbolt/domain.c
+++ b/drivers/thunderbolt/domain.c
@@ -7,9 +7,7 @@
  */
 
 #include <linux/device.h>
-#include <linux/dmar.h>
 #include <linux/idr.h>
-#include <linux/iommu.h>
 #include <linux/module.h>
 #include <linux/pm_runtime.h>
 #include <linux/slab.h>
@@ -257,13 +255,9 @@ static ssize_t iommu_dma_protection_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
 {
-	/*
-	 * Kernel DMA protection is a feature where Thunderbolt security is
-	 * handled natively using IOMMU. It is enabled when IOMMU is
-	 * enabled and ACPI DMAR table has DMAR_PLATFORM_OPT_IN set.
-	 */
-	return sprintf(buf, "%d\n",
-		       iommu_present(&pci_bus_type) && dmar_platform_optin());
+	struct tb *tb = container_of(dev, struct tb, dev);
+
+	return sysfs_emit(buf, "%d\n", tb->nhi->iommu_dma_protection);
 }
 static DEVICE_ATTR_RO(iommu_dma_protection);
 
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 4a582183f675..4bc87b0f003a 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -15,9 +15,11 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
+#include <linux/iommu.h>
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/property.h>
+#include <linux/string_helpers.h>
 
 #include "nhi.h"
 #include "nhi_regs.h"
@@ -1103,6 +1105,47 @@ static void nhi_check_quirks(struct tb_nhi *nhi)
 		nhi->quirks |= QUIRK_AUTO_CLEAR_INT;
 }
 
+static int nhi_check_iommu_pdev(struct pci_dev *pdev, void *data)
+{
+	if (!pdev->external_facing ||
+	    !device_iommu_capable(&pdev->dev, IOMMU_CAP_PRE_BOOT_PROTECTION))
+		return 0;
+	*(bool *)data = true;
+	return 1; /* Stop walking */
+}
+
+static void nhi_check_iommu(struct tb_nhi *nhi)
+{
+	struct pci_bus *bus = nhi->pdev->bus;
+	bool port_ok = false;
+
+	/*
+	 * Ideally what we'd do here is grab every PCI device that
+	 * represents a tunnelling adapter for this NHI and check their
+	 * status directly, but unfortunately USB4 seems to make it
+	 * obnoxiously difficult to reliably make any correlation.
+	 *
+	 * So for now we'll have to bodge it... Hoping that the system
+	 * is at least sane enough that an adapter is in the same PCI
+	 * segment as its NHI, if we can find *something* on that segment
+	 * which meets the requirements for Kernel DMA Protection, we'll
+	 * take that to imply that firmware is aware and has (hopefully)
+	 * done the right thing in general. We need to know that the PCI
+	 * layer has seen the ExternalFacingPort property which will then
+	 * inform the IOMMU layer to enforce the complete "untrusted DMA"
+	 * flow, but also that the IOMMU driver itself can be trusted not
+	 * to have been subverted by a pre-boot DMA attack.
+	 */
+	while (bus->parent)
+		bus = bus->parent;
+
+	pci_walk_bus(bus, nhi_check_iommu_pdev, &port_ok);
+
+	nhi->iommu_dma_protection = port_ok;
+	dev_dbg(&nhi->pdev->dev, "IOMMU DMA protection is %s\n",
+		str_enabled_disabled(port_ok));
+}
+
 static int nhi_init_msi(struct tb_nhi *nhi)
 {
 	struct pci_dev *pdev = nhi->pdev;
@@ -1220,6 +1263,7 @@ static int nhi_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return -ENOMEM;
 
 	nhi_check_quirks(nhi);
+	nhi_check_iommu(nhi);
 
 	res = nhi_init_msi(nhi);
 	if (res) {
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 124e13cb1469..7a8ad984e651 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -465,6 +465,7 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
  * @msix_ida: Used to allocate MSI-X vectors for rings
  * @going_away: The host controller device is about to disappear so when
  *		this flag is set, avoid touching the hardware anymore.
+ * @iommu_dma_protection: An IOMMU will isolate external-facing ports.
  * @interrupt_work: Work scheduled to handle ring interrupt when no
  *		    MSI-X is used.
  * @hop_count: Number of rings (end point hops) supported by NHI.
@@ -479,6 +480,7 @@ struct tb_nhi {
 	struct tb_ring **rx_rings;
 	struct ida msix_ida;
 	bool going_away;
+	bool iommu_dma_protection;
 	struct work_struct interrupt_work;
 	u32 hop_count;
 	unsigned long quirks;
-- 
cgit v1.2.3-71-gd317


From c35fe2a68f29a0bda15ae994154cacaae5f69791 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 25 Apr 2022 16:18:33 +0100
Subject: elf: Fix the arm64 MTE ELF segment name and value

Unfortunately, the name/value choice for the MTE ELF segment type
(PT_ARM_MEMTAG_MTE) was pretty poor: LOPROC+1 is already in use by
PT_AARCH64_UNWIND, as defined in the AArch64 ELF ABI
(https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst).

Update the ELF segment type value to LOPROC+2 and also change the define
to PT_AARCH64_MEMTAG_MTE to match the AArch64 ELF ABI namespace. The
AArch64 ELF ABI document is updating accordingly (segment type not
previously mentioned in the document).

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Fixes: 761b9b366cec ("elf: Introduce the ARM MTE ELF segment type")
Cc: Will Deacon <will@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Machado <luis.machado@arm.com>
Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>
Link: https://lore.kernel.org/r/20220425151833.2603830-1-catalin.marinas@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/arm64/memory-tagging-extension.rst | 4 ++--
 arch/arm64/kernel/elfcore.c                      | 2 +-
 include/uapi/linux/elf.h                         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/arm64/memory-tagging-extension.rst b/Documentation/arm64/memory-tagging-extension.rst
index dd27f78d7608..dbae47bba25e 100644
--- a/Documentation/arm64/memory-tagging-extension.rst
+++ b/Documentation/arm64/memory-tagging-extension.rst
@@ -228,10 +228,10 @@ Core dump support
 -----------------
 
 The allocation tags for user memory mapped with ``PROT_MTE`` are dumped
-in the core file as additional ``PT_ARM_MEMTAG_MTE`` segments. The
+in the core file as additional ``PT_AARCH64_MEMTAG_MTE`` segments. The
 program header for such segment is defined as:
 
-:``p_type``: ``PT_ARM_MEMTAG_MTE``
+:``p_type``: ``PT_AARCH64_MEMTAG_MTE``
 :``p_flags``: 0
 :``p_offset``: segment file offset
 :``p_vaddr``: segment virtual address, same as the corresponding
diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c
index 2b3f3d0544b9..98d67444a5b6 100644
--- a/arch/arm64/kernel/elfcore.c
+++ b/arch/arm64/kernel/elfcore.c
@@ -95,7 +95,7 @@ int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
 	for_each_mte_vma(current, vma) {
 		struct elf_phdr phdr;
 
-		phdr.p_type = PT_ARM_MEMTAG_MTE;
+		phdr.p_type = PT_AARCH64_MEMTAG_MTE;
 		phdr.p_offset = offset;
 		phdr.p_vaddr = vma->vm_start;
 		phdr.p_paddr = 0;
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 787c657bfae8..7ce993e6786c 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -42,7 +42,7 @@ typedef __s64	Elf64_Sxword;
 
 
 /* ARM MTE memory tag segment type */
-#define PT_ARM_MEMTAG_MTE	(PT_LOPROC + 0x1)
+#define PT_AARCH64_MEMTAG_MTE	(PT_LOPROC + 0x2)
 
 /*
  * Extended Numbering
-- 
cgit v1.2.3-71-gd317


From 1ea2a07a532b0e22aabe7e8483f935c672b9e7ed Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 18 Apr 2022 08:49:50 +0800
Subject: iommu: Add DMA ownership management interfaces

Multiple devices may be placed in the same IOMMU group because they
cannot be isolated from each other. These devices must either be
entirely under kernel control or userspace control, never a mixture.

This adds dma ownership management in iommu core and exposes several
interfaces for the device drivers and the device userspace assignment
framework (i.e. VFIO), so that any conflict between user and kernel
controlled dma could be detected at the beginning.

The device driver oriented interfaces are,

	int iommu_device_use_default_domain(struct device *dev);
	void iommu_device_unuse_default_domain(struct device *dev);

By calling iommu_device_use_default_domain(), the device driver tells
the iommu layer that the device dma is handled through the kernel DMA
APIs. The iommu layer will manage the IOVA and use the default domain
for DMA address translation.

The device user-space assignment framework oriented interfaces are,

	int iommu_group_claim_dma_owner(struct iommu_group *group,
					void *owner);
	void iommu_group_release_dma_owner(struct iommu_group *group);
	bool iommu_group_dma_owner_claimed(struct iommu_group *group);

The device userspace assignment must be disallowed if the DMA owner
claiming interface returns failure.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220418005000.897664-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/iommu.h |  31 ++++++++++
 2 files changed, 181 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index f2c45b85b9fc..eba8e8ccf19d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -48,6 +48,8 @@ struct iommu_group {
 	struct iommu_domain *default_domain;
 	struct iommu_domain *domain;
 	struct list_head entry;
+	unsigned int owner_cnt;
+	void *owner;
 };
 
 struct group_device {
@@ -294,7 +296,11 @@ int iommu_probe_device(struct device *dev)
 	mutex_lock(&group->mutex);
 	iommu_alloc_default_domain(group, dev);
 
-	if (group->default_domain) {
+	/*
+	 * If device joined an existing group which has been claimed, don't
+	 * attach the default domain.
+	 */
+	if (group->default_domain && !group->owner) {
 		ret = __iommu_attach_device(group->default_domain, dev);
 		if (ret) {
 			mutex_unlock(&group->mutex);
@@ -2109,7 +2115,7 @@ static int __iommu_attach_group(struct iommu_domain *domain,
 {
 	int ret;
 
-	if (group->default_domain && group->domain != group->default_domain)
+	if (group->domain && group->domain != group->default_domain)
 		return -EBUSY;
 
 	ret = __iommu_group_for_each_dev(group, domain,
@@ -2146,7 +2152,11 @@ static void __iommu_detach_group(struct iommu_domain *domain,
 {
 	int ret;
 
-	if (!group->default_domain) {
+	/*
+	 * If the group has been claimed already, do not re-attach the default
+	 * domain.
+	 */
+	if (!group->default_domain || group->owner) {
 		__iommu_group_for_each_dev(group, domain,
 					   iommu_group_do_detach_device);
 		group->domain = NULL;
@@ -3095,3 +3105,140 @@ out:
 
 	return ret;
 }
+
+/**
+ * iommu_device_use_default_domain() - Device driver wants to handle device
+ *                                     DMA through the kernel DMA API.
+ * @dev: The device.
+ *
+ * The device driver about to bind @dev wants to do DMA through the kernel
+ * DMA API. Return 0 if it is allowed, otherwise an error.
+ */
+int iommu_device_use_default_domain(struct device *dev)
+{
+	struct iommu_group *group = iommu_group_get(dev);
+	int ret = 0;
+
+	if (!group)
+		return 0;
+
+	mutex_lock(&group->mutex);
+	if (group->owner_cnt) {
+		if (group->domain != group->default_domain ||
+		    group->owner) {
+			ret = -EBUSY;
+			goto unlock_out;
+		}
+	}
+
+	group->owner_cnt++;
+
+unlock_out:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+
+	return ret;
+}
+
+/**
+ * iommu_device_unuse_default_domain() - Device driver stops handling device
+ *                                       DMA through the kernel DMA API.
+ * @dev: The device.
+ *
+ * The device driver doesn't want to do DMA through kernel DMA API anymore.
+ * It must be called after iommu_device_use_default_domain().
+ */
+void iommu_device_unuse_default_domain(struct device *dev)
+{
+	struct iommu_group *group = iommu_group_get(dev);
+
+	if (!group)
+		return;
+
+	mutex_lock(&group->mutex);
+	if (!WARN_ON(!group->owner_cnt))
+		group->owner_cnt--;
+
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+}
+
+/**
+ * iommu_group_claim_dma_owner() - Set DMA ownership of a group
+ * @group: The group.
+ * @owner: Caller specified pointer. Used for exclusive ownership.
+ *
+ * This is to support backward compatibility for vfio which manages
+ * the dma ownership in iommu_group level. New invocations on this
+ * interface should be prohibited.
+ */
+int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner)
+{
+	int ret = 0;
+
+	mutex_lock(&group->mutex);
+	if (group->owner_cnt) {
+		ret = -EPERM;
+		goto unlock_out;
+	} else {
+		if (group->domain && group->domain != group->default_domain) {
+			ret = -EBUSY;
+			goto unlock_out;
+		}
+
+		group->owner = owner;
+		if (group->domain)
+			__iommu_detach_group(group->domain, group);
+	}
+
+	group->owner_cnt++;
+unlock_out:
+	mutex_unlock(&group->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner);
+
+/**
+ * iommu_group_release_dma_owner() - Release DMA ownership of a group
+ * @group: The group.
+ *
+ * Release the DMA ownership claimed by iommu_group_claim_dma_owner().
+ */
+void iommu_group_release_dma_owner(struct iommu_group *group)
+{
+	mutex_lock(&group->mutex);
+	if (WARN_ON(!group->owner_cnt || !group->owner))
+		goto unlock_out;
+
+	group->owner_cnt = 0;
+	/*
+	 * The UNMANAGED domain should be detached before all USER
+	 * owners have been released.
+	 */
+	if (!WARN_ON(group->domain) && group->default_domain)
+		__iommu_attach_group(group->default_domain, group);
+	group->owner = NULL;
+unlock_out:
+	mutex_unlock(&group->mutex);
+}
+EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner);
+
+/**
+ * iommu_group_dma_owner_claimed() - Query group dma ownership status
+ * @group: The group.
+ *
+ * This provides status query on a given group. It is racy and only for
+ * non-binding status reporting.
+ */
+bool iommu_group_dma_owner_claimed(struct iommu_group *group)
+{
+	unsigned int user;
+
+	mutex_lock(&group->mutex);
+	user = group->owner_cnt;
+	mutex_unlock(&group->mutex);
+
+	return user;
+}
+EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9208eca4b0d1..77972ef978b5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -675,6 +675,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 void iommu_sva_unbind_device(struct iommu_sva *handle);
 u32 iommu_sva_get_pasid(struct iommu_sva *handle);
 
+int iommu_device_use_default_domain(struct device *dev);
+void iommu_device_unuse_default_domain(struct device *dev);
+
+int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner);
+void iommu_group_release_dma_owner(struct iommu_group *group);
+bool iommu_group_dma_owner_claimed(struct iommu_group *group);
+
 #else /* CONFIG_IOMMU_API */
 
 struct iommu_ops {};
@@ -1031,6 +1038,30 @@ static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
 {
 	return NULL;
 }
+
+static inline int iommu_device_use_default_domain(struct device *dev)
+{
+	return 0;
+}
+
+static inline void iommu_device_unuse_default_domain(struct device *dev)
+{
+}
+
+static inline int
+iommu_group_claim_dma_owner(struct iommu_group *group, void *owner)
+{
+	return -ENODEV;
+}
+
+static inline void iommu_group_release_dma_owner(struct iommu_group *group)
+{
+}
+
+static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group)
+{
+	return false;
+}
 #endif /* CONFIG_IOMMU_API */
 
 /**
-- 
cgit v1.2.3-71-gd317


From 25f3bcfc54bcf7b0e45d140ec8bfbbf10ba11869 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 18 Apr 2022 08:49:51 +0800
Subject: driver core: Add dma_cleanup callback in bus_type

The bus_type structure defines dma_configure() callback for bus drivers
to configure DMA on the devices. This adds the paired dma_cleanup()
callback and calls it during driver unbinding so that bus drivers can do
some cleanup work.

One use case for this paired DMA callbacks is for the bus driver to check
for DMA ownership conflicts during driver binding, where multiple devices
belonging to a same IOMMU group (the minimum granularity of isolation and
protection) may be assigned to kernel drivers or user space respectively.

Without this change, for example, the vfio driver has to listen to a bus
BOUND_DRIVER event and then BUG_ON() in case of dma ownership conflict.
This leads to bad user experience since careless driver binding operation
may crash the system if the admin overlooks the group restriction. Aside
from bad design, this leads to a security problem as a root user, even with
lockdown=integrity, can force the kernel to BUG.

With this change, the bus driver could check and set the DMA ownership in
driver binding process and fail on ownership conflicts. The DMA ownership
should be released during driver unbinding.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20220418005000.897664-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/base/dd.c          | 5 +++++
 include/linux/device/bus.h | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 3fc3b5940bb3..94b7ac9bf459 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -671,6 +671,8 @@ sysfs_failed:
 	if (dev->bus)
 		blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
 					     BUS_NOTIFY_DRIVER_NOT_BOUND, dev);
+	if (dev->bus && dev->bus->dma_cleanup)
+		dev->bus->dma_cleanup(dev);
 pinctrl_bind_failed:
 	device_links_no_driver(dev);
 	device_unbind_cleanup(dev);
@@ -1199,6 +1201,9 @@ static void __device_release_driver(struct device *dev, struct device *parent)
 
 		device_remove(dev);
 
+		if (dev->bus && dev->bus->dma_cleanup)
+			dev->bus->dma_cleanup(dev);
+
 		device_links_driver_cleanup(dev);
 		device_unbind_cleanup(dev);
 
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index a039ab809753..d8b29ccd07e5 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -59,6 +59,8 @@ struct fwnode_handle;
  *		bus supports.
  * @dma_configure:	Called to setup DMA configuration on a device on
  *			this bus.
+ * @dma_cleanup:	Called to cleanup DMA configuration on a device on
+ *			this bus.
  * @pm:		Power management operations of this bus, callback the specific
  *		device driver's pm-ops.
  * @iommu_ops:  IOMMU specific operations for this bus, used to attach IOMMU
@@ -103,6 +105,7 @@ struct bus_type {
 	int (*num_vf)(struct device *dev);
 
 	int (*dma_configure)(struct device *dev);
+	void (*dma_cleanup)(struct device *dev);
 
 	const struct dev_pm_ops *pm;
 
-- 
cgit v1.2.3-71-gd317


From 4a6d9dd564d0e7339fc15ecc5ce66db4ad842be2 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 18 Apr 2022 08:49:52 +0800
Subject: amba: Stop sharing platform_dma_configure()

Stop sharing platform_dma_configure() helper as they are about to have
their own bus dma_configure callbacks.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20220418005000.897664-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/amba/bus.c              | 19 ++++++++++++++++++-
 drivers/base/platform.c         |  3 +--
 include/linux/platform_device.h |  2 --
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index d3bd14aaabf6..76b52bd2c2a4 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -20,6 +20,8 @@
 #include <linux/platform_device.h>
 #include <linux/reset.h>
 #include <linux/of_irq.h>
+#include <linux/of_device.h>
+#include <linux/acpi.h>
 
 #define to_amba_driver(d)	container_of(d, struct amba_driver, drv)
 
@@ -273,6 +275,21 @@ static void amba_shutdown(struct device *dev)
 		drv->shutdown(to_amba_device(dev));
 }
 
+static int amba_dma_configure(struct device *dev)
+{
+	enum dev_dma_attr attr;
+	int ret = 0;
+
+	if (dev->of_node) {
+		ret = of_dma_configure(dev, dev->of_node, true);
+	} else if (has_acpi_companion(dev)) {
+		attr = acpi_get_dma_attr(to_acpi_device_node(dev->fwnode));
+		ret = acpi_dma_configure(dev, attr);
+	}
+
+	return ret;
+}
+
 #ifdef CONFIG_PM
 /*
  * Hooks to provide runtime PM of the pclk (bus clock).  It is safe to
@@ -341,7 +358,7 @@ struct bus_type amba_bustype = {
 	.probe		= amba_probe,
 	.remove		= amba_remove,
 	.shutdown	= amba_shutdown,
-	.dma_configure	= platform_dma_configure,
+	.dma_configure	= amba_dma_configure,
 	.pm		= &amba_pm,
 };
 EXPORT_SYMBOL_GPL(amba_bustype);
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 8cc272fd5c99..d7915734d931 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1454,8 +1454,7 @@ static void platform_shutdown(struct device *_dev)
 		drv->shutdown(dev);
 }
 
-
-int platform_dma_configure(struct device *dev)
+static int platform_dma_configure(struct device *dev)
 {
 	enum dev_dma_attr attr;
 	int ret = 0;
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 7c96f169d274..17fde717df68 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -328,8 +328,6 @@ extern int platform_pm_restore(struct device *dev);
 #define platform_pm_restore		NULL
 #endif
 
-extern int platform_dma_configure(struct device *dev);
-
 #ifdef CONFIG_PM_SLEEP
 #define USE_PLATFORM_PM_SLEEP_OPS \
 	.suspend = platform_pm_suspend, \
-- 
cgit v1.2.3-71-gd317


From 512881eacfa72c2136b27b9934b7b27504a9efc2 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 18 Apr 2022 08:49:53 +0800
Subject: bus: platform,amba,fsl-mc,PCI: Add device DMA ownership management

The devices on platform/amba/fsl-mc/PCI buses could be bound to drivers
with the device DMA managed by kernel drivers or user-space applications.
Unfortunately, multiple devices may be placed in the same IOMMU group
because they cannot be isolated from each other. The DMA on these devices
must either be entirely under kernel control or userspace control, never
a mixture. Otherwise the driver integrity is not guaranteed because they
could access each other through the peer-to-peer accesses which by-pass
the IOMMU protection.

This checks and sets the default DMA mode during driver binding, and
cleanups during driver unbinding. In the default mode, the device DMA is
managed by the device driver which handles DMA operations through the
kernel DMA APIs (see Documentation/core-api/dma-api.rst).

For cases where the devices are assigned for userspace control through the
userspace driver framework(i.e. VFIO), the drivers(for example, vfio_pci/
vfio_platfrom etc.) may set a new flag (driver_managed_dma) to skip this
default setting in the assumption that the drivers know what they are
doing with the device DMA.

Calling iommu_device_use_default_domain() before {of,acpi}_dma_configure
is currently a problem. As things stand, the IOMMU driver ignored the
initial iommu_probe_device() call when the device was added, since at
that point it had no fwspec yet. In this situation,
{of,acpi}_iommu_configure() are retriggering iommu_probe_device() after
the IOMMU driver has seen the firmware data via .of_xlate to learn that
it actually responsible for the given device. As the result, before
that gets fixed, iommu_use_default_domain() goes at the end, and calls
arch_teardown_dma_ops() if it fails.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Stuart Yoder <stuyoder@gmail.com>
Cc: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20220418005000.897664-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/amba/bus.c              | 18 ++++++++++++++++++
 drivers/base/platform.c         | 18 ++++++++++++++++++
 drivers/bus/fsl-mc/fsl-mc-bus.c | 24 ++++++++++++++++++++++--
 drivers/pci/pci-driver.c        | 18 ++++++++++++++++++
 include/linux/amba/bus.h        |  8 ++++++++
 include/linux/fsl/mc.h          |  8 ++++++++
 include/linux/pci.h             |  8 ++++++++
 include/linux/platform_device.h |  8 ++++++++
 8 files changed, 108 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index 76b52bd2c2a4..a0ec61232b6c 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -22,6 +22,8 @@
 #include <linux/of_irq.h>
 #include <linux/of_device.h>
 #include <linux/acpi.h>
+#include <linux/iommu.h>
+#include <linux/dma-map-ops.h>
 
 #define to_amba_driver(d)	container_of(d, struct amba_driver, drv)
 
@@ -277,6 +279,7 @@ static void amba_shutdown(struct device *dev)
 
 static int amba_dma_configure(struct device *dev)
 {
+	struct amba_driver *drv = to_amba_driver(dev->driver);
 	enum dev_dma_attr attr;
 	int ret = 0;
 
@@ -287,9 +290,23 @@ static int amba_dma_configure(struct device *dev)
 		ret = acpi_dma_configure(dev, attr);
 	}
 
+	if (!ret && !drv->driver_managed_dma) {
+		ret = iommu_device_use_default_domain(dev);
+		if (ret)
+			arch_teardown_dma_ops(dev);
+	}
+
 	return ret;
 }
 
+static void amba_dma_cleanup(struct device *dev)
+{
+	struct amba_driver *drv = to_amba_driver(dev->driver);
+
+	if (!drv->driver_managed_dma)
+		iommu_device_unuse_default_domain(dev);
+}
+
 #ifdef CONFIG_PM
 /*
  * Hooks to provide runtime PM of the pclk (bus clock).  It is safe to
@@ -359,6 +376,7 @@ struct bus_type amba_bustype = {
 	.remove		= amba_remove,
 	.shutdown	= amba_shutdown,
 	.dma_configure	= amba_dma_configure,
+	.dma_cleanup	= amba_dma_cleanup,
 	.pm		= &amba_pm,
 };
 EXPORT_SYMBOL_GPL(amba_bustype);
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index d7915734d931..70bc30cf575c 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -30,6 +30,8 @@
 #include <linux/property.h>
 #include <linux/kmemleak.h>
 #include <linux/types.h>
+#include <linux/iommu.h>
+#include <linux/dma-map-ops.h>
 
 #include "base.h"
 #include "power/power.h"
@@ -1456,6 +1458,7 @@ static void platform_shutdown(struct device *_dev)
 
 static int platform_dma_configure(struct device *dev)
 {
+	struct platform_driver *drv = to_platform_driver(dev->driver);
 	enum dev_dma_attr attr;
 	int ret = 0;
 
@@ -1466,9 +1469,23 @@ static int platform_dma_configure(struct device *dev)
 		ret = acpi_dma_configure(dev, attr);
 	}
 
+	if (!ret && !drv->driver_managed_dma) {
+		ret = iommu_device_use_default_domain(dev);
+		if (ret)
+			arch_teardown_dma_ops(dev);
+	}
+
 	return ret;
 }
 
+static void platform_dma_cleanup(struct device *dev)
+{
+	struct platform_driver *drv = to_platform_driver(dev->driver);
+
+	if (!drv->driver_managed_dma)
+		iommu_device_unuse_default_domain(dev);
+}
+
 static const struct dev_pm_ops platform_dev_pm_ops = {
 	SET_RUNTIME_PM_OPS(pm_generic_runtime_suspend, pm_generic_runtime_resume, NULL)
 	USE_PLATFORM_PM_SLEEP_OPS
@@ -1483,6 +1500,7 @@ struct bus_type platform_bus_type = {
 	.remove		= platform_remove,
 	.shutdown	= platform_shutdown,
 	.dma_configure	= platform_dma_configure,
+	.dma_cleanup	= platform_dma_cleanup,
 	.pm		= &platform_dev_pm_ops,
 };
 EXPORT_SYMBOL_GPL(platform_bus_type);
diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c
index 8fd4a356a86e..76648c4fdaf4 100644
--- a/drivers/bus/fsl-mc/fsl-mc-bus.c
+++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
@@ -21,6 +21,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/acpi.h>
 #include <linux/iommu.h>
+#include <linux/dma-map-ops.h>
 
 #include "fsl-mc-private.h"
 
@@ -140,15 +141,33 @@ static int fsl_mc_dma_configure(struct device *dev)
 {
 	struct device *dma_dev = dev;
 	struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev);
+	struct fsl_mc_driver *mc_drv = to_fsl_mc_driver(dev->driver);
 	u32 input_id = mc_dev->icid;
+	int ret;
 
 	while (dev_is_fsl_mc(dma_dev))
 		dma_dev = dma_dev->parent;
 
 	if (dev_of_node(dma_dev))
-		return of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id);
+		ret = of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id);
+	else
+		ret = acpi_dma_configure_id(dev, DEV_DMA_COHERENT, &input_id);
+
+	if (!ret && !mc_drv->driver_managed_dma) {
+		ret = iommu_device_use_default_domain(dev);
+		if (ret)
+			arch_teardown_dma_ops(dev);
+	}
+
+	return ret;
+}
+
+static void fsl_mc_dma_cleanup(struct device *dev)
+{
+	struct fsl_mc_driver *mc_drv = to_fsl_mc_driver(dev->driver);
 
-	return acpi_dma_configure_id(dev, DEV_DMA_COHERENT, &input_id);
+	if (!mc_drv->driver_managed_dma)
+		iommu_device_unuse_default_domain(dev);
 }
 
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
@@ -312,6 +331,7 @@ struct bus_type fsl_mc_bus_type = {
 	.match = fsl_mc_bus_match,
 	.uevent = fsl_mc_bus_uevent,
 	.dma_configure  = fsl_mc_dma_configure,
+	.dma_cleanup = fsl_mc_dma_cleanup,
 	.dev_groups = fsl_mc_dev_groups,
 	.bus_groups = fsl_mc_bus_groups,
 };
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 4ceeb75fc899..f83f7fbac68f 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -20,6 +20,7 @@
 #include <linux/of_device.h>
 #include <linux/acpi.h>
 #include <linux/dma-map-ops.h>
+#include <linux/iommu.h>
 #include "pci.h"
 #include "pcie/portdrv.h"
 
@@ -1601,6 +1602,7 @@ static int pci_bus_num_vf(struct device *dev)
  */
 static int pci_dma_configure(struct device *dev)
 {
+	struct pci_driver *driver = to_pci_driver(dev->driver);
 	struct device *bridge;
 	int ret = 0;
 
@@ -1616,9 +1618,24 @@ static int pci_dma_configure(struct device *dev)
 	}
 
 	pci_put_host_bridge_device(bridge);
+
+	if (!ret && !driver->driver_managed_dma) {
+		ret = iommu_device_use_default_domain(dev);
+		if (ret)
+			arch_teardown_dma_ops(dev);
+	}
+
 	return ret;
 }
 
+static void pci_dma_cleanup(struct device *dev)
+{
+	struct pci_driver *driver = to_pci_driver(dev->driver);
+
+	if (!driver->driver_managed_dma)
+		iommu_device_unuse_default_domain(dev);
+}
+
 struct bus_type pci_bus_type = {
 	.name		= "pci",
 	.match		= pci_bus_match,
@@ -1632,6 +1649,7 @@ struct bus_type pci_bus_type = {
 	.pm		= PCI_PM_OPS_PTR,
 	.num_vf		= pci_bus_num_vf,
 	.dma_configure	= pci_dma_configure,
+	.dma_cleanup	= pci_dma_cleanup,
 };
 EXPORT_SYMBOL(pci_bus_type);
 
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 6562f543c3e0..2ddce9bcd00e 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -79,6 +79,14 @@ struct amba_driver {
 	void			(*remove)(struct amba_device *);
 	void			(*shutdown)(struct amba_device *);
 	const struct amba_id	*id_table;
+	/*
+	 * For most device drivers, no need to care about this flag as long as
+	 * all DMAs are handled through the kernel DMA API. For some special
+	 * ones, for example VFIO drivers, they know how to manage the DMA
+	 * themselves and set this flag so that the IOMMU layer will allow them
+	 * to setup and manage their own I/O address space.
+	 */
+	bool driver_managed_dma;
 };
 
 /*
diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index 7b6c42bfb660..27efef8affb1 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -32,6 +32,13 @@ struct fsl_mc_io;
  * @shutdown: Function called at shutdown time to quiesce the device
  * @suspend: Function called when a device is stopped
  * @resume: Function called when a device is resumed
+ * @driver_managed_dma: Device driver doesn't use kernel DMA API for DMA.
+ *		For most device drivers, no need to care about this flag
+ *		as long as all DMAs are handled through the kernel DMA API.
+ *		For some special ones, for example VFIO drivers, they know
+ *		how to manage the DMA themselves and set this flag so that
+ *		the IOMMU layer will allow them to setup and manage their
+ *		own I/O address space.
  *
  * Generic DPAA device driver object for device drivers that are registered
  * with a DPRC bus. This structure is to be embedded in each device-specific
@@ -45,6 +52,7 @@ struct fsl_mc_driver {
 	void (*shutdown)(struct fsl_mc_device *dev);
 	int (*suspend)(struct fsl_mc_device *dev, pm_message_t state);
 	int (*resume)(struct fsl_mc_device *dev);
+	bool driver_managed_dma;
 };
 
 #define to_fsl_mc_driver(_drv) \
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60adf42460ab..b933d2b08d4d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -895,6 +895,13 @@ struct module;
  *              created once it is bound to the driver.
  * @driver:	Driver model structure.
  * @dynids:	List of dynamically added device IDs.
+ * @driver_managed_dma: Device driver doesn't use kernel DMA API for DMA.
+ *		For most device drivers, no need to care about this flag
+ *		as long as all DMAs are handled through the kernel DMA API.
+ *		For some special ones, for example VFIO drivers, they know
+ *		how to manage the DMA themselves and set this flag so that
+ *		the IOMMU layer will allow them to setup and manage their
+ *		own I/O address space.
  */
 struct pci_driver {
 	struct list_head	node;
@@ -913,6 +920,7 @@ struct pci_driver {
 	const struct attribute_group **dev_groups;
 	struct device_driver	driver;
 	struct pci_dynids	dynids;
+	bool driver_managed_dma;
 };
 
 static inline struct pci_driver *to_pci_driver(struct device_driver *drv)
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 17fde717df68..b3d9c744f1e5 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -210,6 +210,14 @@ struct platform_driver {
 	struct device_driver driver;
 	const struct platform_device_id *id_table;
 	bool prevent_deferred_probe;
+	/*
+	 * For most device drivers, no need to care about this flag as long as
+	 * all DMAs are handled through the kernel DMA API. For some special
+	 * ones, for example VFIO drivers, they know how to manage the DMA
+	 * themselves and set this flag so that the IOMMU layer will allow them
+	 * to setup and manage their own I/O address space.
+	 */
+	bool driver_managed_dma;
 };
 
 #define to_platform_driver(drv)	(container_of((drv), struct platform_driver, \
-- 
cgit v1.2.3-71-gd317


From a5f1bd1afacd7b1e088f93f66af5453df0d8be9a Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 18 Apr 2022 08:50:00 +0800
Subject: iommu: Remove iommu group changes notifier

The iommu group changes notifer is not referenced in the tree. Remove it
to avoid dead code.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20220418005000.897664-12-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 75 ---------------------------------------------------
 include/linux/iommu.h | 23 ----------------
 2 files changed, 98 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index eba8e8ccf19d..0c42ece25854 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -18,7 +18,6 @@
 #include <linux/errno.h>
 #include <linux/iommu.h>
 #include <linux/idr.h>
-#include <linux/notifier.h>
 #include <linux/err.h>
 #include <linux/pci.h>
 #include <linux/bitops.h>
@@ -40,7 +39,6 @@ struct iommu_group {
 	struct kobject *devices_kobj;
 	struct list_head devices;
 	struct mutex mutex;
-	struct blocking_notifier_head notifier;
 	void *iommu_data;
 	void (*iommu_data_release)(void *iommu_data);
 	char *name;
@@ -632,7 +630,6 @@ struct iommu_group *iommu_group_alloc(void)
 	mutex_init(&group->mutex);
 	INIT_LIST_HEAD(&group->devices);
 	INIT_LIST_HEAD(&group->entry);
-	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 
 	ret = ida_simple_get(&iommu_group_ida, 0, 0, GFP_KERNEL);
 	if (ret < 0) {
@@ -905,10 +902,6 @@ rename:
 	if (ret)
 		goto err_put_group;
 
-	/* Notify any listeners about change to group. */
-	blocking_notifier_call_chain(&group->notifier,
-				     IOMMU_GROUP_NOTIFY_ADD_DEVICE, dev);
-
 	trace_add_device_to_group(group->id, dev);
 
 	dev_info(dev, "Adding to iommu group %d\n", group->id);
@@ -950,10 +943,6 @@ void iommu_group_remove_device(struct device *dev)
 
 	dev_info(dev, "Removing from iommu group %d\n", group->id);
 
-	/* Pre-notify listeners that a device is being removed. */
-	blocking_notifier_call_chain(&group->notifier,
-				     IOMMU_GROUP_NOTIFY_DEL_DEVICE, dev);
-
 	mutex_lock(&group->mutex);
 	list_for_each_entry(tmp_device, &group->devices, list) {
 		if (tmp_device->dev == dev) {
@@ -1075,36 +1064,6 @@ void iommu_group_put(struct iommu_group *group)
 }
 EXPORT_SYMBOL_GPL(iommu_group_put);
 
-/**
- * iommu_group_register_notifier - Register a notifier for group changes
- * @group: the group to watch
- * @nb: notifier block to signal
- *
- * This function allows iommu group users to track changes in a group.
- * See include/linux/iommu.h for actions sent via this notifier.  Caller
- * should hold a reference to the group throughout notifier registration.
- */
-int iommu_group_register_notifier(struct iommu_group *group,
-				  struct notifier_block *nb)
-{
-	return blocking_notifier_chain_register(&group->notifier, nb);
-}
-EXPORT_SYMBOL_GPL(iommu_group_register_notifier);
-
-/**
- * iommu_group_unregister_notifier - Unregister a notifier
- * @group: the group to watch
- * @nb: notifier block to signal
- *
- * Unregister a previously registered group notifier block.
- */
-int iommu_group_unregister_notifier(struct iommu_group *group,
-				    struct notifier_block *nb)
-{
-	return blocking_notifier_chain_unregister(&group->notifier, nb);
-}
-EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
-
 /**
  * iommu_register_device_fault_handler() - Register a device fault handler
  * @dev: the device
@@ -1650,14 +1609,8 @@ static int remove_iommu_group(struct device *dev, void *data)
 static int iommu_bus_notifier(struct notifier_block *nb,
 			      unsigned long action, void *data)
 {
-	unsigned long group_action = 0;
 	struct device *dev = data;
-	struct iommu_group *group;
 
-	/*
-	 * ADD/DEL call into iommu driver ops if provided, which may
-	 * result in ADD/DEL notifiers to group->notifier
-	 */
 	if (action == BUS_NOTIFY_ADD_DEVICE) {
 		int ret;
 
@@ -1668,34 +1621,6 @@ static int iommu_bus_notifier(struct notifier_block *nb,
 		return NOTIFY_OK;
 	}
 
-	/*
-	 * Remaining BUS_NOTIFYs get filtered and republished to the
-	 * group, if anyone is listening
-	 */
-	group = iommu_group_get(dev);
-	if (!group)
-		return 0;
-
-	switch (action) {
-	case BUS_NOTIFY_BIND_DRIVER:
-		group_action = IOMMU_GROUP_NOTIFY_BIND_DRIVER;
-		break;
-	case BUS_NOTIFY_BOUND_DRIVER:
-		group_action = IOMMU_GROUP_NOTIFY_BOUND_DRIVER;
-		break;
-	case BUS_NOTIFY_UNBIND_DRIVER:
-		group_action = IOMMU_GROUP_NOTIFY_UNBIND_DRIVER;
-		break;
-	case BUS_NOTIFY_UNBOUND_DRIVER:
-		group_action = IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER;
-		break;
-	}
-
-	if (group_action)
-		blocking_notifier_call_chain(&group->notifier,
-					     group_action, dev);
-
-	iommu_group_put(group);
 	return 0;
 }
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 77972ef978b5..6ef2df258673 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -407,13 +407,6 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
 	return dev->iommu->iommu_dev->ops;
 }
 
-#define IOMMU_GROUP_NOTIFY_ADD_DEVICE		1 /* Device added */
-#define IOMMU_GROUP_NOTIFY_DEL_DEVICE		2 /* Pre Device removed */
-#define IOMMU_GROUP_NOTIFY_BIND_DRIVER		3 /* Pre Driver bind */
-#define IOMMU_GROUP_NOTIFY_BOUND_DRIVER		4 /* Post Driver bind */
-#define IOMMU_GROUP_NOTIFY_UNBIND_DRIVER	5 /* Pre Driver unbind */
-#define IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER	6 /* Post Driver unbind */
-
 extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops);
 extern int bus_iommu_probe(struct bus_type *bus);
 extern bool iommu_present(struct bus_type *bus);
@@ -478,10 +471,6 @@ extern int iommu_group_for_each_dev(struct iommu_group *group, void *data,
 extern struct iommu_group *iommu_group_get(struct device *dev);
 extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group);
 extern void iommu_group_put(struct iommu_group *group);
-extern int iommu_group_register_notifier(struct iommu_group *group,
-					 struct notifier_block *nb);
-extern int iommu_group_unregister_notifier(struct iommu_group *group,
-					   struct notifier_block *nb);
 extern int iommu_register_device_fault_handler(struct device *dev,
 					iommu_dev_fault_handler_t handler,
 					void *data);
@@ -878,18 +867,6 @@ static inline void iommu_group_put(struct iommu_group *group)
 {
 }
 
-static inline int iommu_group_register_notifier(struct iommu_group *group,
-						struct notifier_block *nb)
-{
-	return -ENODEV;
-}
-
-static inline int iommu_group_unregister_notifier(struct iommu_group *group,
-						  struct notifier_block *nb)
-{
-	return 0;
-}
-
 static inline
 int iommu_register_device_fault_handler(struct device *dev,
 					iommu_dev_fault_handler_t handler,
-- 
cgit v1.2.3-71-gd317


From 6043257b1de069fbb5a2a52d7211c0275bc8c0e0 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 11 Apr 2022 12:16:05 -0300
Subject: iommu: Introduce the domain op enforce_cache_coherency()

This new mechanism will replace using IOMMU_CAP_CACHE_COHERENCY and
IOMMU_CACHE to control the no-snoop blocking behavior of the IOMMU.

Currently only Intel and AMD IOMMUs are known to support this
feature. They both implement it as an IOPTE bit, that when set, will cause
PCIe TLPs to that IOVA with the no-snoop bit set to be treated as though
the no-snoop bit was clear.

The new API is triggered by calling enforce_cache_coherency() before
mapping any IOVA to the domain which globally switches on no-snoop
blocking. This allows other implementations that might block no-snoop
globally and outside the IOPTE - AMD also documents such a HW capability.

Leave AMD out of sync with Intel and have it block no-snoop even for
in-kernel users. This can be trivially resolved in a follow up patch.

Only VFIO needs to call this API because it does not have detailed control
over the device to avoid requesting no-snoop behavior at the device
level. Other places using domains with real kernel drivers should simply
avoid asking their devices to set the no-snoop bit.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v3-2cf356649677+a32-intel_no_snoop_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/iommu.c   |  7 +++++++
 drivers/iommu/intel/iommu.c | 14 +++++++++++++-
 include/linux/intel-iommu.h |  1 +
 include/linux/iommu.h       |  4 ++++
 4 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 038e104b922c..840831d5d2ad 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2266,6 +2266,12 @@ static int amd_iommu_def_domain_type(struct device *dev)
 	return 0;
 }
 
+static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
+{
+	/* IOMMU_PTE_FC is always set */
+	return true;
+}
+
 const struct iommu_ops amd_iommu_ops = {
 	.capable = amd_iommu_capable,
 	.domain_alloc = amd_iommu_domain_alloc,
@@ -2288,6 +2294,7 @@ const struct iommu_ops amd_iommu_ops = {
 		.flush_iotlb_all = amd_iommu_flush_iotlb_all,
 		.iotlb_sync	= amd_iommu_iotlb_sync,
 		.free		= amd_iommu_domain_free,
+		.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
 	}
 };
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 0edf6084dc14..161199f62270 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4422,7 +4422,8 @@ static int intel_iommu_map(struct iommu_domain *domain,
 		prot |= DMA_PTE_READ;
 	if (iommu_prot & IOMMU_WRITE)
 		prot |= DMA_PTE_WRITE;
-	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
+	if (((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) ||
+	    dmar_domain->force_snooping)
 		prot |= DMA_PTE_SNP;
 
 	max_addr = iova + size;
@@ -4545,6 +4546,16 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 	return phys;
 }
 
+static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
+	if (!dmar_domain->iommu_snooping)
+		return false;
+	dmar_domain->force_snooping = true;
+	return true;
+}
+
 static bool intel_iommu_capable(enum iommu_cap cap)
 {
 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
@@ -4900,6 +4911,7 @@ const struct iommu_ops intel_iommu_ops = {
 		.iotlb_sync		= intel_iommu_tlb_sync,
 		.iova_to_phys		= intel_iommu_iova_to_phys,
 		.free			= intel_iommu_domain_free,
+		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
 	}
 };
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 2f9891cb3d00..4c2baf2446c2 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -540,6 +540,7 @@ struct dmar_domain {
 	u8 has_iotlb_device: 1;
 	u8 iommu_coherency: 1;		/* indicate coherency of iommu access */
 	u8 iommu_snooping: 1;		/* indicate snooping control feature */
+	u8 force_snooping : 1;		/* Create IOPTEs with snoop control */
 
 	struct list_head devices;	/* all devices' list */
 	struct iova_domain iovad;	/* iova's that belong to this domain */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4123693ae319..c7ad6b10e261 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -274,6 +274,9 @@ struct iommu_ops {
  * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
  *            queue
  * @iova_to_phys: translate iova to physical address
+ * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE,
+ *                           including no-snoop TLPs on PCIe or other platform
+ *                           specific mechanisms.
  * @enable_nesting: Enable nesting
  * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
  * @free: Release the domain after use.
@@ -302,6 +305,7 @@ struct iommu_domain_ops {
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
 				    dma_addr_t iova);
 
+	bool (*enforce_cache_coherency)(struct iommu_domain *domain);
 	int (*enable_nesting)(struct iommu_domain *domain);
 	int (*set_pgtable_quirks)(struct iommu_domain *domain,
 				  unsigned long quirks);
-- 
cgit v1.2.3-71-gd317


From 71cfafda9c9bd9812cdb62ddb94daf65a1af12c1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 11 Apr 2022 12:16:06 -0300
Subject: vfio: Move the Intel no-snoop control off of IOMMU_CACHE

IOMMU_CACHE means "normal DMA to this iommu_domain's IOVA should be cache
coherent" and is used by the DMA API. The definition allows for special
non-coherent DMA to exist - ie processing of the no-snoop flag in PCIe
TLPs - so long as this behavior is opt-in by the device driver.

The flag is mainly used by the DMA API to synchronize the IOMMU setting
with the expected cache behavior of the DMA master. eg based on
dev_is_dma_coherent() in some case.

For Intel IOMMU IOMMU_CACHE was redefined to mean 'force all DMA to be
cache coherent' which has the practical effect of causing the IOMMU to
ignore the no-snoop bit in a PCIe TLP.

x86 platforms are always IOMMU_CACHE, so Intel should ignore this flag.

Instead use the new domain op enforce_cache_coherency() which causes every
IOPTE created in the domain to have the no-snoop blocking behavior.

Reconfigure VFIO to always use IOMMU_CACHE and call
enforce_cache_coherency() to operate the special Intel behavior.

Remove the IOMMU_CACHE test from Intel IOMMU.

Ultimately VFIO plumbs the result of enforce_cache_coherency() back into
the x86 platform code through kvm_arch_register_noncoherent_dma() which
controls if the WBINVD instruction is available in the guest.  No other
archs implement kvm_arch_register_noncoherent_dma() nor are there any
other known consumers of VFIO_DMA_CC_IOMMU that might be affected by the
user visible result change on non-x86 archs.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v3-2cf356649677+a32-intel_no_snoop_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c     |  7 ++-----
 drivers/vfio/vfio_iommu_type1.c | 30 +++++++++++++++++++-----------
 include/linux/intel-iommu.h     |  1 -
 3 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 161199f62270..38441cb06c8c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -641,7 +641,6 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 static void domain_update_iommu_cap(struct dmar_domain *domain)
 {
 	domain_update_iommu_coherency(domain);
-	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 
 	/*
@@ -4283,7 +4282,6 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 	domain->agaw = width_to_agaw(adjust_width);
 
 	domain->iommu_coherency = false;
-	domain->iommu_snooping = false;
 	domain->iommu_superpage = 0;
 	domain->max_addr = 0;
 
@@ -4422,8 +4420,7 @@ static int intel_iommu_map(struct iommu_domain *domain,
 		prot |= DMA_PTE_READ;
 	if (iommu_prot & IOMMU_WRITE)
 		prot |= DMA_PTE_WRITE;
-	if (((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) ||
-	    dmar_domain->force_snooping)
+	if (dmar_domain->force_snooping)
 		prot |= DMA_PTE_SNP;
 
 	max_addr = iova + size;
@@ -4550,7 +4547,7 @@ static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 
-	if (!dmar_domain->iommu_snooping)
+	if (!domain_update_iommu_snooping(NULL))
 		return false;
 	dmar_domain->force_snooping = true;
 	return true;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 9394aa9444c1..c13b9290e357 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -84,8 +84,8 @@ struct vfio_domain {
 	struct iommu_domain	*domain;
 	struct list_head	next;
 	struct list_head	group_list;
-	int			prot;		/* IOMMU_CACHE */
-	bool			fgsp;		/* Fine-grained super pages */
+	bool			fgsp : 1;	/* Fine-grained super pages */
+	bool			enforce_cache_coherency : 1;
 };
 
 struct vfio_dma {
@@ -1461,7 +1461,7 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 
 	list_for_each_entry(d, &iommu->domain_list, next) {
 		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
-				npage << PAGE_SHIFT, prot | d->prot);
+				npage << PAGE_SHIFT, prot | IOMMU_CACHE);
 		if (ret)
 			goto unwind;
 
@@ -1771,7 +1771,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 			}
 
 			ret = iommu_map(domain->domain, iova, phys,
-					size, dma->prot | domain->prot);
+					size, dma->prot | IOMMU_CACHE);
 			if (ret) {
 				if (!dma->iommu_mapped) {
 					vfio_unpin_pages_remote(dma, iova,
@@ -1859,7 +1859,7 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 		return;
 
 	ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
-			IOMMU_READ | IOMMU_WRITE | domain->prot);
+			IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE);
 	if (!ret) {
 		size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
 
@@ -2267,8 +2267,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 		goto out_detach;
 	}
 
-	if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
-		domain->prot |= IOMMU_CACHE;
+	/*
+	 * If the IOMMU can block non-coherent operations (ie PCIe TLPs with
+	 * no-snoop set) then VFIO always turns this feature on because on Intel
+	 * platforms it optimizes KVM to disable wbinvd emulation.
+	 */
+	if (domain->domain->ops->enforce_cache_coherency)
+		domain->enforce_cache_coherency =
+			domain->domain->ops->enforce_cache_coherency(
+				domain->domain);
 
 	/*
 	 * Try to match an existing compatible domain.  We don't want to
@@ -2279,7 +2286,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	 */
 	list_for_each_entry(d, &iommu->domain_list, next) {
 		if (d->domain->ops == domain->domain->ops &&
-		    d->prot == domain->prot) {
+		    d->enforce_cache_coherency ==
+			    domain->enforce_cache_coherency) {
 			iommu_detach_group(domain->domain, group->iommu_group);
 			if (!iommu_attach_group(d->domain,
 						group->iommu_group)) {
@@ -2611,14 +2619,14 @@ static void vfio_iommu_type1_release(void *iommu_data)
 	kfree(iommu);
 }
 
-static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
+static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu)
 {
 	struct vfio_domain *domain;
 	int ret = 1;
 
 	mutex_lock(&iommu->lock);
 	list_for_each_entry(domain, &iommu->domain_list, next) {
-		if (!(domain->prot & IOMMU_CACHE)) {
+		if (!(domain->enforce_cache_coherency)) {
 			ret = 0;
 			break;
 		}
@@ -2641,7 +2649,7 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
 	case VFIO_DMA_CC_IOMMU:
 		if (!iommu)
 			return 0;
-		return vfio_domains_have_iommu_cache(iommu);
+		return vfio_domains_have_enforce_cache_coherency(iommu);
 	default:
 		return 0;
 	}
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 4c2baf2446c2..72e5d7900e71 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -539,7 +539,6 @@ struct dmar_domain {
 
 	u8 has_iotlb_device: 1;
 	u8 iommu_coherency: 1;		/* indicate coherency of iommu access */
-	u8 iommu_snooping: 1;		/* indicate snooping control feature */
 	u8 force_snooping : 1;		/* Create IOPTEs with snoop control */
 
 	struct list_head devices;	/* all devices' list */
-- 
cgit v1.2.3-71-gd317


From f78dc1dad829e505d83e33dc0879887f074c52e1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 11 Apr 2022 12:16:07 -0300
Subject: iommu: Redefine IOMMU_CAP_CACHE_COHERENCY as the cap flag for
 IOMMU_CACHE

While the comment was correct that this flag was intended to convey the
block no-snoop support in the IOMMU, it has become widely implemented and
used to mean the IOMMU supports IOMMU_CACHE as a map flag. Only the Intel
driver was different.

Now that the Intel driver is using enforce_cache_coherency() update the
comment to make it clear that IOMMU_CAP_CACHE_COHERENCY is only about
IOMMU_CACHE.  Fix the Intel driver to return true since IOMMU_CACHE always
works.

The two places that test this flag, usnic and vdpa, are both assigning
userspace pages to a driver controlled iommu_domain and require
IOMMU_CACHE behavior as they offer no way for userspace to synchronize
caches.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/3-v3-2cf356649677+a32-intel_no_snoop_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c | 2 +-
 include/linux/iommu.h       | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 38441cb06c8c..efcecfa5952a 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4556,7 +4556,7 @@ static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
 static bool intel_iommu_capable(enum iommu_cap cap)
 {
 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
-		return domain_update_iommu_snooping(NULL);
+		return true;
 	if (cap == IOMMU_CAP_INTR_REMAP)
 		return irq_remapping_enabled == 1;
 	if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c7ad6b10e261..575ab27ede5b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -103,8 +103,7 @@ static inline bool iommu_is_dma_domain(struct iommu_domain *domain)
 }
 
 enum iommu_cap {
-	IOMMU_CAP_CACHE_COHERENCY,	/* IOMMU can enforce cache coherent DMA
-					   transactions */
+	IOMMU_CAP_CACHE_COHERENCY,	/* IOMMU_CACHE is supported */
 	IOMMU_CAP_INTR_REMAP,		/* IOMMU supports interrupt isolation */
 	IOMMU_CAP_NOEXEC,		/* IOMMU_NOEXEC flag */
 	IOMMU_CAP_PRE_BOOT_PROTECTION,	/* Firmware says it used the IOMMU for
-- 
cgit v1.2.3-71-gd317


From 892de36fd4a98fab3298d417c051d9099af5448d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 29 Apr 2022 12:22:10 -0400
Subject: SUNRPC: Ensure gss-proxy connects on setup

For reasons best known to the author, gss-proxy does not implement a
NULL procedure, and returns RPC_PROC_UNAVAIL. However we still want to
ensure that we connect to the service at setup time.
So add a quirk-flag specially for this case.

Fixes: 1d658336b05f ("SUNRPC: Add RPC based upcall mechanism for RPCGSS auth")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h          | 1 +
 net/sunrpc/auth_gss/gss_rpc_upcall.c | 2 +-
 net/sunrpc/clnt.c                    | 3 +++
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 267b7aeaf1a6..db5149567305 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -160,6 +160,7 @@ struct rpc_add_xprt_test {
 #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT	(1UL << 9)
 #define RPC_CLNT_CREATE_SOFTERR		(1UL << 10)
 #define RPC_CLNT_CREATE_REUSEPORT	(1UL << 11)
+#define RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL (1UL << 12)
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index 61c276bddaf2..8ca1d809b78d 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -97,7 +97,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt)
 		 * timeout, which would result in reconnections being
 		 * done without the correct namespace:
 		 */
-		.flags		= RPC_CLNT_CREATE_NOPING |
+		.flags		= RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL |
 				  RPC_CLNT_CREATE_NO_IDLE_TIMEOUT
 	};
 	struct rpc_clnt *clnt;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 98133aa54f19..22c28cf43eba 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -479,6 +479,9 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
 
 	if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
 		int err = rpc_ping(clnt);
+		if ((args->flags & RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL) &&
+		    err == -EOPNOTSUPP)
+			err = 0;
 		if (err != 0) {
 			rpc_shutdown_client(clnt);
 			return ERR_PTR(err);
-- 
cgit v1.2.3-71-gd317


From d495f942f40aa412f8d4d65951152648cfa09903 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 22 Apr 2022 12:30:13 +0200
Subject: KVM: fix bad user ABI for KVM_EXIT_SYSTEM_EVENT

When KVM_EXIT_SYSTEM_EVENT was introduced, it included a flags
member that at the time was unused.  Unfortunately this extensibility
mechanism has several issues:

- x86 is not writing the member, so it would not be possible to use it
  on x86 except for new events

- the member is not aligned to 64 bits, so the definition of the
  uAPI struct is incorrect for 32- on 64-bit userspace.  This is a
  problem for RISC-V, which supports CONFIG_KVM_COMPAT, but fortunately
  usage of flags was only introduced in 5.18.

Since padding has to be introduced, place a new field in there
that tells if the flags field is valid.  To allow further extensibility,
in fact, change flags to an array of 16 values, and store how many
of the values are valid.  The availability of the new ndata field
is tied to a system capability; all architectures are changed to
fill in the field.

To avoid breaking compilation of userspace that was using the flags
field, provide a userspace-only union to overlap flags with data[0].
The new field is placed at the same offset for both 32- and 64-bit
userspace.

Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Peter Gonda <pgonda@google.com>
Cc: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reported-by: kernel test robot <lkp@intel.com>
Message-Id: <20220422103013.34832-1-pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 24 +++++++++++++++++-------
 arch/arm64/kvm/psci.c          |  3 ++-
 arch/riscv/kvm/vcpu_sbi.c      |  5 +++--
 arch/x86/kvm/x86.c             |  2 ++
 include/uapi/linux/kvm.h       | 10 +++++++++-
 virt/kvm/kvm_main.c            |  1 +
 6 files changed, 34 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 85c7abc51af5..4a900cdbc62e 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5986,16 +5986,16 @@ should put the acknowledged interrupt vector into the 'epr' field.
   #define KVM_SYSTEM_EVENT_RESET          2
   #define KVM_SYSTEM_EVENT_CRASH          3
 			__u32 type;
-			__u64 flags;
+                        __u32 ndata;
+                        __u64 data[16];
 		} system_event;
 
 If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
 a system-level event using some architecture specific mechanism (hypercall
 or some special instruction). In case of ARM64, this is triggered using
-HVC instruction based PSCI call from the vcpu. The 'type' field describes
-the system-level event type. The 'flags' field describes architecture
-specific flags for the system-level event.
+HVC instruction based PSCI call from the vcpu.
 
+The 'type' field describes the system-level event type.
 Valid values for 'type' are:
 
  - KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
@@ -6010,10 +6010,20 @@ Valid values for 'type' are:
    to ignore the request, or to gather VM memory core dump and/or
    reset/shutdown of the VM.
 
-Valid flags are:
+If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
+architecture specific information for the system-level event.  Only
+the first `ndata` items (possibly zero) of the data array are valid.
 
- - KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued
-   a SYSTEM_RESET2 call according to v1.1 of the PSCI specification.
+ - for arm64, data[0] is set to KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 if
+   the guest issued a SYSTEM_RESET2 call according to v1.1 of the PSCI
+   specification.
+
+ - for RISC-V, data[0] is set to the value of the second argument of the
+   ``sbi_system_reset`` call.
+
+Previous versions of Linux defined a `flags` member in this struct.  The
+field is now aliased to `data[0]`.  Userspace can assume that it is only
+written if ndata is greater than 0.
 
 ::
 
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index baac2b405f23..708d80e8e60d 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -181,7 +181,8 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)
 
 	memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
 	vcpu->run->system_event.type = type;
-	vcpu->run->system_event.flags = flags;
+	vcpu->run->system_event.ndata = 1;
+	vcpu->run->system_event.data[0] = flags;
 	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 }
 
diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c
index a09ecb97b890..d45e7da3f0d3 100644
--- a/arch/riscv/kvm/vcpu_sbi.c
+++ b/arch/riscv/kvm/vcpu_sbi.c
@@ -83,7 +83,7 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
 				     struct kvm_run *run,
-				     u32 type, u64 flags)
+				     u32 type, u64 reason)
 {
 	unsigned long i;
 	struct kvm_vcpu *tmp;
@@ -94,7 +94,8 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
 
 	memset(&run->system_event, 0, sizeof(run->system_event));
 	run->system_event.type = type;
-	run->system_event.flags = flags;
+	run->system_event.ndata = 1;
+	run->system_event.data[0] = reason;
 	run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43174a8d9497..07d789b1d366 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10015,12 +10015,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+			vcpu->run->system_event.ndata = 0;
 			r = 0;
 			goto out;
 		}
 		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+			vcpu->run->system_event.ndata = 0;
 			r = 0;
 			goto out;
 		}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 91a6fe4e02c0..6a184d260c7f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -445,7 +445,13 @@ struct kvm_run {
 #define KVM_SYSTEM_EVENT_RESET          2
 #define KVM_SYSTEM_EVENT_CRASH          3
 			__u32 type;
-			__u64 flags;
+			__u32 ndata;
+			union {
+#ifndef __KERNEL__
+				__u64 flags;
+#endif
+				__u64 data[16];
+			};
 		} system_event;
 		/* KVM_EXIT_S390_STSI */
 		struct {
@@ -1144,6 +1150,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_MEM_OP_EXTENSION 211
 #define KVM_CAP_PMU_CAPABILITY 212
 #define KVM_CAP_DISABLE_QUIRKS2 213
+/* #define KVM_CAP_VM_TSC_CONTROL 214 */
+#define KVM_CAP_SYSTEM_EVENT_DATA 215
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dfb7dabdbc63..ac57fc2c935f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4333,6 +4333,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 		return 0;
 #endif
 	case KVM_CAP_BINARY_STATS_FD:
+	case KVM_CAP_SYSTEM_EVENT_DATA:
 		return 1;
 	default:
 		break;
-- 
cgit v1.2.3-71-gd317


From 47f753c1108e287edb3e27fad8a7511a9d55578e Mon Sep 17 00:00:00 2001
From: Tan Tee Min <tee.min.tan@linux.intel.com>
Date: Fri, 29 Apr 2022 19:58:07 +0800
Subject: net: stmmac: disable Split Header (SPH) for Intel platforms

Based on DesignWare Ethernet QoS datasheet, we are seeing the limitation
of Split Header (SPH) feature is not supported for Ipv4 fragmented packet.
This SPH limitation will cause ping failure when the packets size exceed
the MTU size. For example, the issue happens once the basic ping packet
size is larger than the configured MTU size and the data is lost inside
the fragmented packet, replaced by zeros/corrupted values, and leads to
ping fail.

So, disable the Split Header for Intel platforms.

v2: Add fixes tag in commit message.

Fixes: 67afd6d1cfdf("net: stmmac: Add Split Header support and enable it in XGMAC cores")
Cc: <stable@vger.kernel.org> # 5.10.x
Suggested-by: Ong, Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Mohammad Athari Bin Ismail <mohammad.athari.ismail@intel.com>
Signed-off-by: Wong Vee Khee <vee.khee.wong@linux.intel.com>
Signed-off-by: Tan Tee Min <tee.min.tan@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +-
 include/linux/stmmac.h                            | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index 63754a9c4ba7..0b0be0898ac5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -454,6 +454,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev,
 	plat->has_gmac4 = 1;
 	plat->force_sf_dma_mode = 0;
 	plat->tso_en = 1;
+	plat->sph_disable = 1;
 
 	/* Multiplying factor to the clk_eee_i clock time
 	 * period to make it closer to 100 ns. This value
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 4a4b3651ab3e..2525a80353b7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -7021,7 +7021,7 @@ int stmmac_dvr_probe(struct device *device,
 		dev_info(priv->device, "TSO feature enabled\n");
 	}
 
-	if (priv->dma_cap.sphen) {
+	if (priv->dma_cap.sphen && !priv->plat->sph_disable) {
 		ndev->hw_features |= NETIF_F_GRO;
 		priv->sph_cap = true;
 		priv->sph = priv->sph_cap;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 24eea1b05ca2..29917850f079 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -270,5 +270,6 @@ struct plat_stmmacenet_data {
 	int msi_rx_base_vec;
 	int msi_tx_base_vec;
 	bool use_phy_wol;
+	bool sph_disable;
 };
 #endif
-- 
cgit v1.2.3-71-gd317


From 6625ffb90f44efc28a3fe1432191a4eacb08c915 Mon Sep 17 00:00:00 2001
From: Yong Wu <yong.wu@mediatek.com>
Date: Tue, 3 May 2022 15:13:52 +0800
Subject: dt-bindings: mediatek: mt8195: Add binding for MM IOMMU

This patch adds descriptions for mt8195 IOMMU which also use ARM
Short-Descriptor translation table format.

In mt8195, there are two smi-common HW and IOMMU, one is for vdo(video
output), the other is for vpp(video processing pipe). They connects
with different smi-larbs, then some setting(larbid_remap) is different.
Differentiate them with the compatible string.

Something like this:

    IOMMU(VDO)          IOMMU(VPP)
       |                   |
  SMI_COMMON_VDO      SMI_COMMON_VPP
  ---------------     ----------------
  |      |   ...      |      |     ...
larb0 larb2  ...    larb1 larb3    ...

Another change is that we have a new IOMMU that is for infra master like
PCIe and USB. The infra master don't have the larb and ports, thus we
rename the port header file to mt8195-memory-port.h rather than
mt8195-larb-port.h.

Also, the IOMMU is not only for MM, thus, we don't call it "m4u" which
means "MultiMedia Memory Management UNIT". thus, use the "iommu" as the
compatiable string.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Link: https://lore.kernel.org/r/20220503071427.2285-2-yong.wu@mediatek.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 .../devicetree/bindings/iommu/mediatek,iommu.yaml  |   7 +
 include/dt-bindings/memory/mt8195-memory-port.h    | 390 +++++++++++++++++++++
 2 files changed, 397 insertions(+)
 create mode 100644 include/dt-bindings/memory/mt8195-memory-port.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml b/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
index 97e8c471a5e8..2223408e91a9 100644
--- a/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
+++ b/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
@@ -77,6 +77,8 @@ properties:
           - mediatek,mt8173-m4u  # generation two
           - mediatek,mt8183-m4u  # generation two
           - mediatek,mt8192-m4u  # generation two
+          - mediatek,mt8195-iommu-vdo        # generation two
+          - mediatek,mt8195-iommu-vpp        # generation two
 
       - description: mt7623 generation one
         items:
@@ -120,6 +122,7 @@ properties:
       dt-binding/memory/mt8173-larb-port.h for mt8173,
       dt-binding/memory/mt8183-larb-port.h for mt8183,
       dt-binding/memory/mt8192-larb-port.h for mt8192.
+      dt-binding/memory/mt8195-memory-port.h for mt8195.
 
   power-domains:
     maxItems: 1
@@ -141,6 +144,8 @@ allOf:
               - mediatek,mt2712-m4u
               - mediatek,mt8173-m4u
               - mediatek,mt8192-m4u
+              - mediatek,mt8195-iommu-vdo
+              - mediatek,mt8195-iommu-vpp
 
     then:
       required:
@@ -151,6 +156,8 @@ allOf:
         compatible:
           enum:
             - mediatek,mt8192-m4u
+            - mediatek,mt8195-iommu-vdo
+            - mediatek,mt8195-iommu-vpp
 
     then:
       required:
diff --git a/include/dt-bindings/memory/mt8195-memory-port.h b/include/dt-bindings/memory/mt8195-memory-port.h
new file mode 100644
index 000000000000..c10e8b61f1e8
--- /dev/null
+++ b/include/dt-bindings/memory/mt8195-memory-port.h
@@ -0,0 +1,390 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 MediaTek Inc.
+ * Author: Yong Wu <yong.wu@mediatek.com>
+ */
+#ifndef _DT_BINDINGS_MEMORY_MT8195_LARB_PORT_H_
+#define _DT_BINDINGS_MEMORY_MT8195_LARB_PORT_H_
+
+#include <dt-bindings/memory/mtk-memory-port.h>
+
+/*
+ * MM IOMMU supports 16GB dma address. We separate it to four ranges:
+ * 0 ~ 4G; 4G ~ 8G; 8G ~ 12G; 12G ~ 16G, we could adjust these masters
+ * locate in anyone region. BUT:
+ * a) Make sure all the ports inside a larb are in one range.
+ * b) The iova of any master can NOT cross the 4G/8G/12G boundary.
+ *
+ * This is the suggested mapping in this SoC:
+ *
+ * modules    dma-address-region	larbs-ports
+ * disp         0 ~ 4G                  larb0/1/2/3
+ * vcodec      4G ~ 8G                  larb19/20/21/22/23/24
+ * cam/mdp     8G ~ 12G                 the other larbs.
+ * N/A         12G ~ 16G
+ * CCU0   0x24000_0000 ~ 0x243ff_ffff   larb18: port 0/1
+ * CCU1   0x24400_0000 ~ 0x247ff_ffff   larb18: port 2/3
+ *
+ * This SoC have two IOMMU HWs, this is the detailed connected information:
+ * iommu-vdo: larb0/2/5/7/9/10/11/13/17/19/21/24/25/28
+ * iommu-vpp: larb1/3/4/6/8/12/14/16/18/20/22/23/26/27
+ */
+
+/* MM IOMMU ports */
+/* larb0 */
+#define M4U_PORT_L0_DISP_RDMA0			MTK_M4U_ID(0, 0)
+#define M4U_PORT_L0_DISP_WDMA0			MTK_M4U_ID(0, 1)
+#define M4U_PORT_L0_DISP_OVL0_RDMA0		MTK_M4U_ID(0, 2)
+#define M4U_PORT_L0_DISP_OVL0_RDMA1		MTK_M4U_ID(0, 3)
+#define M4U_PORT_L0_DISP_OVL0_HDR		MTK_M4U_ID(0, 4)
+#define M4U_PORT_L0_DISP_FAKE0			MTK_M4U_ID(0, 5)
+
+/* larb1 */
+#define M4U_PORT_L1_DISP_RDMA0			MTK_M4U_ID(1, 0)
+#define M4U_PORT_L1_DISP_WDMA0			MTK_M4U_ID(1, 1)
+#define M4U_PORT_L1_DISP_OVL0_RDMA0		MTK_M4U_ID(1, 2)
+#define M4U_PORT_L1_DISP_OVL0_RDMA1		MTK_M4U_ID(1, 3)
+#define M4U_PORT_L1_DISP_OVL0_HDR		MTK_M4U_ID(1, 4)
+#define M4U_PORT_L1_DISP_FAKE0			MTK_M4U_ID(1, 5)
+
+/* larb2 */
+#define M4U_PORT_L2_MDP_RDMA0			MTK_M4U_ID(2, 0)
+#define M4U_PORT_L2_MDP_RDMA2			MTK_M4U_ID(2, 1)
+#define M4U_PORT_L2_MDP_RDMA4			MTK_M4U_ID(2, 2)
+#define M4U_PORT_L2_MDP_RDMA6			MTK_M4U_ID(2, 3)
+#define M4U_PORT_L2_DISP_FAKE1			MTK_M4U_ID(2, 4)
+
+/* larb3 */
+#define M4U_PORT_L3_MDP_RDMA1			MTK_M4U_ID(3, 0)
+#define M4U_PORT_L3_MDP_RDMA3			MTK_M4U_ID(3, 1)
+#define M4U_PORT_L3_MDP_RDMA5			MTK_M4U_ID(3, 2)
+#define M4U_PORT_L3_MDP_RDMA7			MTK_M4U_ID(3, 3)
+#define M4U_PORT_L3_HDR_DS			MTK_M4U_ID(3, 4)
+#define M4U_PORT_L3_HDR_ADL			MTK_M4U_ID(3, 5)
+#define M4U_PORT_L3_DISP_FAKE1			MTK_M4U_ID(3, 6)
+
+/* larb4 */
+#define M4U_PORT_L4_MDP_RDMA			MTK_M4U_ID(4, 0)
+#define M4U_PORT_L4_MDP_FG			MTK_M4U_ID(4, 1)
+#define M4U_PORT_L4_MDP_OVL			MTK_M4U_ID(4, 2)
+#define M4U_PORT_L4_MDP_WROT			MTK_M4U_ID(4, 3)
+#define M4U_PORT_L4_FAKE			MTK_M4U_ID(4, 4)
+
+/* larb5 */
+#define M4U_PORT_L5_SVPP1_MDP_RDMA		MTK_M4U_ID(5, 0)
+#define M4U_PORT_L5_SVPP1_MDP_FG		MTK_M4U_ID(5, 1)
+#define M4U_PORT_L5_SVPP1_MDP_OVL		MTK_M4U_ID(5, 2)
+#define M4U_PORT_L5_SVPP1_MDP_WROT		MTK_M4U_ID(5, 3)
+#define M4U_PORT_L5_SVPP2_MDP_RDMA		MTK_M4U_ID(5, 4)
+#define M4U_PORT_L5_SVPP2_MDP_FG		MTK_M4U_ID(5, 5)
+#define M4U_PORT_L5_SVPP2_MDP_WROT		MTK_M4U_ID(5, 6)
+#define M4U_PORT_L5_FAKE			MTK_M4U_ID(5, 7)
+
+/* larb6 */
+#define M4U_PORT_L6_SVPP3_MDP_RDMA		MTK_M4U_ID(6, 0)
+#define M4U_PORT_L6_SVPP3_MDP_FG		MTK_M4U_ID(6, 1)
+#define M4U_PORT_L6_SVPP3_MDP_WROT		MTK_M4U_ID(6, 2)
+#define M4U_PORT_L6_FAKE			MTK_M4U_ID(6, 3)
+
+/* larb7 */
+#define M4U_PORT_L7_IMG_WPE_RDMA0		MTK_M4U_ID(7, 0)
+#define M4U_PORT_L7_IMG_WPE_RDMA1		MTK_M4U_ID(7, 1)
+#define M4U_PORT_L7_IMG_WPE_WDMA0		MTK_M4U_ID(7, 2)
+
+/* larb8 */
+#define M4U_PORT_L8_IMG_WPE_RDMA0		MTK_M4U_ID(8, 0)
+#define M4U_PORT_L8_IMG_WPE_RDMA1		MTK_M4U_ID(8, 1)
+#define M4U_PORT_L8_IMG_WPE_WDMA0		MTK_M4U_ID(8, 2)
+
+/* larb9 */
+#define M4U_PORT_L9_IMG_IMGI_T1_A		MTK_M4U_ID(9, 0)
+#define M4U_PORT_L9_IMG_IMGBI_T1_A		MTK_M4U_ID(9, 1)
+#define M4U_PORT_L9_IMG_IMGCI_T1_A		MTK_M4U_ID(9, 2)
+#define M4U_PORT_L9_IMG_SMTI_T1_A		MTK_M4U_ID(9, 3)
+#define M4U_PORT_L9_IMG_TNCSTI_T1_A		MTK_M4U_ID(9, 4)
+#define M4U_PORT_L9_IMG_TNCSTI_T4_A		MTK_M4U_ID(9, 5)
+#define M4U_PORT_L9_IMG_YUVO_T1_A		MTK_M4U_ID(9, 6)
+#define M4U_PORT_L9_IMG_TIMGO_T1_A		MTK_M4U_ID(9, 7)
+#define M4U_PORT_L9_IMG_YUVO_T2_A		MTK_M4U_ID(9, 8)
+#define M4U_PORT_L9_IMG_IMGI_T1_B		MTK_M4U_ID(9, 9)
+#define M4U_PORT_L9_IMG_IMGBI_T1_B		MTK_M4U_ID(9, 10)
+#define M4U_PORT_L9_IMG_IMGCI_T1_B		MTK_M4U_ID(9, 11)
+#define M4U_PORT_L9_IMG_YUVO_T5_A		MTK_M4U_ID(9, 12)
+#define M4U_PORT_L9_IMG_SMTI_T1_B		MTK_M4U_ID(9, 13)
+#define M4U_PORT_L9_IMG_TNCSO_T1_A		MTK_M4U_ID(9, 14)
+#define M4U_PORT_L9_IMG_SMTO_T1_A		MTK_M4U_ID(9, 15)
+#define M4U_PORT_L9_IMG_TNCSTO_T1_A		MTK_M4U_ID(9, 16)
+#define M4U_PORT_L9_IMG_YUVO_T2_B		MTK_M4U_ID(9, 17)
+#define M4U_PORT_L9_IMG_YUVO_T5_B		MTK_M4U_ID(9, 18)
+#define M4U_PORT_L9_IMG_SMTO_T1_B		MTK_M4U_ID(9, 19)
+
+/* larb10 */
+#define M4U_PORT_L10_IMG_IMGI_D1_A		MTK_M4U_ID(10, 0)
+#define M4U_PORT_L10_IMG_IMGCI_D1_A		MTK_M4U_ID(10, 1)
+#define M4U_PORT_L10_IMG_DEPI_D1_A		MTK_M4U_ID(10, 2)
+#define M4U_PORT_L10_IMG_DMGI_D1_A		MTK_M4U_ID(10, 3)
+#define M4U_PORT_L10_IMG_VIPI_D1_A		MTK_M4U_ID(10, 4)
+#define M4U_PORT_L10_IMG_TNRWI_D1_A		MTK_M4U_ID(10, 5)
+#define M4U_PORT_L10_IMG_RECI_D1_A		MTK_M4U_ID(10, 6)
+#define M4U_PORT_L10_IMG_SMTI_D1_A		MTK_M4U_ID(10, 7)
+#define M4U_PORT_L10_IMG_SMTI_D6_A		MTK_M4U_ID(10, 8)
+#define M4U_PORT_L10_IMG_PIMGI_P1_A		MTK_M4U_ID(10, 9)
+#define M4U_PORT_L10_IMG_PIMGBI_P1_A		MTK_M4U_ID(10, 10)
+#define M4U_PORT_L10_IMG_PIMGCI_P1_A		MTK_M4U_ID(10, 11)
+#define M4U_PORT_L10_IMG_PIMGI_P1_B		MTK_M4U_ID(10, 12)
+#define M4U_PORT_L10_IMG_PIMGBI_P1_B		MTK_M4U_ID(10, 13)
+#define M4U_PORT_L10_IMG_PIMGCI_P1_B		MTK_M4U_ID(10, 14)
+#define M4U_PORT_L10_IMG_IMG3O_D1_A		MTK_M4U_ID(10, 15)
+#define M4U_PORT_L10_IMG_IMG4O_D1_A		MTK_M4U_ID(10, 16)
+#define M4U_PORT_L10_IMG_IMG3CO_D1_A		MTK_M4U_ID(10, 17)
+#define M4U_PORT_L10_IMG_FEO_D1_A		MTK_M4U_ID(10, 18)
+#define M4U_PORT_L10_IMG_IMG2O_D1_A		MTK_M4U_ID(10, 19)
+#define M4U_PORT_L10_IMG_TNRWO_D1_A		MTK_M4U_ID(10, 20)
+#define M4U_PORT_L10_IMG_SMTO_D1_A		MTK_M4U_ID(10, 21)
+#define M4U_PORT_L10_IMG_WROT_P1_A		MTK_M4U_ID(10, 22)
+#define M4U_PORT_L10_IMG_WROT_P1_B		MTK_M4U_ID(10, 23)
+
+/* larb11 */
+#define M4U_PORT_L11_IMG_WPE_EIS_RDMA0_A	MTK_M4U_ID(11, 0)
+#define M4U_PORT_L11_IMG_WPE_EIS_RDMA1_A	MTK_M4U_ID(11, 1)
+#define M4U_PORT_L11_IMG_WPE_EIS_WDMA0_A	MTK_M4U_ID(11, 2)
+#define M4U_PORT_L11_IMG_WPE_TNR_RDMA0_A	MTK_M4U_ID(11, 3)
+#define M4U_PORT_L11_IMG_WPE_TNR_RDMA1_A	MTK_M4U_ID(11, 4)
+#define M4U_PORT_L11_IMG_WPE_TNR_WDMA0_A	MTK_M4U_ID(11, 5)
+#define M4U_PORT_L11_IMG_WPE_EIS_CQ0_A		MTK_M4U_ID(11, 6)
+#define M4U_PORT_L11_IMG_WPE_EIS_CQ1_A		MTK_M4U_ID(11, 7)
+#define M4U_PORT_L11_IMG_WPE_TNR_CQ0_A		MTK_M4U_ID(11, 8)
+#define M4U_PORT_L11_IMG_WPE_TNR_CQ1_A		MTK_M4U_ID(11, 9)
+
+/* larb12 */
+#define M4U_PORT_L12_IMG_FDVT_RDA		MTK_M4U_ID(12, 0)
+#define M4U_PORT_L12_IMG_FDVT_RDB		MTK_M4U_ID(12, 1)
+#define M4U_PORT_L12_IMG_FDVT_WRA		MTK_M4U_ID(12, 2)
+#define M4U_PORT_L12_IMG_FDVT_WRB		MTK_M4U_ID(12, 3)
+#define M4U_PORT_L12_IMG_ME_RDMA		MTK_M4U_ID(12, 4)
+#define M4U_PORT_L12_IMG_ME_WDMA		MTK_M4U_ID(12, 5)
+#define M4U_PORT_L12_IMG_DVS_RDMA		MTK_M4U_ID(12, 6)
+#define M4U_PORT_L12_IMG_DVS_WDMA		MTK_M4U_ID(12, 7)
+#define M4U_PORT_L12_IMG_DVP_RDMA		MTK_M4U_ID(12, 8)
+#define M4U_PORT_L12_IMG_DVP_WDMA		MTK_M4U_ID(12, 9)
+
+/* larb13 */
+#define M4U_PORT_L13_CAM_CAMSV_CQI_E1		MTK_M4U_ID(13, 0)
+#define M4U_PORT_L13_CAM_CAMSV_CQI_E2		MTK_M4U_ID(13, 1)
+#define M4U_PORT_L13_CAM_GCAMSV_A_IMGO_0	MTK_M4U_ID(13, 2)
+#define M4U_PORT_L13_CAM_SCAMSV_A_IMGO_0	MTK_M4U_ID(13, 3)
+#define M4U_PORT_L13_CAM_GCAMSV_B_IMGO_0	MTK_M4U_ID(13, 4)
+#define M4U_PORT_L13_CAM_GCAMSV_B_IMGO_1	MTK_M4U_ID(13, 5)
+#define M4U_PORT_L13_CAM_GCAMSV_A_UFEO_0	MTK_M4U_ID(13, 6)
+#define M4U_PORT_L13_CAM_GCAMSV_B_UFEO_0	MTK_M4U_ID(13, 7)
+#define M4U_PORT_L13_CAM_PDAI_0			MTK_M4U_ID(13, 8)
+#define M4U_PORT_L13_CAM_FAKE			MTK_M4U_ID(13, 9)
+
+/* larb14 */
+#define M4U_PORT_L14_CAM_GCAMSV_A_IMGO_1	MTK_M4U_ID(14, 0)
+#define M4U_PORT_L14_CAM_SCAMSV_A_IMGO_1	MTK_M4U_ID(14, 1)
+#define M4U_PORT_L14_CAM_GCAMSV_B_IMGO_0	MTK_M4U_ID(14, 2)
+#define M4U_PORT_L14_CAM_GCAMSV_B_IMGO_1	MTK_M4U_ID(14, 3)
+#define M4U_PORT_L14_CAM_SCAMSV_B_IMGO_0	MTK_M4U_ID(14, 4)
+#define M4U_PORT_L14_CAM_SCAMSV_B_IMGO_1	MTK_M4U_ID(14, 5)
+#define M4U_PORT_L14_CAM_IPUI			MTK_M4U_ID(14, 6)
+#define M4U_PORT_L14_CAM_IPU2I			MTK_M4U_ID(14, 7)
+#define M4U_PORT_L14_CAM_IPUO			MTK_M4U_ID(14, 8)
+#define M4U_PORT_L14_CAM_IPU2O			MTK_M4U_ID(14, 9)
+#define M4U_PORT_L14_CAM_IPU3O			MTK_M4U_ID(14, 10)
+#define M4U_PORT_L14_CAM_GCAMSV_A_UFEO_1	MTK_M4U_ID(14, 11)
+#define M4U_PORT_L14_CAM_GCAMSV_B_UFEO_1	MTK_M4U_ID(14, 12)
+#define M4U_PORT_L14_CAM_PDAI_1			MTK_M4U_ID(14, 13)
+#define M4U_PORT_L14_CAM_PDAO			MTK_M4U_ID(14, 14)
+
+/* larb15: null */
+
+/* larb16 */
+#define M4U_PORT_L16_CAM_IMGO_R1		MTK_M4U_ID(16, 0)
+#define M4U_PORT_L16_CAM_CQI_R1			MTK_M4U_ID(16, 1)
+#define M4U_PORT_L16_CAM_CQI_R2			MTK_M4U_ID(16, 2)
+#define M4U_PORT_L16_CAM_BPCI_R1		MTK_M4U_ID(16, 3)
+#define M4U_PORT_L16_CAM_LSCI_R1		MTK_M4U_ID(16, 4)
+#define M4U_PORT_L16_CAM_RAWI_R2		MTK_M4U_ID(16, 5)
+#define M4U_PORT_L16_CAM_RAWI_R3		MTK_M4U_ID(16, 6)
+#define M4U_PORT_L16_CAM_UFDI_R2		MTK_M4U_ID(16, 7)
+#define M4U_PORT_L16_CAM_UFDI_R3		MTK_M4U_ID(16, 8)
+#define M4U_PORT_L16_CAM_RAWI_R4		MTK_M4U_ID(16, 9)
+#define M4U_PORT_L16_CAM_RAWI_R5		MTK_M4U_ID(16, 10)
+#define M4U_PORT_L16_CAM_AAI_R1			MTK_M4U_ID(16, 11)
+#define M4U_PORT_L16_CAM_FHO_R1			MTK_M4U_ID(16, 12)
+#define M4U_PORT_L16_CAM_AAO_R1			MTK_M4U_ID(16, 13)
+#define M4U_PORT_L16_CAM_TSFSO_R1		MTK_M4U_ID(16, 14)
+#define M4U_PORT_L16_CAM_FLKO_R1		MTK_M4U_ID(16, 15)
+
+/* larb17 */
+#define M4U_PORT_L17_CAM_YUVO_R1		MTK_M4U_ID(17, 0)
+#define M4U_PORT_L17_CAM_YUVO_R3		MTK_M4U_ID(17, 1)
+#define M4U_PORT_L17_CAM_YUVCO_R1		MTK_M4U_ID(17, 2)
+#define M4U_PORT_L17_CAM_YUVO_R2		MTK_M4U_ID(17, 3)
+#define M4U_PORT_L17_CAM_RZH1N2TO_R1		MTK_M4U_ID(17, 4)
+#define M4U_PORT_L17_CAM_DRZS4NO_R1		MTK_M4U_ID(17, 5)
+#define M4U_PORT_L17_CAM_TNCSO_R1		MTK_M4U_ID(17, 6)
+
+/* larb18 */
+#define M4U_PORT_L18_CAM_CCUI			MTK_M4U_ID(18, 0)
+#define M4U_PORT_L18_CAM_CCUO			MTK_M4U_ID(18, 1)
+#define M4U_PORT_L18_CAM_CCUI2			MTK_M4U_ID(18, 2)
+#define M4U_PORT_L18_CAM_CCUO2			MTK_M4U_ID(18, 3)
+
+/* larb19 */
+#define M4U_PORT_L19_VENC_RCPU			MTK_M4U_ID(19, 0)
+#define M4U_PORT_L19_VENC_REC			MTK_M4U_ID(19, 1)
+#define M4U_PORT_L19_VENC_BSDMA			MTK_M4U_ID(19, 2)
+#define M4U_PORT_L19_VENC_SV_COMV		MTK_M4U_ID(19, 3)
+#define M4U_PORT_L19_VENC_RD_COMV		MTK_M4U_ID(19, 4)
+#define M4U_PORT_L19_VENC_NBM_RDMA		MTK_M4U_ID(19, 5)
+#define M4U_PORT_L19_VENC_NBM_RDMA_LITE		MTK_M4U_ID(19, 6)
+#define M4U_PORT_L19_JPGENC_Y_RDMA		MTK_M4U_ID(19, 7)
+#define M4U_PORT_L19_JPGENC_C_RDMA		MTK_M4U_ID(19, 8)
+#define M4U_PORT_L19_JPGENC_Q_TABLE		MTK_M4U_ID(19, 9)
+#define M4U_PORT_L19_VENC_SUB_W_LUMA		MTK_M4U_ID(19, 10)
+#define M4U_PORT_L19_VENC_FCS_NBM_RDMA		MTK_M4U_ID(19, 11)
+#define M4U_PORT_L19_JPGENC_BSDMA		MTK_M4U_ID(19, 12)
+#define M4U_PORT_L19_JPGDEC_WDMA0		MTK_M4U_ID(19, 13)
+#define M4U_PORT_L19_JPGDEC_BSDMA0		MTK_M4U_ID(19, 14)
+#define M4U_PORT_L19_VENC_NBM_WDMA		MTK_M4U_ID(19, 15)
+#define M4U_PORT_L19_VENC_NBM_WDMA_LITE		MTK_M4U_ID(19, 16)
+#define M4U_PORT_L19_VENC_FCS_NBM_WDMA		MTK_M4U_ID(19, 17)
+#define M4U_PORT_L19_JPGDEC_WDMA1		MTK_M4U_ID(19, 18)
+#define M4U_PORT_L19_JPGDEC_BSDMA1		MTK_M4U_ID(19, 19)
+#define M4U_PORT_L19_JPGDEC_BUFF_OFFSET1	MTK_M4U_ID(19, 20)
+#define M4U_PORT_L19_JPGDEC_BUFF_OFFSET0	MTK_M4U_ID(19, 21)
+#define M4U_PORT_L19_VENC_CUR_LUMA		MTK_M4U_ID(19, 22)
+#define M4U_PORT_L19_VENC_CUR_CHROMA		MTK_M4U_ID(19, 23)
+#define M4U_PORT_L19_VENC_REF_LUMA		MTK_M4U_ID(19, 24)
+#define M4U_PORT_L19_VENC_REF_CHROMA		MTK_M4U_ID(19, 25)
+#define M4U_PORT_L19_VENC_SUB_R_CHROMA		MTK_M4U_ID(19, 26)
+
+/* larb20 */
+#define M4U_PORT_L20_VENC_RCPU			MTK_M4U_ID(20, 0)
+#define M4U_PORT_L20_VENC_REC			MTK_M4U_ID(20, 1)
+#define M4U_PORT_L20_VENC_BSDMA			MTK_M4U_ID(20, 2)
+#define M4U_PORT_L20_VENC_SV_COMV		MTK_M4U_ID(20, 3)
+#define M4U_PORT_L20_VENC_RD_COMV		MTK_M4U_ID(20, 4)
+#define M4U_PORT_L20_VENC_NBM_RDMA		MTK_M4U_ID(20, 5)
+#define M4U_PORT_L20_VENC_NBM_RDMA_LITE		MTK_M4U_ID(20, 6)
+#define M4U_PORT_L20_JPGENC_Y_RDMA		MTK_M4U_ID(20, 7)
+#define M4U_PORT_L20_JPGENC_C_RDMA		MTK_M4U_ID(20, 8)
+#define M4U_PORT_L20_JPGENC_Q_TABLE		MTK_M4U_ID(20, 9)
+#define M4U_PORT_L20_VENC_SUB_W_LUMA		MTK_M4U_ID(20, 10)
+#define M4U_PORT_L20_VENC_FCS_NBM_RDMA		MTK_M4U_ID(20, 11)
+#define M4U_PORT_L20_JPGENC_BSDMA		MTK_M4U_ID(20, 12)
+#define M4U_PORT_L20_JPGDEC_WDMA0		MTK_M4U_ID(20, 13)
+#define M4U_PORT_L20_JPGDEC_BSDMA0		MTK_M4U_ID(20, 14)
+#define M4U_PORT_L20_VENC_NBM_WDMA		MTK_M4U_ID(20, 15)
+#define M4U_PORT_L20_VENC_NBM_WDMA_LITE		MTK_M4U_ID(20, 16)
+#define M4U_PORT_L20_VENC_FCS_NBM_WDMA		MTK_M4U_ID(20, 17)
+#define M4U_PORT_L20_JPGDEC_WDMA1		MTK_M4U_ID(20, 18)
+#define M4U_PORT_L20_JPGDEC_BSDMA1		MTK_M4U_ID(20, 19)
+#define M4U_PORT_L20_JPGDEC_BUFF_OFFSET1	MTK_M4U_ID(20, 20)
+#define M4U_PORT_L20_JPGDEC_BUFF_OFFSET0	MTK_M4U_ID(20, 21)
+#define M4U_PORT_L20_VENC_CUR_LUMA		MTK_M4U_ID(20, 22)
+#define M4U_PORT_L20_VENC_CUR_CHROMA		MTK_M4U_ID(20, 23)
+#define M4U_PORT_L20_VENC_REF_LUMA		MTK_M4U_ID(20, 24)
+#define M4U_PORT_L20_VENC_REF_CHROMA		MTK_M4U_ID(20, 25)
+#define M4U_PORT_L20_VENC_SUB_R_CHROMA		MTK_M4U_ID(20, 26)
+
+/* larb21 */
+#define M4U_PORT_L21_VDEC_MC_EXT		MTK_M4U_ID(21, 0)
+#define M4U_PORT_L21_VDEC_UFO_EXT		MTK_M4U_ID(21, 1)
+#define M4U_PORT_L21_VDEC_PP_EXT		MTK_M4U_ID(21, 2)
+#define M4U_PORT_L21_VDEC_PRED_RD_EXT		MTK_M4U_ID(21, 3)
+#define M4U_PORT_L21_VDEC_PRED_WR_EXT		MTK_M4U_ID(21, 4)
+#define M4U_PORT_L21_VDEC_PPWRAP_EXT		MTK_M4U_ID(21, 5)
+#define M4U_PORT_L21_VDEC_TILE_EXT		MTK_M4U_ID(21, 6)
+#define M4U_PORT_L21_VDEC_VLD_EXT		MTK_M4U_ID(21, 7)
+#define M4U_PORT_L21_VDEC_VLD2_EXT		MTK_M4U_ID(21, 8)
+#define M4U_PORT_L21_VDEC_AVC_MV_EXT		MTK_M4U_ID(21, 9)
+
+/* larb22 */
+#define M4U_PORT_L22_VDEC_MC_EXT		MTK_M4U_ID(22, 0)
+#define M4U_PORT_L22_VDEC_UFO_EXT		MTK_M4U_ID(22, 1)
+#define M4U_PORT_L22_VDEC_PP_EXT		MTK_M4U_ID(22, 2)
+#define M4U_PORT_L22_VDEC_PRED_RD_EXT		MTK_M4U_ID(22, 3)
+#define M4U_PORT_L22_VDEC_PRED_WR_EXT		MTK_M4U_ID(22, 4)
+#define M4U_PORT_L22_VDEC_PPWRAP_EXT		MTK_M4U_ID(22, 5)
+#define M4U_PORT_L22_VDEC_TILE_EXT		MTK_M4U_ID(22, 6)
+#define M4U_PORT_L22_VDEC_VLD_EXT		MTK_M4U_ID(22, 7)
+#define M4U_PORT_L22_VDEC_VLD2_EXT		MTK_M4U_ID(22, 8)
+#define M4U_PORT_L22_VDEC_AVC_MV_EXT		MTK_M4U_ID(22, 9)
+
+/* larb23 */
+#define M4U_PORT_L23_VDEC_UFO_ENC_EXT		MTK_M4U_ID(23, 0)
+#define M4U_PORT_L23_VDEC_RDMA_EXT		MTK_M4U_ID(23, 1)
+
+/* larb24 */
+#define M4U_PORT_L24_VDEC_LAT0_VLD_EXT		MTK_M4U_ID(24, 0)
+#define M4U_PORT_L24_VDEC_LAT0_VLD2_EXT		MTK_M4U_ID(24, 1)
+#define M4U_PORT_L24_VDEC_LAT0_AVC_MC_EXT	MTK_M4U_ID(24, 2)
+#define M4U_PORT_L24_VDEC_LAT0_PRED_RD_EXT	MTK_M4U_ID(24, 3)
+#define M4U_PORT_L24_VDEC_LAT0_TILE_EXT		MTK_M4U_ID(24, 4)
+#define M4U_PORT_L24_VDEC_LAT0_WDMA_EXT		MTK_M4U_ID(24, 5)
+#define M4U_PORT_L24_VDEC_LAT1_VLD_EXT		MTK_M4U_ID(24, 6)
+#define M4U_PORT_L24_VDEC_LAT1_VLD2_EXT		MTK_M4U_ID(24, 7)
+#define M4U_PORT_L24_VDEC_LAT1_AVC_MC_EXT	MTK_M4U_ID(24, 8)
+#define M4U_PORT_L24_VDEC_LAT1_PRED_RD_EXT	MTK_M4U_ID(24, 9)
+#define M4U_PORT_L24_VDEC_LAT1_TILE_EXT		MTK_M4U_ID(24, 10)
+#define M4U_PORT_L24_VDEC_LAT1_WDMA_EXT		MTK_M4U_ID(24, 11)
+
+/* larb25 */
+#define M4U_PORT_L25_CAM_MRAW0_LSCI_M1		MTK_M4U_ID(25, 0)
+#define M4U_PORT_L25_CAM_MRAW0_CQI_M1		MTK_M4U_ID(25, 1)
+#define M4U_PORT_L25_CAM_MRAW0_CQI_M2		MTK_M4U_ID(25, 2)
+#define M4U_PORT_L25_CAM_MRAW0_IMGO_M1		MTK_M4U_ID(25, 3)
+#define M4U_PORT_L25_CAM_MRAW0_IMGBO_M1		MTK_M4U_ID(25, 4)
+#define M4U_PORT_L25_CAM_MRAW2_LSCI_M1		MTK_M4U_ID(25, 5)
+#define M4U_PORT_L25_CAM_MRAW2_CQI_M1		MTK_M4U_ID(25, 6)
+#define M4U_PORT_L25_CAM_MRAW2_CQI_M2		MTK_M4U_ID(25, 7)
+#define M4U_PORT_L25_CAM_MRAW2_IMGO_M1		MTK_M4U_ID(25, 8)
+#define M4U_PORT_L25_CAM_MRAW2_IMGBO_M1		MTK_M4U_ID(25, 9)
+#define M4U_PORT_L25_CAM_MRAW0_AFO_M1		MTK_M4U_ID(25, 10)
+#define M4U_PORT_L25_CAM_MRAW2_AFO_M1		MTK_M4U_ID(25, 11)
+
+/* larb26 */
+#define M4U_PORT_L26_CAM_MRAW1_LSCI_M1		MTK_M4U_ID(26, 0)
+#define M4U_PORT_L26_CAM_MRAW1_CQI_M1		MTK_M4U_ID(26, 1)
+#define M4U_PORT_L26_CAM_MRAW1_CQI_M2		MTK_M4U_ID(26, 2)
+#define M4U_PORT_L26_CAM_MRAW1_IMGO_M1		MTK_M4U_ID(26, 3)
+#define M4U_PORT_L26_CAM_MRAW1_IMGBO_M1		MTK_M4U_ID(26, 4)
+#define M4U_PORT_L26_CAM_MRAW3_LSCI_M1		MTK_M4U_ID(26, 5)
+#define M4U_PORT_L26_CAM_MRAW3_CQI_M1		MTK_M4U_ID(26, 6)
+#define M4U_PORT_L26_CAM_MRAW3_CQI_M2		MTK_M4U_ID(26, 7)
+#define M4U_PORT_L26_CAM_MRAW3_IMGO_M1		MTK_M4U_ID(26, 8)
+#define M4U_PORT_L26_CAM_MRAW3_IMGBO_M1		MTK_M4U_ID(26, 9)
+#define M4U_PORT_L26_CAM_MRAW1_AFO_M1		MTK_M4U_ID(26, 10)
+#define M4U_PORT_L26_CAM_MRAW3_AFO_M1		MTK_M4U_ID(26, 11)
+
+/* larb27 */
+#define M4U_PORT_L27_CAM_IMGO_R1		MTK_M4U_ID(27, 0)
+#define M4U_PORT_L27_CAM_CQI_R1			MTK_M4U_ID(27, 1)
+#define M4U_PORT_L27_CAM_CQI_R2			MTK_M4U_ID(27, 2)
+#define M4U_PORT_L27_CAM_BPCI_R1		MTK_M4U_ID(27, 3)
+#define M4U_PORT_L27_CAM_LSCI_R1		MTK_M4U_ID(27, 4)
+#define M4U_PORT_L27_CAM_RAWI_R2		MTK_M4U_ID(27, 5)
+#define M4U_PORT_L27_CAM_RAWI_R3		MTK_M4U_ID(27, 6)
+#define M4U_PORT_L27_CAM_UFDI_R2		MTK_M4U_ID(27, 7)
+#define M4U_PORT_L27_CAM_UFDI_R3		MTK_M4U_ID(27, 8)
+#define M4U_PORT_L27_CAM_RAWI_R4		MTK_M4U_ID(27, 9)
+#define M4U_PORT_L27_CAM_RAWI_R5		MTK_M4U_ID(27, 10)
+#define M4U_PORT_L27_CAM_AAI_R1			MTK_M4U_ID(27, 11)
+#define M4U_PORT_L27_CAM_FHO_R1			MTK_M4U_ID(27, 12)
+#define M4U_PORT_L27_CAM_AAO_R1			MTK_M4U_ID(27, 13)
+#define M4U_PORT_L27_CAM_TSFSO_R1		MTK_M4U_ID(27, 14)
+#define M4U_PORT_L27_CAM_FLKO_R1		MTK_M4U_ID(27, 15)
+
+/* larb28 */
+#define M4U_PORT_L28_CAM_YUVO_R1		MTK_M4U_ID(28, 0)
+#define M4U_PORT_L28_CAM_YUVO_R3		MTK_M4U_ID(28, 1)
+#define M4U_PORT_L28_CAM_YUVCO_R1		MTK_M4U_ID(28, 2)
+#define M4U_PORT_L28_CAM_YUVO_R2		MTK_M4U_ID(28, 3)
+#define M4U_PORT_L28_CAM_RZH1N2TO_R1		MTK_M4U_ID(28, 4)
+#define M4U_PORT_L28_CAM_DRZS4NO_R1		MTK_M4U_ID(28, 5)
+#define M4U_PORT_L28_CAM_TNCSO_R1		MTK_M4U_ID(28, 6)
+
+#endif
-- 
cgit v1.2.3-71-gd317


From dc1d99342db705efbeec517f9c6187f3b93ab423 Mon Sep 17 00:00:00 2001
From: Yong Wu <yong.wu@mediatek.com>
Date: Tue, 3 May 2022 15:13:53 +0800
Subject: dt-bindings: mediatek: mt8195: Add binding for infra IOMMU

In mt8195, we have a new IOMMU that is for INFRA IOMMU. its masters
mainly are PCIe and USB. Different with MM IOMMU, all these masters
connect with IOMMU directly, there is no mediatek,larbs property for
infra IOMMU.

Another thing is about PCIe ports. currently the function
"of_iommu_configure_dev_id" only support the id number is 1, But our
PCIe have two ports, one is for reading and the other is for writing.
see more about the PCIe patch in this patchset. Thus, I only list
the reading id here and add the other id in our driver.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Link: https://lore.kernel.org/r/20220503071427.2285-3-yong.wu@mediatek.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 .../devicetree/bindings/iommu/mediatek,iommu.yaml      | 13 ++++++++++++-
 include/dt-bindings/memory/mt8195-memory-port.h        | 18 ++++++++++++++++++
 include/dt-bindings/memory/mtk-memory-port.h           |  2 ++
 3 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml b/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
index 2223408e91a9..eed59ec00e78 100644
--- a/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
+++ b/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
@@ -79,6 +79,7 @@ properties:
           - mediatek,mt8192-m4u  # generation two
           - mediatek,mt8195-iommu-vdo        # generation two
           - mediatek,mt8195-iommu-vpp        # generation two
+          - mediatek,mt8195-iommu-infra      # generation two
 
       - description: mt7623 generation one
         items:
@@ -131,7 +132,6 @@ required:
   - compatible
   - reg
   - interrupts
-  - mediatek,larbs
   - '#iommu-cells'
 
 allOf:
@@ -163,6 +163,17 @@ allOf:
       required:
         - power-domains
 
+  - if: # The IOMMUs don't have larbs.
+      not:
+        properties:
+          compatible:
+            contains:
+              const: mediatek,mt8195-iommu-infra
+
+    then:
+      required:
+        - mediatek,larbs
+
 additionalProperties: false
 
 examples:
diff --git a/include/dt-bindings/memory/mt8195-memory-port.h b/include/dt-bindings/memory/mt8195-memory-port.h
index c10e8b61f1e8..70ba9f498eeb 100644
--- a/include/dt-bindings/memory/mt8195-memory-port.h
+++ b/include/dt-bindings/memory/mt8195-memory-port.h
@@ -387,4 +387,22 @@
 #define M4U_PORT_L28_CAM_DRZS4NO_R1		MTK_M4U_ID(28, 5)
 #define M4U_PORT_L28_CAM_TNCSO_R1		MTK_M4U_ID(28, 6)
 
+/* Infra iommu ports */
+/* PCIe1: read: BIT16; write BIT17. */
+#define IOMMU_PORT_INFRA_PCIE1			MTK_IFAIOMMU_PERI_ID(16)
+/* PCIe0: read: BIT18; write BIT19. */
+#define IOMMU_PORT_INFRA_PCIE0			MTK_IFAIOMMU_PERI_ID(18)
+#define IOMMU_PORT_INFRA_SSUSB_P3_R		MTK_IFAIOMMU_PERI_ID(20)
+#define IOMMU_PORT_INFRA_SSUSB_P3_W		MTK_IFAIOMMU_PERI_ID(21)
+#define IOMMU_PORT_INFRA_SSUSB_P2_R		MTK_IFAIOMMU_PERI_ID(22)
+#define IOMMU_PORT_INFRA_SSUSB_P2_W		MTK_IFAIOMMU_PERI_ID(23)
+#define IOMMU_PORT_INFRA_SSUSB_P1_1_R		MTK_IFAIOMMU_PERI_ID(24)
+#define IOMMU_PORT_INFRA_SSUSB_P1_1_W		MTK_IFAIOMMU_PERI_ID(25)
+#define IOMMU_PORT_INFRA_SSUSB_P1_0_R		MTK_IFAIOMMU_PERI_ID(26)
+#define IOMMU_PORT_INFRA_SSUSB_P1_0_W		MTK_IFAIOMMU_PERI_ID(27)
+#define IOMMU_PORT_INFRA_SSUSB2_R		MTK_IFAIOMMU_PERI_ID(28)
+#define IOMMU_PORT_INFRA_SSUSB2_W		MTK_IFAIOMMU_PERI_ID(29)
+#define IOMMU_PORT_INFRA_SSUSB_R		MTK_IFAIOMMU_PERI_ID(30)
+#define IOMMU_PORT_INFRA_SSUSB_W		MTK_IFAIOMMU_PERI_ID(31)
+
 #endif
diff --git a/include/dt-bindings/memory/mtk-memory-port.h b/include/dt-bindings/memory/mtk-memory-port.h
index 7d64103209af..2f68a0511a25 100644
--- a/include/dt-bindings/memory/mtk-memory-port.h
+++ b/include/dt-bindings/memory/mtk-memory-port.h
@@ -12,4 +12,6 @@
 #define MTK_M4U_TO_LARB(id)		(((id) >> 5) & 0x1f)
 #define MTK_M4U_TO_PORT(id)		((id) & 0x1f)
 
+#define MTK_IFAIOMMU_PERI_ID(port)	MTK_M4U_ID(0, port)
+
 #endif
-- 
cgit v1.2.3-71-gd317


From 2d555a3844142b9e0f876925a8475cc830cd472a Mon Sep 17 00:00:00 2001
From: Yong Wu <yong.wu@mediatek.com>
Date: Tue, 3 May 2022 15:13:54 +0800
Subject: dt-bindings: mediatek: mt8186: Add binding for MM iommu

Add mt8186 iommu binding. "-mm" means the iommu is for Multimedia.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20220503071427.2285-4-yong.wu@mediatek.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 .../devicetree/bindings/iommu/mediatek,iommu.yaml  |   4 +
 include/dt-bindings/memory/mt8186-memory-port.h    | 217 +++++++++++++++++++++
 2 files changed, 221 insertions(+)
 create mode 100644 include/dt-bindings/memory/mt8186-memory-port.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml b/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
index eed59ec00e78..91a3629a8e6e 100644
--- a/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
+++ b/Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
@@ -76,6 +76,7 @@ properties:
           - mediatek,mt8167-m4u  # generation two
           - mediatek,mt8173-m4u  # generation two
           - mediatek,mt8183-m4u  # generation two
+          - mediatek,mt8186-iommu-mm         # generation two
           - mediatek,mt8192-m4u  # generation two
           - mediatek,mt8195-iommu-vdo        # generation two
           - mediatek,mt8195-iommu-vpp        # generation two
@@ -122,6 +123,7 @@ properties:
       dt-binding/memory/mt8167-larb-port.h for mt8167,
       dt-binding/memory/mt8173-larb-port.h for mt8173,
       dt-binding/memory/mt8183-larb-port.h for mt8183,
+      dt-binding/memory/mt8186-memory-port.h for mt8186,
       dt-binding/memory/mt8192-larb-port.h for mt8192.
       dt-binding/memory/mt8195-memory-port.h for mt8195.
 
@@ -143,6 +145,7 @@ allOf:
               - mediatek,mt2701-m4u
               - mediatek,mt2712-m4u
               - mediatek,mt8173-m4u
+              - mediatek,mt8186-iommu-mm
               - mediatek,mt8192-m4u
               - mediatek,mt8195-iommu-vdo
               - mediatek,mt8195-iommu-vpp
@@ -155,6 +158,7 @@ allOf:
       properties:
         compatible:
           enum:
+            - mediatek,mt8186-iommu-mm
             - mediatek,mt8192-m4u
             - mediatek,mt8195-iommu-vdo
             - mediatek,mt8195-iommu-vpp
diff --git a/include/dt-bindings/memory/mt8186-memory-port.h b/include/dt-bindings/memory/mt8186-memory-port.h
new file mode 100644
index 000000000000..2bc6e4433048
--- /dev/null
+++ b/include/dt-bindings/memory/mt8186-memory-port.h
@@ -0,0 +1,217 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 MediaTek Inc.
+ *
+ * Author: Anan Sun <anan.sun@mediatek.com>
+ * Author: Yong Wu <yong.wu@mediatek.com>
+ */
+#ifndef _DT_BINDINGS_MEMORY_MT8186_LARB_PORT_H_
+#define _DT_BINDINGS_MEMORY_MT8186_LARB_PORT_H_
+
+#include <dt-bindings/memory/mtk-memory-port.h>
+
+/*
+ * MM IOMMU supports 16GB dma address. We separate it to four ranges:
+ * 0 ~ 4G; 4G ~ 8G; 8G ~ 12G; 12G ~ 16G, we could adjust these masters
+ * locate in anyone region. BUT:
+ * a) Make sure all the ports inside a larb are in one range.
+ * b) The iova of any master can NOT cross the 4G/8G/12G boundary.
+ *
+ * This is the suggested mapping in this SoC:
+ *
+ * modules    dma-address-region	larbs-ports
+ * disp         0 ~ 4G                  larb0/1/2
+ * vcodec      4G ~ 8G                  larb4/7
+ * cam/mdp     8G ~ 12G                 the other larbs.
+ * N/A         12G ~ 16G
+ * CCU0   0x24000_0000 ~ 0x243ff_ffff   larb13: port 9/10
+ * CCU1   0x24400_0000 ~ 0x247ff_ffff   larb14: port 4/5
+ */
+
+/* MM IOMMU ports */
+/* LARB 0 -- MMSYS */
+#define IOMMU_PORT_L0_DISP_POSTMASK0	MTK_M4U_ID(0, 0)
+#define IOMMU_PORT_L0_REVERSED		MTK_M4U_ID(0, 1)
+#define IOMMU_PORT_L0_OVL_RDMA0		MTK_M4U_ID(0, 2)
+#define IOMMU_PORT_L0_DISP_FAKE0	MTK_M4U_ID(0, 3)
+
+/* LARB 1 -- MMSYS */
+#define IOMMU_PORT_L1_DISP_RDMA1	MTK_M4U_ID(1, 0)
+#define IOMMU_PORT_L1_OVL_2L_RDMA0	MTK_M4U_ID(1, 1)
+#define IOMMU_PORT_L1_DISP_RDMA0	MTK_M4U_ID(1, 2)
+#define IOMMU_PORT_L1_DISP_WDMA0	MTK_M4U_ID(1, 3)
+#define IOMMU_PORT_L1_DISP_FAKE1	MTK_M4U_ID(1, 4)
+
+/* LARB 2 -- MMSYS */
+#define IOMMU_PORT_L2_MDP_RDMA0		MTK_M4U_ID(2, 0)
+#define IOMMU_PORT_L2_MDP_RDMA1		MTK_M4U_ID(2, 1)
+#define IOMMU_PORT_L2_MDP_WROT0		MTK_M4U_ID(2, 2)
+#define IOMMU_PORT_L2_MDP_WROT1		MTK_M4U_ID(2, 3)
+#define IOMMU_PORT_L2_DISP_FAKE0	MTK_M4U_ID(2, 4)
+
+/* LARB 4 -- VDEC */
+#define IOMMU_PORT_L4_HW_VDEC_MC_EXT		MTK_M4U_ID(4, 0)
+#define IOMMU_PORT_L4_HW_VDEC_UFO_EXT		MTK_M4U_ID(4, 1)
+#define IOMMU_PORT_L4_HW_VDEC_PP_EXT		MTK_M4U_ID(4, 2)
+#define IOMMU_PORT_L4_HW_VDEC_PRED_RD_EXT	MTK_M4U_ID(4, 3)
+#define IOMMU_PORT_L4_HW_VDEC_PRED_WR_EXT	MTK_M4U_ID(4, 4)
+#define IOMMU_PORT_L4_HW_VDEC_PPWRAP_EXT	MTK_M4U_ID(4, 5)
+#define IOMMU_PORT_L4_HW_VDEC_TILE_EXT		MTK_M4U_ID(4, 6)
+#define IOMMU_PORT_L4_HW_VDEC_VLD_EXT		MTK_M4U_ID(4, 7)
+#define IOMMU_PORT_L4_HW_VDEC_VLD2_EXT		MTK_M4U_ID(4, 8)
+#define IOMMU_PORT_L4_HW_VDEC_AVC_MV_EXT	MTK_M4U_ID(4, 9)
+#define IOMMU_PORT_L4_HW_VDEC_UFO_ENC_EXT	MTK_M4U_ID(4, 10)
+#define IOMMU_PORT_L4_HW_VDEC_RG_CTRL_DMA_EXT	MTK_M4U_ID(4, 11)
+#define IOMMU_PORT_L4_HW_MINI_MDP_R0_EXT	MTK_M4U_ID(4, 12)
+#define IOMMU_PORT_L4_HW_MINI_MDP_W0_EXT	MTK_M4U_ID(4, 13)
+
+/* LARB 7 -- VENC */
+#define IOMMU_PORT_L7_VENC_RCPU		MTK_M4U_ID(7, 0)
+#define IOMMU_PORT_L7_VENC_REC		MTK_M4U_ID(7, 1)
+#define IOMMU_PORT_L7_VENC_BSDMA	MTK_M4U_ID(7, 2)
+#define IOMMU_PORT_L7_VENC_SV_COMV	MTK_M4U_ID(7, 3)
+#define IOMMU_PORT_L7_VENC_RD_COMV	MTK_M4U_ID(7, 4)
+#define IOMMU_PORT_L7_VENC_CUR_LUMA	MTK_M4U_ID(7, 5)
+#define IOMMU_PORT_L7_VENC_CUR_CHROMA	MTK_M4U_ID(7, 6)
+#define IOMMU_PORT_L7_VENC_REF_LUMA	MTK_M4U_ID(7, 7)
+#define IOMMU_PORT_L7_VENC_REF_CHROMA	MTK_M4U_ID(7, 8)
+#define IOMMU_PORT_L7_JPGENC_Y_RDMA	MTK_M4U_ID(7, 9)
+#define IOMMU_PORT_L7_JPGENC_C_RDMA	MTK_M4U_ID(7, 10)
+#define IOMMU_PORT_L7_JPGENC_Q_TABLE	MTK_M4U_ID(7, 11)
+#define IOMMU_PORT_L7_JPGENC_BSDMA	MTK_M4U_ID(7, 12)
+
+/* LARB 8 -- WPE */
+#define IOMMU_PORT_L8_WPE_RDMA_0	MTK_M4U_ID(8, 0)
+#define IOMMU_PORT_L8_WPE_RDMA_1	MTK_M4U_ID(8, 1)
+#define IOMMU_PORT_L8_WPE_WDMA_0	MTK_M4U_ID(8, 2)
+
+/* LARB 9 -- IMG-1 */
+#define IOMMU_PORT_L9_IMG_IMGI_D1	MTK_M4U_ID(9, 0)
+#define IOMMU_PORT_L9_IMG_IMGBI_D1	MTK_M4U_ID(9, 1)
+#define IOMMU_PORT_L9_IMG_DMGI_D1	MTK_M4U_ID(9, 2)
+#define IOMMU_PORT_L9_IMG_DEPI_D1	MTK_M4U_ID(9, 3)
+#define IOMMU_PORT_L9_IMG_LCE_D1	MTK_M4U_ID(9, 4)
+#define IOMMU_PORT_L9_IMG_SMTI_D1	MTK_M4U_ID(9, 5)
+#define IOMMU_PORT_L9_IMG_SMTO_D2	MTK_M4U_ID(9, 6)
+#define IOMMU_PORT_L9_IMG_SMTO_D1	MTK_M4U_ID(9, 7)
+#define IOMMU_PORT_L9_IMG_CRZO_D1	MTK_M4U_ID(9, 8)
+#define IOMMU_PORT_L9_IMG_IMG3O_D1	MTK_M4U_ID(9, 9)
+#define IOMMU_PORT_L9_IMG_VIPI_D1	MTK_M4U_ID(9, 10)
+#define IOMMU_PORT_L9_IMG_SMTI_D5	MTK_M4U_ID(9, 11)
+#define IOMMU_PORT_L9_IMG_TIMGO_D1	MTK_M4U_ID(9, 12)
+#define IOMMU_PORT_L9_IMG_UFBC_W0	MTK_M4U_ID(9, 13)
+#define IOMMU_PORT_L9_IMG_UFBC_R0	MTK_M4U_ID(9, 14)
+#define IOMMU_PORT_L9_IMG_WPE_RDMA1	MTK_M4U_ID(9, 15)
+#define IOMMU_PORT_L9_IMG_WPE_RDMA0	MTK_M4U_ID(9, 16)
+#define IOMMU_PORT_L9_IMG_WPE_WDMA	MTK_M4U_ID(9, 17)
+#define IOMMU_PORT_L9_IMG_MFB_RDMA0	MTK_M4U_ID(9, 18)
+#define IOMMU_PORT_L9_IMG_MFB_RDMA1	MTK_M4U_ID(9, 19)
+#define IOMMU_PORT_L9_IMG_MFB_RDMA2	MTK_M4U_ID(9, 20)
+#define IOMMU_PORT_L9_IMG_MFB_RDMA3	MTK_M4U_ID(9, 21)
+#define IOMMU_PORT_L9_IMG_MFB_RDMA4	MTK_M4U_ID(9, 22)
+#define IOMMU_PORT_L9_IMG_MFB_RDMA5	MTK_M4U_ID(9, 23)
+#define IOMMU_PORT_L9_IMG_MFB_WDMA0	MTK_M4U_ID(9, 24)
+#define IOMMU_PORT_L9_IMG_MFB_WDMA1	MTK_M4U_ID(9, 25)
+#define IOMMU_PORT_L9_IMG_RESERVE6	MTK_M4U_ID(9, 26)
+#define IOMMU_PORT_L9_IMG_RESERVE7	MTK_M4U_ID(9, 27)
+#define IOMMU_PORT_L9_IMG_RESERVE8	MTK_M4U_ID(9, 28)
+
+/* LARB 11 -- IMG-2 */
+#define IOMMU_PORT_L11_IMG_IMGI_D1	MTK_M4U_ID(11, 0)
+#define IOMMU_PORT_L11_IMG_IMGBI_D1	MTK_M4U_ID(11, 1)
+#define IOMMU_PORT_L11_IMG_DMGI_D1	MTK_M4U_ID(11, 2)
+#define IOMMU_PORT_L11_IMG_DEPI_D1	MTK_M4U_ID(11, 3)
+#define IOMMU_PORT_L11_IMG_LCE_D1	MTK_M4U_ID(11, 4)
+#define IOMMU_PORT_L11_IMG_SMTI_D1	MTK_M4U_ID(11, 5)
+#define IOMMU_PORT_L11_IMG_SMTO_D2	MTK_M4U_ID(11, 6)
+#define IOMMU_PORT_L11_IMG_SMTO_D1	MTK_M4U_ID(11, 7)
+#define IOMMU_PORT_L11_IMG_CRZO_D1	MTK_M4U_ID(11, 8)
+#define IOMMU_PORT_L11_IMG_IMG3O_D1	MTK_M4U_ID(11, 9)
+#define IOMMU_PORT_L11_IMG_VIPI_D1	MTK_M4U_ID(11, 10)
+#define IOMMU_PORT_L11_IMG_SMTI_D5	MTK_M4U_ID(11, 11)
+#define IOMMU_PORT_L11_IMG_TIMGO_D1	MTK_M4U_ID(11, 12)
+#define IOMMU_PORT_L11_IMG_UFBC_W0	MTK_M4U_ID(11, 13)
+#define IOMMU_PORT_L11_IMG_UFBC_R0	MTK_M4U_ID(11, 14)
+#define IOMMU_PORT_L11_IMG_WPE_RDMA1	MTK_M4U_ID(11, 15)
+#define IOMMU_PORT_L11_IMG_WPE_RDMA0	MTK_M4U_ID(11, 16)
+#define IOMMU_PORT_L11_IMG_WPE_WDMA	MTK_M4U_ID(11, 17)
+#define IOMMU_PORT_L11_IMG_MFB_RDMA0	MTK_M4U_ID(11, 18)
+#define IOMMU_PORT_L11_IMG_MFB_RDMA1	MTK_M4U_ID(11, 19)
+#define IOMMU_PORT_L11_IMG_MFB_RDMA2	MTK_M4U_ID(11, 20)
+#define IOMMU_PORT_L11_IMG_MFB_RDMA3	MTK_M4U_ID(11, 21)
+#define IOMMU_PORT_L11_IMG_MFB_RDMA4	MTK_M4U_ID(11, 22)
+#define IOMMU_PORT_L11_IMG_MFB_RDMA5	MTK_M4U_ID(11, 23)
+#define IOMMU_PORT_L11_IMG_MFB_WDMA0	MTK_M4U_ID(11, 24)
+#define IOMMU_PORT_L11_IMG_MFB_WDMA1	MTK_M4U_ID(11, 25)
+#define IOMMU_PORT_L11_IMG_RESERVE6	MTK_M4U_ID(11, 26)
+#define IOMMU_PORT_L11_IMG_RESERVE7	MTK_M4U_ID(11, 27)
+#define IOMMU_PORT_L11_IMG_RESERVE8	MTK_M4U_ID(11, 28)
+
+/* LARB 13 -- CAM */
+#define IOMMU_PORT_L13_CAM_MRAWI	MTK_M4U_ID(13, 0)
+#define IOMMU_PORT_L13_CAM_MRAWO_0	MTK_M4U_ID(13, 1)
+#define IOMMU_PORT_L13_CAM_MRAWO_1	MTK_M4U_ID(13, 2)
+#define IOMMU_PORT_L13_CAM_CAMSV_4	MTK_M4U_ID(13, 6)
+#define IOMMU_PORT_L13_CAM_CAMSV_5	MTK_M4U_ID(13, 7)
+#define IOMMU_PORT_L13_CAM_CAMSV_6	MTK_M4U_ID(13, 8)
+#define IOMMU_PORT_L13_CAM_CCUI		MTK_M4U_ID(13, 9)
+#define IOMMU_PORT_L13_CAM_CCUO		MTK_M4U_ID(13, 10)
+#define IOMMU_PORT_L13_CAM_FAKE		MTK_M4U_ID(13, 11)
+
+/* LARB 14 -- CAM */
+#define IOMMU_PORT_L14_CAM_CCUI		MTK_M4U_ID(14, 4)
+#define IOMMU_PORT_L14_CAM_CCUO		MTK_M4U_ID(14, 5)
+
+/* LARB 16 -- RAW-A */
+#define IOMMU_PORT_L16_CAM_IMGO_R1_A	MTK_M4U_ID(16, 0)
+#define IOMMU_PORT_L16_CAM_RRZO_R1_A	MTK_M4U_ID(16, 1)
+#define IOMMU_PORT_L16_CAM_CQI_R1_A	MTK_M4U_ID(16, 2)
+#define IOMMU_PORT_L16_CAM_BPCI_R1_A	MTK_M4U_ID(16, 3)
+#define IOMMU_PORT_L16_CAM_YUVO_R1_A	MTK_M4U_ID(16, 4)
+#define IOMMU_PORT_L16_CAM_UFDI_R2_A	MTK_M4U_ID(16, 5)
+#define IOMMU_PORT_L16_CAM_RAWI_R2_A	MTK_M4U_ID(16, 6)
+#define IOMMU_PORT_L16_CAM_RAWI_R3_A	MTK_M4U_ID(16, 7)
+#define IOMMU_PORT_L16_CAM_AAO_R1_A	MTK_M4U_ID(16, 8)
+#define IOMMU_PORT_L16_CAM_AFO_R1_A	MTK_M4U_ID(16, 9)
+#define IOMMU_PORT_L16_CAM_FLKO_R1_A	MTK_M4U_ID(16, 10)
+#define IOMMU_PORT_L16_CAM_LCESO_R1_A	MTK_M4U_ID(16, 11)
+#define IOMMU_PORT_L16_CAM_CRZO_R1_A	MTK_M4U_ID(16, 12)
+#define IOMMU_PORT_L16_CAM_LTMSO_R1_A	MTK_M4U_ID(16, 13)
+#define IOMMU_PORT_L16_CAM_RSSO_R1_A	MTK_M4U_ID(16, 14)
+#define IOMMU_PORT_L16_CAM_AAHO_R1_A	MTK_M4U_ID(16, 15)
+#define IOMMU_PORT_L16_CAM_LSCI_R1_A	MTK_M4U_ID(16, 16)
+
+/* LARB 17 -- RAW-B */
+#define IOMMU_PORT_L17_CAM_IMGO_R1_B	MTK_M4U_ID(17, 0)
+#define IOMMU_PORT_L17_CAM_RRZO_R1_B	MTK_M4U_ID(17, 1)
+#define IOMMU_PORT_L17_CAM_CQI_R1_B	MTK_M4U_ID(17, 2)
+#define IOMMU_PORT_L17_CAM_BPCI_R1_B	MTK_M4U_ID(17, 3)
+#define IOMMU_PORT_L17_CAM_YUVO_R1_B	MTK_M4U_ID(17, 4)
+#define IOMMU_PORT_L17_CAM_UFDI_R2_B	MTK_M4U_ID(17, 5)
+#define IOMMU_PORT_L17_CAM_RAWI_R2_B	MTK_M4U_ID(17, 6)
+#define IOMMU_PORT_L17_CAM_RAWI_R3_B	MTK_M4U_ID(17, 7)
+#define IOMMU_PORT_L17_CAM_AAO_R1_B	MTK_M4U_ID(17, 8)
+#define IOMMU_PORT_L17_CAM_AFO_R1_B	MTK_M4U_ID(17, 9)
+#define IOMMU_PORT_L17_CAM_FLKO_R1_B	MTK_M4U_ID(17, 10)
+#define IOMMU_PORT_L17_CAM_LCESO_R1_B	MTK_M4U_ID(17, 11)
+#define IOMMU_PORT_L17_CAM_CRZO_R1_B	MTK_M4U_ID(17, 12)
+#define IOMMU_PORT_L17_CAM_LTMSO_R1_B	MTK_M4U_ID(17, 13)
+#define IOMMU_PORT_L17_CAM_RSSO_R1_B	MTK_M4U_ID(17, 14)
+#define IOMMU_PORT_L17_CAM_AAHO_R1_B	MTK_M4U_ID(17, 15)
+#define IOMMU_PORT_L17_CAM_LSCI_R1_B	MTK_M4U_ID(17, 16)
+
+/* LARB 19 -- IPE */
+#define IOMMU_PORT_L19_IPE_DVS_RDMA	MTK_M4U_ID(19, 0)
+#define IOMMU_PORT_L19_IPE_DVS_WDMA	MTK_M4U_ID(19, 1)
+#define IOMMU_PORT_L19_IPE_DVP_RDMA	MTK_M4U_ID(19, 2)
+#define IOMMU_PORT_L19_IPE_DVP_WDMA	MTK_M4U_ID(19, 3)
+
+/* LARB 20 -- IPE */
+#define IOMMU_PORT_L20_IPE_FDVT_RDA	MTK_M4U_ID(20, 0)
+#define IOMMU_PORT_L20_IPE_FDVT_RDB	MTK_M4U_ID(20, 1)
+#define IOMMU_PORT_L20_IPE_FDVT_WRA	MTK_M4U_ID(20, 2)
+#define IOMMU_PORT_L20_IPE_FDVT_WRB	MTK_M4U_ID(20, 3)
+#define IOMMU_PORT_L20_IPE_RSC_RDMA0	MTK_M4U_ID(20, 4)
+#define IOMMU_PORT_L20_IPE_RSC_WDMA	MTK_M4U_ID(20, 5)
+
+#endif
-- 
cgit v1.2.3-71-gd317


From b2d057560b8107c633b39aabe517ff9d93f285e3 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Mon, 2 May 2022 10:46:08 +0200
Subject: secure_seq: use the 64 bits of the siphash for port offset
 calculation

SipHash replaced MD5 in secure_ipv{4,6}_port_ephemeral() via commit
7cd23e5300c1 ("secure_seq: use SipHash in place of MD5"), but the output
remained truncated to 32-bit only. In order to exploit more bits from the
hash, let's make the functions return the full 64-bit of siphash_3u32().
We also make sure the port offset calculation in __inet_hash_connect()
remains done on 32-bit to avoid the need for div_u64_rem() and an extra
cost on 32-bit systems.

Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Moshe Kol <moshe.kol@mail.huji.ac.il>
Cc: Yossi Gilad <yossi.gilad@mail.huji.ac.il>
Cc: Amit Klein <aksecurity@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_hashtables.h |  2 +-
 include/net/secure_seq.h      |  4 ++--
 net/core/secure_seq.c         |  4 ++--
 net/ipv4/inet_hashtables.c    | 10 ++++++----
 net/ipv6/inet6_hashtables.c   |  4 ++--
 5 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index f72ec113ae56..98e1ec1a14f0 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -425,7 +425,7 @@ static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr)
 }
 
 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
-			struct sock *sk, u32 port_offset,
+			struct sock *sk, u64 port_offset,
 			int (*check_established)(struct inet_timewait_death_row *,
 						 struct sock *, __u16,
 						 struct inet_timewait_sock **));
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index d7d2495f83c2..dac91aa38c5a 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -4,8 +4,8 @@
 
 #include <linux/types.h>
 
-u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
-u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
+u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport);
 u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
 		   __be16 sport, __be16 dport);
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 9b8443774449..55aa5cc258e3 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -94,7 +94,7 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
 }
 EXPORT_SYMBOL(secure_tcpv6_seq);
 
-u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport)
 {
 	const struct {
@@ -142,7 +142,7 @@ u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
 }
 EXPORT_SYMBOL_GPL(secure_tcp_seq);
 
-u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
+u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
 {
 	net_secret_init();
 	return siphash_3u32((__force u32)saddr, (__force u32)daddr,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 17440840a791..9d24d9319f3d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -504,7 +504,7 @@ not_unique:
 	return -EADDRNOTAVAIL;
 }
 
-static u32 inet_sk_port_offset(const struct sock *sk)
+static u64 inet_sk_port_offset(const struct sock *sk)
 {
 	const struct inet_sock *inet = inet_sk(sk);
 
@@ -734,7 +734,7 @@ EXPORT_SYMBOL_GPL(inet_unhash);
 static u32 table_perturb[1 << INET_TABLE_PERTURB_SHIFT];
 
 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
-		struct sock *sk, u32 port_offset,
+		struct sock *sk, u64 port_offset,
 		int (*check_established)(struct inet_timewait_death_row *,
 			struct sock *, __u16, struct inet_timewait_sock **))
 {
@@ -777,7 +777,9 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 	net_get_random_once(table_perturb, sizeof(table_perturb));
 	index = hash_32(port_offset, INET_TABLE_PERTURB_SHIFT);
 
-	offset = (READ_ONCE(table_perturb[index]) + port_offset) % remaining;
+	offset = READ_ONCE(table_perturb[index]) + port_offset;
+	offset %= remaining;
+
 	/* In first pass we try ports of @low parity.
 	 * inet_csk_get_port() does the opposite choice.
 	 */
@@ -859,7 +861,7 @@ ok:
 int inet_hash_connect(struct inet_timewait_death_row *death_row,
 		      struct sock *sk)
 {
-	u32 port_offset = 0;
+	u64 port_offset = 0;
 
 	if (!inet_sk(sk)->inet_num)
 		port_offset = inet_sk_port_offset(sk);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 4740afecf7c6..32ccac10bd62 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -308,7 +308,7 @@ not_unique:
 	return -EADDRNOTAVAIL;
 }
 
-static u32 inet6_sk_port_offset(const struct sock *sk)
+static u64 inet6_sk_port_offset(const struct sock *sk)
 {
 	const struct inet_sock *inet = inet_sk(sk);
 
@@ -320,7 +320,7 @@ static u32 inet6_sk_port_offset(const struct sock *sk)
 int inet6_hash_connect(struct inet_timewait_death_row *death_row,
 		       struct sock *sk)
 {
-	u32 port_offset = 0;
+	u64 port_offset = 0;
 
 	if (!inet_sk(sk)->inet_num)
 		port_offset = inet6_sk_port_offset(sk);
-- 
cgit v1.2.3-71-gd317


From 170f37d6aa6ad4582eefd7459015de79e244536e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 3 May 2022 00:09:31 -0400
Subject: block: Do not call folio_next() on an unreferenced folio

It is unsafe to call folio_next() on a folio unless you hold a reference
on it that prevents it from being split or freed.  After returning
from the iterator, iomap calls folio_end_writeback() which may drop
the last reference to the page, or allow the page to be split.  If that
happens, the iterator will not advance far enough through the bio_vec,
leading to assertion failures like the BUG() in folio_end_writeback()
that checks we're not trying to end writeback on a page not currently
under writeback.  Other assertion failures were also seen, but they're
all explained by this one bug.

Fix the bug by remembering where the next folio starts before returning
from the iterator.  There are other ways of fixing this bug, but this
seems the simplest.

Reported-by: Darrick J. Wong <djwong@kernel.org>
Tested-by: Darrick J. Wong <djwong@kernel.org>
Reported-by: Brian Foster <bfoster@redhat.com>
Tested-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/bio.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 278cc81cc1e7..00450fd86bb4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -269,6 +269,7 @@ struct folio_iter {
 	size_t offset;
 	size_t length;
 	/* private: for use by the iterator */
+	struct folio *_next;
 	size_t _seg_count;
 	int _i;
 };
@@ -283,6 +284,7 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio,
 			PAGE_SIZE * (bvec->bv_page - &fi->folio->page);
 	fi->_seg_count = bvec->bv_len;
 	fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count);
+	fi->_next = folio_next(fi->folio);
 	fi->_i = i;
 }
 
@@ -290,9 +292,10 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio)
 {
 	fi->_seg_count -= fi->length;
 	if (fi->_seg_count) {
-		fi->folio = folio_next(fi->folio);
+		fi->folio = fi->_next;
 		fi->offset = 0;
 		fi->length = min(folio_size(fi->folio), fi->_seg_count);
+		fi->_next = folio_next(fi->folio);
 	} else if (fi->_i + 1 < bio->bi_vcnt) {
 		bio_first_folio(fi, bio, fi->_i + 1);
 	} else {
-- 
cgit v1.2.3-71-gd317


From 85db6352fc8a158a893151baa1716463d34a20d0 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Wed, 4 May 2022 11:09:14 +0300
Subject: net: Fix features skip in for_each_netdev_feature()

The find_next_netdev_feature() macro gets the "remaining length",
not bit index.
Passing "bit - 1" for the following iteration is wrong as it skips
the adjacent bit. Pass "bit" instead.

Fixes: 3b89ea9c5902 ("net: Fix for_each_netdev_feature on Big endian")
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Link: https://lore.kernel.org/r/20220504080914.1918-1-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdev_features.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 2c6b9e416225..7c2d77d75a88 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -169,7 +169,7 @@ enum {
 #define NETIF_F_HW_HSR_FWD	__NETIF_F(HW_HSR_FWD)
 #define NETIF_F_HW_HSR_DUP	__NETIF_F(HW_HSR_DUP)
 
-/* Finds the next feature with the highest number of the range of start till 0.
+/* Finds the next feature with the highest number of the range of start-1 till 0.
  */
 static inline int find_next_netdev_feature(u64 feature, unsigned long start)
 {
@@ -188,7 +188,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start)
 	for ((bit) = find_next_netdev_feature((mask_addr),		\
 					      NETDEV_FEATURE_COUNT);	\
 	     (bit) >= 0;						\
-	     (bit) = find_next_netdev_feature((mask_addr), (bit) - 1))
+	     (bit) = find_next_netdev_feature((mask_addr), (bit)))
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
-- 
cgit v1.2.3-71-gd317


From e1846cff2fe614d93a2f89461b5935678fd34bd9 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 5 May 2022 02:54:59 +0300
Subject: net: mscc: ocelot: mark traps with a bool instead of keeping them in
 a list

Since the blamed commit, VCAP filters can appear on more than one list.
If their action is "trap", they are chained on ocelot->traps via
filter->trap_list. This is in addition to their normal placement on the
VCAP block->rules list head.

Therefore, when we free a VCAP filter, we must remove it from all lists
it is a member of, including ocelot->traps.

There are at least 2 bugs which are direct consequences of this design
decision.

First is the incorrect usage of list_empty(), meant to denote whether
"filter" is chained into ocelot->traps via filter->trap_list.
This does not do the correct thing, because list_empty() checks whether
"head->next == head", but in our case, head->next == head->prev == NULL.
So we dereference NULL pointers and die when we call list_del().

Second is the fact that not all places that should remove the filter
from ocelot->traps do so. One example is ocelot_vcap_block_remove_filter(),
which is where we have the main kfree(filter). By keeping freed filters
in ocelot->traps we end up in a use-after-free in
felix_update_trapping_destinations().

Attempting to fix all the buggy patterns is a whack-a-mole game which
makes the driver unmaintainable. Actually this is what the previous
patch version attempted to do:
https://patchwork.kernel.org/project/netdevbpf/patch/20220503115728.834457-3-vladimir.oltean@nxp.com/

but it introduced another set of bugs, because there are other places in
which create VCAP filters, not just ocelot_vcap_filter_create():

- ocelot_trap_add()
- felix_tag_8021q_vlan_add_rx()
- felix_tag_8021q_vlan_add_tx()

Relying on the convention that all those code paths must call
INIT_LIST_HEAD(&filter->trap_list) is not going to scale.

So let's do what should have been done in the first place and keep a
bool in struct ocelot_vcap_filter which denotes whether we are looking
at a trapping rule or not. Iterating now happens over the main VCAP IS2
block->rules. The advantage is that we no longer risk having stale
references to a freed filter, since it is only present in that list.

Fixes: e42bd4ed09aa ("net: mscc: ocelot: keep traps in a list")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix.c            |  7 ++++++-
 drivers/net/ethernet/mscc/ocelot.c        | 11 +++--------
 drivers/net/ethernet/mscc/ocelot_flower.c |  4 +---
 include/soc/mscc/ocelot_vcap.h            |  2 +-
 4 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 9e28219b223d..faccfb3f0158 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -403,6 +403,7 @@ static int felix_update_trapping_destinations(struct dsa_switch *ds,
 {
 	struct ocelot *ocelot = ds->priv;
 	struct felix *felix = ocelot_to_felix(ocelot);
+	struct ocelot_vcap_block *block_vcap_is2;
 	struct ocelot_vcap_filter *trap;
 	enum ocelot_mask_mode mask_mode;
 	unsigned long port_mask;
@@ -422,9 +423,13 @@ static int felix_update_trapping_destinations(struct dsa_switch *ds,
 	/* We are sure that "cpu" was found, otherwise
 	 * dsa_tree_setup_default_cpu() would have failed earlier.
 	 */
+	block_vcap_is2 = &ocelot->block[VCAP_IS2];
 
 	/* Make sure all traps are set up for that destination */
-	list_for_each_entry(trap, &ocelot->traps, trap_list) {
+	list_for_each_entry(trap, &block_vcap_is2->rules, list) {
+		if (!trap->is_trap)
+			continue;
+
 		/* Figure out the current trapping destination */
 		if (using_tag_8021q) {
 			/* Redirect to the tag_8021q CPU port. If timestamps
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index ca71b62a44dc..20ceac81a2c2 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -1622,7 +1622,7 @@ int ocelot_trap_add(struct ocelot *ocelot, int port,
 		trap->action.mask_mode = OCELOT_MASK_MODE_PERMIT_DENY;
 		trap->action.port_mask = 0;
 		trap->take_ts = take_ts;
-		list_add_tail(&trap->trap_list, &ocelot->traps);
+		trap->is_trap = true;
 		new = true;
 	}
 
@@ -1634,10 +1634,8 @@ int ocelot_trap_add(struct ocelot *ocelot, int port,
 		err = ocelot_vcap_filter_replace(ocelot, trap);
 	if (err) {
 		trap->ingress_port_mask &= ~BIT(port);
-		if (!trap->ingress_port_mask) {
-			list_del(&trap->trap_list);
+		if (!trap->ingress_port_mask)
 			kfree(trap);
-		}
 		return err;
 	}
 
@@ -1657,11 +1655,8 @@ int ocelot_trap_del(struct ocelot *ocelot, int port, unsigned long cookie)
 		return 0;
 
 	trap->ingress_port_mask &= ~BIT(port);
-	if (!trap->ingress_port_mask) {
-		list_del(&trap->trap_list);
-
+	if (!trap->ingress_port_mask)
 		return ocelot_vcap_filter_del(ocelot, trap);
-	}
 
 	return ocelot_vcap_filter_replace(ocelot, trap);
 }
diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c
index 03b5e59d033e..a9b26b3002be 100644
--- a/drivers/net/ethernet/mscc/ocelot_flower.c
+++ b/drivers/net/ethernet/mscc/ocelot_flower.c
@@ -295,7 +295,7 @@ static int ocelot_flower_parse_action(struct ocelot *ocelot, int port,
 			filter->action.cpu_copy_ena = true;
 			filter->action.cpu_qu_num = 0;
 			filter->type = OCELOT_VCAP_FILTER_OFFLOAD;
-			list_add_tail(&filter->trap_list, &ocelot->traps);
+			filter->is_trap = true;
 			break;
 		case FLOW_ACTION_POLICE:
 			if (filter->block_id == PSFP_BLOCK_ID) {
@@ -878,8 +878,6 @@ int ocelot_cls_flower_replace(struct ocelot *ocelot, int port,
 
 	ret = ocelot_flower_parse(ocelot, port, ingress, f, filter);
 	if (ret) {
-		if (!list_empty(&filter->trap_list))
-			list_del(&filter->trap_list);
 		kfree(filter);
 		return ret;
 	}
diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h
index 7b2bf9b1fe69..de26c992f821 100644
--- a/include/soc/mscc/ocelot_vcap.h
+++ b/include/soc/mscc/ocelot_vcap.h
@@ -681,7 +681,6 @@ struct ocelot_vcap_id {
 
 struct ocelot_vcap_filter {
 	struct list_head list;
-	struct list_head trap_list;
 
 	enum ocelot_vcap_filter_type type;
 	int block_id;
@@ -695,6 +694,7 @@ struct ocelot_vcap_filter {
 	struct ocelot_vcap_stats stats;
 	/* For VCAP IS1 and IS2 */
 	bool take_ts;
+	bool is_trap;
 	unsigned long ingress_port_mask;
 	/* For VCAP ES0 */
 	struct ocelot_vcap_port ingress_port;
-- 
cgit v1.2.3-71-gd317


From 3d1b0d351441c2be7de082b92f43d0bfdc95a8f3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 7 May 2022 13:48:21 -0400
Subject: Revert "SUNRPC: Ensure gss-proxy connects on setup"

This reverts commit 892de36fd4a98fab3298d417c051d9099af5448d.

The gssproxy server is unresponsive when it calls into the kernel to
start the upcall service, so it will not reply to our RPC ping at all.

Reported-by: "J.Bruce Fields" <bfields@fieldses.org>
Fixes: 892de36fd4a9 ("SUNRPC: Ensure gss-proxy connects on setup")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h          | 1 -
 net/sunrpc/auth_gss/gss_rpc_upcall.c | 2 +-
 net/sunrpc/clnt.c                    | 3 ---
 3 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index db5149567305..267b7aeaf1a6 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -160,7 +160,6 @@ struct rpc_add_xprt_test {
 #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT	(1UL << 9)
 #define RPC_CLNT_CREATE_SOFTERR		(1UL << 10)
 #define RPC_CLNT_CREATE_REUSEPORT	(1UL << 11)
-#define RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL (1UL << 12)
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index 8ca1d809b78d..61c276bddaf2 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -97,7 +97,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt)
 		 * timeout, which would result in reconnections being
 		 * done without the correct namespace:
 		 */
-		.flags		= RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL |
+		.flags		= RPC_CLNT_CREATE_NOPING |
 				  RPC_CLNT_CREATE_NO_IDLE_TIMEOUT
 	};
 	struct rpc_clnt *clnt;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 22c28cf43eba..98133aa54f19 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -479,9 +479,6 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
 
 	if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
 		int err = rpc_ping(clnt);
-		if ((args->flags & RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL) &&
-		    err == -EOPNOTSUPP)
-			err = 0;
 		if (err != 0) {
 			rpc_shutdown_client(clnt);
 			return ERR_PTR(err);
-- 
cgit v1.2.3-71-gd317


From fd13359f54ee854f00134abc6be32da94ec53dbf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 7 May 2022 13:53:59 -0400
Subject: SUNRPC: Ensure that the gssproxy client can start in a connected
 state

Ensure that the gssproxy client connects to the server from the gssproxy
daemon process context so that the AF_LOCAL socket connection is done
using the correct path and namespaces.

Fixes: 1d658336b05f ("SUNRPC: Add RPC based upcall mechanism for RPCGSS auth")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h          |  1 +
 net/sunrpc/auth_gss/gss_rpc_upcall.c |  1 +
 net/sunrpc/clnt.c                    | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 267b7aeaf1a6..90501404fa49 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -160,6 +160,7 @@ struct rpc_add_xprt_test {
 #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT	(1UL << 9)
 #define RPC_CLNT_CREATE_SOFTERR		(1UL << 10)
 #define RPC_CLNT_CREATE_REUSEPORT	(1UL << 11)
+#define RPC_CLNT_CREATE_CONNECTED	(1UL << 12)
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index 61c276bddaf2..f549e4c05def 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -98,6 +98,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt)
 		 * done without the correct namespace:
 		 */
 		.flags		= RPC_CLNT_CREATE_NOPING |
+				  RPC_CLNT_CREATE_CONNECTED |
 				  RPC_CLNT_CREATE_NO_IDLE_TIMEOUT
 	};
 	struct rpc_clnt *clnt;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 98133aa54f19..e2c6eca0271b 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -76,6 +76,7 @@ static int	rpc_encode_header(struct rpc_task *task,
 static int	rpc_decode_header(struct rpc_task *task,
 				  struct xdr_stream *xdr);
 static int	rpc_ping(struct rpc_clnt *clnt);
+static int	rpc_ping_noreply(struct rpc_clnt *clnt);
 static void	rpc_check_timeout(struct rpc_task *task);
 
 static void rpc_register_client(struct rpc_clnt *clnt)
@@ -483,6 +484,12 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
 			rpc_shutdown_client(clnt);
 			return ERR_PTR(err);
 		}
+	} else if (args->flags & RPC_CLNT_CREATE_CONNECTED) {
+		int err = rpc_ping_noreply(clnt);
+		if (err != 0) {
+			rpc_shutdown_client(clnt);
+			return ERR_PTR(err);
+		}
 	}
 
 	clnt->cl_softrtry = 1;
@@ -2709,6 +2716,10 @@ static const struct rpc_procinfo rpcproc_null = {
 	.p_decode = rpcproc_decode_null,
 };
 
+static const struct rpc_procinfo rpcproc_null_noreply = {
+	.p_encode = rpcproc_encode_null,
+};
+
 static void
 rpc_null_call_prepare(struct rpc_task *task, void *data)
 {
@@ -2762,6 +2773,28 @@ static int rpc_ping(struct rpc_clnt *clnt)
 	return status;
 }
 
+static int rpc_ping_noreply(struct rpc_clnt *clnt)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &rpcproc_null_noreply,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clnt,
+		.rpc_message = &msg,
+		.callback_ops = &rpc_null_ops,
+		.flags = RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS,
+	};
+	struct rpc_task	*task;
+	int status;
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = task->tk_status;
+	rpc_put_task(task);
+	return status;
+}
+
 struct rpc_cb_add_xprt_calldata {
 	struct rpc_xprt_switch *xps;
 	struct rpc_xprt *xprt;
-- 
cgit v1.2.3-71-gd317


From 2e3afb42dd480d755cd7d97ca04586a5616c1a5e Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Sun, 8 May 2022 11:37:09 +0200
Subject: blk-mq: remove the error_count from struct request

The last two users were floppy.c and ataflop.c respectively, it was
verified that no other drivers makes use of this, so let's remove it.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Minh Yuan <yuanmingbuaa@gmail.com>
Cc: Denis Efremov <efremov@linux.com>,
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/blk-mq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 7aa5c54901a9..9f07061418db 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -163,7 +163,6 @@ struct request {
 		struct rb_node rb_node;	/* sort/lookup */
 		struct bio_vec special_vec;
 		void *completion_data;
-		int error_count; /* for legacy drivers, don't use */
 	};
 
 
-- 
cgit v1.2.3-71-gd317


From a36e07dfe6ee71e209383ea9288cd8d1617e14f9 Mon Sep 17 00:00:00 2001
From: Gleb Fotengauer-Malinovskiy <glebfm@altlinux.org>
Date: Fri, 6 May 2022 17:24:54 +0000
Subject: rfkill: uapi: fix RFKILL_IOCTL_MAX_SIZE ioctl request definition

The definition of RFKILL_IOCTL_MAX_SIZE introduced by commit
54f586a91532 ("rfkill: make new event layout opt-in") is unusable
since it is based on RFKILL_IOC_EXT_SIZE which has not been defined.
Fix that by replacing the undefined constant with the constant which
is intended to be used in this definition.

Fixes: 54f586a91532 ("rfkill: make new event layout opt-in")
Cc: stable@vger.kernel.org # 5.11+
Signed-off-by: Gleb Fotengauer-Malinovskiy <glebfm@altlinux.org>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Link: https://lore.kernel.org/r/20220506172454.120319-1-glebfm@altlinux.org
[add commit message provided later by Dmitry]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/rfkill.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h
index 283c5a7b3f2c..db6c8588c1d0 100644
--- a/include/uapi/linux/rfkill.h
+++ b/include/uapi/linux/rfkill.h
@@ -184,7 +184,7 @@ struct rfkill_event_ext {
 #define RFKILL_IOC_NOINPUT	1
 #define RFKILL_IOCTL_NOINPUT	_IO(RFKILL_IOC_MAGIC, RFKILL_IOC_NOINPUT)
 #define RFKILL_IOC_MAX_SIZE	2
-#define RFKILL_IOCTL_MAX_SIZE	_IOW(RFKILL_IOC_MAGIC, RFKILL_IOC_EXT_SIZE, __u32)
+#define RFKILL_IOCTL_MAX_SIZE	_IOW(RFKILL_IOC_MAGIC, RFKILL_IOC_MAX_SIZE, __u32)
 
 /* and that's all userspace gets */
 
-- 
cgit v1.2.3-71-gd317


From 7ff960a6fe399fdcbca6159063684671ae57eee9 Mon Sep 17 00:00:00 2001
From: Shunsuke Mie <mie@igel.co.jp>
Date: Tue, 10 May 2022 19:27:23 +0900
Subject: virtio: fix virtio transitional ids

This commit fixes the transitional PCI device ID.

Fixes: d61914ea6ada ("virtio: update virtio id table, add transitional ids")
Signed-off-by: Shunsuke Mie <mie@igel.co.jp>
Link: https://lore.kernel.org/r/20220510102723.87666-1-mie@igel.co.jp
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_ids.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 80d76b75bccd..7aa2eb766205 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -73,12 +73,12 @@
  * Virtio Transitional IDs
  */
 
-#define VIRTIO_TRANS_ID_NET		1000 /* transitional virtio net */
-#define VIRTIO_TRANS_ID_BLOCK		1001 /* transitional virtio block */
-#define VIRTIO_TRANS_ID_BALLOON		1002 /* transitional virtio balloon */
-#define VIRTIO_TRANS_ID_CONSOLE		1003 /* transitional virtio console */
-#define VIRTIO_TRANS_ID_SCSI		1004 /* transitional virtio SCSI */
-#define VIRTIO_TRANS_ID_RNG		1005 /* transitional virtio rng */
-#define VIRTIO_TRANS_ID_9P		1009 /* transitional virtio 9p console */
+#define VIRTIO_TRANS_ID_NET		0x1000 /* transitional virtio net */
+#define VIRTIO_TRANS_ID_BLOCK		0x1001 /* transitional virtio block */
+#define VIRTIO_TRANS_ID_BALLOON		0x1002 /* transitional virtio balloon */
+#define VIRTIO_TRANS_ID_CONSOLE		0x1003 /* transitional virtio console */
+#define VIRTIO_TRANS_ID_SCSI		0x1004 /* transitional virtio SCSI */
+#define VIRTIO_TRANS_ID_RNG		0x1005 /* transitional virtio rng */
+#define VIRTIO_TRANS_ID_9P		0x1009 /* transitional virtio 9p console */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
-- 
cgit v1.2.3-71-gd317


From 8b796475fd7882663a870456466a4fb315cc1bd6 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 10 May 2022 16:57:34 +0200
Subject: net/sched: act_pedit: really ensure the skb is writable

Currently pedit tries to ensure that the accessed skb offset
is writable via skb_unclone(). The action potentially allows
touching any skb bytes, so it may end-up modifying shared data.

The above causes some sporadic MPTCP self-test failures, due to
this code:

	tc -n $ns2 filter add dev ns2eth$i egress \
		protocol ip prio 1000 \
		handle 42 fw \
		action pedit munge offset 148 u8 invert \
		pipe csum tcp \
		index 100

The above modifies a data byte outside the skb head and the skb is
a cloned one, carrying a TCP output packet.

This change addresses the issue by keeping track of a rough
over-estimate highest skb offset accessed by the action and ensuring
such offset is really writable.

Note that this may cause performance regressions in some scenarios,
but hopefully pedit is not in the critical path.

Fixes: db2c24175d14 ("act_pedit: access skb->data safely")
Acked-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Tested-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://lore.kernel.org/r/1fcf78e6679d0a287dd61bb0f04730ce33b3255d.1652194627.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_pedit.h |  1 +
 net/sched/act_pedit.c         | 26 ++++++++++++++++++++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 748cf87a4d7e..3e02709a1df6 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -14,6 +14,7 @@ struct tcf_pedit {
 	struct tc_action	common;
 	unsigned char		tcfp_nkeys;
 	unsigned char		tcfp_flags;
+	u32			tcfp_off_max_hint;
 	struct tc_pedit_key	*tcfp_keys;
 	struct tcf_pedit_key_ex	*tcfp_keys_ex;
 };
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 31fcd279c177..0eaaf1f45de1 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -149,7 +149,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	struct nlattr *pattr;
 	struct tcf_pedit *p;
 	int ret = 0, err;
-	int ksize;
+	int i, ksize;
 	u32 index;
 
 	if (!nla) {
@@ -228,6 +228,18 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		p->tcfp_nkeys = parm->nkeys;
 	}
 	memcpy(p->tcfp_keys, parm->keys, ksize);
+	p->tcfp_off_max_hint = 0;
+	for (i = 0; i < p->tcfp_nkeys; ++i) {
+		u32 cur = p->tcfp_keys[i].off;
+
+		/* The AT option can read a single byte, we can bound the actual
+		 * value with uchar max.
+		 */
+		cur += (0xff & p->tcfp_keys[i].offmask) >> p->tcfp_keys[i].shift;
+
+		/* Each key touches 4 bytes starting from the computed offset */
+		p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4);
+	}
 
 	p->tcfp_flags = parm->flags;
 	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
@@ -308,13 +320,18 @@ static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a,
 			 struct tcf_result *res)
 {
 	struct tcf_pedit *p = to_pedit(a);
+	u32 max_offset;
 	int i;
 
-	if (skb_unclone(skb, GFP_ATOMIC))
-		return p->tcf_action;
-
 	spin_lock(&p->tcf_lock);
 
+	max_offset = (skb_transport_header_was_set(skb) ?
+		      skb_transport_offset(skb) :
+		      skb_network_offset(skb)) +
+		     p->tcfp_off_max_hint;
+	if (skb_ensure_writable(skb, min(skb->len, max_offset)))
+		goto unlock;
+
 	tcf_lastuse_update(&p->tcf_tm);
 
 	if (p->tcfp_nkeys > 0) {
@@ -403,6 +420,7 @@ bad:
 	p->tcf_qstats.overlimits++;
 done:
 	bstats_update(&p->tcf_bstats, skb);
+unlock:
 	spin_unlock(&p->tcf_lock);
 	return p->tcf_action;
 }
-- 
cgit v1.2.3-71-gd317


From 9c2136be0878c88c53dea26943ce40bb03ad8d8d Mon Sep 17 00:00:00 2001
From: Delyan Kratunov <delyank@fb.com>
Date: Wed, 11 May 2022 18:28:36 +0000
Subject: sched/tracing: Append prev_state to tp args instead

Commit fa2c3254d7cf (sched/tracing: Don't re-read p->state when emitting
sched_switch event, 2022-01-20) added a new prev_state argument to the
sched_switch tracepoint, before the prev task_struct pointer.

This reordering of arguments broke BPF programs that use the raw
tracepoint (e.g. tp_btf programs). The type of the second argument has
changed and existing programs that assume a task_struct* argument
(e.g. for bpf_task_storage access) will now fail to verify.

If we instead append the new argument to the end, all existing programs
would continue to work and can conditionally extract the prev_state
argument on supported kernel versions.

Fixes: fa2c3254d7cf (sched/tracing: Don't re-read p->state when emitting sched_switch event, 2022-01-20)
Signed-off-by: Delyan Kratunov <delyank@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/c8a6930dfdd58a4a5755fc01732675472979732b.camel@fb.com
---
 include/trace/events/sched.h              | 6 +++---
 kernel/sched/core.c                       | 2 +-
 kernel/trace/fgraph.c                     | 4 ++--
 kernel/trace/ftrace.c                     | 4 ++--
 kernel/trace/trace_events.c               | 8 ++++----
 kernel/trace/trace_osnoise.c              | 4 ++--
 kernel/trace/trace_sched_switch.c         | 4 ++--
 kernel/trace/trace_sched_wakeup.c         | 4 ++--
 samples/trace_events/trace_custom_sched.h | 6 +++---
 9 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 65e786756321..fbb99a61f714 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -222,11 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt,
 TRACE_EVENT(sched_switch,
 
 	TP_PROTO(bool preempt,
-		 unsigned int prev_state,
 		 struct task_struct *prev,
-		 struct task_struct *next),
+		 struct task_struct *next,
+		 unsigned int prev_state),
 
-	TP_ARGS(preempt, prev_state, prev, next),
+	TP_ARGS(preempt, prev, next, prev_state),
 
 	TP_STRUCT__entry(
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 51efaabac3e4..d58c0389eb23 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6382,7 +6382,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
-		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
+		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
 
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8f4fb328133a..a7e84c8543cb 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -404,9 +404,9 @@ free:
 
 static void
 ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
-				unsigned int prev_state,
 				struct task_struct *prev,
-				struct task_struct *next)
+				struct task_struct *next,
+				unsigned int prev_state)
 {
 	unsigned long long timestamp;
 	int index;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f1d2f5e7263..af899b058c8d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7420,9 +7420,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 
 static void
 ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
-				     unsigned int prev_state,
 				     struct task_struct *prev,
-				     struct task_struct *next)
+				     struct task_struct *next,
+				     unsigned int prev_state)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *pid_list;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e11e167b7809..f97de82d1342 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -773,9 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
 
 static void
 event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
-					unsigned int prev_state,
 					struct task_struct *prev,
-					struct task_struct *next)
+					struct task_struct *next,
+					unsigned int prev_state)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *no_pid_list;
@@ -799,9 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 
 static void
 event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
-					 unsigned int prev_state,
 					 struct task_struct *prev,
-					 struct task_struct *next)
+					 struct task_struct *next,
+					 unsigned int prev_state)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *no_pid_list;
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index e9ae1f33a7f0..afb92e2f0aea 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1168,9 +1168,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
  */
 static void
 trace_sched_switch_callback(void *data, bool preempt,
-			    unsigned int prev_state,
 			    struct task_struct *p,
-			    struct task_struct *n)
+			    struct task_struct *n,
+			    unsigned int prev_state)
 {
 	struct osnoise_variables *osn_var = this_cpu_osn_var();
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 45796d8bd4b2..c9ffdcfe622e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,8 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex);
 
 static void
 probe_sched_switch(void *ignore, bool preempt,
-		   unsigned int prev_state,
-		   struct task_struct *prev, struct task_struct *next)
+		   struct task_struct *prev, struct task_struct *next,
+		   unsigned int prev_state)
 {
 	int flags;
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 46429f9a96fa..330aee1c1a49 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -426,8 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 
 static void notrace
 probe_wakeup_sched_switch(void *ignore, bool preempt,
-			  unsigned int prev_state,
-			  struct task_struct *prev, struct task_struct *next)
+			  struct task_struct *prev, struct task_struct *next,
+			  unsigned int prev_state)
 {
 	struct trace_array_cpu *data;
 	u64 T0, T1, delta;
diff --git a/samples/trace_events/trace_custom_sched.h b/samples/trace_events/trace_custom_sched.h
index 9fdd8e7c2a45..951388334a3f 100644
--- a/samples/trace_events/trace_custom_sched.h
+++ b/samples/trace_events/trace_custom_sched.h
@@ -25,11 +25,11 @@ TRACE_CUSTOM_EVENT(sched_switch,
 	 * that the custom event is using.
 	 */
 	TP_PROTO(bool preempt,
-		 unsigned int prev_state,
 		 struct task_struct *prev,
-		 struct task_struct *next),
+		 struct task_struct *next,
+		 unsigned int prev_state),
 
-	TP_ARGS(preempt, prev_state, prev, next),
+	TP_ARGS(preempt, prev, next, prev_state),
 
 	/*
 	 * The next fields are where the customization happens.
-- 
cgit v1.2.3-71-gd317


From 103a2f3255a95991252f8f13375c3a96a75011cd Mon Sep 17 00:00:00 2001
From: Itay Iellin <ieitayie@gmail.com>
Date: Sat, 7 May 2022 08:32:48 -0400
Subject: Bluetooth: Fix the creation of hdev->name

Set a size limit of 8 bytes of the written buffer to "hdev->name"
including the terminating null byte, as the size of "hdev->name" is 8
bytes. If an id value which is greater than 9999 is allocated,
then the "snprintf(hdev->name, sizeof(hdev->name), "hci%d", id)"
function call would lead to a truncation of the id value in decimal
notation.

Set an explicit maximum id parameter in the id allocation function call.
The id allocation function defines the maximum allocated id value as the
maximum id parameter value minus one. Therefore, HCI_MAX_ID is defined
as 10000.

Signed-off-by: Itay Iellin <ieitayie@gmail.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 3 +++
 net/bluetooth/hci_core.c         | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 8abd08245326..62d7b81b1cb7 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -36,6 +36,9 @@
 /* HCI priority */
 #define HCI_PRIO_MAX	7
 
+/* HCI maximum id value */
+#define HCI_MAX_ID 10000
+
 /* HCI Core structures */
 struct inquiry_data {
 	bdaddr_t	bdaddr;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index b4782a6c1025..45c2dd2e1590 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2555,10 +2555,10 @@ int hci_register_dev(struct hci_dev *hdev)
 	 */
 	switch (hdev->dev_type) {
 	case HCI_PRIMARY:
-		id = ida_simple_get(&hci_index_ida, 0, 0, GFP_KERNEL);
+		id = ida_simple_get(&hci_index_ida, 0, HCI_MAX_ID, GFP_KERNEL);
 		break;
 	case HCI_AMP:
-		id = ida_simple_get(&hci_index_ida, 1, 0, GFP_KERNEL);
+		id = ida_simple_get(&hci_index_ida, 1, HCI_MAX_ID, GFP_KERNEL);
 		break;
 	default:
 		return -EINVAL;
@@ -2567,7 +2567,7 @@ int hci_register_dev(struct hci_dev *hdev)
 	if (id < 0)
 		return id;
 
-	sprintf(hdev->name, "hci%d", id);
+	snprintf(hdev->name, sizeof(hdev->name), "hci%d", id);
 	hdev->id = id;
 
 	BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
-- 
cgit v1.2.3-71-gd317


From ea661ad6e1573d5b08c27444ff2ed403bf39ff66 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 10 May 2022 10:34:03 +0800
Subject: iommu/vt-d: Size Page Request Queue to avoid overflow condition

PRQ overflow may cause I/O throughput congestion, resulting in unnecessary
degradation of I/O performance. Appropriately increasing the length of PRQ
can greatly reduce the occurrence of PRQ overflow. The count of maximum
page requests that can be generated in parallel by a PCIe device is
statically defined in the Outstanding Page Request Capacity field of the
PCIe ATS configure space.

The new length of PRQ is calculated by summing up the value of Outstanding
Page Request Capacity register across all devices where Page Requests are
supported on the real PR-capable platform (Intel Sapphire Rapids). The
result is round to the nearest higher power of 2.

The PRQ length is also double sized as the VT-d IOMMU driver only updates
the Page Request Queue Head Register (PQH_REG) after processing the entire
queue.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20220421113558.3504874-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20220510023407.2759143-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-svm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index b3b125b332aa..207ef06ba3e1 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -9,7 +9,7 @@
 #define __INTEL_SVM_H__
 
 /* Page Request Queue depth */
-#define PRQ_ORDER	2
+#define PRQ_ORDER	4
 #define PRQ_RING_MASK	((0x1000 << PRQ_ORDER) - 0x20)
 #define PRQ_DEPTH	((0x1000 << PRQ_ORDER) >> 5)
 
-- 
cgit v1.2.3-71-gd317


From fc0051cb95909ab56bd8c929f24d48c9870c3e3a Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 10 May 2022 10:34:05 +0800
Subject: iommu/vt-d: Check domain force_snooping against attached devices

As domain->force_snooping only impacts the devices attached with the
domain, there's no need to check against all IOMMU units. On the other
hand, force_snooping could be set on a domain no matter whether it has
been attached or not, and once set it is an immutable flag. If no
device attached, the operation always succeeds. Then this empty domain
can be only attached to a device of which the IOMMU supports snoop
control.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20220508123525.1973626-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20220510023407.2759143-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c | 53 ++++++++++++++++++++++++++++++++++++++++++---
 drivers/iommu/intel/pasid.c | 42 +++++++++++++++++++++++++++++++++++
 drivers/iommu/intel/pasid.h |  2 ++
 include/linux/intel-iommu.h |  1 +
 4 files changed, 95 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 23985cb5c8ee..f366e8c49636 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2438,7 +2438,7 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
 	if (level == 5)
 		flags |= PASID_FLAG_FL5LP;
 
-	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
+	if (domain->force_snooping)
 		flags |= PASID_FLAG_PAGE_SNOOP;
 
 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
@@ -4410,7 +4410,7 @@ static int intel_iommu_map(struct iommu_domain *domain,
 		prot |= DMA_PTE_READ;
 	if (iommu_prot & IOMMU_WRITE)
 		prot |= DMA_PTE_WRITE;
-	if (dmar_domain->force_snooping)
+	if (dmar_domain->set_pte_snp)
 		prot |= DMA_PTE_SNP;
 
 	max_addr = iova + size;
@@ -4533,13 +4533,60 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 	return phys;
 }
 
+static bool domain_support_force_snooping(struct dmar_domain *domain)
+{
+	struct device_domain_info *info;
+	bool support = true;
+
+	assert_spin_locked(&device_domain_lock);
+	list_for_each_entry(info, &domain->devices, link) {
+		if (!ecap_sc_support(info->iommu->ecap)) {
+			support = false;
+			break;
+		}
+	}
+
+	return support;
+}
+
+static void domain_set_force_snooping(struct dmar_domain *domain)
+{
+	struct device_domain_info *info;
+
+	assert_spin_locked(&device_domain_lock);
+
+	/*
+	 * Second level page table supports per-PTE snoop control. The
+	 * iommu_map() interface will handle this by setting SNP bit.
+	 */
+	if (!domain_use_first_level(domain)) {
+		domain->set_pte_snp = true;
+		return;
+	}
+
+	list_for_each_entry(info, &domain->devices, link)
+		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
+						     PASID_RID2PASID);
+}
+
 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	unsigned long flags;
 
-	if (!domain_update_iommu_snooping(NULL))
+	if (dmar_domain->force_snooping)
+		return true;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	if (!domain_support_force_snooping(dmar_domain)) {
+		spin_unlock_irqrestore(&device_domain_lock, flags);
 		return false;
+	}
+
+	domain_set_force_snooping(dmar_domain);
 	dmar_domain->force_snooping = true;
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
 	return true;
 }
 
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index f8d215d85695..d19dd66a670c 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -762,3 +762,45 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 
 	return 0;
 }
+
+/*
+ * Set the page snoop control for a pasid entry which has been set up.
+ */
+void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu,
+					  struct device *dev, u32 pasid)
+{
+	struct pasid_entry *pte;
+	u16 did;
+
+	spin_lock(&iommu->lock);
+	pte = intel_pasid_get_entry(dev, pasid);
+	if (WARN_ON(!pte || !pasid_pte_is_present(pte))) {
+		spin_unlock(&iommu->lock);
+		return;
+	}
+
+	pasid_set_pgsnp(pte);
+	did = pasid_get_domain_id(pte);
+	spin_unlock(&iommu->lock);
+
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(pte, sizeof(*pte));
+
+	/*
+	 * VT-d spec 3.4 table23 states guides for cache invalidation:
+	 *
+	 * - PASID-selective-within-Domain PASID-cache invalidation
+	 * - PASID-selective PASID-based IOTLB invalidation
+	 * - If (pasid is RID_PASID)
+	 *    - Global Device-TLB invalidation to affected functions
+	 *   Else
+	 *    - PASID-based Device-TLB invalidation (with S=1 and
+	 *      Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions
+	 */
+	pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+	qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
+
+	/* Device IOTLB doesn't need to be flushed in caching mode. */
+	if (!cap_caching_mode(iommu->cap))
+		devtlb_invalidation_with_pasid(iommu, dev, pasid);
+}
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index ab4408c824a5..583ea67fc783 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -123,4 +123,6 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
 				 bool fault_ignore);
 int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid);
 void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid);
+void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu,
+					  struct device *dev, u32 pasid);
 #endif /* __INTEL_PASID_H */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 72e5d7900e71..4f29139bbfc3 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -540,6 +540,7 @@ struct dmar_domain {
 	u8 has_iotlb_device: 1;
 	u8 iommu_coherency: 1;		/* indicate coherency of iommu access */
 	u8 force_snooping : 1;		/* Create IOPTEs with snoop control */
+	u8 set_pte_snp:1;
 
 	struct list_head devices;	/* all devices' list */
 	struct iova_domain iovad;	/* iova's that belong to this domain */
-- 
cgit v1.2.3-71-gd317