From d4e5268a08b211b536fed29beb24271ecd85187e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Apr 2022 11:45:55 +0200 Subject: x86,objtool: Mark cpu_startup_entry() __noreturn GCC-8 isn't clever enough to figure out that cpu_start_entry() is a noreturn while objtool is. This results in code after the call in start_secondary(). Give GCC a hand so that they all agree on things. vmlinux.o: warning: objtool: start_secondary()+0x10e: unreachable Reported-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20220408094718.383658532@infradead.org --- include/linux/cpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 9cf51e41e697..54dc2f9a2d56 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -167,7 +167,7 @@ static inline int suspend_disable_secondary_cpus(void) { return 0; } static inline void suspend_enable_secondary_cpus(void) { } #endif /* !CONFIG_PM_SLEEP_SMP */ -void cpu_startup_entry(enum cpuhp_state state); +void __noreturn cpu_startup_entry(enum cpuhp_state state); void cpu_idle_poll_ctrl(bool enable); -- cgit v1.2.3-71-gd317 From 37c5f9e80e015d0df17d0c377c18523002986851 Mon Sep 17 00:00:00 2001 From: Oleksandr Ocheretnyi Date: Sun, 17 Apr 2022 11:46:47 -0700 Subject: mtd: fix 'part' field data corruption in mtd_info Commit 46b5889cc2c5 ("mtd: implement proper partition handling") started using "mtd_get_master_ofs()" in mtd callbacks to determine memory offsets by means of 'part' field from mtd_info, what previously was smashed accessing 'master' field in the mtd_set_dev_defaults() method. That provides wrong offset what causes hardware access errors. Just make 'part', 'master' as separate fields, rather than using union type to avoid 'part' data corruption when mtd_set_dev_defaults() is called. Fixes: 46b5889cc2c5 ("mtd: implement proper partition handling") Signed-off-by: Oleksandr Ocheretnyi Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220417184649.449289-1-oocheret@cisco.com --- include/linux/mtd/mtd.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 151607e9d64a..955aee14b0f7 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -389,10 +389,8 @@ struct mtd_info { /* List of partitions attached to this MTD device */ struct list_head partitions; - union { - struct mtd_part part; - struct mtd_master master; - }; + struct mtd_part part; + struct mtd_master master; }; static inline struct mtd_info *mtd_get_master(struct mtd_info *mtd) -- cgit v1.2.3-71-gd317 From 8d084b2eae7fc5fcfc9f143cd7321a88e1cd76aa Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Apr 2022 17:15:13 +0200 Subject: usb: typec: tcpm: Fix undefined behavior due to shift overflowing the constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: drivers/usb/typec/tcpm/tcpm.c: In function ‘run_state_machine’: drivers/usb/typec/tcpm/tcpm.c:4724:3: error: case label does not reduce to an integer constant case BDO_MODE_TESTDATA: ^~~~ See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory details as to why it triggers with older gccs only. Signed-off-by: Borislav Petkov Cc: Greg Kroah-Hartman Cc: linux-usb@vger.kernel.org Link: https://lore.kernel.org/r/20220405151517.29753-8-bp@alien8.de Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/pd_bdo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/pd_bdo.h b/include/linux/usb/pd_bdo.h index 033fe3e17141..7c25b88d79f9 100644 --- a/include/linux/usb/pd_bdo.h +++ b/include/linux/usb/pd_bdo.h @@ -15,7 +15,7 @@ #define BDO_MODE_CARRIER2 (5 << 28) #define BDO_MODE_CARRIER3 (6 << 28) #define BDO_MODE_EYE (7 << 28) -#define BDO_MODE_TESTDATA (8 << 28) +#define BDO_MODE_TESTDATA (8U << 28) #define BDO_MODE_MASK(mode) ((mode) & 0xf0000000) -- cgit v1.2.3-71-gd317 From 6510ea973d8d9d4a0cb2fb557b36bd1ab3eb49f6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 25 Apr 2022 18:39:46 +0200 Subject: net: Use this_cpu_inc() to increment net->core_stats The macro dev_core_stats_##FIELD##_inc() disables preemption and invokes netdev_core_stats_alloc() to return a per-CPU pointer. netdev_core_stats_alloc() will allocate memory on its first invocation which breaks on PREEMPT_RT because it requires non-atomic context for memory allocation. This can be avoided by enabling preemption in netdev_core_stats_alloc() assuming the caller always disables preemption. It might be better to replace local_inc() with this_cpu_inc() now that dev_core_stats_##FIELD##_inc() gained a preempt-disable section and does not rely on already disabled preemption. This results in less instructions on x86-64: local_inc: | incl %gs:__preempt_count(%rip) # __preempt_count | movq 488(%rdi), %rax # _1->core_stats, _22 | testq %rax, %rax # _22 | je .L585 #, | add %gs:this_cpu_off(%rip), %rax # this_cpu_off, tcp_ptr__ | .L586: | testq %rax, %rax # _27 | je .L587 #, | incq (%rax) # _6->a.counter | .L587: | decl %gs:__preempt_count(%rip) # __preempt_count this_cpu_inc(), this patch: | movq 488(%rdi), %rax # _1->core_stats, _5 | testq %rax, %rax # _5 | je .L591 #, | .L585: | incq %gs:(%rax) # _18->rx_dropped Use unsigned long as type for the counter. Use this_cpu_inc() to increment the counter. Use a plain read of the counter. Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/YmbO0pxgtKpCw4SY@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 21 +++++++++------------ net/core/dev.c | 14 +++++--------- 2 files changed, 14 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 59e27a2b7bf0..b1fbe21650bb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -199,10 +199,10 @@ struct net_device_stats { * Try to fit them in a single cache line, for dev_get_stats() sake. */ struct net_device_core_stats { - local_t rx_dropped; - local_t tx_dropped; - local_t rx_nohandler; -} __aligned(4 * sizeof(local_t)); + unsigned long rx_dropped; + unsigned long tx_dropped; + unsigned long rx_nohandler; +} __aligned(4 * sizeof(unsigned long)); #include #include @@ -3843,15 +3843,15 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev, return false; } -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev); +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev); -static inline struct net_device_core_stats *dev_core_stats(struct net_device *dev) +static inline struct net_device_core_stats __percpu *dev_core_stats(struct net_device *dev) { /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); if (likely(p)) - return this_cpu_ptr(p); + return p; return netdev_core_stats_alloc(dev); } @@ -3859,14 +3859,11 @@ static inline struct net_device_core_stats *dev_core_stats(struct net_device *de #define DEV_CORE_STATS_INC(FIELD) \ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ { \ - struct net_device_core_stats *p; \ + struct net_device_core_stats __percpu *p; \ \ - preempt_disable(); \ p = dev_core_stats(dev); \ - \ if (p) \ - local_inc(&p->FIELD); \ - preempt_enable(); \ + this_cpu_inc(p->FIELD); \ } DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) diff --git a/net/core/dev.c b/net/core/dev.c index 8c6c08446556..1461c2d9dec8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10304,7 +10304,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, } EXPORT_SYMBOL(netdev_stats_to_stats64); -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) { struct net_device_core_stats __percpu *p; @@ -10315,11 +10315,7 @@ struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) free_percpu(p); /* This READ_ONCE() pairs with the cmpxchg() above */ - p = READ_ONCE(dev->core_stats); - if (!p) - return NULL; - - return this_cpu_ptr(p); + return READ_ONCE(dev->core_stats); } EXPORT_SYMBOL(netdev_core_stats_alloc); @@ -10356,9 +10352,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, for_each_possible_cpu(i) { core_stats = per_cpu_ptr(p, i); - storage->rx_dropped += local_read(&core_stats->rx_dropped); - storage->tx_dropped += local_read(&core_stats->tx_dropped); - storage->rx_nohandler += local_read(&core_stats->rx_nohandler); + storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); + storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); + storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); } } return storage; -- cgit v1.2.3-71-gd317 From e5be15767e7e284351853cbaba80cde8620341fb Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 25 Apr 2022 08:07:48 -0400 Subject: hex2bin: make the function hex_to_bin constant-time The function hex2bin is used to load cryptographic keys into device mapper targets dm-crypt and dm-integrity. It should take constant time independent on the processed data, so that concurrently running unprivileged code can't infer any information about the keys via microarchitectural convert channels. This patch changes the function hex_to_bin so that it contains no branches and no memory accesses. Note that this shouldn't cause performance degradation because the size of the new function is the same as the size of the old function (on x86-64) - and the new function causes no branch misprediction penalties. I compile-tested this function with gcc on aarch64 alpha arm hppa hppa64 i386 ia64 m68k mips32 mips64 powerpc powerpc64 riscv sh4 s390x sparc32 sparc64 x86_64 and with clang on aarch64 arm hexagon i386 mips32 mips64 powerpc powerpc64 s390x sparc32 sparc64 x86_64 to verify that there are no branches in the generated code. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 2 +- lib/hexdump.c | 32 +++++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a890428bcc1a..fe6efb24d151 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -285,7 +285,7 @@ static inline char *hex_byte_pack_upper(char *buf, u8 byte) return buf; } -extern int hex_to_bin(char ch); +extern int hex_to_bin(unsigned char ch); extern int __must_check hex2bin(u8 *dst, const char *src, size_t count); extern char *bin2hex(char *dst, const void *src, size_t count); diff --git a/lib/hexdump.c b/lib/hexdump.c index 9301578f98e8..369420ce553a 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -22,15 +22,33 @@ EXPORT_SYMBOL(hex_asc_upper); * * hex_to_bin() converts one hex digit to its actual value or -1 in case of bad * input. + * + * This function is used to load cryptographic keys, so it is coded in such a + * way that there are no conditions or memory accesses that depend on data. + * + * Explanation of the logic: + * (ch - '9' - 1) is negative if ch <= '9' + * ('0' - 1 - ch) is negative if ch >= '0' + * we "and" these two values, so the result is negative if ch is in the range + * '0' ... '9' + * we are only interested in the sign, so we do a shift ">> 8"; note that right + * shift of a negative value is implementation-defined, so we cast the + * value to (unsigned) before the shift --- we have 0xffffff if ch is in + * the range '0' ... '9', 0 otherwise + * we "and" this value with (ch - '0' + 1) --- we have a value 1 ... 10 if ch is + * in the range '0' ... '9', 0 otherwise + * we add this value to -1 --- we have a value 0 ... 9 if ch is in the range '0' + * ... '9', -1 otherwise + * the next line is similar to the previous one, but we need to decode both + * uppercase and lowercase letters, so we use (ch & 0xdf), which converts + * lowercase to uppercase */ -int hex_to_bin(char ch) +int hex_to_bin(unsigned char ch) { - if ((ch >= '0') && (ch <= '9')) - return ch - '0'; - ch = tolower(ch); - if ((ch >= 'a') && (ch <= 'f')) - return ch - 'a' + 10; - return -1; + unsigned char cu = ch & 0xdf; + return -1 + + ((ch - '0' + 1) & (unsigned)((ch - '9' - 1) & ('0' - 1 - ch)) >> 8) + + ((cu - 'A' + 11) & (unsigned)((cu - 'F' - 1) & ('A' - 1 - cu)) >> 8); } EXPORT_SYMBOL(hex_to_bin); -- cgit v1.2.3-71-gd317 From ed36d04e8f8d7b00db451b0fa56a54e8e02ec43e Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 25 Apr 2022 13:42:02 +0100 Subject: iommu: Introduce device_iommu_capable() iommu_capable() only really works for systems where all IOMMU instances are completely homogeneous, and all devices are IOMMU-mapped. Implement the new variant which will be able to give a more accurate answer for whichever device the caller is actually interested in, and even more so once all the external users have been converted and we can reliably pass the device pointer through the internal driver interface too. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/8407eb9586677995b7a9fd70d0fd82d85929a9bb.1650878781.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 23 +++++++++++++++++++++++ include/linux/iommu.h | 6 ++++++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index f2c45b85b9fc..780c11734979 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1906,6 +1906,29 @@ bool iommu_present(struct bus_type *bus) } EXPORT_SYMBOL_GPL(iommu_present); +/** + * device_iommu_capable() - check for a general IOMMU capability + * @dev: device to which the capability would be relevant, if available + * @cap: IOMMU capability + * + * Return: true if an IOMMU is present and supports the given capability + * for the given device, otherwise false. + */ +bool device_iommu_capable(struct device *dev, enum iommu_cap cap) +{ + const struct iommu_ops *ops; + + if (!dev->iommu || !dev->iommu->iommu_dev) + return false; + + ops = dev_iommu_ops(dev); + if (!ops->capable) + return false; + + return ops->capable(cap); +} +EXPORT_SYMBOL_GPL(device_iommu_capable); + bool iommu_capable(struct bus_type *bus, enum iommu_cap cap) { if (!bus->iommu_ops || !bus->iommu_ops->capable) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 9208eca4b0d1..e26cf84e5d82 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -417,6 +417,7 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops); extern int bus_iommu_probe(struct bus_type *bus); extern bool iommu_present(struct bus_type *bus); +extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); extern bool iommu_capable(struct bus_type *bus, enum iommu_cap cap); extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus); extern struct iommu_group *iommu_group_get_by_id(int id); @@ -689,6 +690,11 @@ static inline bool iommu_present(struct bus_type *bus) return false; } +static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap) +{ + return false; +} + static inline bool iommu_capable(struct bus_type *bus, enum iommu_cap cap) { return false; -- cgit v1.2.3-71-gd317 From d0be55fbeb6ac694d15af5d1aad19cdec8cd64e5 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 25 Apr 2022 13:42:03 +0100 Subject: iommu: Add capability for pre-boot DMA protection VT-d's dmar_platform_optin() actually represents a combination of properties fairly well standardised by Microsoft as "Pre-boot DMA Protection" and "Kernel DMA Protection"[1]. As such, we can provide interested consumers with an abstracted capability rather than driver-specific interfaces that won't scale. We name it for the former aspect since that's what external callers are most likely to be interested in; the latter is for the IOMMU layer to handle itself. [1] https://docs.microsoft.com/en-us/windows-hardware/design/device-experiences/oem-kernel-dma-protection Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Lu Baolu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/d6218dff2702472da80db6aec2c9589010684551.1650878781.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 2 ++ include/linux/iommu.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index df5c62ecf942..0edf6084dc14 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4551,6 +4551,8 @@ static bool intel_iommu_capable(enum iommu_cap cap) return domain_update_iommu_snooping(NULL); if (cap == IOMMU_CAP_INTR_REMAP) return irq_remapping_enabled == 1; + if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION) + return dmar_platform_optin(); return false; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e26cf84e5d82..4123693ae319 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -107,6 +107,8 @@ enum iommu_cap { transactions */ IOMMU_CAP_INTR_REMAP, /* IOMMU supports interrupt isolation */ IOMMU_CAP_NOEXEC, /* IOMMU_NOEXEC flag */ + IOMMU_CAP_PRE_BOOT_PROTECTION, /* Firmware says it used the IOMMU for + DMA protection and we should too */ }; /* These are the possible reserved region types */ -- cgit v1.2.3-71-gd317 From 86eaf4a5b4312bea8676fb79399d9e08b53d8e71 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 25 Apr 2022 13:42:04 +0100 Subject: thunderbolt: Make iommu_dma_protection more accurate Between me trying to get rid of iommu_present() and Mario wanting to support the AMD equivalent of DMAR_PLATFORM_OPT_IN, scrutiny has shown that the iommu_dma_protection attribute is being far too optimistic. Even if an IOMMU might be present for some PCI segment in the system, that doesn't necessarily mean it provides translation for the device(s) we care about. Furthermore, all that DMAR_PLATFORM_OPT_IN really does is tell us that memory was protected before the kernel was loaded, and prevent the user from disabling the intel-iommu driver entirely. While that lets us assume kernel integrity, what matters for actual runtime DMA protection is whether we trust individual devices, based on the "external facing" property that we expect firmware to describe for Thunderbolt ports. It's proven challenging to determine the appropriate ports accurately given the variety of possible topologies, so while still not getting a perfect answer, by putting enough faith in firmware we can at least get a good bit closer. If we can see that any device near a Thunderbolt NHI has all the requisites for Kernel DMA Protection, chances are that it *is* a relevant port, but moreover that implies that firmware is playing the game overall, so we'll use that to assume that all Thunderbolt ports should be correctly marked and thus will end up fully protected. CC: Mario Limonciello Reviewed-by: Christoph Hellwig Acked-by: Mika Westerberg Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/b153f208bc9eafab5105bad0358b77366509d2d4.1650878781.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/thunderbolt/domain.c | 12 +++--------- drivers/thunderbolt/nhi.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/thunderbolt.h | 2 ++ 3 files changed, 49 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c index 7018d959f775..2889a214dadc 100644 --- a/drivers/thunderbolt/domain.c +++ b/drivers/thunderbolt/domain.c @@ -7,9 +7,7 @@ */ #include -#include #include -#include #include #include #include @@ -257,13 +255,9 @@ static ssize_t iommu_dma_protection_show(struct device *dev, struct device_attribute *attr, char *buf) { - /* - * Kernel DMA protection is a feature where Thunderbolt security is - * handled natively using IOMMU. It is enabled when IOMMU is - * enabled and ACPI DMAR table has DMAR_PLATFORM_OPT_IN set. - */ - return sprintf(buf, "%d\n", - iommu_present(&pci_bus_type) && dmar_platform_optin()); + struct tb *tb = container_of(dev, struct tb, dev); + + return sysfs_emit(buf, "%d\n", tb->nhi->iommu_dma_protection); } static DEVICE_ATTR_RO(iommu_dma_protection); diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 4a582183f675..4bc87b0f003a 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -15,9 +15,11 @@ #include #include #include +#include #include #include #include +#include #include "nhi.h" #include "nhi_regs.h" @@ -1103,6 +1105,47 @@ static void nhi_check_quirks(struct tb_nhi *nhi) nhi->quirks |= QUIRK_AUTO_CLEAR_INT; } +static int nhi_check_iommu_pdev(struct pci_dev *pdev, void *data) +{ + if (!pdev->external_facing || + !device_iommu_capable(&pdev->dev, IOMMU_CAP_PRE_BOOT_PROTECTION)) + return 0; + *(bool *)data = true; + return 1; /* Stop walking */ +} + +static void nhi_check_iommu(struct tb_nhi *nhi) +{ + struct pci_bus *bus = nhi->pdev->bus; + bool port_ok = false; + + /* + * Ideally what we'd do here is grab every PCI device that + * represents a tunnelling adapter for this NHI and check their + * status directly, but unfortunately USB4 seems to make it + * obnoxiously difficult to reliably make any correlation. + * + * So for now we'll have to bodge it... Hoping that the system + * is at least sane enough that an adapter is in the same PCI + * segment as its NHI, if we can find *something* on that segment + * which meets the requirements for Kernel DMA Protection, we'll + * take that to imply that firmware is aware and has (hopefully) + * done the right thing in general. We need to know that the PCI + * layer has seen the ExternalFacingPort property which will then + * inform the IOMMU layer to enforce the complete "untrusted DMA" + * flow, but also that the IOMMU driver itself can be trusted not + * to have been subverted by a pre-boot DMA attack. + */ + while (bus->parent) + bus = bus->parent; + + pci_walk_bus(bus, nhi_check_iommu_pdev, &port_ok); + + nhi->iommu_dma_protection = port_ok; + dev_dbg(&nhi->pdev->dev, "IOMMU DMA protection is %s\n", + str_enabled_disabled(port_ok)); +} + static int nhi_init_msi(struct tb_nhi *nhi) { struct pci_dev *pdev = nhi->pdev; @@ -1220,6 +1263,7 @@ static int nhi_probe(struct pci_dev *pdev, const struct pci_device_id *id) return -ENOMEM; nhi_check_quirks(nhi); + nhi_check_iommu(nhi); res = nhi_init_msi(nhi); if (res) { diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 124e13cb1469..7a8ad984e651 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -465,6 +465,7 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc) * @msix_ida: Used to allocate MSI-X vectors for rings * @going_away: The host controller device is about to disappear so when * this flag is set, avoid touching the hardware anymore. + * @iommu_dma_protection: An IOMMU will isolate external-facing ports. * @interrupt_work: Work scheduled to handle ring interrupt when no * MSI-X is used. * @hop_count: Number of rings (end point hops) supported by NHI. @@ -479,6 +480,7 @@ struct tb_nhi { struct tb_ring **rx_rings; struct ida msix_ida; bool going_away; + bool iommu_dma_protection; struct work_struct interrupt_work; u32 hop_count; unsigned long quirks; -- cgit v1.2.3-71-gd317 From 1ea2a07a532b0e22aabe7e8483f935c672b9e7ed Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Apr 2022 08:49:50 +0800 Subject: iommu: Add DMA ownership management interfaces Multiple devices may be placed in the same IOMMU group because they cannot be isolated from each other. These devices must either be entirely under kernel control or userspace control, never a mixture. This adds dma ownership management in iommu core and exposes several interfaces for the device drivers and the device userspace assignment framework (i.e. VFIO), so that any conflict between user and kernel controlled dma could be detected at the beginning. The device driver oriented interfaces are, int iommu_device_use_default_domain(struct device *dev); void iommu_device_unuse_default_domain(struct device *dev); By calling iommu_device_use_default_domain(), the device driver tells the iommu layer that the device dma is handled through the kernel DMA APIs. The iommu layer will manage the IOVA and use the default domain for DMA address translation. The device user-space assignment framework oriented interfaces are, int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); The device userspace assignment must be disallowed if the DMA owner claiming interface returns failure. Signed-off-by: Jason Gunthorpe Signed-off-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/20220418005000.897664-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++++- include/linux/iommu.h | 31 ++++++++++ 2 files changed, 181 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index f2c45b85b9fc..eba8e8ccf19d 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -48,6 +48,8 @@ struct iommu_group { struct iommu_domain *default_domain; struct iommu_domain *domain; struct list_head entry; + unsigned int owner_cnt; + void *owner; }; struct group_device { @@ -294,7 +296,11 @@ int iommu_probe_device(struct device *dev) mutex_lock(&group->mutex); iommu_alloc_default_domain(group, dev); - if (group->default_domain) { + /* + * If device joined an existing group which has been claimed, don't + * attach the default domain. + */ + if (group->default_domain && !group->owner) { ret = __iommu_attach_device(group->default_domain, dev); if (ret) { mutex_unlock(&group->mutex); @@ -2109,7 +2115,7 @@ static int __iommu_attach_group(struct iommu_domain *domain, { int ret; - if (group->default_domain && group->domain != group->default_domain) + if (group->domain && group->domain != group->default_domain) return -EBUSY; ret = __iommu_group_for_each_dev(group, domain, @@ -2146,7 +2152,11 @@ static void __iommu_detach_group(struct iommu_domain *domain, { int ret; - if (!group->default_domain) { + /* + * If the group has been claimed already, do not re-attach the default + * domain. + */ + if (!group->default_domain || group->owner) { __iommu_group_for_each_dev(group, domain, iommu_group_do_detach_device); group->domain = NULL; @@ -3095,3 +3105,140 @@ out: return ret; } + +/** + * iommu_device_use_default_domain() - Device driver wants to handle device + * DMA through the kernel DMA API. + * @dev: The device. + * + * The device driver about to bind @dev wants to do DMA through the kernel + * DMA API. Return 0 if it is allowed, otherwise an error. + */ +int iommu_device_use_default_domain(struct device *dev) +{ + struct iommu_group *group = iommu_group_get(dev); + int ret = 0; + + if (!group) + return 0; + + mutex_lock(&group->mutex); + if (group->owner_cnt) { + if (group->domain != group->default_domain || + group->owner) { + ret = -EBUSY; + goto unlock_out; + } + } + + group->owner_cnt++; + +unlock_out: + mutex_unlock(&group->mutex); + iommu_group_put(group); + + return ret; +} + +/** + * iommu_device_unuse_default_domain() - Device driver stops handling device + * DMA through the kernel DMA API. + * @dev: The device. + * + * The device driver doesn't want to do DMA through kernel DMA API anymore. + * It must be called after iommu_device_use_default_domain(). + */ +void iommu_device_unuse_default_domain(struct device *dev) +{ + struct iommu_group *group = iommu_group_get(dev); + + if (!group) + return; + + mutex_lock(&group->mutex); + if (!WARN_ON(!group->owner_cnt)) + group->owner_cnt--; + + mutex_unlock(&group->mutex); + iommu_group_put(group); +} + +/** + * iommu_group_claim_dma_owner() - Set DMA ownership of a group + * @group: The group. + * @owner: Caller specified pointer. Used for exclusive ownership. + * + * This is to support backward compatibility for vfio which manages + * the dma ownership in iommu_group level. New invocations on this + * interface should be prohibited. + */ +int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner) +{ + int ret = 0; + + mutex_lock(&group->mutex); + if (group->owner_cnt) { + ret = -EPERM; + goto unlock_out; + } else { + if (group->domain && group->domain != group->default_domain) { + ret = -EBUSY; + goto unlock_out; + } + + group->owner = owner; + if (group->domain) + __iommu_detach_group(group->domain, group); + } + + group->owner_cnt++; +unlock_out: + mutex_unlock(&group->mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner); + +/** + * iommu_group_release_dma_owner() - Release DMA ownership of a group + * @group: The group. + * + * Release the DMA ownership claimed by iommu_group_claim_dma_owner(). + */ +void iommu_group_release_dma_owner(struct iommu_group *group) +{ + mutex_lock(&group->mutex); + if (WARN_ON(!group->owner_cnt || !group->owner)) + goto unlock_out; + + group->owner_cnt = 0; + /* + * The UNMANAGED domain should be detached before all USER + * owners have been released. + */ + if (!WARN_ON(group->domain) && group->default_domain) + __iommu_attach_group(group->default_domain, group); + group->owner = NULL; +unlock_out: + mutex_unlock(&group->mutex); +} +EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner); + +/** + * iommu_group_dma_owner_claimed() - Query group dma ownership status + * @group: The group. + * + * This provides status query on a given group. It is racy and only for + * non-binding status reporting. + */ +bool iommu_group_dma_owner_claimed(struct iommu_group *group) +{ + unsigned int user; + + mutex_lock(&group->mutex); + user = group->owner_cnt; + mutex_unlock(&group->mutex); + + return user; +} +EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 9208eca4b0d1..77972ef978b5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -675,6 +675,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +int iommu_device_use_default_domain(struct device *dev); +void iommu_device_unuse_default_domain(struct device *dev); + +int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); +void iommu_group_release_dma_owner(struct iommu_group *group); +bool iommu_group_dma_owner_claimed(struct iommu_group *group); + #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; @@ -1031,6 +1038,30 @@ static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) { return NULL; } + +static inline int iommu_device_use_default_domain(struct device *dev) +{ + return 0; +} + +static inline void iommu_device_unuse_default_domain(struct device *dev) +{ +} + +static inline int +iommu_group_claim_dma_owner(struct iommu_group *group, void *owner) +{ + return -ENODEV; +} + +static inline void iommu_group_release_dma_owner(struct iommu_group *group) +{ +} + +static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group) +{ + return false; +} #endif /* CONFIG_IOMMU_API */ /** -- cgit v1.2.3-71-gd317 From 25f3bcfc54bcf7b0e45d140ec8bfbbf10ba11869 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Apr 2022 08:49:51 +0800 Subject: driver core: Add dma_cleanup callback in bus_type The bus_type structure defines dma_configure() callback for bus drivers to configure DMA on the devices. This adds the paired dma_cleanup() callback and calls it during driver unbinding so that bus drivers can do some cleanup work. One use case for this paired DMA callbacks is for the bus driver to check for DMA ownership conflicts during driver binding, where multiple devices belonging to a same IOMMU group (the minimum granularity of isolation and protection) may be assigned to kernel drivers or user space respectively. Without this change, for example, the vfio driver has to listen to a bus BOUND_DRIVER event and then BUG_ON() in case of dma ownership conflict. This leads to bad user experience since careless driver binding operation may crash the system if the admin overlooks the group restriction. Aside from bad design, this leads to a security problem as a root user, even with lockdown=integrity, can force the kernel to BUG. With this change, the bus driver could check and set the DMA ownership in driver binding process and fail on ownership conflicts. The DMA ownership should be released during driver unbinding. Signed-off-by: Lu Baolu Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220418005000.897664-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/base/dd.c | 5 +++++ include/linux/device/bus.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 3fc3b5940bb3..94b7ac9bf459 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -671,6 +671,8 @@ sysfs_failed: if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_DRIVER_NOT_BOUND, dev); + if (dev->bus && dev->bus->dma_cleanup) + dev->bus->dma_cleanup(dev); pinctrl_bind_failed: device_links_no_driver(dev); device_unbind_cleanup(dev); @@ -1199,6 +1201,9 @@ static void __device_release_driver(struct device *dev, struct device *parent) device_remove(dev); + if (dev->bus && dev->bus->dma_cleanup) + dev->bus->dma_cleanup(dev); + device_links_driver_cleanup(dev); device_unbind_cleanup(dev); diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index a039ab809753..d8b29ccd07e5 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -59,6 +59,8 @@ struct fwnode_handle; * bus supports. * @dma_configure: Called to setup DMA configuration on a device on * this bus. + * @dma_cleanup: Called to cleanup DMA configuration on a device on + * this bus. * @pm: Power management operations of this bus, callback the specific * device driver's pm-ops. * @iommu_ops: IOMMU specific operations for this bus, used to attach IOMMU @@ -103,6 +105,7 @@ struct bus_type { int (*num_vf)(struct device *dev); int (*dma_configure)(struct device *dev); + void (*dma_cleanup)(struct device *dev); const struct dev_pm_ops *pm; -- cgit v1.2.3-71-gd317 From 4a6d9dd564d0e7339fc15ecc5ce66db4ad842be2 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Apr 2022 08:49:52 +0800 Subject: amba: Stop sharing platform_dma_configure() Stop sharing platform_dma_configure() helper as they are about to have their own bus dma_configure callbacks. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220418005000.897664-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/amba/bus.c | 19 ++++++++++++++++++- drivers/base/platform.c | 3 +-- include/linux/platform_device.h | 2 -- 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index d3bd14aaabf6..76b52bd2c2a4 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #define to_amba_driver(d) container_of(d, struct amba_driver, drv) @@ -273,6 +275,21 @@ static void amba_shutdown(struct device *dev) drv->shutdown(to_amba_device(dev)); } +static int amba_dma_configure(struct device *dev) +{ + enum dev_dma_attr attr; + int ret = 0; + + if (dev->of_node) { + ret = of_dma_configure(dev, dev->of_node, true); + } else if (has_acpi_companion(dev)) { + attr = acpi_get_dma_attr(to_acpi_device_node(dev->fwnode)); + ret = acpi_dma_configure(dev, attr); + } + + return ret; +} + #ifdef CONFIG_PM /* * Hooks to provide runtime PM of the pclk (bus clock). It is safe to @@ -341,7 +358,7 @@ struct bus_type amba_bustype = { .probe = amba_probe, .remove = amba_remove, .shutdown = amba_shutdown, - .dma_configure = platform_dma_configure, + .dma_configure = amba_dma_configure, .pm = &amba_pm, }; EXPORT_SYMBOL_GPL(amba_bustype); diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 8cc272fd5c99..d7915734d931 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -1454,8 +1454,7 @@ static void platform_shutdown(struct device *_dev) drv->shutdown(dev); } - -int platform_dma_configure(struct device *dev) +static int platform_dma_configure(struct device *dev) { enum dev_dma_attr attr; int ret = 0; diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 7c96f169d274..17fde717df68 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -328,8 +328,6 @@ extern int platform_pm_restore(struct device *dev); #define platform_pm_restore NULL #endif -extern int platform_dma_configure(struct device *dev); - #ifdef CONFIG_PM_SLEEP #define USE_PLATFORM_PM_SLEEP_OPS \ .suspend = platform_pm_suspend, \ -- cgit v1.2.3-71-gd317 From 512881eacfa72c2136b27b9934b7b27504a9efc2 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Apr 2022 08:49:53 +0800 Subject: bus: platform,amba,fsl-mc,PCI: Add device DMA ownership management The devices on platform/amba/fsl-mc/PCI buses could be bound to drivers with the device DMA managed by kernel drivers or user-space applications. Unfortunately, multiple devices may be placed in the same IOMMU group because they cannot be isolated from each other. The DMA on these devices must either be entirely under kernel control or userspace control, never a mixture. Otherwise the driver integrity is not guaranteed because they could access each other through the peer-to-peer accesses which by-pass the IOMMU protection. This checks and sets the default DMA mode during driver binding, and cleanups during driver unbinding. In the default mode, the device DMA is managed by the device driver which handles DMA operations through the kernel DMA APIs (see Documentation/core-api/dma-api.rst). For cases where the devices are assigned for userspace control through the userspace driver framework(i.e. VFIO), the drivers(for example, vfio_pci/ vfio_platfrom etc.) may set a new flag (driver_managed_dma) to skip this default setting in the assumption that the drivers know what they are doing with the device DMA. Calling iommu_device_use_default_domain() before {of,acpi}_dma_configure is currently a problem. As things stand, the IOMMU driver ignored the initial iommu_probe_device() call when the device was added, since at that point it had no fwspec yet. In this situation, {of,acpi}_iommu_configure() are retriggering iommu_probe_device() after the IOMMU driver has seen the firmware data via .of_xlate to learn that it actually responsible for the given device. As the result, before that gets fixed, iommu_use_default_domain() goes at the end, and calls arch_teardown_dma_ops() if it fails. Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Stuart Yoder Cc: Laurentiu Tudor Signed-off-by: Lu Baolu Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Reviewed-by: Robin Murphy Tested-by: Eric Auger Link: https://lore.kernel.org/r/20220418005000.897664-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/amba/bus.c | 18 ++++++++++++++++++ drivers/base/platform.c | 18 ++++++++++++++++++ drivers/bus/fsl-mc/fsl-mc-bus.c | 24 ++++++++++++++++++++++-- drivers/pci/pci-driver.c | 18 ++++++++++++++++++ include/linux/amba/bus.h | 8 ++++++++ include/linux/fsl/mc.h | 8 ++++++++ include/linux/pci.h | 8 ++++++++ include/linux/platform_device.h | 8 ++++++++ 8 files changed, 108 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index 76b52bd2c2a4..a0ec61232b6c 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #define to_amba_driver(d) container_of(d, struct amba_driver, drv) @@ -277,6 +279,7 @@ static void amba_shutdown(struct device *dev) static int amba_dma_configure(struct device *dev) { + struct amba_driver *drv = to_amba_driver(dev->driver); enum dev_dma_attr attr; int ret = 0; @@ -287,9 +290,23 @@ static int amba_dma_configure(struct device *dev) ret = acpi_dma_configure(dev, attr); } + if (!ret && !drv->driver_managed_dma) { + ret = iommu_device_use_default_domain(dev); + if (ret) + arch_teardown_dma_ops(dev); + } + return ret; } +static void amba_dma_cleanup(struct device *dev) +{ + struct amba_driver *drv = to_amba_driver(dev->driver); + + if (!drv->driver_managed_dma) + iommu_device_unuse_default_domain(dev); +} + #ifdef CONFIG_PM /* * Hooks to provide runtime PM of the pclk (bus clock). It is safe to @@ -359,6 +376,7 @@ struct bus_type amba_bustype = { .remove = amba_remove, .shutdown = amba_shutdown, .dma_configure = amba_dma_configure, + .dma_cleanup = amba_dma_cleanup, .pm = &amba_pm, }; EXPORT_SYMBOL_GPL(amba_bustype); diff --git a/drivers/base/platform.c b/drivers/base/platform.c index d7915734d931..70bc30cf575c 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include "base.h" #include "power/power.h" @@ -1456,6 +1458,7 @@ static void platform_shutdown(struct device *_dev) static int platform_dma_configure(struct device *dev) { + struct platform_driver *drv = to_platform_driver(dev->driver); enum dev_dma_attr attr; int ret = 0; @@ -1466,9 +1469,23 @@ static int platform_dma_configure(struct device *dev) ret = acpi_dma_configure(dev, attr); } + if (!ret && !drv->driver_managed_dma) { + ret = iommu_device_use_default_domain(dev); + if (ret) + arch_teardown_dma_ops(dev); + } + return ret; } +static void platform_dma_cleanup(struct device *dev) +{ + struct platform_driver *drv = to_platform_driver(dev->driver); + + if (!drv->driver_managed_dma) + iommu_device_unuse_default_domain(dev); +} + static const struct dev_pm_ops platform_dev_pm_ops = { SET_RUNTIME_PM_OPS(pm_generic_runtime_suspend, pm_generic_runtime_resume, NULL) USE_PLATFORM_PM_SLEEP_OPS @@ -1483,6 +1500,7 @@ struct bus_type platform_bus_type = { .remove = platform_remove, .shutdown = platform_shutdown, .dma_configure = platform_dma_configure, + .dma_cleanup = platform_dma_cleanup, .pm = &platform_dev_pm_ops, }; EXPORT_SYMBOL_GPL(platform_bus_type); diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index 8fd4a356a86e..76648c4fdaf4 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "fsl-mc-private.h" @@ -140,15 +141,33 @@ static int fsl_mc_dma_configure(struct device *dev) { struct device *dma_dev = dev; struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev); + struct fsl_mc_driver *mc_drv = to_fsl_mc_driver(dev->driver); u32 input_id = mc_dev->icid; + int ret; while (dev_is_fsl_mc(dma_dev)) dma_dev = dma_dev->parent; if (dev_of_node(dma_dev)) - return of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id); + ret = of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id); + else + ret = acpi_dma_configure_id(dev, DEV_DMA_COHERENT, &input_id); + + if (!ret && !mc_drv->driver_managed_dma) { + ret = iommu_device_use_default_domain(dev); + if (ret) + arch_teardown_dma_ops(dev); + } + + return ret; +} + +static void fsl_mc_dma_cleanup(struct device *dev) +{ + struct fsl_mc_driver *mc_drv = to_fsl_mc_driver(dev->driver); - return acpi_dma_configure_id(dev, DEV_DMA_COHERENT, &input_id); + if (!mc_drv->driver_managed_dma) + iommu_device_unuse_default_domain(dev); } static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, @@ -312,6 +331,7 @@ struct bus_type fsl_mc_bus_type = { .match = fsl_mc_bus_match, .uevent = fsl_mc_bus_uevent, .dma_configure = fsl_mc_dma_configure, + .dma_cleanup = fsl_mc_dma_cleanup, .dev_groups = fsl_mc_dev_groups, .bus_groups = fsl_mc_bus_groups, }; diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 4ceeb75fc899..f83f7fbac68f 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "pci.h" #include "pcie/portdrv.h" @@ -1601,6 +1602,7 @@ static int pci_bus_num_vf(struct device *dev) */ static int pci_dma_configure(struct device *dev) { + struct pci_driver *driver = to_pci_driver(dev->driver); struct device *bridge; int ret = 0; @@ -1616,9 +1618,24 @@ static int pci_dma_configure(struct device *dev) } pci_put_host_bridge_device(bridge); + + if (!ret && !driver->driver_managed_dma) { + ret = iommu_device_use_default_domain(dev); + if (ret) + arch_teardown_dma_ops(dev); + } + return ret; } +static void pci_dma_cleanup(struct device *dev) +{ + struct pci_driver *driver = to_pci_driver(dev->driver); + + if (!driver->driver_managed_dma) + iommu_device_unuse_default_domain(dev); +} + struct bus_type pci_bus_type = { .name = "pci", .match = pci_bus_match, @@ -1632,6 +1649,7 @@ struct bus_type pci_bus_type = { .pm = PCI_PM_OPS_PTR, .num_vf = pci_bus_num_vf, .dma_configure = pci_dma_configure, + .dma_cleanup = pci_dma_cleanup, }; EXPORT_SYMBOL(pci_bus_type); diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index 6562f543c3e0..2ddce9bcd00e 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -79,6 +79,14 @@ struct amba_driver { void (*remove)(struct amba_device *); void (*shutdown)(struct amba_device *); const struct amba_id *id_table; + /* + * For most device drivers, no need to care about this flag as long as + * all DMAs are handled through the kernel DMA API. For some special + * ones, for example VFIO drivers, they know how to manage the DMA + * themselves and set this flag so that the IOMMU layer will allow them + * to setup and manage their own I/O address space. + */ + bool driver_managed_dma; }; /* diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 7b6c42bfb660..27efef8affb1 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -32,6 +32,13 @@ struct fsl_mc_io; * @shutdown: Function called at shutdown time to quiesce the device * @suspend: Function called when a device is stopped * @resume: Function called when a device is resumed + * @driver_managed_dma: Device driver doesn't use kernel DMA API for DMA. + * For most device drivers, no need to care about this flag + * as long as all DMAs are handled through the kernel DMA API. + * For some special ones, for example VFIO drivers, they know + * how to manage the DMA themselves and set this flag so that + * the IOMMU layer will allow them to setup and manage their + * own I/O address space. * * Generic DPAA device driver object for device drivers that are registered * with a DPRC bus. This structure is to be embedded in each device-specific @@ -45,6 +52,7 @@ struct fsl_mc_driver { void (*shutdown)(struct fsl_mc_device *dev); int (*suspend)(struct fsl_mc_device *dev, pm_message_t state); int (*resume)(struct fsl_mc_device *dev); + bool driver_managed_dma; }; #define to_fsl_mc_driver(_drv) \ diff --git a/include/linux/pci.h b/include/linux/pci.h index 60adf42460ab..b933d2b08d4d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -895,6 +895,13 @@ struct module; * created once it is bound to the driver. * @driver: Driver model structure. * @dynids: List of dynamically added device IDs. + * @driver_managed_dma: Device driver doesn't use kernel DMA API for DMA. + * For most device drivers, no need to care about this flag + * as long as all DMAs are handled through the kernel DMA API. + * For some special ones, for example VFIO drivers, they know + * how to manage the DMA themselves and set this flag so that + * the IOMMU layer will allow them to setup and manage their + * own I/O address space. */ struct pci_driver { struct list_head node; @@ -913,6 +920,7 @@ struct pci_driver { const struct attribute_group **dev_groups; struct device_driver driver; struct pci_dynids dynids; + bool driver_managed_dma; }; static inline struct pci_driver *to_pci_driver(struct device_driver *drv) diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 17fde717df68..b3d9c744f1e5 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -210,6 +210,14 @@ struct platform_driver { struct device_driver driver; const struct platform_device_id *id_table; bool prevent_deferred_probe; + /* + * For most device drivers, no need to care about this flag as long as + * all DMAs are handled through the kernel DMA API. For some special + * ones, for example VFIO drivers, they know how to manage the DMA + * themselves and set this flag so that the IOMMU layer will allow them + * to setup and manage their own I/O address space. + */ + bool driver_managed_dma; }; #define to_platform_driver(drv) (container_of((drv), struct platform_driver, \ -- cgit v1.2.3-71-gd317 From a5f1bd1afacd7b1e088f93f66af5453df0d8be9a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Apr 2022 08:50:00 +0800 Subject: iommu: Remove iommu group changes notifier The iommu group changes notifer is not referenced in the tree. Remove it to avoid dead code. Suggested-by: Christoph Hellwig Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220418005000.897664-12-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 75 --------------------------------------------------- include/linux/iommu.h | 23 ---------------- 2 files changed, 98 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index eba8e8ccf19d..0c42ece25854 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -40,7 +39,6 @@ struct iommu_group { struct kobject *devices_kobj; struct list_head devices; struct mutex mutex; - struct blocking_notifier_head notifier; void *iommu_data; void (*iommu_data_release)(void *iommu_data); char *name; @@ -632,7 +630,6 @@ struct iommu_group *iommu_group_alloc(void) mutex_init(&group->mutex); INIT_LIST_HEAD(&group->devices); INIT_LIST_HEAD(&group->entry); - BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); ret = ida_simple_get(&iommu_group_ida, 0, 0, GFP_KERNEL); if (ret < 0) { @@ -905,10 +902,6 @@ rename: if (ret) goto err_put_group; - /* Notify any listeners about change to group. */ - blocking_notifier_call_chain(&group->notifier, - IOMMU_GROUP_NOTIFY_ADD_DEVICE, dev); - trace_add_device_to_group(group->id, dev); dev_info(dev, "Adding to iommu group %d\n", group->id); @@ -950,10 +943,6 @@ void iommu_group_remove_device(struct device *dev) dev_info(dev, "Removing from iommu group %d\n", group->id); - /* Pre-notify listeners that a device is being removed. */ - blocking_notifier_call_chain(&group->notifier, - IOMMU_GROUP_NOTIFY_DEL_DEVICE, dev); - mutex_lock(&group->mutex); list_for_each_entry(tmp_device, &group->devices, list) { if (tmp_device->dev == dev) { @@ -1075,36 +1064,6 @@ void iommu_group_put(struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_group_put); -/** - * iommu_group_register_notifier - Register a notifier for group changes - * @group: the group to watch - * @nb: notifier block to signal - * - * This function allows iommu group users to track changes in a group. - * See include/linux/iommu.h for actions sent via this notifier. Caller - * should hold a reference to the group throughout notifier registration. - */ -int iommu_group_register_notifier(struct iommu_group *group, - struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&group->notifier, nb); -} -EXPORT_SYMBOL_GPL(iommu_group_register_notifier); - -/** - * iommu_group_unregister_notifier - Unregister a notifier - * @group: the group to watch - * @nb: notifier block to signal - * - * Unregister a previously registered group notifier block. - */ -int iommu_group_unregister_notifier(struct iommu_group *group, - struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&group->notifier, nb); -} -EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier); - /** * iommu_register_device_fault_handler() - Register a device fault handler * @dev: the device @@ -1650,14 +1609,8 @@ static int remove_iommu_group(struct device *dev, void *data) static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data) { - unsigned long group_action = 0; struct device *dev = data; - struct iommu_group *group; - /* - * ADD/DEL call into iommu driver ops if provided, which may - * result in ADD/DEL notifiers to group->notifier - */ if (action == BUS_NOTIFY_ADD_DEVICE) { int ret; @@ -1668,34 +1621,6 @@ static int iommu_bus_notifier(struct notifier_block *nb, return NOTIFY_OK; } - /* - * Remaining BUS_NOTIFYs get filtered and republished to the - * group, if anyone is listening - */ - group = iommu_group_get(dev); - if (!group) - return 0; - - switch (action) { - case BUS_NOTIFY_BIND_DRIVER: - group_action = IOMMU_GROUP_NOTIFY_BIND_DRIVER; - break; - case BUS_NOTIFY_BOUND_DRIVER: - group_action = IOMMU_GROUP_NOTIFY_BOUND_DRIVER; - break; - case BUS_NOTIFY_UNBIND_DRIVER: - group_action = IOMMU_GROUP_NOTIFY_UNBIND_DRIVER; - break; - case BUS_NOTIFY_UNBOUND_DRIVER: - group_action = IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER; - break; - } - - if (group_action) - blocking_notifier_call_chain(&group->notifier, - group_action, dev); - - iommu_group_put(group); return 0; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 77972ef978b5..6ef2df258673 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -407,13 +407,6 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) return dev->iommu->iommu_dev->ops; } -#define IOMMU_GROUP_NOTIFY_ADD_DEVICE 1 /* Device added */ -#define IOMMU_GROUP_NOTIFY_DEL_DEVICE 2 /* Pre Device removed */ -#define IOMMU_GROUP_NOTIFY_BIND_DRIVER 3 /* Pre Driver bind */ -#define IOMMU_GROUP_NOTIFY_BOUND_DRIVER 4 /* Post Driver bind */ -#define IOMMU_GROUP_NOTIFY_UNBIND_DRIVER 5 /* Pre Driver unbind */ -#define IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER 6 /* Post Driver unbind */ - extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops); extern int bus_iommu_probe(struct bus_type *bus); extern bool iommu_present(struct bus_type *bus); @@ -478,10 +471,6 @@ extern int iommu_group_for_each_dev(struct iommu_group *group, void *data, extern struct iommu_group *iommu_group_get(struct device *dev); extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group); extern void iommu_group_put(struct iommu_group *group); -extern int iommu_group_register_notifier(struct iommu_group *group, - struct notifier_block *nb); -extern int iommu_group_unregister_notifier(struct iommu_group *group, - struct notifier_block *nb); extern int iommu_register_device_fault_handler(struct device *dev, iommu_dev_fault_handler_t handler, void *data); @@ -878,18 +867,6 @@ static inline void iommu_group_put(struct iommu_group *group) { } -static inline int iommu_group_register_notifier(struct iommu_group *group, - struct notifier_block *nb) -{ - return -ENODEV; -} - -static inline int iommu_group_unregister_notifier(struct iommu_group *group, - struct notifier_block *nb) -{ - return 0; -} - static inline int iommu_register_device_fault_handler(struct device *dev, iommu_dev_fault_handler_t handler, -- cgit v1.2.3-71-gd317 From 6043257b1de069fbb5a2a52d7211c0275bc8c0e0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 11 Apr 2022 12:16:05 -0300 Subject: iommu: Introduce the domain op enforce_cache_coherency() This new mechanism will replace using IOMMU_CAP_CACHE_COHERENCY and IOMMU_CACHE to control the no-snoop blocking behavior of the IOMMU. Currently only Intel and AMD IOMMUs are known to support this feature. They both implement it as an IOPTE bit, that when set, will cause PCIe TLPs to that IOVA with the no-snoop bit set to be treated as though the no-snoop bit was clear. The new API is triggered by calling enforce_cache_coherency() before mapping any IOVA to the domain which globally switches on no-snoop blocking. This allows other implementations that might block no-snoop globally and outside the IOPTE - AMD also documents such a HW capability. Leave AMD out of sync with Intel and have it block no-snoop even for in-kernel users. This can be trivially resolved in a follow up patch. Only VFIO needs to call this API because it does not have detailed control over the device to avoid requesting no-snoop behavior at the device level. Other places using domains with real kernel drivers should simply avoid asking their devices to set the no-snoop bit. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Acked-by: Robin Murphy Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v3-2cf356649677+a32-intel_no_snoop_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 7 +++++++ drivers/iommu/intel/iommu.c | 14 +++++++++++++- include/linux/intel-iommu.h | 1 + include/linux/iommu.h | 4 ++++ 4 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 038e104b922c..840831d5d2ad 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2266,6 +2266,12 @@ static int amd_iommu_def_domain_type(struct device *dev) return 0; } +static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) +{ + /* IOMMU_PTE_FC is always set */ + return true; +} + const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .domain_alloc = amd_iommu_domain_alloc, @@ -2288,6 +2294,7 @@ const struct iommu_ops amd_iommu_ops = { .flush_iotlb_all = amd_iommu_flush_iotlb_all, .iotlb_sync = amd_iommu_iotlb_sync, .free = amd_iommu_domain_free, + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, } }; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 0edf6084dc14..161199f62270 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4422,7 +4422,8 @@ static int intel_iommu_map(struct iommu_domain *domain, prot |= DMA_PTE_READ; if (iommu_prot & IOMMU_WRITE) prot |= DMA_PTE_WRITE; - if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) + if (((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) || + dmar_domain->force_snooping) prot |= DMA_PTE_SNP; max_addr = iova + size; @@ -4545,6 +4546,16 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, return phys; } +static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + if (!dmar_domain->iommu_snooping) + return false; + dmar_domain->force_snooping = true; + return true; +} + static bool intel_iommu_capable(enum iommu_cap cap) { if (cap == IOMMU_CAP_CACHE_COHERENCY) @@ -4900,6 +4911,7 @@ const struct iommu_ops intel_iommu_ops = { .iotlb_sync = intel_iommu_tlb_sync, .iova_to_phys = intel_iommu_iova_to_phys, .free = intel_iommu_domain_free, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, } }; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 2f9891cb3d00..4c2baf2446c2 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -540,6 +540,7 @@ struct dmar_domain { u8 has_iotlb_device: 1; u8 iommu_coherency: 1; /* indicate coherency of iommu access */ u8 iommu_snooping: 1; /* indicate snooping control feature */ + u8 force_snooping : 1; /* Create IOPTEs with snoop control */ struct list_head devices; /* all devices' list */ struct iova_domain iovad; /* iova's that belong to this domain */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4123693ae319..c7ad6b10e261 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -274,6 +274,9 @@ struct iommu_ops { * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue * @iova_to_phys: translate iova to physical address + * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE, + * including no-snoop TLPs on PCIe or other platform + * specific mechanisms. * @enable_nesting: Enable nesting * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @free: Release the domain after use. @@ -302,6 +305,7 @@ struct iommu_domain_ops { phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); + bool (*enforce_cache_coherency)(struct iommu_domain *domain); int (*enable_nesting)(struct iommu_domain *domain); int (*set_pgtable_quirks)(struct iommu_domain *domain, unsigned long quirks); -- cgit v1.2.3-71-gd317 From 71cfafda9c9bd9812cdb62ddb94daf65a1af12c1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 11 Apr 2022 12:16:06 -0300 Subject: vfio: Move the Intel no-snoop control off of IOMMU_CACHE IOMMU_CACHE means "normal DMA to this iommu_domain's IOVA should be cache coherent" and is used by the DMA API. The definition allows for special non-coherent DMA to exist - ie processing of the no-snoop flag in PCIe TLPs - so long as this behavior is opt-in by the device driver. The flag is mainly used by the DMA API to synchronize the IOMMU setting with the expected cache behavior of the DMA master. eg based on dev_is_dma_coherent() in some case. For Intel IOMMU IOMMU_CACHE was redefined to mean 'force all DMA to be cache coherent' which has the practical effect of causing the IOMMU to ignore the no-snoop bit in a PCIe TLP. x86 platforms are always IOMMU_CACHE, so Intel should ignore this flag. Instead use the new domain op enforce_cache_coherency() which causes every IOPTE created in the domain to have the no-snoop blocking behavior. Reconfigure VFIO to always use IOMMU_CACHE and call enforce_cache_coherency() to operate the special Intel behavior. Remove the IOMMU_CACHE test from Intel IOMMU. Ultimately VFIO plumbs the result of enforce_cache_coherency() back into the x86 platform code through kvm_arch_register_noncoherent_dma() which controls if the WBINVD instruction is available in the guest. No other archs implement kvm_arch_register_noncoherent_dma() nor are there any other known consumers of VFIO_DMA_CC_IOMMU that might be affected by the user visible result change on non-x86 archs. Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Acked-by: Alex Williamson Acked-by: Robin Murphy Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v3-2cf356649677+a32-intel_no_snoop_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 7 ++----- drivers/vfio/vfio_iommu_type1.c | 30 +++++++++++++++++++----------- include/linux/intel-iommu.h | 1 - 3 files changed, 21 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 161199f62270..38441cb06c8c 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -641,7 +641,6 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) static void domain_update_iommu_cap(struct dmar_domain *domain) { domain_update_iommu_coherency(domain); - domain->iommu_snooping = domain_update_iommu_snooping(NULL); domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); /* @@ -4283,7 +4282,6 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) domain->agaw = width_to_agaw(adjust_width); domain->iommu_coherency = false; - domain->iommu_snooping = false; domain->iommu_superpage = 0; domain->max_addr = 0; @@ -4422,8 +4420,7 @@ static int intel_iommu_map(struct iommu_domain *domain, prot |= DMA_PTE_READ; if (iommu_prot & IOMMU_WRITE) prot |= DMA_PTE_WRITE; - if (((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) || - dmar_domain->force_snooping) + if (dmar_domain->force_snooping) prot |= DMA_PTE_SNP; max_addr = iova + size; @@ -4550,7 +4547,7 @@ static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); - if (!dmar_domain->iommu_snooping) + if (!domain_update_iommu_snooping(NULL)) return false; dmar_domain->force_snooping = true; return true; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9394aa9444c1..c13b9290e357 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -84,8 +84,8 @@ struct vfio_domain { struct iommu_domain *domain; struct list_head next; struct list_head group_list; - int prot; /* IOMMU_CACHE */ - bool fgsp; /* Fine-grained super pages */ + bool fgsp : 1; /* Fine-grained super pages */ + bool enforce_cache_coherency : 1; }; struct vfio_dma { @@ -1461,7 +1461,7 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, list_for_each_entry(d, &iommu->domain_list, next) { ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT, - npage << PAGE_SHIFT, prot | d->prot); + npage << PAGE_SHIFT, prot | IOMMU_CACHE); if (ret) goto unwind; @@ -1771,7 +1771,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, } ret = iommu_map(domain->domain, iova, phys, - size, dma->prot | domain->prot); + size, dma->prot | IOMMU_CACHE); if (ret) { if (!dma->iommu_mapped) { vfio_unpin_pages_remote(dma, iova, @@ -1859,7 +1859,7 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain) return; ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2, - IOMMU_READ | IOMMU_WRITE | domain->prot); + IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE); if (!ret) { size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE); @@ -2267,8 +2267,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, goto out_detach; } - if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY)) - domain->prot |= IOMMU_CACHE; + /* + * If the IOMMU can block non-coherent operations (ie PCIe TLPs with + * no-snoop set) then VFIO always turns this feature on because on Intel + * platforms it optimizes KVM to disable wbinvd emulation. + */ + if (domain->domain->ops->enforce_cache_coherency) + domain->enforce_cache_coherency = + domain->domain->ops->enforce_cache_coherency( + domain->domain); /* * Try to match an existing compatible domain. We don't want to @@ -2279,7 +2286,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, */ list_for_each_entry(d, &iommu->domain_list, next) { if (d->domain->ops == domain->domain->ops && - d->prot == domain->prot) { + d->enforce_cache_coherency == + domain->enforce_cache_coherency) { iommu_detach_group(domain->domain, group->iommu_group); if (!iommu_attach_group(d->domain, group->iommu_group)) { @@ -2611,14 +2619,14 @@ static void vfio_iommu_type1_release(void *iommu_data) kfree(iommu); } -static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu) +static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu) { struct vfio_domain *domain; int ret = 1; mutex_lock(&iommu->lock); list_for_each_entry(domain, &iommu->domain_list, next) { - if (!(domain->prot & IOMMU_CACHE)) { + if (!(domain->enforce_cache_coherency)) { ret = 0; break; } @@ -2641,7 +2649,7 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, case VFIO_DMA_CC_IOMMU: if (!iommu) return 0; - return vfio_domains_have_iommu_cache(iommu); + return vfio_domains_have_enforce_cache_coherency(iommu); default: return 0; } diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 4c2baf2446c2..72e5d7900e71 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -539,7 +539,6 @@ struct dmar_domain { u8 has_iotlb_device: 1; u8 iommu_coherency: 1; /* indicate coherency of iommu access */ - u8 iommu_snooping: 1; /* indicate snooping control feature */ u8 force_snooping : 1; /* Create IOPTEs with snoop control */ struct list_head devices; /* all devices' list */ -- cgit v1.2.3-71-gd317 From f78dc1dad829e505d83e33dc0879887f074c52e1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 11 Apr 2022 12:16:07 -0300 Subject: iommu: Redefine IOMMU_CAP_CACHE_COHERENCY as the cap flag for IOMMU_CACHE While the comment was correct that this flag was intended to convey the block no-snoop support in the IOMMU, it has become widely implemented and used to mean the IOMMU supports IOMMU_CACHE as a map flag. Only the Intel driver was different. Now that the Intel driver is using enforce_cache_coherency() update the comment to make it clear that IOMMU_CAP_CACHE_COHERENCY is only about IOMMU_CACHE. Fix the Intel driver to return true since IOMMU_CACHE always works. The two places that test this flag, usnic and vdpa, are both assigning userspace pages to a driver controlled iommu_domain and require IOMMU_CACHE behavior as they offer no way for userspace to synchronize caches. Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Acked-by: Robin Murphy Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v3-2cf356649677+a32-intel_no_snoop_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 2 +- include/linux/iommu.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 38441cb06c8c..efcecfa5952a 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4556,7 +4556,7 @@ static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) static bool intel_iommu_capable(enum iommu_cap cap) { if (cap == IOMMU_CAP_CACHE_COHERENCY) - return domain_update_iommu_snooping(NULL); + return true; if (cap == IOMMU_CAP_INTR_REMAP) return irq_remapping_enabled == 1; if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c7ad6b10e261..575ab27ede5b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -103,8 +103,7 @@ static inline bool iommu_is_dma_domain(struct iommu_domain *domain) } enum iommu_cap { - IOMMU_CAP_CACHE_COHERENCY, /* IOMMU can enforce cache coherent DMA - transactions */ + IOMMU_CAP_CACHE_COHERENCY, /* IOMMU_CACHE is supported */ IOMMU_CAP_INTR_REMAP, /* IOMMU supports interrupt isolation */ IOMMU_CAP_NOEXEC, /* IOMMU_NOEXEC flag */ IOMMU_CAP_PRE_BOOT_PROTECTION, /* Firmware says it used the IOMMU for -- cgit v1.2.3-71-gd317 From 892de36fd4a98fab3298d417c051d9099af5448d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Apr 2022 12:22:10 -0400 Subject: SUNRPC: Ensure gss-proxy connects on setup For reasons best known to the author, gss-proxy does not implement a NULL procedure, and returns RPC_PROC_UNAVAIL. However we still want to ensure that we connect to the service at setup time. So add a quirk-flag specially for this case. Fixes: 1d658336b05f ("SUNRPC: Add RPC based upcall mechanism for RPCGSS auth") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/auth_gss/gss_rpc_upcall.c | 2 +- net/sunrpc/clnt.c | 3 +++ 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 267b7aeaf1a6..db5149567305 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -160,6 +160,7 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) #define RPC_CLNT_CREATE_SOFTERR (1UL << 10) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) +#define RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL (1UL << 12) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 61c276bddaf2..8ca1d809b78d 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -97,7 +97,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) * timeout, which would result in reconnections being * done without the correct namespace: */ - .flags = RPC_CLNT_CREATE_NOPING | + .flags = RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 98133aa54f19..22c28cf43eba 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -479,6 +479,9 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { int err = rpc_ping(clnt); + if ((args->flags & RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL) && + err == -EOPNOTSUPP) + err = 0; if (err != 0) { rpc_shutdown_client(clnt); return ERR_PTR(err); -- cgit v1.2.3-71-gd317 From 47f753c1108e287edb3e27fad8a7511a9d55578e Mon Sep 17 00:00:00 2001 From: Tan Tee Min Date: Fri, 29 Apr 2022 19:58:07 +0800 Subject: net: stmmac: disable Split Header (SPH) for Intel platforms Based on DesignWare Ethernet QoS datasheet, we are seeing the limitation of Split Header (SPH) feature is not supported for Ipv4 fragmented packet. This SPH limitation will cause ping failure when the packets size exceed the MTU size. For example, the issue happens once the basic ping packet size is larger than the configured MTU size and the data is lost inside the fragmented packet, replaced by zeros/corrupted values, and leads to ping fail. So, disable the Split Header for Intel platforms. v2: Add fixes tag in commit message. Fixes: 67afd6d1cfdf("net: stmmac: Add Split Header support and enable it in XGMAC cores") Cc: # 5.10.x Suggested-by: Ong, Boon Leong Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: Wong Vee Khee Signed-off-by: Tan Tee Min Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- include/linux/stmmac.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 63754a9c4ba7..0b0be0898ac5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -454,6 +454,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->has_gmac4 = 1; plat->force_sf_dma_mode = 0; plat->tso_en = 1; + plat->sph_disable = 1; /* Multiplying factor to the clk_eee_i clock time * period to make it closer to 100 ns. This value diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 4a4b3651ab3e..2525a80353b7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7021,7 +7021,7 @@ int stmmac_dvr_probe(struct device *device, dev_info(priv->device, "TSO feature enabled\n"); } - if (priv->dma_cap.sphen) { + if (priv->dma_cap.sphen && !priv->plat->sph_disable) { ndev->hw_features |= NETIF_F_GRO; priv->sph_cap = true; priv->sph = priv->sph_cap; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 24eea1b05ca2..29917850f079 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -270,5 +270,6 @@ struct plat_stmmacenet_data { int msi_rx_base_vec; int msi_tx_base_vec; bool use_phy_wol; + bool sph_disable; }; #endif -- cgit v1.2.3-71-gd317 From 170f37d6aa6ad4582eefd7459015de79e244536e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 3 May 2022 00:09:31 -0400 Subject: block: Do not call folio_next() on an unreferenced folio It is unsafe to call folio_next() on a folio unless you hold a reference on it that prevents it from being split or freed. After returning from the iterator, iomap calls folio_end_writeback() which may drop the last reference to the page, or allow the page to be split. If that happens, the iterator will not advance far enough through the bio_vec, leading to assertion failures like the BUG() in folio_end_writeback() that checks we're not trying to end writeback on a page not currently under writeback. Other assertion failures were also seen, but they're all explained by this one bug. Fix the bug by remembering where the next folio starts before returning from the iterator. There are other ways of fixing this bug, but this seems the simplest. Reported-by: Darrick J. Wong Tested-by: Darrick J. Wong Reported-by: Brian Foster Tested-by: Brian Foster Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/bio.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 278cc81cc1e7..00450fd86bb4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -269,6 +269,7 @@ struct folio_iter { size_t offset; size_t length; /* private: for use by the iterator */ + struct folio *_next; size_t _seg_count; int _i; }; @@ -283,6 +284,7 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio, PAGE_SIZE * (bvec->bv_page - &fi->folio->page); fi->_seg_count = bvec->bv_len; fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count); + fi->_next = folio_next(fi->folio); fi->_i = i; } @@ -290,9 +292,10 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) { fi->_seg_count -= fi->length; if (fi->_seg_count) { - fi->folio = folio_next(fi->folio); + fi->folio = fi->_next; fi->offset = 0; fi->length = min(folio_size(fi->folio), fi->_seg_count); + fi->_next = folio_next(fi->folio); } else if (fi->_i + 1 < bio->bi_vcnt) { bio_first_folio(fi, bio, fi->_i + 1); } else { -- cgit v1.2.3-71-gd317 From 85db6352fc8a158a893151baa1716463d34a20d0 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 4 May 2022 11:09:14 +0300 Subject: net: Fix features skip in for_each_netdev_feature() The find_next_netdev_feature() macro gets the "remaining length", not bit index. Passing "bit - 1" for the following iteration is wrong as it skips the adjacent bit. Pass "bit" instead. Fixes: 3b89ea9c5902 ("net: Fix for_each_netdev_feature on Big endian") Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Link: https://lore.kernel.org/r/20220504080914.1918-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/netdev_features.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 2c6b9e416225..7c2d77d75a88 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -169,7 +169,7 @@ enum { #define NETIF_F_HW_HSR_FWD __NETIF_F(HW_HSR_FWD) #define NETIF_F_HW_HSR_DUP __NETIF_F(HW_HSR_DUP) -/* Finds the next feature with the highest number of the range of start till 0. +/* Finds the next feature with the highest number of the range of start-1 till 0. */ static inline int find_next_netdev_feature(u64 feature, unsigned long start) { @@ -188,7 +188,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) for ((bit) = find_next_netdev_feature((mask_addr), \ NETDEV_FEATURE_COUNT); \ (bit) >= 0; \ - (bit) = find_next_netdev_feature((mask_addr), (bit) - 1)) + (bit) = find_next_netdev_feature((mask_addr), (bit))) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ -- cgit v1.2.3-71-gd317 From 3d1b0d351441c2be7de082b92f43d0bfdc95a8f3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 7 May 2022 13:48:21 -0400 Subject: Revert "SUNRPC: Ensure gss-proxy connects on setup" This reverts commit 892de36fd4a98fab3298d417c051d9099af5448d. The gssproxy server is unresponsive when it calls into the kernel to start the upcall service, so it will not reply to our RPC ping at all. Reported-by: "J.Bruce Fields" Fixes: 892de36fd4a9 ("SUNRPC: Ensure gss-proxy connects on setup") Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 - net/sunrpc/auth_gss/gss_rpc_upcall.c | 2 +- net/sunrpc/clnt.c | 3 --- 3 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index db5149567305..267b7aeaf1a6 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -160,7 +160,6 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) #define RPC_CLNT_CREATE_SOFTERR (1UL << 10) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) -#define RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL (1UL << 12) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 8ca1d809b78d..61c276bddaf2 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -97,7 +97,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) * timeout, which would result in reconnections being * done without the correct namespace: */ - .flags = RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL | + .flags = RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 22c28cf43eba..98133aa54f19 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -479,9 +479,6 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { int err = rpc_ping(clnt); - if ((args->flags & RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL) && - err == -EOPNOTSUPP) - err = 0; if (err != 0) { rpc_shutdown_client(clnt); return ERR_PTR(err); -- cgit v1.2.3-71-gd317 From fd13359f54ee854f00134abc6be32da94ec53dbf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 7 May 2022 13:53:59 -0400 Subject: SUNRPC: Ensure that the gssproxy client can start in a connected state Ensure that the gssproxy client connects to the server from the gssproxy daemon process context so that the AF_LOCAL socket connection is done using the correct path and namespaces. Fixes: 1d658336b05f ("SUNRPC: Add RPC based upcall mechanism for RPCGSS auth") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/auth_gss/gss_rpc_upcall.c | 1 + net/sunrpc/clnt.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 267b7aeaf1a6..90501404fa49 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -160,6 +160,7 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) #define RPC_CLNT_CREATE_SOFTERR (1UL << 10) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) +#define RPC_CLNT_CREATE_CONNECTED (1UL << 12) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 61c276bddaf2..f549e4c05def 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -98,6 +98,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) * done without the correct namespace: */ .flags = RPC_CLNT_CREATE_NOPING | + RPC_CLNT_CREATE_CONNECTED | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 98133aa54f19..e2c6eca0271b 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -76,6 +76,7 @@ static int rpc_encode_header(struct rpc_task *task, static int rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr); static int rpc_ping(struct rpc_clnt *clnt); +static int rpc_ping_noreply(struct rpc_clnt *clnt); static void rpc_check_timeout(struct rpc_task *task); static void rpc_register_client(struct rpc_clnt *clnt) @@ -483,6 +484,12 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, rpc_shutdown_client(clnt); return ERR_PTR(err); } + } else if (args->flags & RPC_CLNT_CREATE_CONNECTED) { + int err = rpc_ping_noreply(clnt); + if (err != 0) { + rpc_shutdown_client(clnt); + return ERR_PTR(err); + } } clnt->cl_softrtry = 1; @@ -2709,6 +2716,10 @@ static const struct rpc_procinfo rpcproc_null = { .p_decode = rpcproc_decode_null, }; +static const struct rpc_procinfo rpcproc_null_noreply = { + .p_encode = rpcproc_encode_null, +}; + static void rpc_null_call_prepare(struct rpc_task *task, void *data) { @@ -2762,6 +2773,28 @@ static int rpc_ping(struct rpc_clnt *clnt) return status; } +static int rpc_ping_noreply(struct rpc_clnt *clnt) +{ + struct rpc_message msg = { + .rpc_proc = &rpcproc_null_noreply, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = &rpc_null_ops, + .flags = RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, + }; + struct rpc_task *task; + int status; + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); + return status; +} + struct rpc_cb_add_xprt_calldata { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; -- cgit v1.2.3-71-gd317 From 2e3afb42dd480d755cd7d97ca04586a5616c1a5e Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 8 May 2022 11:37:09 +0200 Subject: blk-mq: remove the error_count from struct request The last two users were floppy.c and ataflop.c respectively, it was verified that no other drivers makes use of this, so let's remove it. Suggested-by: Linus Torvalds Cc: Minh Yuan Cc: Denis Efremov , Cc: Geert Uytterhoeven Cc: Christoph Hellwig Signed-off-by: Willy Tarreau Signed-off-by: Linus Torvalds --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7aa5c54901a9..9f07061418db 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -163,7 +163,6 @@ struct request { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec; void *completion_data; - int error_count; /* for legacy drivers, don't use */ }; -- cgit v1.2.3-71-gd317 From ea661ad6e1573d5b08c27444ff2ed403bf39ff66 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 10 May 2022 10:34:03 +0800 Subject: iommu/vt-d: Size Page Request Queue to avoid overflow condition PRQ overflow may cause I/O throughput congestion, resulting in unnecessary degradation of I/O performance. Appropriately increasing the length of PRQ can greatly reduce the occurrence of PRQ overflow. The count of maximum page requests that can be generated in parallel by a PCIe device is statically defined in the Outstanding Page Request Capacity field of the PCIe ATS configure space. The new length of PRQ is calculated by summing up the value of Outstanding Page Request Capacity register across all devices where Page Requests are supported on the real PR-capable platform (Intel Sapphire Rapids). The result is round to the nearest higher power of 2. The PRQ length is also double sized as the VT-d IOMMU driver only updates the Page Request Queue Head Register (PQH_REG) after processing the entire queue. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20220421113558.3504874-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20220510023407.2759143-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-svm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index b3b125b332aa..207ef06ba3e1 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -9,7 +9,7 @@ #define __INTEL_SVM_H__ /* Page Request Queue depth */ -#define PRQ_ORDER 2 +#define PRQ_ORDER 4 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) #define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5) -- cgit v1.2.3-71-gd317 From fc0051cb95909ab56bd8c929f24d48c9870c3e3a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 10 May 2022 10:34:05 +0800 Subject: iommu/vt-d: Check domain force_snooping against attached devices As domain->force_snooping only impacts the devices attached with the domain, there's no need to check against all IOMMU units. On the other hand, force_snooping could be set on a domain no matter whether it has been attached or not, and once set it is an immutable flag. If no device attached, the operation always succeeds. Then this empty domain can be only attached to a device of which the IOMMU supports snoop control. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20220508123525.1973626-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20220510023407.2759143-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 53 ++++++++++++++++++++++++++++++++++++++++++--- drivers/iommu/intel/pasid.c | 42 +++++++++++++++++++++++++++++++++++ drivers/iommu/intel/pasid.h | 2 ++ include/linux/intel-iommu.h | 1 + 4 files changed, 95 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 23985cb5c8ee..f366e8c49636 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2438,7 +2438,7 @@ static int domain_setup_first_level(struct intel_iommu *iommu, if (level == 5) flags |= PASID_FLAG_FL5LP; - if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED) + if (domain->force_snooping) flags |= PASID_FLAG_PAGE_SNOOP; return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, @@ -4410,7 +4410,7 @@ static int intel_iommu_map(struct iommu_domain *domain, prot |= DMA_PTE_READ; if (iommu_prot & IOMMU_WRITE) prot |= DMA_PTE_WRITE; - if (dmar_domain->force_snooping) + if (dmar_domain->set_pte_snp) prot |= DMA_PTE_SNP; max_addr = iova + size; @@ -4533,13 +4533,60 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, return phys; } +static bool domain_support_force_snooping(struct dmar_domain *domain) +{ + struct device_domain_info *info; + bool support = true; + + assert_spin_locked(&device_domain_lock); + list_for_each_entry(info, &domain->devices, link) { + if (!ecap_sc_support(info->iommu->ecap)) { + support = false; + break; + } + } + + return support; +} + +static void domain_set_force_snooping(struct dmar_domain *domain) +{ + struct device_domain_info *info; + + assert_spin_locked(&device_domain_lock); + + /* + * Second level page table supports per-PTE snoop control. The + * iommu_map() interface will handle this by setting SNP bit. + */ + if (!domain_use_first_level(domain)) { + domain->set_pte_snp = true; + return; + } + + list_for_each_entry(info, &domain->devices, link) + intel_pasid_setup_page_snoop_control(info->iommu, info->dev, + PASID_RID2PASID); +} + static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); + unsigned long flags; - if (!domain_update_iommu_snooping(NULL)) + if (dmar_domain->force_snooping) + return true; + + spin_lock_irqsave(&device_domain_lock, flags); + if (!domain_support_force_snooping(dmar_domain)) { + spin_unlock_irqrestore(&device_domain_lock, flags); return false; + } + + domain_set_force_snooping(dmar_domain); dmar_domain->force_snooping = true; + spin_unlock_irqrestore(&device_domain_lock, flags); + return true; } diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index f8d215d85695..d19dd66a670c 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -762,3 +762,45 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return 0; } + +/* + * Set the page snoop control for a pasid entry which has been set up. + */ +void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, + struct device *dev, u32 pasid) +{ + struct pasid_entry *pte; + u16 did; + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (WARN_ON(!pte || !pasid_pte_is_present(pte))) { + spin_unlock(&iommu->lock); + return; + } + + pasid_set_pgsnp(pte); + did = pasid_get_domain_id(pte); + spin_unlock(&iommu->lock); + + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * VT-d spec 3.4 table23 states guides for cache invalidation: + * + * - PASID-selective-within-Domain PASID-cache invalidation + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); + + /* Device IOTLB doesn't need to be flushed in caching mode. */ + if (!cap_caching_mode(iommu->cap)) + devtlb_invalidation_with_pasid(iommu, dev, pasid); +} diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index ab4408c824a5..583ea67fc783 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -123,4 +123,6 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, bool fault_ignore); int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid); void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid); +void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, + struct device *dev, u32 pasid); #endif /* __INTEL_PASID_H */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 72e5d7900e71..4f29139bbfc3 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -540,6 +540,7 @@ struct dmar_domain { u8 has_iotlb_device: 1; u8 iommu_coherency: 1; /* indicate coherency of iommu access */ u8 force_snooping : 1; /* Create IOPTEs with snoop control */ + u8 set_pte_snp:1; struct list_head devices; /* all devices' list */ struct iova_domain iovad; /* iova's that belong to this domain */ -- cgit v1.2.3-71-gd317