From 4a4472fdc098fb78f52a0848788faf46674a8423 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 12 Feb 2020 10:43:17 +0100 Subject: of: clk: Make of_clk_get_parent_{count,name}() parameter const of_clk_get_parent_count() and of_clk_get_parent_name() never modify the device nodes passed, so they can be const. Signed-off-by: Geert Uytterhoeven Link: https://lkml.kernel.org/r/20200212094317.1150-1-geert+renesas@glider.be Signed-off-by: Stephen Boyd --- include/linux/of_clk.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_clk.h b/include/linux/of_clk.h index c86fcad23fc2..31b73a0da9db 100644 --- a/include/linux/of_clk.h +++ b/include/linux/of_clk.h @@ -11,17 +11,17 @@ struct of_device_id; #if defined(CONFIG_COMMON_CLK) && defined(CONFIG_OF) -unsigned int of_clk_get_parent_count(struct device_node *np); -const char *of_clk_get_parent_name(struct device_node *np, int index); +unsigned int of_clk_get_parent_count(const struct device_node *np); +const char *of_clk_get_parent_name(const struct device_node *np, int index); void of_clk_init(const struct of_device_id *matches); #else /* !CONFIG_COMMON_CLK || !CONFIG_OF */ -static inline unsigned int of_clk_get_parent_count(struct device_node *np) +static inline unsigned int of_clk_get_parent_count(const struct device_node *np) { return 0; } -static inline const char *of_clk_get_parent_name(struct device_node *np, +static inline const char *of_clk_get_parent_name(const struct device_node *np, int index) { return NULL; -- cgit v1.2.3-71-gd317 From 02d715b4a8182f4887d82df82a7b83aced647760 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Sun, 23 Feb 2020 22:25:39 +0530 Subject: iommu/vt-d: Fix RCU list debugging warnings dmar_drhd_units is traversed using list_for_each_entry_rcu() outside of an RCU read side critical section but under the protection of dmar_global_lock. Hence add corresponding lockdep expression to silence the following false-positive warnings: [ 1.603975] ============================= [ 1.603976] WARNING: suspicious RCU usage [ 1.603977] 5.5.4-stable #17 Not tainted [ 1.603978] ----------------------------- [ 1.603980] drivers/iommu/intel-iommu.c:4769 RCU-list traversed in non-reader section!! [ 1.603869] ============================= [ 1.603870] WARNING: suspicious RCU usage [ 1.603872] 5.5.4-stable #17 Not tainted [ 1.603874] ----------------------------- [ 1.603875] drivers/iommu/dmar.c:293 RCU-list traversed in non-reader section!! Tested-by: Madhuparna Bhowmik Signed-off-by: Amol Grover Cc: stable@vger.kernel.org Acked-by: Lu Baolu Signed-off-by: Joerg Roedel --- include/linux/dmar.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index f64ca27dc210..712be8bc6a7c 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -69,8 +69,9 @@ struct dmar_pci_notify_info { extern struct rw_semaphore dmar_global_lock; extern struct list_head dmar_drhd_units; -#define for_each_drhd_unit(drhd) \ - list_for_each_entry_rcu(drhd, &dmar_drhd_units, list) +#define for_each_drhd_unit(drhd) \ + list_for_each_entry_rcu(drhd, &dmar_drhd_units, list, \ + dmar_rcu_check()) #define for_each_active_drhd_unit(drhd) \ list_for_each_entry_rcu(drhd, &dmar_drhd_units, list) \ @@ -81,7 +82,8 @@ extern struct list_head dmar_drhd_units; if (i=drhd->iommu, drhd->ignored) {} else #define for_each_iommu(i, drhd) \ - list_for_each_entry_rcu(drhd, &dmar_drhd_units, list) \ + list_for_each_entry_rcu(drhd, &dmar_drhd_units, list, \ + dmar_rcu_check()) \ if (i=drhd->iommu, 0) {} else static inline bool dmar_rcu_check(void) -- cgit v1.2.3-71-gd317 From 8019ad13ef7f64be44d4f892af9c840179009254 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Mar 2020 11:28:31 +0100 Subject: futex: Fix inode life-time issue As reported by Jann, ihold() does not in fact guarantee inode persistence. And instead of making it so, replace the usage of inode pointers with a per boot, machine wide, unique inode identifier. This sequence number is global, but shared (file backed) futexes are rare enough that this should not become a performance issue. Reported-by: Jann Horn Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) --- fs/inode.c | 1 + include/linux/fs.h | 1 + include/linux/futex.h | 17 ++++++---- kernel/futex.c | 89 ++++++++++++++++++++++++++++++--------------------- 4 files changed, 65 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/fs/inode.c b/fs/inode.c index 7d57068b6b7a..93d9252a00ab 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -138,6 +138,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; + atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3cd4fe6b845e..abedbffe2c9e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -698,6 +698,7 @@ struct inode { struct rcu_head i_rcu; }; atomic64_t i_version; + atomic64_t i_sequence; /* see futex */ atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; diff --git a/include/linux/futex.h b/include/linux/futex.h index 5cc3fed27d4c..b70df27d7e85 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -31,23 +31,26 @@ struct task_struct; union futex_key { struct { + u64 i_seq; unsigned long pgoff; - struct inode *inode; - int offset; + unsigned int offset; } shared; struct { + union { + struct mm_struct *mm; + u64 __tmp; + }; unsigned long address; - struct mm_struct *mm; - int offset; + unsigned int offset; } private; struct { + u64 ptr; unsigned long word; - void *ptr; - int offset; + unsigned int offset; } both; }; -#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } +#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } } #ifdef CONFIG_FUTEX enum { diff --git a/kernel/futex.c b/kernel/futex.c index 0cf84c8664f2..e14f7cd45dbd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -429,7 +429,7 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies smp_mb(); (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies smp_mb(); (B) */ @@ -463,7 +463,6 @@ static void drop_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - iput(key->shared.inode); break; case FUT_OFF_MMSHARED: mmdrop(key->private.mm); @@ -505,6 +504,46 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, return timeout; } +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -517,9 +556,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, * * The key words are stored in @key on success. * - * For shared mappings, it's (page->index, file_inode(vma->vm_file), - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. + * For shared mappings (when @fshared), the key is: + * ( inode->i_sequence, page->index, offset_within_page ) + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ @@ -659,8 +704,6 @@ again: key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - } else { struct inode *inode; @@ -692,40 +735,14 @@ again: goto again; } - /* - * Take a reference unless it is about to be freed. Previously - * this reference was taken by ihold under the page lock - * pinning the inode in place so i_lock was unnecessary. The - * only way for this check to fail is if the inode was - * truncated in parallel which is almost certainly an - * application bug. In such a case, just retry. - * - * We are not calling into get_futex_key_refs() in file-backed - * cases, therefore a successful atomic_inc return below will - * guarantee that get_futex_key() will still imply smp_mb(); (B). - */ - if (!atomic_inc_not_zero(&inode->i_count)) { - rcu_read_unlock(); - put_page(page); - - goto again; - } - - /* Should be impossible but lets be paranoid for now */ - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { - err = -EFAULT; - rcu_read_unlock(); - iput(inode); - - goto out; - } - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = inode; + key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + out: put_page(page); return err; -- cgit v1.2.3-71-gd317 From f5152416528c2295f35dd9c9bd4fb27c4032413d Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 5 Mar 2020 15:15:02 -0500 Subject: iommu/vt-d: Silence RCU-list debugging warnings Similar to the commit 02d715b4a818 ("iommu/vt-d: Fix RCU list debugging warnings"), there are several other places that call list_for_each_entry_rcu() outside of an RCU read side critical section but with dmar_global_lock held. Silence those false positives as well. drivers/iommu/intel-iommu.c:4288 RCU-list traversed in non-reader section!! 1 lock held by swapper/0/1: #0: ffffffff935892c8 (dmar_global_lock){+.+.}, at: intel_iommu_init+0x1ad/0xb97 drivers/iommu/dmar.c:366 RCU-list traversed in non-reader section!! 1 lock held by swapper/0/1: #0: ffffffff935892c8 (dmar_global_lock){+.+.}, at: intel_iommu_init+0x125/0xb97 drivers/iommu/intel-iommu.c:5057 RCU-list traversed in non-reader section!! 1 lock held by swapper/0/1: #0: ffffffffa71892c8 (dmar_global_lock){++++}, at: intel_iommu_init+0x61a/0xb13 Signed-off-by: Qian Cai Acked-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 3 ++- include/linux/dmar.h | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 071bb42bbbc5..7b16c4db40b4 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -363,7 +363,8 @@ dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd) { struct dmar_drhd_unit *dmaru; - list_for_each_entry_rcu(dmaru, &dmar_drhd_units, list) + list_for_each_entry_rcu(dmaru, &dmar_drhd_units, list, + dmar_rcu_check()) if (dmaru->segment == drhd->segment && dmaru->reg_base_addr == drhd->address) return dmaru; diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 712be8bc6a7c..d7bf029df737 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -74,11 +74,13 @@ extern struct list_head dmar_drhd_units; dmar_rcu_check()) #define for_each_active_drhd_unit(drhd) \ - list_for_each_entry_rcu(drhd, &dmar_drhd_units, list) \ + list_for_each_entry_rcu(drhd, &dmar_drhd_units, list, \ + dmar_rcu_check()) \ if (drhd->ignored) {} else #define for_each_active_iommu(i, drhd) \ - list_for_each_entry_rcu(drhd, &dmar_drhd_units, list) \ + list_for_each_entry_rcu(drhd, &dmar_drhd_units, list, \ + dmar_rcu_check()) \ if (i=drhd->iommu, drhd->ignored) {} else #define for_each_iommu(i, drhd) \ -- cgit v1.2.3-71-gd317 From 1292e3efb149ee21d8d33d725eeed4e6b1ade963 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 10 Mar 2020 12:49:43 +0100 Subject: mmc: core: Allow host controllers to require R1B for CMD6 It has turned out that some host controllers can't use R1B for CMD6 and other commands that have R1B associated with them. Therefore invent a new host cap, MMC_CAP_NEED_RSP_BUSY to let them specify this. In __mmc_switch(), let's check the flag and use it to prevent R1B responses from being converted into R1. Note that, this also means that the host are on its own, when it comes to manage the busy timeout. Suggested-by: Sowjanya Komatineni Cc: Tested-by: Anders Roxell Tested-by: Sowjanya Komatineni Tested-by: Faiz Abbas Tested-By: Peter Geis Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc_ops.c | 6 ++++-- include/linux/mmc/host.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index da425ee2d9bf..e025604e17d4 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -542,9 +542,11 @@ int __mmc_switch(struct mmc_card *card, u8 set, u8 index, u8 value, * If the max_busy_timeout of the host is specified, make sure it's * enough to fit the used timeout_ms. In case it's not, let's instruct * the host to avoid HW busy detection, by converting to a R1 response - * instead of a R1B. + * instead of a R1B. Note, some hosts requires R1B, which also means + * they are on their own when it comes to deal with the busy timeout. */ - if (host->max_busy_timeout && (timeout_ms > host->max_busy_timeout)) + if (!(host->caps & MMC_CAP_NEED_RSP_BUSY) && host->max_busy_timeout && + (timeout_ms > host->max_busy_timeout)) use_r1b_resp = false; cmd.opcode = MMC_SWITCH; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index ba703384bea0..4c5eb3aa8e72 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -333,6 +333,7 @@ struct mmc_host { MMC_CAP_UHS_SDR50 | MMC_CAP_UHS_SDR104 | \ MMC_CAP_UHS_DDR50) #define MMC_CAP_SYNC_RUNTIME_PM (1 << 21) /* Synced runtime PM suspends. */ +#define MMC_CAP_NEED_RSP_BUSY (1 << 22) /* Commands with R1B can't use R1. */ #define MMC_CAP_DRIVER_TYPE_A (1 << 23) /* Host supports Driver Type A */ #define MMC_CAP_DRIVER_TYPE_C (1 << 24) /* Host supports Driver Type C */ #define MMC_CAP_DRIVER_TYPE_D (1 << 25) /* Host supports Driver Type D */ -- cgit v1.2.3-71-gd317 From b53df2e7442c73a932fb74228147fb946e531585 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Fri, 21 Feb 2020 10:37:08 +0900 Subject: block: Fix partition support for host aware zoned block devices Commit b72053072c0b ("block: allow partitions on host aware zone devices") introduced the helper function disk_has_partitions() to check if a given disk has valid partitions. However, since this function result directly depends on the disk partition table length rather than the actual existence of valid partitions in the table, it returns true even after all partitions are removed from the disk. For host aware zoned block devices, this results in zone management support to be kept disabled even after removing all partitions. Fix this by changing disk_has_partitions() to walk through the partition table entries and return true if and only if a valid non-zero size partition is found. Fixes: b72053072c0b ("block: allow partitions on host aware zone devices") Cc: stable@vger.kernel.org # 5.5 Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: Jens Axboe --- block/genhd.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/genhd.h | 13 +------------ 2 files changed, 37 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index ff6268970ddc..9c2e13ce0d19 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -301,6 +301,42 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) } EXPORT_SYMBOL_GPL(disk_map_sector_rcu); +/** + * disk_has_partitions + * @disk: gendisk of interest + * + * Walk through the partition table and check if valid partition exists. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * True if the gendisk has at least one valid non-zero size partition. + * Otherwise false. + */ +bool disk_has_partitions(struct gendisk *disk) +{ + struct disk_part_tbl *ptbl; + int i; + bool ret = false; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + + /* Iterate partitions skipping the whole device at index 0 */ + for (i = 1; i < ptbl->len; i++) { + if (rcu_dereference(ptbl->part[i])) { + ret = true; + break; + } + } + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(disk_has_partitions); + /* * Can be deleted altogether. Later. * diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6fbe58538ad6..07dc91835b98 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -245,18 +245,6 @@ static inline bool disk_part_scan_enabled(struct gendisk *disk) !(disk->flags & GENHD_FL_NO_PART_SCAN); } -static inline bool disk_has_partitions(struct gendisk *disk) -{ - bool ret = false; - - rcu_read_lock(); - if (rcu_dereference(disk->part_tbl)->len > 1) - ret = true; - rcu_read_unlock(); - - return ret; -} - static inline dev_t disk_devt(struct gendisk *disk) { return MKDEV(disk->major, disk->first_minor); @@ -298,6 +286,7 @@ extern void disk_part_iter_exit(struct disk_part_iter *piter); extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); +bool disk_has_partitions(struct gendisk *disk); /* * Macros to operate on percpu disk statistics: -- cgit v1.2.3-71-gd317 From ba3b01d7a6f4ab9f8a0557044c9a7678f64ae070 Mon Sep 17 00:00:00 2001 From: Megha Dey Date: Mon, 9 Mar 2020 13:09:46 -0700 Subject: iommu/vt-d: Fix debugfs register reads Commit 6825d3ea6cde ("iommu/vt-d: Add debugfs support to show register contents") dumps the register contents for all IOMMU devices. Currently, a 64 bit read(dmar_readq) is done for all the IOMMU registers, even though some of the registers are 32 bits, which is incorrect. Use the correct read function variant (dmar_readl/dmar_readq) while reading the contents of 32/64 bit registers respectively. Signed-off-by: Megha Dey Link: https://lore.kernel.org/r/1583784587-26126-2-git-send-email-megha.dey@linux.intel.com Acked-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu-debugfs.c | 40 +++++++++++++++++++++++-------------- include/linux/intel-iommu.h | 2 ++ 2 files changed, 27 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel-iommu-debugfs.c b/drivers/iommu/intel-iommu-debugfs.c index c1257bef553c..0a7791934a16 100644 --- a/drivers/iommu/intel-iommu-debugfs.c +++ b/drivers/iommu/intel-iommu-debugfs.c @@ -33,38 +33,42 @@ struct iommu_regset { #define IOMMU_REGSET_ENTRY(_reg_) \ { DMAR_##_reg_##_REG, __stringify(_reg_) } -static const struct iommu_regset iommu_regs[] = { + +static const struct iommu_regset iommu_regs_32[] = { IOMMU_REGSET_ENTRY(VER), - IOMMU_REGSET_ENTRY(CAP), - IOMMU_REGSET_ENTRY(ECAP), IOMMU_REGSET_ENTRY(GCMD), IOMMU_REGSET_ENTRY(GSTS), - IOMMU_REGSET_ENTRY(RTADDR), - IOMMU_REGSET_ENTRY(CCMD), IOMMU_REGSET_ENTRY(FSTS), IOMMU_REGSET_ENTRY(FECTL), IOMMU_REGSET_ENTRY(FEDATA), IOMMU_REGSET_ENTRY(FEADDR), IOMMU_REGSET_ENTRY(FEUADDR), - IOMMU_REGSET_ENTRY(AFLOG), IOMMU_REGSET_ENTRY(PMEN), IOMMU_REGSET_ENTRY(PLMBASE), IOMMU_REGSET_ENTRY(PLMLIMIT), + IOMMU_REGSET_ENTRY(ICS), + IOMMU_REGSET_ENTRY(PRS), + IOMMU_REGSET_ENTRY(PECTL), + IOMMU_REGSET_ENTRY(PEDATA), + IOMMU_REGSET_ENTRY(PEADDR), + IOMMU_REGSET_ENTRY(PEUADDR), +}; + +static const struct iommu_regset iommu_regs_64[] = { + IOMMU_REGSET_ENTRY(CAP), + IOMMU_REGSET_ENTRY(ECAP), + IOMMU_REGSET_ENTRY(RTADDR), + IOMMU_REGSET_ENTRY(CCMD), + IOMMU_REGSET_ENTRY(AFLOG), IOMMU_REGSET_ENTRY(PHMBASE), IOMMU_REGSET_ENTRY(PHMLIMIT), IOMMU_REGSET_ENTRY(IQH), IOMMU_REGSET_ENTRY(IQT), IOMMU_REGSET_ENTRY(IQA), - IOMMU_REGSET_ENTRY(ICS), IOMMU_REGSET_ENTRY(IRTA), IOMMU_REGSET_ENTRY(PQH), IOMMU_REGSET_ENTRY(PQT), IOMMU_REGSET_ENTRY(PQA), - IOMMU_REGSET_ENTRY(PRS), - IOMMU_REGSET_ENTRY(PECTL), - IOMMU_REGSET_ENTRY(PEDATA), - IOMMU_REGSET_ENTRY(PEADDR), - IOMMU_REGSET_ENTRY(PEUADDR), IOMMU_REGSET_ENTRY(MTRRCAP), IOMMU_REGSET_ENTRY(MTRRDEF), IOMMU_REGSET_ENTRY(MTRR_FIX64K_00000), @@ -127,10 +131,16 @@ static int iommu_regset_show(struct seq_file *m, void *unused) * by adding the offset to the pointer (virtual address). */ raw_spin_lock_irqsave(&iommu->register_lock, flag); - for (i = 0 ; i < ARRAY_SIZE(iommu_regs); i++) { - value = dmar_readq(iommu->reg + iommu_regs[i].offset); + for (i = 0 ; i < ARRAY_SIZE(iommu_regs_32); i++) { + value = dmar_readl(iommu->reg + iommu_regs_32[i].offset); + seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n", + iommu_regs_32[i].regs, iommu_regs_32[i].offset, + value); + } + for (i = 0 ; i < ARRAY_SIZE(iommu_regs_64); i++) { + value = dmar_readq(iommu->reg + iommu_regs_64[i].offset); seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n", - iommu_regs[i].regs, iommu_regs[i].offset, + iommu_regs_64[i].regs, iommu_regs_64[i].offset, value); } raw_spin_unlock_irqrestore(&iommu->register_lock, flag); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 4a16b39ae353..980234ae0312 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -123,6 +123,8 @@ #define dmar_readq(a) readq(a) #define dmar_writeq(a,v) writeq(v,a) +#define dmar_readl(a) readl(a) +#define dmar_writel(a, v) writel(v, a) #define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) #define DMAR_VER_MINOR(v) ((v) & 0x0f) -- cgit v1.2.3-71-gd317 From f1388ec4a144f40348321a0915c5535d623e165c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 15 Mar 2020 18:17:48 +0100 Subject: netlink: add nl_set_extack_cookie_u32() Similar to existing nl_set_extack_cookie_u64(), add new helper nl_set_extack_cookie_u32() which sets extack cookie to a u32 value. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- include/linux/netlink.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 205fa7b1f07a..4090524c3462 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -119,6 +119,15 @@ static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, extack->cookie_len = sizeof(__cookie); } +static inline void nl_set_extack_cookie_u32(struct netlink_ext_ack *extack, + u32 cookie) +{ + u32 __cookie = cookie; + + memcpy(extack->cookie, &__cookie, sizeof(__cookie)); + extack->cookie_len = sizeof(__cookie); +} + void netlink_kernel_release(struct sock *sk); int __netlink_change_ngroups(struct sock *sk, unsigned int groups); int netlink_change_ngroups(struct sock *sk, unsigned int groups); -- cgit v1.2.3-71-gd317 From 4022e7af86be2dd62975dedb6b7ea551d108695e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Mar 2020 19:23:18 -0600 Subject: io_uring: make sure openat/openat2 honor rlimit nofile Dmitry reports that a test case shows that io_uring isn't honoring a modified rlimit nofile setting. get_unused_fd_flags() checks the task signal->rlimi[] for the limits. As this isn't easily inheritable, provide a __get_unused_fd_flags() that takes the value instead. Then we can grab it when the request is prepared (from the original task), and pass that in when we do the async part part of the open. Reported-by: Dmitry Kadashev Tested-by: Dmitry Kadashev Acked-by: David S. Miller Signed-off-by: Jens Axboe --- fs/file.c | 7 ++++++- fs/io_uring.c | 5 ++++- include/linux/file.h | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/file.c b/fs/file.c index a364e1a9b7e8..c8a4e4c86e55 100644 --- a/fs/file.c +++ b/fs/file.c @@ -540,9 +540,14 @@ static int alloc_fd(unsigned start, unsigned flags) return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); } +int __get_unused_fd_flags(unsigned flags, unsigned long nofile) +{ + return __alloc_fd(current->files, 0, nofile, flags); +} + int get_unused_fd_flags(unsigned flags) { - return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); + return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); diff --git a/fs/io_uring.c b/fs/io_uring.c index b1fbc4424aa6..fe5ded7c74ef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -397,6 +397,7 @@ struct io_open { struct filename *filename; struct statx __user *buffer; struct open_how how; + unsigned long nofile; }; struct io_files_update { @@ -2577,6 +2578,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } + req->open.nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -2618,6 +2620,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } + req->open.nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -2636,7 +2639,7 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, if (ret) goto err; - ret = get_unused_fd_flags(req->open.how.flags); + ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); if (ret < 0) goto err; diff --git a/include/linux/file.h b/include/linux/file.h index c6c7b24ea9f7..142d102f285e 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -85,6 +85,7 @@ extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); extern int replace_fd(unsigned fd, struct file *file, unsigned flags); extern void set_close_on_exec(unsigned int fd, int flag); extern bool get_close_on_exec(unsigned int fd); +extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile); extern int get_unused_fd_flags(unsigned flags); extern void put_unused_fd(unsigned int fd); -- cgit v1.2.3-71-gd317 From 09952e3e7826119ddd4357c453d54bcc7ef25156 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Mar 2020 20:16:56 -0600 Subject: io_uring: make sure accept honor rlimit nofile Just like commit 4022e7af86be, this fixes the fact that IORING_OP_ACCEPT ends up using get_unused_fd_flags(), which checks current->signal->rlim[] for limits. Add an extra argument to __sys_accept4_file() that allows us to pass in the proper nofile limit, and grab it at request prep time. Acked-by: David S. Miller Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++++- include/linux/socket.h | 3 ++- net/socket.c | 8 +++++--- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index fe5ded7c74ef..3affd96a98ba 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -343,6 +343,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + unsigned long nofile; }; struct io_sync { @@ -3324,6 +3325,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); + accept->nofile = rlimit(RLIMIT_NOFILE); return 0; #else return -EOPNOTSUPP; @@ -3340,7 +3342,8 @@ static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, file_flags = force_nonblock ? O_NONBLOCK : 0; ret = __sys_accept4_file(req->file, file_flags, accept->addr, - accept->addr_len, accept->flags); + accept->addr_len, accept->flags, + accept->nofile); if (ret == -EAGAIN && force_nonblock) return -EAGAIN; if (ret == -ERESTARTSYS) diff --git a/include/linux/socket.h b/include/linux/socket.h index 2d2313403101..15f3412d481e 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -401,7 +401,8 @@ extern int __sys_sendto(int fd, void __user *buff, size_t len, int addr_len); extern int __sys_accept4_file(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags); + int __user *upeer_addrlen, int flags, + unsigned long nofile); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); diff --git a/net/socket.c b/net/socket.c index b79a05de7c6e..2eecf1517f76 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1707,7 +1707,8 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) int __sys_accept4_file(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags) + int __user *upeer_addrlen, int flags, + unsigned long nofile) { struct socket *sock, *newsock; struct file *newfile; @@ -1738,7 +1739,7 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, */ __module_get(newsock->ops->owner); - newfd = get_unused_fd_flags(flags); + newfd = __get_unused_fd_flags(flags, nofile); if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); @@ -1807,7 +1808,8 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, f = fdget(fd); if (f.file) { ret = __sys_accept4_file(f.file, 0, upeer_sockaddr, - upeer_addrlen, flags); + upeer_addrlen, flags, + rlimit(RLIMIT_NOFILE)); if (f.flags) fput(f.file); } -- cgit v1.2.3-71-gd317 From d72520ad004a8ce18a6ba6cde317f0081b27365a Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Sat, 21 Mar 2020 18:22:17 -0700 Subject: page-flags: fix a crash at SetPageError(THP_SWAP) Commit bd4c82c22c36 ("mm, THP, swap: delay splitting THP after swapped out") supported writing THP to a swap device but forgot to upgrade an older commit df8c94d13c7e ("page-flags: define behavior of FS/IO-related flags on compound pages") which could trigger a crash during THP swapping out with DEBUG_VM_PGFLAGS=y, kernel BUG at include/linux/page-flags.h:317! page dumped because: VM_BUG_ON_PAGE(1 && PageCompound(page)) page:fffff3b2ec3a8000 refcount:512 mapcount:0 mapping:000000009eb0338c index:0x7f6e58200 head:fffff3b2ec3a8000 order:9 compound_mapcount:0 compound_pincount:0 anon flags: 0x45fffe0000d8454(uptodate|lru|workingset|owner_priv_1|writeback|head|reclaim|swapbacked) end_swap_bio_write() SetPageError(page) VM_BUG_ON_PAGE(1 && PageCompound(page)) bio_endio+0x297/0x560 dec_pending+0x218/0x430 [dm_mod] clone_endio+0xe4/0x2c0 [dm_mod] bio_endio+0x297/0x560 blk_update_request+0x201/0x920 scsi_end_request+0x6b/0x4b0 scsi_io_completion+0x509/0x7e0 scsi_finish_command+0x1ed/0x2a0 scsi_softirq_done+0x1c9/0x1d0 __blk_mqnterrupt+0xf/0x20 Fix by checking PF_NO_TAIL in those places instead. Fixes: bd4c82c22c36 ("mm, THP, swap: delay splitting THP after swapped out") Signed-off-by: Qian Cai Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: "Huang, Ying" Acked-by: Rafael Aquini Cc: Link: http://lkml.kernel.org/r/20200310235846.1319-1-cai@lca.pw Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1bf83c8fcaa7..77de28bfefb0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -311,7 +311,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; } __PAGEFLAG(Locked, locked, PF_NO_TAIL) PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) -PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND) +PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL) PAGEFLAG(Referenced, referenced, PF_HEAD) TESTCLEARFLAG(Referenced, referenced, PF_HEAD) __SETPAGEFLAG(Referenced, referenced, PF_HEAD) -- cgit v1.2.3-71-gd317 From 763802b53a427ed3cbd419dbba255c414fdd9e7c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Sat, 21 Mar 2020 18:22:41 -0700 Subject: x86/mm: split vmalloc_sync_all() Commit 3f8fd02b1bf1 ("mm/vmalloc: Sync unmappings in __purge_vmap_area_lazy()") introduced a call to vmalloc_sync_all() in the vunmap() code-path. While this change was necessary to maintain correctness on x86-32-pae kernels, it also adds additional cycles for architectures that don't need it. Specifically on x86-64 with CONFIG_VMAP_STACK=y some people reported severe performance regressions in micro-benchmarks because it now also calls the x86-64 implementation of vmalloc_sync_all() on vunmap(). But the vmalloc_sync_all() implementation on x86-64 is only needed for newly created mappings. To avoid the unnecessary work on x86-64 and to gain the performance back, split up vmalloc_sync_all() into two functions: * vmalloc_sync_mappings(), and * vmalloc_sync_unmappings() Most call-sites to vmalloc_sync_all() only care about new mappings being synchronized. The only exception is the new call-site added in the above mentioned commit. Shile Zhang directed us to a report of an 80% regression in reaim throughput. Fixes: 3f8fd02b1bf1 ("mm/vmalloc: Sync unmappings in __purge_vmap_area_lazy()") Reported-by: kernel test robot Reported-by: Shile Zhang Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Tested-by: Borislav Petkov Acked-by: Rafael J. Wysocki [GHES] Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Link: http://lkml.kernel.org/r/20191009124418.8286-1-joro@8bytes.org Link: https://lists.01.org/hyperkitty/list/lkp@lists.01.org/thread/4D3JPPHBNOSPFK2KEPC6KGKS6J25AIDB/ Link: http://lkml.kernel.org/r/20191113095530.228959-1-shile.zhang@linux.alibaba.com Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 26 ++++++++++++++++++++++++-- drivers/acpi/apei/ghes.c | 2 +- include/linux/vmalloc.h | 5 +++-- kernel/notifier.c | 2 +- mm/nommu.c | 10 +++++++--- mm/vmalloc.c | 11 +++++++---- 6 files changed, 43 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fa4ea09593ab..629fdf13f846 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -190,7 +190,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -void vmalloc_sync_all(void) +static void vmalloc_sync(void) { unsigned long address; @@ -217,6 +217,16 @@ void vmalloc_sync_all(void) } } +void vmalloc_sync_mappings(void) +{ + vmalloc_sync(); +} + +void vmalloc_sync_unmappings(void) +{ + vmalloc_sync(); +} + /* * 32-bit: * @@ -319,11 +329,23 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_all(void) +void vmalloc_sync_mappings(void) { + /* + * 64-bit mappings might allocate new p4d/pud pages + * that need to be propagated to all tasks' PGDs. + */ sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); } +void vmalloc_sync_unmappings(void) +{ + /* + * Unmappings never allocate or free p4d/pud pages. + * No work is required here. + */ +} + /* * 64-bit: * diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 103acbbfcf9a..24c9642e8fc7 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -171,7 +171,7 @@ int ghes_estatus_pool_init(int num_ghes) * New allocation must be visible in all pgd before it can be found by * an NMI allocating from the pool. */ - vmalloc_sync_all(); + vmalloc_sync_mappings(); rc = gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1); if (rc) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index ec3813236699..0507a162ccd0 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -141,8 +141,9 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); -void vmalloc_sync_all(void); - +void vmalloc_sync_mappings(void); +void vmalloc_sync_unmappings(void); + /* * Lowlevel-APIs (not for driver use!) */ diff --git a/kernel/notifier.c b/kernel/notifier.c index 63d7501ac638..5989bbb93039 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,7 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_all(); + vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/mm/nommu.c b/mm/nommu.c index bd2b4e5ef144..318df4e236c9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -370,10 +370,14 @@ void vm_unmap_aliases(void) EXPORT_SYMBOL_GPL(vm_unmap_aliases); /* - * Implement a stub for vmalloc_sync_all() if the architecture chose not to - * have one. + * Implement a stub for vmalloc_sync_[un]mapping() if the architecture + * chose not to have one. */ -void __weak vmalloc_sync_all(void) +void __weak vmalloc_sync_mappings(void) +{ +} + +void __weak vmalloc_sync_unmappings(void) { } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1f46c3b86f9f..6b8eeb0ecee5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1295,7 +1295,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) * First make sure the mappings are removed from all page-tables * before they are freed. */ - vmalloc_sync_all(); + vmalloc_sync_unmappings(); /* * TODO: to calculate a flush range without looping. @@ -3128,16 +3128,19 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, EXPORT_SYMBOL(remap_vmalloc_range); /* - * Implement a stub for vmalloc_sync_all() if the architecture chose not to - * have one. + * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose + * not to have one. * * The purpose of this function is to make sure the vmalloc area * mappings are identical in all page-tables in the system. */ -void __weak vmalloc_sync_all(void) +void __weak vmalloc_sync_mappings(void) { } +void __weak vmalloc_sync_unmappings(void) +{ +} static int f(pte_t *pte, unsigned long addr, void *data) { -- cgit v1.2.3-71-gd317 From 55b474c41e586a5c21c7ab81ff474eb6bacb4322 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sat, 21 Mar 2020 00:46:50 +0100 Subject: netlink: check for null extack in cookie helpers Unlike NL_SET_ERR_* macros, nl_set_extack_cookie_u64() and nl_set_extack_cookie_u32() helpers do not check extack argument for null and neither do their callers, as syzbot recently discovered for ethnl_parse_header(). Instead of fixing the callers and leaving the trap in place, add check of null extack to both helpers to make them consistent with NL_SET_ERR_* macros. v2: drop incorrect second Fixes tag Fixes: 2363d73a2f3e ("ethtool: reject unrecognized request flags") Reported-by: syzbot+258a9089477493cea67b@syzkaller.appspotmail.com Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- include/linux/netlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 4090524c3462..60739d0cbf93 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -115,6 +115,8 @@ static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, { u64 __cookie = cookie; + if (!extack) + return; memcpy(extack->cookie, &__cookie, sizeof(__cookie)); extack->cookie_len = sizeof(__cookie); } @@ -124,6 +126,8 @@ static inline void nl_set_extack_cookie_u32(struct netlink_ext_ack *extack, { u32 __cookie = cookie; + if (!extack) + return; memcpy(extack->cookie, &__cookie, sizeof(__cookie)); extack->cookie_len = sizeof(__cookie); } -- cgit v1.2.3-71-gd317 From e80f40cbe4dd51371818e967d40da8fe305db5e4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 24 Mar 2020 11:45:34 +0200 Subject: net: dsa: tag_8021q: replace dsa_8021q_remove_header with __skb_vlan_pop Not only did this wheel did not need reinventing, but there is also an issue with it: It doesn't remove the VLAN header in a way that preserves the L2 payload checksum when that is being provided by the DSA master hw. It should recalculate checksum both for the push, before removing the header, and for the pull afterwards. But the current implementation is quite dizzying, with pulls followed immediately afterwards by pushes, the memmove is done before the push, etc. This makes a DSA master with RX checksumming offload to print stack traces with the infamous 'hw csum failure' message. So remove the dsa_8021q_remove_header function and replace it with something that actually works with inet checksumming. Fixes: d461933638ae ("net: dsa: tag_8021q: Create helper function for removing VLAN header") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 7 ------- net/dsa/tag_8021q.c | 43 ------------------------------------------- net/dsa/tag_sja1105.c | 19 +++++++++---------- 3 files changed, 9 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 0aa803c451a3..c620d9139c28 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -28,8 +28,6 @@ int dsa_8021q_rx_switch_id(u16 vid); int dsa_8021q_rx_source_port(u16 vid); -struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb); - #else int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, @@ -64,11 +62,6 @@ int dsa_8021q_rx_source_port(u16 vid) return 0; } -struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) -{ - return NULL; -} - #endif /* IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) */ #endif /* _NET_DSA_8021Q_H */ diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 2fb6c26294b5..b97ad93d1c1a 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -298,47 +298,4 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -/* In the DSA packet_type handler, skb->data points in the middle of the VLAN - * tag, after tpid and before tci. This is because so far, ETH_HLEN - * (DMAC, SMAC, EtherType) bytes were pulled. - * There are 2 bytes of VLAN tag left in skb->data, and upper - * layers expect the 'real' EtherType to be consumed as well. - * Coincidentally, a VLAN header is also of the same size as - * the number of bytes that need to be pulled. - * - * skb_mac_header skb->data - * | | - * v v - * | | | | | | | | | | | | | | | | | | | - * +-----------------------+-----------------------+-------+-------+-------+ - * | Destination MAC | Source MAC | TPID | TCI | EType | - * +-----------------------+-----------------------+-------+-------+-------+ - * ^ | | - * |<--VLAN_HLEN-->to <---VLAN_HLEN---> - * from | - * >>>>>>> v - * >>>>>>> | | | | | | | | | | | | | | | - * >>>>>>> +-----------------------+-----------------------+-------+ - * >>>>>>> | Destination MAC | Source MAC | EType | - * +-----------------------+-----------------------+-------+ - * ^ ^ - * (now part of | | - * skb->head) skb_mac_header skb->data - */ -struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) -{ - u8 *from = skb_mac_header(skb); - u8 *dest = from + VLAN_HLEN; - - memmove(dest, from, ETH_HLEN - VLAN_HLEN); - skb_pull(skb, VLAN_HLEN); - skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - skb_pull_rcsum(skb, ETH_HLEN); - - return skb; -} -EXPORT_SYMBOL_GPL(dsa_8021q_remove_header); - MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 5366ea430349..d553bf36bd41 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -250,14 +250,14 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, { struct sja1105_meta meta = {0}; int source_port, switch_id; - struct vlan_ethhdr *hdr; + struct ethhdr *hdr; u16 tpid, vid, tci; bool is_link_local; bool is_tagged; bool is_meta; - hdr = vlan_eth_hdr(skb); - tpid = ntohs(hdr->h_vlan_proto); + hdr = eth_hdr(skb); + tpid = ntohs(hdr->h_proto); is_tagged = (tpid == ETH_P_SJA1105); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); @@ -266,7 +266,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, if (is_tagged) { /* Normal traffic path. */ - tci = ntohs(hdr->h_vlan_TCI); + skb_push_rcsum(skb, ETH_HLEN); + __skb_vlan_pop(skb, &tci); + skb_pull_rcsum(skb, ETH_HLEN); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + vid = tci & VLAN_VID_MASK; source_port = dsa_8021q_rx_source_port(vid); switch_id = dsa_8021q_rx_switch_id(vid); @@ -295,12 +300,6 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } - /* Delete/overwrite fake VLAN header, DSA expects to not find - * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN). - */ - if (is_tagged) - skb = dsa_8021q_remove_header(skb); - return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, is_meta); } -- cgit v1.2.3-71-gd317 From 2c64605b590edadb3fb46d1ec6badb49e940b479 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Mar 2020 13:47:18 +0100 Subject: net: Fix CONFIG_NET_CLS_ACT=n and CONFIG_NFT_FWD_NETDEV={y, m} build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/netfilter/nft_fwd_netdev.c: In function ‘nft_fwd_netdev_eval’: net/netfilter/nft_fwd_netdev.c:32:10: error: ‘struct sk_buff’ has no member named ‘tc_redirected’ pkt->skb->tc_redirected = 1; ^~ net/netfilter/nft_fwd_netdev.c:33:10: error: ‘struct sk_buff’ has no member named ‘tc_from_ingress’ pkt->skb->tc_from_ingress = 1; ^~ To avoid a direct dependency with tc actions from netfilter, wrap the redirect bits around CONFIG_NET_REDIRECT and move helpers to include/linux/skbuff.h. Turn on this toggle from the ifb driver, the only existing client of these bits in the tree. This patch adds skb_set_redirected() that sets on the redirected bit on the skbuff, it specifies if the packet was redirect from ingress and resets the timestamp (timestamp reset was originally missing in the netfilter bugfix). Fixes: bcfabee1afd99484 ("netfilter: nft_fwd_netdev: allow to redirect to ifb via ingress") Reported-by: noreply@ellerman.id.au Reported-by: Geert Uytterhoeven Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- drivers/net/Kconfig | 1 + drivers/net/ifb.c | 6 +++--- drivers/net/wireguard/queueing.h | 2 +- include/linux/skbuff.h | 36 ++++++++++++++++++++++++++++++++---- include/net/sch_generic.h | 16 ---------------- net/Kconfig | 3 +++ net/core/dev.c | 4 ++-- net/core/pktgen.c | 2 +- net/netfilter/nft_fwd_netdev.c | 5 ++--- net/sched/act_mirred.c | 6 ++---- 10 files changed, 47 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 25a8f9387d5a..db8884ad6d40 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -149,6 +149,7 @@ config NET_FC config IFB tristate "Intermediate Functional Block support" depends on NET_CLS_ACT + select NET_REDIRECT ---help--- This is an intermediate driver that allows sharing of resources. diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index 242b9b0943f8..7fe306e76281 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -75,7 +75,7 @@ static void ifb_ri_tasklet(unsigned long _txp) } while ((skb = __skb_dequeue(&txp->tq)) != NULL) { - skb->tc_redirected = 0; + skb->redirected = 0; skb->tc_skip_classify = 1; u64_stats_update_begin(&txp->tsync); @@ -96,7 +96,7 @@ static void ifb_ri_tasklet(unsigned long _txp) rcu_read_unlock(); skb->skb_iif = txp->dev->ifindex; - if (!skb->tc_from_ingress) { + if (!skb->from_ingress) { dev_queue_xmit(skb); } else { skb_pull_rcsum(skb, skb->mac_len); @@ -243,7 +243,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev) txp->rx_bytes += skb->len; u64_stats_update_end(&txp->rsync); - if (!skb->tc_redirected || !skb->skb_iif) { + if (!skb->redirected || !skb->skb_iif) { dev_kfree_skb(skb); dev->stats.rx_dropped++; return NETDEV_TX_OK; diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index cf1e0e2376d8..3432232afe06 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -100,8 +100,8 @@ static inline void wg_reset_packet(struct sk_buff *skb) skb->dev = NULL; #ifdef CONFIG_NET_SCHED skb->tc_index = 0; - skb_reset_tc(skb); #endif + skb_reset_redirect(skb); skb->hdr_len = skb_headroom(skb); skb_reset_mac_header(skb); skb_reset_network_header(skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5b50278c4bc8..e59620234415 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -645,8 +645,8 @@ typedef unsigned char *sk_buff_data_t; * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device * @tc_at_ingress: used within tc_classify to distinguish in/egress - * @tc_redirected: packet was redirected by a tc action - * @tc_from_ingress: if tc_redirected, tc_at_ingress at time of redirect + * @redirected: packet was redirected by packet classifier + * @from_ingress: packet was redirected from the ingress path * @peeked: this packet has been seen already, so stats have been * done for it, don't do them again * @nf_trace: netfilter packet trace flag @@ -848,8 +848,10 @@ struct sk_buff { #ifdef CONFIG_NET_CLS_ACT __u8 tc_skip_classify:1; __u8 tc_at_ingress:1; - __u8 tc_redirected:1; - __u8 tc_from_ingress:1; +#endif +#ifdef CONFIG_NET_REDIRECT + __u8 redirected:1; + __u8 from_ingress:1; #endif #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; @@ -4579,5 +4581,31 @@ static inline __wsum lco_csum(struct sk_buff *skb) return csum_partial(l4_hdr, csum_start - l4_hdr, partial); } +static inline bool skb_is_redirected(const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_REDIRECT + return skb->redirected; +#else + return false; +#endif +} + +static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) +{ +#ifdef CONFIG_NET_REDIRECT + skb->redirected = 1; + skb->from_ingress = from_ingress; + if (skb->from_ingress) + skb->tstamp = 0; +#endif +} + +static inline void skb_reset_redirect(struct sk_buff *skb) +{ +#ifdef CONFIG_NET_REDIRECT + skb->redirected = 0; +#endif +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 151208704ed2..c30f914867e6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -675,22 +675,6 @@ void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab); int skb_do_redirect(struct sk_buff *); -static inline void skb_reset_tc(struct sk_buff *skb) -{ -#ifdef CONFIG_NET_CLS_ACT - skb->tc_redirected = 0; -#endif -} - -static inline bool skb_is_tc_redirected(const struct sk_buff *skb) -{ -#ifdef CONFIG_NET_CLS_ACT - return skb->tc_redirected; -#else - return false; -#endif -} - static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { #ifdef CONFIG_NET_CLS_ACT diff --git a/net/Kconfig b/net/Kconfig index 2eeb0e55f7c9..df8d8c9bd021 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -52,6 +52,9 @@ config NET_INGRESS config NET_EGRESS bool +config NET_REDIRECT + bool + config SKB_EXTENSIONS bool diff --git a/net/core/dev.c b/net/core/dev.c index 402a986659cf..500bba8874b0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4516,7 +4516,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ - if (skb_is_tc_redirected(skb)) + if (skb_is_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom @@ -5063,7 +5063,7 @@ skip_taps: goto out; } #endif - skb_reset_tc(skb); + skb_reset_redirect(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index acc849df60b5..d0641bba6b81 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3362,7 +3362,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) /* skb was 'freed' by stack, so clean few * bits and reuse it */ - skb_reset_tc(skb); + skb_reset_redirect(skb); } while (--burst > 0); goto out; /* Skips xmit_mode M_START_XMIT */ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 74f050ba6bad..3087e23297db 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -28,9 +28,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, struct nft_fwd_netdev *priv = nft_expr_priv(expr); int oif = regs->data[priv->sreg_dev]; - /* These are used by ifb only. */ - pkt->skb->tc_redirected = 1; - pkt->skb->tc_from_ingress = 1; + /* This is used by ifb only. */ + skb_set_redirected(pkt->skb, true); nf_fwd_netdev_egress(pkt, oif); regs->verdict.code = NF_STOLEN; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1ad300e6dbc0..83dd82fc9f40 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -284,10 +284,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* mirror is always swallowed */ if (is_redirect) { - skb2->tc_redirected = 1; - skb2->tc_from_ingress = skb2->tc_at_ingress; - if (skb2->tc_from_ingress) - skb2->tstamp = 0; + skb_set_redirected(skb2, skb2->tc_at_ingress); + /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; -- cgit v1.2.3-71-gd317