From e4e8276a4f652be2c7bb783a0155d4adb85f5d7d Mon Sep 17 00:00:00 2001
From: Vignesh Raghavendra <vigneshr@ti.com>
Date: Tue, 4 Feb 2020 18:18:15 +0530
Subject: spi: spi-omap2-mcspi: Handle DMA size restriction on AM65x

On AM654, McSPI can only support 4K - 1 bytes per transfer when DMA is
enabled. Therefore populate master->max_transfer_size callback to
inform client drivers of this restriction when DMA channels are
available.

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Link: https://lore.kernel.org/r/20200204124816.16735-2-vigneshr@ti.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/platform_data/spi-omap2-mcspi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/spi-omap2-mcspi.h b/include/linux/platform_data/spi-omap2-mcspi.h
index 0bf9fddb8306..3b400b1919a9 100644
--- a/include/linux/platform_data/spi-omap2-mcspi.h
+++ b/include/linux/platform_data/spi-omap2-mcspi.h
@@ -11,6 +11,7 @@ struct omap2_mcspi_platform_config {
 	unsigned short	num_cs;
 	unsigned int regs_offset;
 	unsigned int pin_dir:1;
+	size_t max_xfer_len;
 };
 
 struct omap2_mcspi_device_config {
-- 
cgit v1.2.3-71-gd317


From ae91c92565494a37c30ce9a691c87890f800d826 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 22 Nov 2019 11:44:53 +0100
Subject: debugfs: remove return value of debugfs_create_regset32()

No one checks the return value of debugfs_create_regset32(), as it's not
needed, so make the return value void, so that no one tries to do so in
the future.

Link: https://lore.kernel.org/r/20191122104453.GA2017837@kroah.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/filesystems/debugfs.txt |  6 +++---
 fs/debugfs/file.c                     | 17 ++++-------------
 include/linux/debugfs.h               | 13 ++++++-------
 3 files changed, 13 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index dc497b96fa4f..55336a47a110 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -164,9 +164,9 @@ file.
 	void __iomem *base;
     };
 
-    struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
-				     struct dentry *parent,
-				     struct debugfs_regset32 *regset);
+    debugfs_create_regset32(const char *name, umode_t mode,
+			    struct dentry *parent,
+			    struct debugfs_regset32 *regset);
 
     void debugfs_print_regs32(struct seq_file *s, struct debugfs_reg32 *regs,
 			 int nregs, void __iomem *base, char *prefix);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 634b09d18b77..db987b5110a9 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1090,21 +1090,12 @@ static const struct file_operations fops_regset32 = {
  * This function creates a file in debugfs with the given name that reports
  * the names and values of a set of 32-bit registers. If the @mode variable
  * is so set it can be read from. Writing is not supported.
- *
- * This function will return a pointer to a dentry if it succeeds.  This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
- * be returned.
  */
-struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
-				       struct dentry *parent,
-				       struct debugfs_regset32 *regset)
+void debugfs_create_regset32(const char *name, umode_t mode,
+			     struct dentry *parent,
+			     struct debugfs_regset32 *regset)
 {
-	return debugfs_create_file(name, mode, parent, regset, &fops_regset32);
+	debugfs_create_file(name, mode, parent, regset, &fops_regset32);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_regset32);
 
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 3d013de64f70..43efcc49f061 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -127,9 +127,9 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				  struct dentry *parent,
 				  struct debugfs_blob_wrapper *blob);
 
-struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
-				     struct dentry *parent,
-				     struct debugfs_regset32 *regset);
+void debugfs_create_regset32(const char *name, umode_t mode,
+			     struct dentry *parent,
+			     struct debugfs_regset32 *regset);
 
 void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
 			  int nregs, void __iomem *base, char *prefix);
@@ -304,11 +304,10 @@ static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_regset32(const char *name,
-				   umode_t mode, struct dentry *parent,
-				   struct debugfs_regset32 *regset)
+static inline void debugfs_create_regset32(const char *name, umode_t mode,
+					   struct dentry *parent,
+					   struct debugfs_regset32 *regset)
 {
-	return ERR_PTR(-ENODEV);
 }
 
 static inline void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
-- 
cgit v1.2.3-71-gd317


From 84a4062632462c4320704fcdf8e99e89e94c0aba Mon Sep 17 00:00:00 2001
From: Johan Korsnes <jkorsnes@cisco.com>
Date: Fri, 17 Jan 2020 13:08:36 +0100
Subject: HID: core: increase HID report buffer size to 8KiB

We have a HID touch device that reports its opens and shorts test
results in HID buffers of size 8184 bytes. The maximum size of the HID
buffer is currently set to 4096 bytes, causing probe of this device to
fail. With this patch we increase the maximum size of the HID buffer to
8192 bytes, making device probe and acquisition of said buffers succeed.

Signed-off-by: Johan Korsnes <jkorsnes@cisco.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Armando Visconti <armando.visconti@st.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index cd41f209043f..875f71132b14 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -492,7 +492,7 @@ struct hid_report_enum {
 };
 
 #define HID_MIN_BUFFER_SIZE	64		/* make sure there is at least a packet size of space */
-#define HID_MAX_BUFFER_SIZE	4096		/* 4kb */
+#define HID_MAX_BUFFER_SIZE	8192		/* 8kb */
 #define HID_CONTROL_FIFO_SIZE	256		/* to init devices with >100 reports */
 #define HID_OUTPUT_FIFO_SIZE	64
 
-- 
cgit v1.2.3-71-gd317


From d970a325561da5e611596cbb06475db3755ce823 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 13 Feb 2020 18:22:55 +0100
Subject: KVM: x86: fix missing prototypes

Reported with "make W=1" due to -Wmissing-prototypes.

Reported-by: Qian Cai <cai@lca.pw>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e89eb67356cb..7944ad6ac10b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -889,6 +889,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
+int kvm_arch_post_init_vm(struct kvm *kvm);
+void kvm_arch_pre_destroy_vm(struct kvm *kvm);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 /*
-- 
cgit v1.2.3-71-gd317


From 85c46b78da58398be1c5166f55063c0512decd39 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 20 Feb 2020 21:18:42 +0900
Subject: bootconfig: Add bootconfig magic word for indicating bootconfig
 explicitly

Add bootconfig magic word to the end of bootconfig on initrd
image for indicating explicitly the bootconfig is there.
Also tools/bootconfig treats wrong size or wrong checksum or
parse error as an error, because if there is a bootconfig magic
word, there must be a bootconfig.

The bootconfig magic word is "#BOOTCONFIG\n", 12 bytes word.
Thus the block image of the initrd file with bootconfig is
as follows.

[Initrd][bootconfig][size][csum][#BOOTCONFIG\n]

Link: http://lkml.kernel.org/r/158220112263.26565.3944814205960612841.stgit@devnote2

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/admin-guide/bootconfig.rst | 10 +++++---
 include/linux/bootconfig.h               |  3 +++
 init/Kconfig                             |  2 +-
 init/main.c                              |  6 ++++-
 tools/bootconfig/main.c                  | 43 ++++++++++++++++++++++++--------
 tools/bootconfig/test-bootconfig.sh      |  2 +-
 6 files changed, 49 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
index b342a6796392..5e7609936507 100644
--- a/Documentation/admin-guide/bootconfig.rst
+++ b/Documentation/admin-guide/bootconfig.rst
@@ -102,9 +102,13 @@ Boot Kernel With a Boot Config
 ==============================
 
 Since the boot configuration file is loaded with initrd, it will be added
-to the end of the initrd (initramfs) image file. The Linux kernel decodes
-the last part of the initrd image in memory to get the boot configuration
-data.
+to the end of the initrd (initramfs) image file with size, checksum and
+12-byte magic word as below.
+
+[initrd][bootconfig][size(u32)][checksum(u32)][#BOOTCONFIG\n]
+
+The Linux kernel decodes the last part of the initrd image in memory to
+get the boot configuration data.
 Because of this "piggyback" method, there is no need to change or
 update the boot loader and the kernel image itself.
 
diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 7e18c939663e..d11e183fcb54 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -10,6 +10,9 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 
+#define BOOTCONFIG_MAGIC	"#BOOTCONFIG\n"
+#define BOOTCONFIG_MAGIC_LEN	12
+
 /* XBC tree node */
 struct xbc_node {
 	u16 next;
diff --git a/init/Kconfig b/init/Kconfig
index f586878410d2..a84e7aa89a29 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1222,7 +1222,7 @@ config BOOT_CONFIG
 	  Extra boot config allows system admin to pass a config file as
 	  complemental extension of kernel cmdline when booting.
 	  The boot config file must be attached at the end of initramfs
-	  with checksum and size.
+	  with checksum, size and magic word.
 	  See <file:Documentation/admin-guide/bootconfig.rst> for details.
 
 	  If unsure, say Y.
diff --git a/init/main.c b/init/main.c
index d96cc5f65022..2fe8dec93e68 100644
--- a/init/main.c
+++ b/init/main.c
@@ -374,7 +374,11 @@ static void __init setup_boot_config(const char *cmdline)
 	if (!initrd_end)
 		goto not_found;
 
-	hdr = (u32 *)(initrd_end - 8);
+	data = (char *)initrd_end - BOOTCONFIG_MAGIC_LEN;
+	if (memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN))
+		goto not_found;
+
+	hdr = (u32 *)(data - 8);
 	size = hdr[0];
 	csum = hdr[1];
 
diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index e18eeb070562..742271f019a9 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -131,15 +131,26 @@ int load_xbc_from_initrd(int fd, char **buf)
 	struct stat stat;
 	int ret;
 	u32 size = 0, csum = 0, rcsum;
+	char magic[BOOTCONFIG_MAGIC_LEN];
 
 	ret = fstat(fd, &stat);
 	if (ret < 0)
 		return -errno;
 
-	if (stat.st_size < 8)
+	if (stat.st_size < 8 + BOOTCONFIG_MAGIC_LEN)
 		return 0;
 
-	if (lseek(fd, -8, SEEK_END) < 0) {
+	if (lseek(fd, -BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0) {
+		pr_err("Failed to lseek: %d\n", -errno);
+		return -errno;
+	}
+	if (read(fd, magic, BOOTCONFIG_MAGIC_LEN) < 0)
+		return -errno;
+	/* Check the bootconfig magic bytes */
+	if (memcmp(magic, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN) != 0)
+		return 0;
+
+	if (lseek(fd, -(8 + BOOTCONFIG_MAGIC_LEN), SEEK_END) < 0) {
 		pr_err("Failed to lseek: %d\n", -errno);
 		return -errno;
 	}
@@ -150,11 +161,14 @@ int load_xbc_from_initrd(int fd, char **buf)
 	if (read(fd, &csum, sizeof(u32)) < 0)
 		return -errno;
 
-	/* Wrong size, maybe no boot config here */
-	if (stat.st_size < size + 8)
-		return 0;
+	/* Wrong size error  */
+	if (stat.st_size < size + 8 + BOOTCONFIG_MAGIC_LEN) {
+		pr_err("bootconfig size is too big\n");
+		return -E2BIG;
+	}
 
-	if (lseek(fd, stat.st_size - 8 - size, SEEK_SET) < 0) {
+	if (lseek(fd, stat.st_size - (size + 8 + BOOTCONFIG_MAGIC_LEN),
+		  SEEK_SET) < 0) {
 		pr_err("Failed to lseek: %d\n", -errno);
 		return -errno;
 	}
@@ -163,17 +177,17 @@ int load_xbc_from_initrd(int fd, char **buf)
 	if (ret < 0)
 		return ret;
 
-	/* Wrong Checksum, maybe no boot config here */
+	/* Wrong Checksum */
 	rcsum = checksum((unsigned char *)*buf, size);
 	if (csum != rcsum) {
 		pr_err("checksum error: %d != %d\n", csum, rcsum);
-		return 0;
+		return -EINVAL;
 	}
 
 	ret = xbc_init(*buf);
-	/* Wrong data, maybe no boot config here */
+	/* Wrong data */
 	if (ret < 0)
-		return 0;
+		return ret;
 
 	return size;
 }
@@ -226,7 +240,8 @@ int delete_xbc(const char *path)
 	} else if (size > 0) {
 		ret = fstat(fd, &stat);
 		if (!ret)
-			ret = ftruncate(fd, stat.st_size - size - 8);
+			ret = ftruncate(fd, stat.st_size
+					- size - 8 - BOOTCONFIG_MAGIC_LEN);
 		if (ret)
 			ret = -errno;
 	} /* Ignore if there is no boot config in initrd */
@@ -295,6 +310,12 @@ int apply_xbc(const char *path, const char *xbc_path)
 		pr_err("Failed to apply a boot config: %d\n", ret);
 		return ret;
 	}
+	/* Write a magic word of the bootconfig */
+	ret = write(fd, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN);
+	if (ret < 0) {
+		pr_err("Failed to apply a boot config magic: %d\n", ret);
+		return ret;
+	}
 	close(fd);
 	free(data);
 
diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh
index 1de06de328e2..adafb7c50940 100755
--- a/tools/bootconfig/test-bootconfig.sh
+++ b/tools/bootconfig/test-bootconfig.sh
@@ -49,7 +49,7 @@ xpass $BOOTCONF -a $TEMPCONF $INITRD
 new_size=$(stat -c %s $INITRD)
 
 echo "File size check"
-xpass test $new_size -eq $(expr $bconf_size + $initrd_size + 9)
+xpass test $new_size -eq $(expr $bconf_size + $initrd_size + 9 + 12)
 
 echo "Apply command repeat test"
 xpass $BOOTCONF -a $TEMPCONF $INITRD
-- 
cgit v1.2.3-71-gd317


From f66ee0410b1c3481ee75e5db9b34547b4d582465 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@netfilter.org>
Date: Tue, 11 Feb 2020 23:20:43 +0100
Subject: netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports

In the case of huge hash:* types of sets, due to the single spinlock of
a set the processing of the whole set under spinlock protection could take
too long.

There were four places where the whole hash table of the set was processed
from bucket to bucket under holding the spinlock:

- During resizing a set, the original set was locked to exclude kernel side
  add/del element operations (userspace add/del is excluded by the
  nfnetlink mutex). The original set is actually just read during the
  resize, so the spinlocking is replaced with rcu locking of regions.
  However, thus there can be parallel kernel side add/del of entries.
  In order not to loose those operations a backlog is added and replayed
  after the successful resize.
- Garbage collection of timed out entries was also protected by the spinlock.
  In order not to lock too long, region locking is introduced and a single
  region is processed in one gc go. Also, the simple timer based gc running
  is replaced with a workqueue based solution. The internal book-keeping
  (number of elements, size of extensions) is moved to region level due to
  the region locking.
- Adding elements: when the max number of the elements is reached, the gc
  was called to evict the timed out entries. The new approach is that the gc
  is called just for the matching region, assuming that if the region
  (proportionally) seems to be full, then the whole set does. We could scan
  the other regions to check every entry under rcu locking, but for huge
  sets it'd mean a slowdown at adding elements.
- Listing the set header data: when the set was defined with timeout
  support, the garbage collector was called to clean up timed out entries
  to get the correct element numbers and set size values. Now the set is
  scanned to check non-timed out entries, without actually calling the gc
  for the whole set.

Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe ->
SOFTIRQ-unsafe lock order issues during working on the patch.

Reported-by: syzbot+4b0e9d4ff3cf117837e5@syzkaller.appspotmail.com
Reported-by: syzbot+c27b8d5010f45c666ed1@syzkaller.appspotmail.com
Reported-by: syzbot+68a806795ac89df3aa1c@syzkaller.appspotmail.com
Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7")
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h |  11 +-
 net/netfilter/ipset/ip_set_core.c      |  34 +-
 net/netfilter/ipset/ip_set_hash_gen.h  | 633 +++++++++++++++++++++++----------
 3 files changed, 472 insertions(+), 206 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 908d38dbcb91..5448c8b443db 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -121,6 +121,7 @@ struct ip_set_ext {
 	u32 timeout;
 	u8 packets_op;
 	u8 bytes_op;
+	bool target;
 };
 
 struct ip_set;
@@ -187,6 +188,14 @@ struct ip_set_type_variant {
 	/* Return true if "b" set is the same as "a"
 	 * according to the create set parameters */
 	bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
+	/* Region-locking is used */
+	bool region_lock;
+};
+
+struct ip_set_region {
+	spinlock_t lock;	/* Region lock */
+	size_t ext_size;	/* Size of the dynamic extensions */
+	u32 elements;		/* Number of elements vs timeout */
 };
 
 /* The core set type structure */
@@ -501,7 +510,7 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
 }
 
 #define IP_SET_INIT_KEXT(skb, opt, set)			\
-	{ .bytes = (skb)->len, .packets = 1,		\
+	{ .bytes = (skb)->len, .packets = 1, .target = true,\
 	  .timeout = ip_set_adt_opt_timeout(opt, set) }
 
 #define IP_SET_INIT_UEXT(set)				\
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 69c107f9ba8d..8dd17589217d 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -723,6 +723,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index)
 	return set;
 }
 
+static inline void
+ip_set_lock(struct ip_set *set)
+{
+	if (!set->variant->region_lock)
+		spin_lock_bh(&set->lock);
+}
+
+static inline void
+ip_set_unlock(struct ip_set *set)
+{
+	if (!set->variant->region_lock)
+		spin_unlock_bh(&set->lock);
+}
+
 int
 ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 	    const struct xt_action_param *par, struct ip_set_adt_opt *opt)
@@ -744,9 +758,9 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 	if (ret == -EAGAIN) {
 		/* Type requests element to be completed */
 		pr_debug("element must be completed, ADD is triggered\n");
-		spin_lock_bh(&set->lock);
+		ip_set_lock(set);
 		set->variant->kadt(set, skb, par, IPSET_ADD, opt);
-		spin_unlock_bh(&set->lock);
+		ip_set_unlock(set);
 		ret = 1;
 	} else {
 		/* --return-nomatch: invert matched element */
@@ -775,9 +789,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
 	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
 		return -IPSET_ERR_TYPE_MISMATCH;
 
-	spin_lock_bh(&set->lock);
+	ip_set_lock(set);
 	ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
-	spin_unlock_bh(&set->lock);
+	ip_set_unlock(set);
 
 	return ret;
 }
@@ -797,9 +811,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
 		return -IPSET_ERR_TYPE_MISMATCH;
 
-	spin_lock_bh(&set->lock);
+	ip_set_lock(set);
 	ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
-	spin_unlock_bh(&set->lock);
+	ip_set_unlock(set);
 
 	return ret;
 }
@@ -1264,9 +1278,9 @@ ip_set_flush_set(struct ip_set *set)
 {
 	pr_debug("set: %s\n",  set->name);
 
-	spin_lock_bh(&set->lock);
+	ip_set_lock(set);
 	set->variant->flush(set);
-	spin_unlock_bh(&set->lock);
+	ip_set_unlock(set);
 }
 
 static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
@@ -1713,9 +1727,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
 	bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
 
 	do {
-		spin_lock_bh(&set->lock);
+		ip_set_lock(set);
 		ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
-		spin_unlock_bh(&set->lock);
+		ip_set_unlock(set);
 		retried = true;
 	} while (ret == -EAGAIN &&
 		 set->variant->resize &&
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 7480ce55b5c8..71e93eac0831 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -7,13 +7,21 @@
 #include <linux/rcupdate.h>
 #include <linux/jhash.h>
 #include <linux/types.h>
+#include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/ipset/ip_set.h>
 
-#define __ipset_dereference_protected(p, c)	rcu_dereference_protected(p, c)
-#define ipset_dereference_protected(p, set) \
-	__ipset_dereference_protected(p, lockdep_is_held(&(set)->lock))
-
-#define rcu_dereference_bh_nfnl(p)	rcu_dereference_bh_check(p, 1)
+#define __ipset_dereference(p)		\
+	rcu_dereference_protected(p, 1)
+#define ipset_dereference_nfnl(p)	\
+	rcu_dereference_protected(p,	\
+		lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
+#define ipset_dereference_set(p, set) 	\
+	rcu_dereference_protected(p,	\
+		lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
+		lockdep_is_held(&(set)->lock))
+#define ipset_dereference_bh_nfnl(p)	\
+	rcu_dereference_bh_check(p, 	\
+		lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
 
 /* Hashing which uses arrays to resolve clashing. The hash table is resized
  * (doubled) when searching becomes too long.
@@ -72,11 +80,35 @@ struct hbucket {
 		__aligned(__alignof__(u64));
 };
 
+/* Region size for locking == 2^HTABLE_REGION_BITS */
+#define HTABLE_REGION_BITS	10
+#define ahash_numof_locks(htable_bits)		\
+	((htable_bits) < HTABLE_REGION_BITS ? 1	\
+		: jhash_size((htable_bits) - HTABLE_REGION_BITS))
+#define ahash_sizeof_regions(htable_bits)		\
+	(ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region))
+#define ahash_region(n, htable_bits)		\
+	((n) % ahash_numof_locks(htable_bits))
+#define ahash_bucket_start(h,  htable_bits)	\
+	((htable_bits) < HTABLE_REGION_BITS ? 0	\
+		: (h) * jhash_size(HTABLE_REGION_BITS))
+#define ahash_bucket_end(h,  htable_bits)	\
+	((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits)	\
+		: ((h) + 1) * jhash_size(HTABLE_REGION_BITS))
+
+struct htable_gc {
+	struct delayed_work dwork;
+	struct ip_set *set;	/* Set the gc belongs to */
+	u32 region;		/* Last gc run position */
+};
+
 /* The hash table: the table size stored here in order to make resizing easy */
 struct htable {
 	atomic_t ref;		/* References for resizing */
-	atomic_t uref;		/* References for dumping */
+	atomic_t uref;		/* References for dumping and gc */
 	u8 htable_bits;		/* size of hash table == 2^htable_bits */
+	u32 maxelem;		/* Maxelem per region */
+	struct ip_set_region *hregion;	/* Region locks and ext sizes */
 	struct hbucket __rcu *bucket[0]; /* hashtable buckets */
 };
 
@@ -162,6 +194,10 @@ htable_bits(u32 hashsize)
 #define NLEN			0
 #endif /* IP_SET_HASH_WITH_NETS */
 
+#define SET_ELEM_EXPIRED(set, d)	\
+	(SET_WITH_TIMEOUT(set) &&	\
+	 ip_set_timeout_expired(ext_timeout(d, set)))
+
 #endif /* _IP_SET_HASH_GEN_H */
 
 #ifndef MTYPE
@@ -205,10 +241,12 @@ htable_bits(u32 hashsize)
 #undef mtype_test_cidrs
 #undef mtype_test
 #undef mtype_uref
-#undef mtype_expire
 #undef mtype_resize
+#undef mtype_ext_size
+#undef mtype_resize_ad
 #undef mtype_head
 #undef mtype_list
+#undef mtype_gc_do
 #undef mtype_gc
 #undef mtype_gc_init
 #undef mtype_variant
@@ -247,10 +285,12 @@ htable_bits(u32 hashsize)
 #define mtype_test_cidrs	IPSET_TOKEN(MTYPE, _test_cidrs)
 #define mtype_test		IPSET_TOKEN(MTYPE, _test)
 #define mtype_uref		IPSET_TOKEN(MTYPE, _uref)
-#define mtype_expire		IPSET_TOKEN(MTYPE, _expire)
 #define mtype_resize		IPSET_TOKEN(MTYPE, _resize)
+#define mtype_ext_size		IPSET_TOKEN(MTYPE, _ext_size)
+#define mtype_resize_ad		IPSET_TOKEN(MTYPE, _resize_ad)
 #define mtype_head		IPSET_TOKEN(MTYPE, _head)
 #define mtype_list		IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc_do		IPSET_TOKEN(MTYPE, _gc_do)
 #define mtype_gc		IPSET_TOKEN(MTYPE, _gc)
 #define mtype_gc_init		IPSET_TOKEN(MTYPE, _gc_init)
 #define mtype_variant		IPSET_TOKEN(MTYPE, _variant)
@@ -275,8 +315,7 @@ htable_bits(u32 hashsize)
 /* The generic hash structure */
 struct htype {
 	struct htable __rcu *table; /* the hash table */
-	struct timer_list gc;	/* garbage collection when timeout enabled */
-	struct ip_set *set;	/* attached to this ip_set */
+	struct htable_gc gc;	/* gc workqueue */
 	u32 maxelem;		/* max elements in the hash */
 	u32 initval;		/* random jhash init value */
 #ifdef IP_SET_HASH_WITH_MARKMASK
@@ -288,21 +327,33 @@ struct htype {
 #ifdef IP_SET_HASH_WITH_NETMASK
 	u8 netmask;		/* netmask value for subnets to store */
 #endif
+	struct list_head ad;	/* Resize add|del backlist */
 	struct mtype_elem next; /* temporary storage for uadd */
 #ifdef IP_SET_HASH_WITH_NETS
 	struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */
 #endif
 };
 
+/* ADD|DEL entries saved during resize */
+struct mtype_resize_ad {
+	struct list_head list;
+	enum ipset_adt ad;	/* ADD|DEL element */
+	struct mtype_elem d;	/* Element value */
+	struct ip_set_ext ext;	/* Extensions for ADD */
+	struct ip_set_ext mext;	/* Target extensions for ADD */
+	u32 flags;		/* Flags for ADD */
+};
+
 #ifdef IP_SET_HASH_WITH_NETS
 /* Network cidr size book keeping when the hash stores different
  * sized networks. cidr == real cidr + 1 to support /0.
  */
 static void
-mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
+mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
 {
 	int i, j;
 
+	spin_lock_bh(&set->lock);
 	/* Add in increasing prefix order, so larger cidr first */
 	for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) {
 		if (j != -1) {
@@ -311,7 +362,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
 			j = i;
 		} else if (h->nets[i].cidr[n] == cidr) {
 			h->nets[CIDR_POS(cidr)].nets[n]++;
-			return;
+			goto unlock;
 		}
 	}
 	if (j != -1) {
@@ -320,24 +371,29 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
 	}
 	h->nets[i].cidr[n] = cidr;
 	h->nets[CIDR_POS(cidr)].nets[n] = 1;
+unlock:
+	spin_unlock_bh(&set->lock);
 }
 
 static void
-mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
+mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
 {
 	u8 i, j, net_end = NLEN - 1;
 
+	spin_lock_bh(&set->lock);
 	for (i = 0; i < NLEN; i++) {
 		if (h->nets[i].cidr[n] != cidr)
 			continue;
 		h->nets[CIDR_POS(cidr)].nets[n]--;
 		if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
-			return;
+			goto unlock;
 		for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
 			h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
 		h->nets[j].cidr[n] = 0;
-		return;
+		goto unlock;
 	}
+unlock:
+	spin_unlock_bh(&set->lock);
 }
 #endif
 
@@ -345,7 +401,7 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
 static size_t
 mtype_ahash_memsize(const struct htype *h, const struct htable *t)
 {
-	return sizeof(*h) + sizeof(*t);
+	return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits);
 }
 
 /* Get the ith element from the array block n */
@@ -369,24 +425,29 @@ mtype_flush(struct ip_set *set)
 	struct htype *h = set->data;
 	struct htable *t;
 	struct hbucket *n;
-	u32 i;
-
-	t = ipset_dereference_protected(h->table, set);
-	for (i = 0; i < jhash_size(t->htable_bits); i++) {
-		n = __ipset_dereference_protected(hbucket(t, i), 1);
-		if (!n)
-			continue;
-		if (set->extensions & IPSET_EXT_DESTROY)
-			mtype_ext_cleanup(set, n);
-		/* FIXME: use slab cache */
-		rcu_assign_pointer(hbucket(t, i), NULL);
-		kfree_rcu(n, rcu);
+	u32 r, i;
+
+	t = ipset_dereference_nfnl(h->table);
+	for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
+		spin_lock_bh(&t->hregion[r].lock);
+		for (i = ahash_bucket_start(r, t->htable_bits);
+		     i < ahash_bucket_end(r, t->htable_bits); i++) {
+			n = __ipset_dereference(hbucket(t, i));
+			if (!n)
+				continue;
+			if (set->extensions & IPSET_EXT_DESTROY)
+				mtype_ext_cleanup(set, n);
+			/* FIXME: use slab cache */
+			rcu_assign_pointer(hbucket(t, i), NULL);
+			kfree_rcu(n, rcu);
+		}
+		t->hregion[r].ext_size = 0;
+		t->hregion[r].elements = 0;
+		spin_unlock_bh(&t->hregion[r].lock);
 	}
 #ifdef IP_SET_HASH_WITH_NETS
 	memset(h->nets, 0, sizeof(h->nets));
 #endif
-	set->elements = 0;
-	set->ext_size = 0;
 }
 
 /* Destroy the hashtable part of the set */
@@ -397,7 +458,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
 	u32 i;
 
 	for (i = 0; i < jhash_size(t->htable_bits); i++) {
-		n = __ipset_dereference_protected(hbucket(t, i), 1);
+		n = __ipset_dereference(hbucket(t, i));
 		if (!n)
 			continue;
 		if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
@@ -406,6 +467,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
 		kfree(n);
 	}
 
+	ip_set_free(t->hregion);
 	ip_set_free(t);
 }
 
@@ -414,28 +476,21 @@ static void
 mtype_destroy(struct ip_set *set)
 {
 	struct htype *h = set->data;
+	struct list_head *l, *lt;
 
 	if (SET_WITH_TIMEOUT(set))
-		del_timer_sync(&h->gc);
+		cancel_delayed_work_sync(&h->gc.dwork);
 
-	mtype_ahash_destroy(set,
-			    __ipset_dereference_protected(h->table, 1), true);
+	mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
+	list_for_each_safe(l, lt, &h->ad) {
+		list_del(l);
+		kfree(l);
+	}
 	kfree(h);
 
 	set->data = NULL;
 }
 
-static void
-mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t))
-{
-	struct htype *h = set->data;
-
-	timer_setup(&h->gc, gc, 0);
-	mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
-	pr_debug("gc initialized, run in every %u\n",
-		 IPSET_GC_PERIOD(set->timeout));
-}
-
 static bool
 mtype_same_set(const struct ip_set *a, const struct ip_set *b)
 {
@@ -454,11 +509,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
 	       a->extensions == b->extensions;
 }
 
-/* Delete expired elements from the hashtable */
 static void
-mtype_expire(struct ip_set *set, struct htype *h)
+mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r)
 {
-	struct htable *t;
 	struct hbucket *n, *tmp;
 	struct mtype_elem *data;
 	u32 i, j, d;
@@ -466,10 +519,12 @@ mtype_expire(struct ip_set *set, struct htype *h)
 #ifdef IP_SET_HASH_WITH_NETS
 	u8 k;
 #endif
+	u8 htable_bits = t->htable_bits;
 
-	t = ipset_dereference_protected(h->table, set);
-	for (i = 0; i < jhash_size(t->htable_bits); i++) {
-		n = __ipset_dereference_protected(hbucket(t, i), 1);
+	spin_lock_bh(&t->hregion[r].lock);
+	for (i = ahash_bucket_start(r, htable_bits);
+	     i < ahash_bucket_end(r, htable_bits); i++) {
+		n = __ipset_dereference(hbucket(t, i));
 		if (!n)
 			continue;
 		for (j = 0, d = 0; j < n->pos; j++) {
@@ -485,58 +540,100 @@ mtype_expire(struct ip_set *set, struct htype *h)
 			smp_mb__after_atomic();
 #ifdef IP_SET_HASH_WITH_NETS
 			for (k = 0; k < IPSET_NET_COUNT; k++)
-				mtype_del_cidr(h,
+				mtype_del_cidr(set, h,
 					NCIDR_PUT(DCIDR_GET(data->cidr, k)),
 					k);
 #endif
+			t->hregion[r].elements--;
 			ip_set_ext_destroy(set, data);
-			set->elements--;
 			d++;
 		}
 		if (d >= AHASH_INIT_SIZE) {
 			if (d >= n->size) {
+				t->hregion[r].ext_size -=
+					ext_size(n->size, dsize);
 				rcu_assign_pointer(hbucket(t, i), NULL);
 				kfree_rcu(n, rcu);
 				continue;
 			}
 			tmp = kzalloc(sizeof(*tmp) +
-				      (n->size - AHASH_INIT_SIZE) * dsize,
-				      GFP_ATOMIC);
+				(n->size - AHASH_INIT_SIZE) * dsize,
+				GFP_ATOMIC);
 			if (!tmp)
-				/* Still try to delete expired elements */
+				/* Still try to delete expired elements. */
 				continue;
 			tmp->size = n->size - AHASH_INIT_SIZE;
 			for (j = 0, d = 0; j < n->pos; j++) {
 				if (!test_bit(j, n->used))
 					continue;
 				data = ahash_data(n, j, dsize);
-				memcpy(tmp->value + d * dsize, data, dsize);
+				memcpy(tmp->value + d * dsize,
+				       data, dsize);
 				set_bit(d, tmp->used);
 				d++;
 			}
 			tmp->pos = d;
-			set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
+			t->hregion[r].ext_size -=
+				ext_size(AHASH_INIT_SIZE, dsize);
 			rcu_assign_pointer(hbucket(t, i), tmp);
 			kfree_rcu(n, rcu);
 		}
 	}
+	spin_unlock_bh(&t->hregion[r].lock);
 }
 
 static void
-mtype_gc(struct timer_list *t)
+mtype_gc(struct work_struct *work)
 {
-	struct htype *h = from_timer(h, t, gc);
-	struct ip_set *set = h->set;
+	struct htable_gc *gc;
+	struct ip_set *set;
+	struct htype *h;
+	struct htable *t;
+	u32 r, numof_locks;
+	unsigned int next_run;
+
+	gc = container_of(work, struct htable_gc, dwork.work);
+	set = gc->set;
+	h = set->data;
 
-	pr_debug("called\n");
 	spin_lock_bh(&set->lock);
-	mtype_expire(set, h);
+	t = ipset_dereference_set(h->table, set);
+	atomic_inc(&t->uref);
+	numof_locks = ahash_numof_locks(t->htable_bits);
+	r = gc->region++;
+	if (r >= numof_locks) {
+		r = gc->region = 0;
+	}
+	next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks;
+	if (next_run < HZ/10)
+		next_run = HZ/10;
 	spin_unlock_bh(&set->lock);
 
-	h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
-	add_timer(&h->gc);
+	mtype_gc_do(set, h, t, r);
+
+	if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+		pr_debug("Table destroy after resize by expire: %p\n", t);
+		mtype_ahash_destroy(set, t, false);
+	}
+
+	queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run);
+
+}
+
+static void
+mtype_gc_init(struct htable_gc *gc)
+{
+	INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc);
+	queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
 }
 
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+	  struct ip_set_ext *mext, u32 flags);
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+	  struct ip_set_ext *mext, u32 flags);
+
 /* Resize a hash: create a new hash table with doubling the hashsize
  * and inserting the elements to it. Repeat until we succeed or
  * fail due to memory pressures.
@@ -547,7 +644,7 @@ mtype_resize(struct ip_set *set, bool retried)
 	struct htype *h = set->data;
 	struct htable *t, *orig;
 	u8 htable_bits;
-	size_t extsize, dsize = set->dsize;
+	size_t dsize = set->dsize;
 #ifdef IP_SET_HASH_WITH_NETS
 	u8 flags;
 	struct mtype_elem *tmp;
@@ -555,7 +652,9 @@ mtype_resize(struct ip_set *set, bool retried)
 	struct mtype_elem *data;
 	struct mtype_elem *d;
 	struct hbucket *n, *m;
-	u32 i, j, key;
+	struct list_head *l, *lt;
+	struct mtype_resize_ad *x;
+	u32 i, j, r, nr, key;
 	int ret;
 
 #ifdef IP_SET_HASH_WITH_NETS
@@ -563,10 +662,8 @@ mtype_resize(struct ip_set *set, bool retried)
 	if (!tmp)
 		return -ENOMEM;
 #endif
-	rcu_read_lock_bh();
-	orig = rcu_dereference_bh_nfnl(h->table);
+	orig = ipset_dereference_bh_nfnl(h->table);
 	htable_bits = orig->htable_bits;
-	rcu_read_unlock_bh();
 
 retry:
 	ret = 0;
@@ -583,88 +680,124 @@ retry:
 		ret = -ENOMEM;
 		goto out;
 	}
+	t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits));
+	if (!t->hregion) {
+		kfree(t);
+		ret = -ENOMEM;
+		goto out;
+	}
 	t->htable_bits = htable_bits;
+	t->maxelem = h->maxelem / ahash_numof_locks(htable_bits);
+	for (i = 0; i < ahash_numof_locks(htable_bits); i++)
+		spin_lock_init(&t->hregion[i].lock);
 
-	spin_lock_bh(&set->lock);
-	orig = __ipset_dereference_protected(h->table, 1);
-	/* There can't be another parallel resizing, but dumping is possible */
+	/* There can't be another parallel resizing,
+	 * but dumping, gc, kernel side add/del are possible
+	 */
+	orig = ipset_dereference_bh_nfnl(h->table);
 	atomic_set(&orig->ref, 1);
 	atomic_inc(&orig->uref);
-	extsize = 0;
 	pr_debug("attempt to resize set %s from %u to %u, t %p\n",
 		 set->name, orig->htable_bits, htable_bits, orig);
-	for (i = 0; i < jhash_size(orig->htable_bits); i++) {
-		n = __ipset_dereference_protected(hbucket(orig, i), 1);
-		if (!n)
-			continue;
-		for (j = 0; j < n->pos; j++) {
-			if (!test_bit(j, n->used))
+	for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) {
+		/* Expire may replace a hbucket with another one */
+		rcu_read_lock_bh();
+		for (i = ahash_bucket_start(r, orig->htable_bits);
+		     i < ahash_bucket_end(r, orig->htable_bits); i++) {
+			n = __ipset_dereference(hbucket(orig, i));
+			if (!n)
 				continue;
-			data = ahash_data(n, j, dsize);
+			for (j = 0; j < n->pos; j++) {
+				if (!test_bit(j, n->used))
+					continue;
+				data = ahash_data(n, j, dsize);
+				if (SET_ELEM_EXPIRED(set, data))
+					continue;
 #ifdef IP_SET_HASH_WITH_NETS
-			/* We have readers running parallel with us,
-			 * so the live data cannot be modified.
-			 */
-			flags = 0;
-			memcpy(tmp, data, dsize);
-			data = tmp;
-			mtype_data_reset_flags(data, &flags);
+				/* We have readers running parallel with us,
+				 * so the live data cannot be modified.
+				 */
+				flags = 0;
+				memcpy(tmp, data, dsize);
+				data = tmp;
+				mtype_data_reset_flags(data, &flags);
 #endif
-			key = HKEY(data, h->initval, htable_bits);
-			m = __ipset_dereference_protected(hbucket(t, key), 1);
-			if (!m) {
-				m = kzalloc(sizeof(*m) +
+				key = HKEY(data, h->initval, htable_bits);
+				m = __ipset_dereference(hbucket(t, key));
+				nr = ahash_region(key, htable_bits);
+				if (!m) {
+					m = kzalloc(sizeof(*m) +
 					    AHASH_INIT_SIZE * dsize,
 					    GFP_ATOMIC);
-				if (!m) {
-					ret = -ENOMEM;
-					goto cleanup;
-				}
-				m->size = AHASH_INIT_SIZE;
-				extsize += ext_size(AHASH_INIT_SIZE, dsize);
-				RCU_INIT_POINTER(hbucket(t, key), m);
-			} else if (m->pos >= m->size) {
-				struct hbucket *ht;
-
-				if (m->size >= AHASH_MAX(h)) {
-					ret = -EAGAIN;
-				} else {
-					ht = kzalloc(sizeof(*ht) +
+					if (!m) {
+						ret = -ENOMEM;
+						goto cleanup;
+					}
+					m->size = AHASH_INIT_SIZE;
+					t->hregion[nr].ext_size +=
+						ext_size(AHASH_INIT_SIZE,
+							 dsize);
+					RCU_INIT_POINTER(hbucket(t, key), m);
+				} else if (m->pos >= m->size) {
+					struct hbucket *ht;
+
+					if (m->size >= AHASH_MAX(h)) {
+						ret = -EAGAIN;
+					} else {
+						ht = kzalloc(sizeof(*ht) +
 						(m->size + AHASH_INIT_SIZE)
 						* dsize,
 						GFP_ATOMIC);
-					if (!ht)
-						ret = -ENOMEM;
+						if (!ht)
+							ret = -ENOMEM;
+					}
+					if (ret < 0)
+						goto cleanup;
+					memcpy(ht, m, sizeof(struct hbucket) +
+					       m->size * dsize);
+					ht->size = m->size + AHASH_INIT_SIZE;
+					t->hregion[nr].ext_size +=
+						ext_size(AHASH_INIT_SIZE,
+							 dsize);
+					kfree(m);
+					m = ht;
+					RCU_INIT_POINTER(hbucket(t, key), ht);
 				}
-				if (ret < 0)
-					goto cleanup;
-				memcpy(ht, m, sizeof(struct hbucket) +
-					      m->size * dsize);
-				ht->size = m->size + AHASH_INIT_SIZE;
-				extsize += ext_size(AHASH_INIT_SIZE, dsize);
-				kfree(m);
-				m = ht;
-				RCU_INIT_POINTER(hbucket(t, key), ht);
-			}
-			d = ahash_data(m, m->pos, dsize);
-			memcpy(d, data, dsize);
-			set_bit(m->pos++, m->used);
+				d = ahash_data(m, m->pos, dsize);
+				memcpy(d, data, dsize);
+				set_bit(m->pos++, m->used);
+				t->hregion[nr].elements++;
 #ifdef IP_SET_HASH_WITH_NETS
-			mtype_data_reset_flags(d, &flags);
+				mtype_data_reset_flags(d, &flags);
 #endif
+			}
 		}
+		rcu_read_unlock_bh();
 	}
-	rcu_assign_pointer(h->table, t);
-	set->ext_size = extsize;
 
-	spin_unlock_bh(&set->lock);
+	/* There can't be any other writer. */
+	rcu_assign_pointer(h->table, t);
 
 	/* Give time to other readers of the set */
 	synchronize_rcu();
 
 	pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
 		 orig->htable_bits, orig, t->htable_bits, t);
-	/* If there's nobody else dumping the table, destroy it */
+	/* Add/delete elements processed by the SET target during resize.
+	 * Kernel-side add cannot trigger a resize and userspace actions
+	 * are serialized by the mutex.
+	 */
+	list_for_each_safe(l, lt, &h->ad) {
+		x = list_entry(l, struct mtype_resize_ad, list);
+		if (x->ad == IPSET_ADD) {
+			mtype_add(set, &x->d, &x->ext, &x->mext, x->flags);
+		} else {
+			mtype_del(set, &x->d, NULL, NULL, 0);
+		}
+		list_del(l);
+		kfree(l);
+	}
+	/* If there's nobody else using the table, destroy it */
 	if (atomic_dec_and_test(&orig->uref)) {
 		pr_debug("Table destroy by resize %p\n", orig);
 		mtype_ahash_destroy(set, orig, false);
@@ -677,15 +810,44 @@ out:
 	return ret;
 
 cleanup:
+	rcu_read_unlock_bh();
 	atomic_set(&orig->ref, 0);
 	atomic_dec(&orig->uref);
-	spin_unlock_bh(&set->lock);
 	mtype_ahash_destroy(set, t, false);
 	if (ret == -EAGAIN)
 		goto retry;
 	goto out;
 }
 
+/* Get the current number of elements and ext_size in the set  */
+static void
+mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size)
+{
+	struct htype *h = set->data;
+	const struct htable *t;
+	u32 i, j, r;
+	struct hbucket *n;
+	struct mtype_elem *data;
+
+	t = rcu_dereference_bh(h->table);
+	for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
+		for (i = ahash_bucket_start(r, t->htable_bits);
+		     i < ahash_bucket_end(r, t->htable_bits); i++) {
+			n = rcu_dereference_bh(hbucket(t, i));
+			if (!n)
+				continue;
+			for (j = 0; j < n->pos; j++) {
+				if (!test_bit(j, n->used))
+					continue;
+				data = ahash_data(n, j, set->dsize);
+				if (!SET_ELEM_EXPIRED(set, data))
+					(*elements)++;
+			}
+		}
+		*ext_size += t->hregion[r].ext_size;
+	}
+}
+
 /* Add an element to a hash and update the internal counters when succeeded,
  * otherwise report the proper error code.
  */
@@ -698,32 +860,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	const struct mtype_elem *d = value;
 	struct mtype_elem *data;
 	struct hbucket *n, *old = ERR_PTR(-ENOENT);
-	int i, j = -1;
+	int i, j = -1, ret;
 	bool flag_exist = flags & IPSET_FLAG_EXIST;
 	bool deleted = false, forceadd = false, reuse = false;
-	u32 key, multi = 0;
+	u32 r, key, multi = 0, elements, maxelem;
 
-	if (set->elements >= h->maxelem) {
-		if (SET_WITH_TIMEOUT(set))
-			/* FIXME: when set is full, we slow down here */
-			mtype_expire(set, h);
-		if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set))
+	rcu_read_lock_bh();
+	t = rcu_dereference_bh(h->table);
+	key = HKEY(value, h->initval, t->htable_bits);
+	r = ahash_region(key, t->htable_bits);
+	atomic_inc(&t->uref);
+	elements = t->hregion[r].elements;
+	maxelem = t->maxelem;
+	if (elements >= maxelem) {
+		u32 e;
+		if (SET_WITH_TIMEOUT(set)) {
+			rcu_read_unlock_bh();
+			mtype_gc_do(set, h, t, r);
+			rcu_read_lock_bh();
+		}
+		maxelem = h->maxelem;
+		elements = 0;
+		for (e = 0; e < ahash_numof_locks(t->htable_bits); e++)
+			elements += t->hregion[e].elements;
+		if (elements >= maxelem && SET_WITH_FORCEADD(set))
 			forceadd = true;
 	}
+	rcu_read_unlock_bh();
 
-	t = ipset_dereference_protected(h->table, set);
-	key = HKEY(value, h->initval, t->htable_bits);
-	n = __ipset_dereference_protected(hbucket(t, key), 1);
+	spin_lock_bh(&t->hregion[r].lock);
+	n = rcu_dereference_bh(hbucket(t, key));
 	if (!n) {
-		if (forceadd || set->elements >= h->maxelem)
+		if (forceadd || elements >= maxelem)
 			goto set_full;
 		old = NULL;
 		n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
 			    GFP_ATOMIC);
-		if (!n)
-			return -ENOMEM;
+		if (!n) {
+			ret = -ENOMEM;
+			goto unlock;
+		}
 		n->size = AHASH_INIT_SIZE;
-		set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
+		t->hregion[r].ext_size +=
+			ext_size(AHASH_INIT_SIZE, set->dsize);
 		goto copy_elem;
 	}
 	for (i = 0; i < n->pos; i++) {
@@ -737,19 +916,16 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		}
 		data = ahash_data(n, i, set->dsize);
 		if (mtype_data_equal(data, d, &multi)) {
-			if (flag_exist ||
-			    (SET_WITH_TIMEOUT(set) &&
-			     ip_set_timeout_expired(ext_timeout(data, set)))) {
+			if (flag_exist || SET_ELEM_EXPIRED(set, data)) {
 				/* Just the extensions could be overwritten */
 				j = i;
 				goto overwrite_extensions;
 			}
-			return -IPSET_ERR_EXIST;
+			ret = -IPSET_ERR_EXIST;
+			goto unlock;
 		}
 		/* Reuse first timed out entry */
-		if (SET_WITH_TIMEOUT(set) &&
-		    ip_set_timeout_expired(ext_timeout(data, set)) &&
-		    j == -1) {
+		if (SET_ELEM_EXPIRED(set, data) && j == -1) {
 			j = i;
 			reuse = true;
 		}
@@ -759,16 +935,16 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		if (!deleted) {
 #ifdef IP_SET_HASH_WITH_NETS
 			for (i = 0; i < IPSET_NET_COUNT; i++)
-				mtype_del_cidr(h,
+				mtype_del_cidr(set, h,
 					NCIDR_PUT(DCIDR_GET(data->cidr, i)),
 					i);
 #endif
 			ip_set_ext_destroy(set, data);
-			set->elements--;
+			t->hregion[r].elements--;
 		}
 		goto copy_data;
 	}
-	if (set->elements >= h->maxelem)
+	if (elements >= maxelem)
 		goto set_full;
 	/* Create a new slot */
 	if (n->pos >= n->size) {
@@ -776,28 +952,32 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		if (n->size >= AHASH_MAX(h)) {
 			/* Trigger rehashing */
 			mtype_data_next(&h->next, d);
-			return -EAGAIN;
+			ret = -EAGAIN;
+			goto resize;
 		}
 		old = n;
 		n = kzalloc(sizeof(*n) +
 			    (old->size + AHASH_INIT_SIZE) * set->dsize,
 			    GFP_ATOMIC);
-		if (!n)
-			return -ENOMEM;
+		if (!n) {
+			ret = -ENOMEM;
+			goto unlock;
+		}
 		memcpy(n, old, sizeof(struct hbucket) +
 		       old->size * set->dsize);
 		n->size = old->size + AHASH_INIT_SIZE;
-		set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
+		t->hregion[r].ext_size +=
+			ext_size(AHASH_INIT_SIZE, set->dsize);
 	}
 
 copy_elem:
 	j = n->pos++;
 	data = ahash_data(n, j, set->dsize);
 copy_data:
-	set->elements++;
+	t->hregion[r].elements++;
 #ifdef IP_SET_HASH_WITH_NETS
 	for (i = 0; i < IPSET_NET_COUNT; i++)
-		mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
+		mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
 #endif
 	memcpy(data, d, sizeof(struct mtype_elem));
 overwrite_extensions:
@@ -820,13 +1000,41 @@ overwrite_extensions:
 		if (old)
 			kfree_rcu(old, rcu);
 	}
+	ret = 0;
+resize:
+	spin_unlock_bh(&t->hregion[r].lock);
+	if (atomic_read(&t->ref) && ext->target) {
+		/* Resize is in process and kernel side add, save values */
+		struct mtype_resize_ad *x;
+
+		x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC);
+		if (!x)
+			/* Don't bother */
+			goto out;
+		x->ad = IPSET_ADD;
+		memcpy(&x->d, value, sizeof(struct mtype_elem));
+		memcpy(&x->ext, ext, sizeof(struct ip_set_ext));
+		memcpy(&x->mext, mext, sizeof(struct ip_set_ext));
+		x->flags = flags;
+		spin_lock_bh(&set->lock);
+		list_add_tail(&x->list, &h->ad);
+		spin_unlock_bh(&set->lock);
+	}
+	goto out;
 
-	return 0;
 set_full:
 	if (net_ratelimit())
 		pr_warn("Set %s is full, maxelem %u reached\n",
-			set->name, h->maxelem);
-	return -IPSET_ERR_HASH_FULL;
+			set->name, maxelem);
+	ret = -IPSET_ERR_HASH_FULL;
+unlock:
+	spin_unlock_bh(&t->hregion[r].lock);
+out:
+	if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+		pr_debug("Table destroy after resize by add: %p\n", t);
+		mtype_ahash_destroy(set, t, false);
+	}
+	return ret;
 }
 
 /* Delete an element from the hash and free up space if possible.
@@ -840,13 +1048,23 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	const struct mtype_elem *d = value;
 	struct mtype_elem *data;
 	struct hbucket *n;
-	int i, j, k, ret = -IPSET_ERR_EXIST;
+	struct mtype_resize_ad *x = NULL;
+	int i, j, k, r, ret = -IPSET_ERR_EXIST;
 	u32 key, multi = 0;
 	size_t dsize = set->dsize;
 
-	t = ipset_dereference_protected(h->table, set);
+	/* Userspace add and resize is excluded by the mutex.
+	 * Kernespace add does not trigger resize.
+	 */
+	rcu_read_lock_bh();
+	t = rcu_dereference_bh(h->table);
 	key = HKEY(value, h->initval, t->htable_bits);
-	n = __ipset_dereference_protected(hbucket(t, key), 1);
+	r = ahash_region(key, t->htable_bits);
+	atomic_inc(&t->uref);
+	rcu_read_unlock_bh();
+
+	spin_lock_bh(&t->hregion[r].lock);
+	n = rcu_dereference_bh(hbucket(t, key));
 	if (!n)
 		goto out;
 	for (i = 0, k = 0; i < n->pos; i++) {
@@ -857,8 +1075,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		data = ahash_data(n, i, dsize);
 		if (!mtype_data_equal(data, d, &multi))
 			continue;
-		if (SET_WITH_TIMEOUT(set) &&
-		    ip_set_timeout_expired(ext_timeout(data, set)))
+		if (SET_ELEM_EXPIRED(set, data))
 			goto out;
 
 		ret = 0;
@@ -866,20 +1083,33 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		smp_mb__after_atomic();
 		if (i + 1 == n->pos)
 			n->pos--;
-		set->elements--;
+		t->hregion[r].elements--;
 #ifdef IP_SET_HASH_WITH_NETS
 		for (j = 0; j < IPSET_NET_COUNT; j++)
-			mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
-				       j);
+			mtype_del_cidr(set, h,
+				       NCIDR_PUT(DCIDR_GET(d->cidr, j)), j);
 #endif
 		ip_set_ext_destroy(set, data);
 
+		if (atomic_read(&t->ref) && ext->target) {
+			/* Resize is in process and kernel side del,
+			 * save values
+			 */
+			x = kzalloc(sizeof(struct mtype_resize_ad),
+				    GFP_ATOMIC);
+			if (x) {
+				x->ad = IPSET_DEL;
+				memcpy(&x->d, value,
+				       sizeof(struct mtype_elem));
+				x->flags = flags;
+			}
+		}
 		for (; i < n->pos; i++) {
 			if (!test_bit(i, n->used))
 				k++;
 		}
 		if (n->pos == 0 && k == 0) {
-			set->ext_size -= ext_size(n->size, dsize);
+			t->hregion[r].ext_size -= ext_size(n->size, dsize);
 			rcu_assign_pointer(hbucket(t, key), NULL);
 			kfree_rcu(n, rcu);
 		} else if (k >= AHASH_INIT_SIZE) {
@@ -898,7 +1128,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 				k++;
 			}
 			tmp->pos = k;
-			set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
+			t->hregion[r].ext_size -=
+				ext_size(AHASH_INIT_SIZE, dsize);
 			rcu_assign_pointer(hbucket(t, key), tmp);
 			kfree_rcu(n, rcu);
 		}
@@ -906,6 +1137,16 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	}
 
 out:
+	spin_unlock_bh(&t->hregion[r].lock);
+	if (x) {
+		spin_lock_bh(&set->lock);
+		list_add(&x->list, &h->ad);
+		spin_unlock_bh(&set->lock);
+	}
+	if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+		pr_debug("Table destroy after resize by del: %p\n", t);
+		mtype_ahash_destroy(set, t, false);
+	}
 	return ret;
 }
 
@@ -991,6 +1232,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	int i, ret = 0;
 	u32 key, multi = 0;
 
+	rcu_read_lock_bh();
 	t = rcu_dereference_bh(h->table);
 #ifdef IP_SET_HASH_WITH_NETS
 	/* If we test an IP address and not a network address,
@@ -1022,6 +1264,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 			goto out;
 	}
 out:
+	rcu_read_unlock_bh();
 	return ret;
 }
 
@@ -1033,23 +1276,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
 	const struct htable *t;
 	struct nlattr *nested;
 	size_t memsize;
+	u32 elements = 0;
+	size_t ext_size = 0;
 	u8 htable_bits;
 
-	/* If any members have expired, set->elements will be wrong
-	 * mytype_expire function will update it with the right count.
-	 * we do not hold set->lock here, so grab it first.
-	 * set->elements can still be incorrect in the case of a huge set,
-	 * because elements might time out during the listing.
-	 */
-	if (SET_WITH_TIMEOUT(set)) {
-		spin_lock_bh(&set->lock);
-		mtype_expire(set, h);
-		spin_unlock_bh(&set->lock);
-	}
-
 	rcu_read_lock_bh();
-	t = rcu_dereference_bh_nfnl(h->table);
-	memsize = mtype_ahash_memsize(h, t) + set->ext_size;
+	t = rcu_dereference_bh(h->table);
+	mtype_ext_size(set, &elements, &ext_size);
+	memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size;
 	htable_bits = t->htable_bits;
 	rcu_read_unlock_bh();
 
@@ -1071,7 +1305,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
 #endif
 	if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
 	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
-	    nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
+	    nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements)))
 		goto nla_put_failure;
 	if (unlikely(ip_set_put_flags(skb, set)))
 		goto nla_put_failure;
@@ -1091,15 +1325,15 @@ mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start)
 
 	if (start) {
 		rcu_read_lock_bh();
-		t = rcu_dereference_bh_nfnl(h->table);
+		t = ipset_dereference_bh_nfnl(h->table);
 		atomic_inc(&t->uref);
 		cb->args[IPSET_CB_PRIVATE] = (unsigned long)t;
 		rcu_read_unlock_bh();
 	} else if (cb->args[IPSET_CB_PRIVATE]) {
 		t = (struct htable *)cb->args[IPSET_CB_PRIVATE];
 		if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
-			/* Resizing didn't destroy the hash table */
-			pr_debug("Table destroy by dump: %p\n", t);
+			pr_debug("Table destroy after resize "
+				 " by dump: %p\n", t);
 			mtype_ahash_destroy(set, t, false);
 		}
 		cb->args[IPSET_CB_PRIVATE] = 0;
@@ -1141,8 +1375,7 @@ mtype_list(const struct ip_set *set,
 			if (!test_bit(i, n->used))
 				continue;
 			e = ahash_data(n, i, set->dsize);
-			if (SET_WITH_TIMEOUT(set) &&
-			    ip_set_timeout_expired(ext_timeout(e, set)))
+			if (SET_ELEM_EXPIRED(set, e))
 				continue;
 			pr_debug("list hash %lu hbucket %p i %u, data %p\n",
 				 cb->args[IPSET_CB_ARG0], n, i, e);
@@ -1208,6 +1441,7 @@ static const struct ip_set_type_variant mtype_variant = {
 	.uref	= mtype_uref,
 	.resize	= mtype_resize,
 	.same_set = mtype_same_set,
+	.region_lock = true,
 };
 
 #ifdef IP_SET_EMIT_CREATE
@@ -1226,6 +1460,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 	size_t hsize;
 	struct htype *h;
 	struct htable *t;
+	u32 i;
 
 	pr_debug("Create set %s with family %s\n",
 		 set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
@@ -1294,6 +1529,15 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 		kfree(h);
 		return -ENOMEM;
 	}
+	t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits));
+	if (!t->hregion) {
+		kfree(t);
+		kfree(h);
+		return -ENOMEM;
+	}
+	h->gc.set = set;
+	for (i = 0; i < ahash_numof_locks(hbits); i++)
+		spin_lock_init(&t->hregion[i].lock);
 	h->maxelem = maxelem;
 #ifdef IP_SET_HASH_WITH_NETMASK
 	h->netmask = netmask;
@@ -1304,9 +1548,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 	get_random_bytes(&h->initval, sizeof(h->initval));
 
 	t->htable_bits = hbits;
+	t->maxelem = h->maxelem / ahash_numof_locks(hbits);
 	RCU_INIT_POINTER(h->table, t);
 
-	h->set = set;
+	INIT_LIST_HEAD(&h->ad);
 	set->data = h;
 #ifndef IP_SET_PROTO_UNDEF
 	if (set->family == NFPROTO_IPV4) {
@@ -1329,12 +1574,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 #ifndef IP_SET_PROTO_UNDEF
 		if (set->family == NFPROTO_IPV4)
 #endif
-			IPSET_TOKEN(HTYPE, 4_gc_init)(set,
-				IPSET_TOKEN(HTYPE, 4_gc));
+			IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc);
 #ifndef IP_SET_PROTO_UNDEF
 		else
-			IPSET_TOKEN(HTYPE, 6_gc_init)(set,
-				IPSET_TOKEN(HTYPE, 6_gc));
+			IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc);
 #endif
 	}
 	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
-- 
cgit v1.2.3-71-gd317


From c780e86dd48ef6467a1146cf7d0fe1e05a635039 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 6 Feb 2020 15:28:12 +0100
Subject: blktrace: Protect q->blk_trace with RCU

KASAN is reporting that __blk_add_trace() has a use-after-free issue
when accessing q->blk_trace. Indeed the switching of block tracing (and
thus eventual freeing of q->blk_trace) is completely unsynchronized with
the currently running tracing and thus it can happen that the blk_trace
structure is being freed just while __blk_add_trace() works on it.
Protect accesses to q->blk_trace by RCU during tracing and make sure we
wait for the end of RCU grace period when shutting down tracing. Luckily
that is rare enough event that we can afford that. Note that postponing
the freeing of blk_trace to an RCU callback should better be avoided as
it could have unexpected user visible side-effects as debugfs files
would be still existing for a short while block tracing has been shut
down.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=205711
CC: stable@vger.kernel.org
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reported-by: Tristan Madani <tristmd@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h       |   2 +-
 include/linux/blktrace_api.h |  18 +++++--
 kernel/trace/blktrace.c      | 114 +++++++++++++++++++++++++++++++------------
 3 files changed, 97 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 053ea4b51988..10455b2bbbb4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -524,7 +524,7 @@ struct request_queue {
 	unsigned int		sg_reserved_size;
 	int			node;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
-	struct blk_trace	*blk_trace;
+	struct blk_trace __rcu	*blk_trace;
 	struct mutex		blk_trace_mutex;
 #endif
 	/*
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 7bb2d8de9f30..3b6ff5902edc 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -51,9 +51,13 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f
  **/
 #define blk_add_cgroup_trace_msg(q, cg, fmt, ...)			\
 	do {								\
-		struct blk_trace *bt = (q)->blk_trace;			\
+		struct blk_trace *bt;					\
+									\
+		rcu_read_lock();					\
+		bt = rcu_dereference((q)->blk_trace);			\
 		if (unlikely(bt))					\
 			__trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\
+		rcu_read_unlock();					\
 	} while (0)
 #define blk_add_trace_msg(q, fmt, ...)					\
 	blk_add_cgroup_trace_msg(q, NULL, fmt, ##__VA_ARGS__)
@@ -61,10 +65,14 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f
 
 static inline bool blk_trace_note_message_enabled(struct request_queue *q)
 {
-	struct blk_trace *bt = q->blk_trace;
-	if (likely(!bt))
-		return false;
-	return bt->act_mask & BLK_TC_NOTIFY;
+	struct blk_trace *bt;
+	bool ret;
+
+	rcu_read_lock();
+	bt = rcu_dereference(q->blk_trace);
+	ret = bt && (bt->act_mask & BLK_TC_NOTIFY);
+	rcu_read_unlock();
+	return ret;
 }
 
 extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 0735ae8545d8..4560878f0bac 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -335,6 +335,7 @@ static void put_probe_ref(void)
 
 static void blk_trace_cleanup(struct blk_trace *bt)
 {
+	synchronize_rcu();
 	blk_trace_free(bt);
 	put_probe_ref();
 }
@@ -629,8 +630,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
 static int __blk_trace_startstop(struct request_queue *q, int start)
 {
 	int ret;
-	struct blk_trace *bt = q->blk_trace;
+	struct blk_trace *bt;
 
+	bt = rcu_dereference_protected(q->blk_trace,
+				       lockdep_is_held(&q->blk_trace_mutex));
 	if (bt == NULL)
 		return -EINVAL;
 
@@ -740,8 +743,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 void blk_trace_shutdown(struct request_queue *q)
 {
 	mutex_lock(&q->blk_trace_mutex);
-
-	if (q->blk_trace) {
+	if (rcu_dereference_protected(q->blk_trace,
+				      lockdep_is_held(&q->blk_trace_mutex))) {
 		__blk_trace_startstop(q, 0);
 		__blk_trace_remove(q);
 	}
@@ -752,8 +755,10 @@ void blk_trace_shutdown(struct request_queue *q)
 #ifdef CONFIG_BLK_CGROUP
 static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
 {
-	struct blk_trace *bt = q->blk_trace;
+	struct blk_trace *bt;
 
+	/* We don't use the 'bt' value here except as an optimization... */
+	bt = rcu_dereference_protected(q->blk_trace, 1);
 	if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
 		return 0;
 
@@ -796,10 +801,14 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
 static void blk_add_trace_rq(struct request *rq, int error,
 			     unsigned int nr_bytes, u32 what, u64 cgid)
 {
-	struct blk_trace *bt = rq->q->blk_trace;
+	struct blk_trace *bt;
 
-	if (likely(!bt))
+	rcu_read_lock();
+	bt = rcu_dereference(rq->q->blk_trace);
+	if (likely(!bt)) {
+		rcu_read_unlock();
 		return;
+	}
 
 	if (blk_rq_is_passthrough(rq))
 		what |= BLK_TC_ACT(BLK_TC_PC);
@@ -808,6 +817,7 @@ static void blk_add_trace_rq(struct request *rq, int error,
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
 			rq->cmd_flags, what, error, 0, NULL, cgid);
+	rcu_read_unlock();
 }
 
 static void blk_add_trace_rq_insert(void *ignore,
@@ -853,14 +863,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 			      u32 what, int error)
 {
-	struct blk_trace *bt = q->blk_trace;
+	struct blk_trace *bt;
 
-	if (likely(!bt))
+	rcu_read_lock();
+	bt = rcu_dereference(q->blk_trace);
+	if (likely(!bt)) {
+		rcu_read_unlock();
 		return;
+	}
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
 			bio_op(bio), bio->bi_opf, what, error, 0, NULL,
 			blk_trace_bio_get_cgid(q, bio));
+	rcu_read_unlock();
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
@@ -905,11 +920,14 @@ static void blk_add_trace_getrq(void *ignore,
 	if (bio)
 		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
 	else {
-		struct blk_trace *bt = q->blk_trace;
+		struct blk_trace *bt;
 
+		rcu_read_lock();
+		bt = rcu_dereference(q->blk_trace);
 		if (bt)
 			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
 					NULL, 0);
+		rcu_read_unlock();
 	}
 }
 
@@ -921,27 +939,35 @@ static void blk_add_trace_sleeprq(void *ignore,
 	if (bio)
 		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
 	else {
-		struct blk_trace *bt = q->blk_trace;
+		struct blk_trace *bt;
 
+		rcu_read_lock();
+		bt = rcu_dereference(q->blk_trace);
 		if (bt)
 			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
 					0, 0, NULL, 0);
+		rcu_read_unlock();
 	}
 }
 
 static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 {
-	struct blk_trace *bt = q->blk_trace;
+	struct blk_trace *bt;
 
+	rcu_read_lock();
+	bt = rcu_dereference(q->blk_trace);
 	if (bt)
 		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
+	rcu_read_unlock();
 }
 
 static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 				    unsigned int depth, bool explicit)
 {
-	struct blk_trace *bt = q->blk_trace;
+	struct blk_trace *bt;
 
+	rcu_read_lock();
+	bt = rcu_dereference(q->blk_trace);
 	if (bt) {
 		__be64 rpdu = cpu_to_be64(depth);
 		u32 what;
@@ -953,14 +979,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 
 		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
 	}
+	rcu_read_unlock();
 }
 
 static void blk_add_trace_split(void *ignore,
 				struct request_queue *q, struct bio *bio,
 				unsigned int pdu)
 {
-	struct blk_trace *bt = q->blk_trace;
+	struct blk_trace *bt;
 
+	rcu_read_lock();
+	bt = rcu_dereference(q->blk_trace);
 	if (bt) {
 		__be64 rpdu = cpu_to_be64(pdu);
 
@@ -969,6 +998,7 @@ static void blk_add_trace_split(void *ignore,
 				BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
 				&rpdu, blk_trace_bio_get_cgid(q, bio));
 	}
+	rcu_read_unlock();
 }
 
 /**
@@ -988,11 +1018,15 @@ static void blk_add_trace_bio_remap(void *ignore,
 				    struct request_queue *q, struct bio *bio,
 				    dev_t dev, sector_t from)
 {
-	struct blk_trace *bt = q->blk_trace;
+	struct blk_trace *bt;
 	struct blk_io_trace_remap r;
 
-	if (likely(!bt))
+	rcu_read_lock();
+	bt = rcu_dereference(q->blk_trace);
+	if (likely(!bt)) {
+		rcu_read_unlock();
 		return;
+	}
 
 	r.device_from = cpu_to_be32(dev);
 	r.device_to   = cpu_to_be32(bio_dev(bio));
@@ -1001,6 +1035,7 @@ static void blk_add_trace_bio_remap(void *ignore,
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
 			bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
 			sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
+	rcu_read_unlock();
 }
 
 /**
@@ -1021,11 +1056,15 @@ static void blk_add_trace_rq_remap(void *ignore,
 				   struct request *rq, dev_t dev,
 				   sector_t from)
 {
-	struct blk_trace *bt = q->blk_trace;
+	struct blk_trace *bt;
 	struct blk_io_trace_remap r;
 
-	if (likely(!bt))
+	rcu_read_lock();
+	bt = rcu_dereference(q->blk_trace);
+	if (likely(!bt)) {
+		rcu_read_unlock();
 		return;
+	}
 
 	r.device_from = cpu_to_be32(dev);
 	r.device_to   = cpu_to_be32(disk_devt(rq->rq_disk));
@@ -1034,6 +1073,7 @@ static void blk_add_trace_rq_remap(void *ignore,
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
 			rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
 			sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
+	rcu_read_unlock();
 }
 
 /**
@@ -1051,14 +1091,19 @@ void blk_add_driver_data(struct request_queue *q,
 			 struct request *rq,
 			 void *data, size_t len)
 {
-	struct blk_trace *bt = q->blk_trace;
+	struct blk_trace *bt;
 
-	if (likely(!bt))
+	rcu_read_lock();
+	bt = rcu_dereference(q->blk_trace);
+	if (likely(!bt)) {
+		rcu_read_unlock();
 		return;
+	}
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
 				BLK_TA_DRV_DATA, 0, len, data,
 				blk_trace_request_get_cgid(q, rq));
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
@@ -1597,6 +1642,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
 		return -EINVAL;
 
 	put_probe_ref();
+	synchronize_rcu();
 	blk_trace_free(bt);
 	return 0;
 }
@@ -1758,6 +1804,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	struct hd_struct *p = dev_to_part(dev);
 	struct request_queue *q;
 	struct block_device *bdev;
+	struct blk_trace *bt;
 	ssize_t ret = -ENXIO;
 
 	bdev = bdget(part_devt(p));
@@ -1770,21 +1817,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 
 	mutex_lock(&q->blk_trace_mutex);
 
+	bt = rcu_dereference_protected(q->blk_trace,
+				       lockdep_is_held(&q->blk_trace_mutex));
 	if (attr == &dev_attr_enable) {
-		ret = sprintf(buf, "%u\n", !!q->blk_trace);
+		ret = sprintf(buf, "%u\n", !!bt);
 		goto out_unlock_bdev;
 	}
 
-	if (q->blk_trace == NULL)
+	if (bt == NULL)
 		ret = sprintf(buf, "disabled\n");
 	else if (attr == &dev_attr_act_mask)
-		ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
+		ret = blk_trace_mask2str(buf, bt->act_mask);
 	else if (attr == &dev_attr_pid)
-		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+		ret = sprintf(buf, "%u\n", bt->pid);
 	else if (attr == &dev_attr_start_lba)
-		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+		ret = sprintf(buf, "%llu\n", bt->start_lba);
 	else if (attr == &dev_attr_end_lba)
-		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+		ret = sprintf(buf, "%llu\n", bt->end_lba);
 
 out_unlock_bdev:
 	mutex_unlock(&q->blk_trace_mutex);
@@ -1801,6 +1850,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	struct block_device *bdev;
 	struct request_queue *q;
 	struct hd_struct *p;
+	struct blk_trace *bt;
 	u64 value;
 	ssize_t ret = -EINVAL;
 
@@ -1831,8 +1881,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 	mutex_lock(&q->blk_trace_mutex);
 
+	bt = rcu_dereference_protected(q->blk_trace,
+				       lockdep_is_held(&q->blk_trace_mutex));
 	if (attr == &dev_attr_enable) {
-		if (!!value == !!q->blk_trace) {
+		if (!!value == !!bt) {
 			ret = 0;
 			goto out_unlock_bdev;
 		}
@@ -1844,18 +1896,18 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	}
 
 	ret = 0;
-	if (q->blk_trace == NULL)
+	if (bt == NULL)
 		ret = blk_trace_setup_queue(q, bdev);
 
 	if (ret == 0) {
 		if (attr == &dev_attr_act_mask)
-			q->blk_trace->act_mask = value;
+			bt->act_mask = value;
 		else if (attr == &dev_attr_pid)
-			q->blk_trace->pid = value;
+			bt->pid = value;
 		else if (attr == &dev_attr_start_lba)
-			q->blk_trace->start_lba = value;
+			bt->start_lba = value;
 		else if (attr == &dev_attr_end_lba)
-			q->blk_trace->end_lba = value;
+			bt->end_lba = value;
 	}
 
 out_unlock_bdev:
-- 
cgit v1.2.3-71-gd317


From a8e41f6033a0c5633d55d6e35993c9e2005d872f Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 25 Feb 2020 18:05:35 +0800
Subject: icmp: allow icmpv6_ndo_send to work with CONFIG_IPV6=n

The icmpv6_send function has long had a static inline implementation
with an empty body for CONFIG_IPV6=n, so that code calling it doesn't
need to be ifdef'd. The new icmpv6_ndo_send function, which is intended
for drivers as a drop-in replacement with an identical function
signature, should follow the same pattern. Without this patch, drivers
that used to work with CONFIG_IPV6=n now result in a linker error.

Cc: Chen Zhou <chenzhou10@huawei.com>
Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: 0b41713b6066 ("icmp: introduce helper for nat'd source address in network device context")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/icmpv6.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index 93338fd54af8..33d379602314 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -22,19 +22,23 @@ extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn);
 int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
 			       unsigned int data_len);
 
+#if IS_ENABLED(CONFIG_NF_NAT)
+void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info);
+#else
+#define icmpv6_ndo_send icmpv6_send
+#endif
+
 #else
 
 static inline void icmpv6_send(struct sk_buff *skb,
 			       u8 type, u8 code, __u32 info)
 {
-
 }
-#endif
 
-#if IS_ENABLED(CONFIG_NF_NAT)
-void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info);
-#else
-#define icmpv6_ndo_send icmpv6_send
+static inline void icmpv6_ndo_send(struct sk_buff *skb,
+				   u8 type, u8 code, __u32 info)
+{
+}
 #endif
 
 extern int				icmpv6_init(void);
-- 
cgit v1.2.3-71-gd317


From fcd07f9adc7dacc2532695cf9dd2284d49e716ff Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 28 Feb 2020 09:49:41 +0100
Subject: KVM: let declaration of kvm_get_running_vcpus match implementation

Sparse notices that declaration and implementation do not match:
arch/s390/kvm/../../../virt/kvm/kvm_main.c:4435:17: warning: incorrect type in return expression (different address spaces)
arch/s390/kvm/../../../virt/kvm/kvm_main.c:4435:17:    expected struct kvm_vcpu [noderef] <asn:3> **
arch/s390/kvm/../../../virt/kvm/kvm_main.c:4435:17:    got struct kvm_vcpu *[noderef] <asn:3> *

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7944ad6ac10b..bcb9b2ac0791 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1344,7 +1344,7 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 
 struct kvm_vcpu *kvm_get_running_vcpu(void);
-struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
+struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
 
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 bool kvm_arch_has_irq_bypass(void);
-- 
cgit v1.2.3-71-gd317


From e959e5405f34aa92d71d0dd162b969c21742061d Mon Sep 17 00:00:00 2001
From: Daniel Wagner <dwagner@suse.de>
Date: Mon, 2 Mar 2020 14:24:08 +0100
Subject: block: Remove used kblockd_schedule_work_on()

Commit ee63cfa7fc19 ("block: add kblockd_schedule_work_on()")
introduced the helper in 2016. Remove it because since then no caller
was added.

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 6 ------
 include/linux/blkdev.h | 1 -
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 089e890ab208..60dc9552ef8d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1663,12 +1663,6 @@ int kblockd_schedule_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
-int kblockd_schedule_work_on(int cpu, struct work_struct *work)
-{
-	return queue_work_on(cpu, kblockd_workqueue, work);
-}
-EXPORT_SYMBOL(kblockd_schedule_work_on);
-
 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
 				unsigned long delay)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 10455b2bbbb4..f629d40c645c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1494,7 +1494,6 @@ static inline void put_dev_sector(Sector p)
 }
 
 int kblockd_schedule_work(struct work_struct *work);
-int kblockd_schedule_work_on(int cpu, struct work_struct *work);
 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
-- 
cgit v1.2.3-71-gd317


From ac338acf514e7b578fa9e3742ec2c292323b4c1a Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 21 Feb 2020 00:05:09 -0800
Subject: driver core: Add dev_has_sync_state()

Add an API to check if a device has sync_state support in its driver or
bus.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20200221080510.197337-3-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 0cd7c647c16c..fa04dfd22bbc 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -798,6 +798,17 @@ static inline struct device_node *dev_of_node(struct device *dev)
 	return dev->of_node;
 }
 
+static inline bool dev_has_sync_state(struct device *dev)
+{
+	if (!dev)
+		return false;
+	if (dev->driver && dev->driver->sync_state)
+		return true;
+	if (dev->bus && dev->bus->sync_state)
+		return true;
+	return false;
+}
+
 /*
  * High level routines for use by the bus drivers
  */
-- 
cgit v1.2.3-71-gd317


From c87cbc1f007c4b46165f05ceca04e1973cda0b9c Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 5 Mar 2020 22:28:42 -0800
Subject: mm, hotplug: fix page online with DEBUG_PAGEALLOC compiled but not
 enabled

Commit cd02cf1aceea ("mm/hotplug: fix an imbalance with DEBUG_PAGEALLOC")
fixed memory hotplug with debug_pagealloc enabled, where onlining a page
goes through page freeing, which removes the direct mapping.  Some arches
don't like when the page is not mapped in the first place, so
generic_online_page() maps it first.  This is somewhat wasteful, but
better than special casing page freeing fast paths.

The commit however missed that DEBUG_PAGEALLOC configured doesn't mean
it's actually enabled.  One has to test debug_pagealloc_enabled() since
031bc5743f15 ("mm/debug-pagealloc: make debug-pagealloc boottime
configurable"), or alternatively debug_pagealloc_enabled_static() since
8e57f8acbbd1 ("mm, debug_pagealloc: don't rely on static keys too early"),
but this is not done.

As a result, a s390 kernel with DEBUG_PAGEALLOC configured but not enabled
will crash:

Unable to handle kernel pointer dereference in virtual kernel address space
Failing address: 0000000000000000 TEID: 0000000000000483
Fault in home space mode while using kernel ASCE.
AS:0000001ece13400b R2:000003fff7fd000b R3:000003fff7fcc007 S:000003fff7fd7000 P:000000000000013d
Oops: 0004 ilc:2 [#1] SMP
CPU: 1 PID: 26015 Comm: chmem Kdump: loaded Tainted: GX 5.3.18-5-default #1 SLE15-SP2 (unreleased)
Krnl PSW : 0704e00180000000 0000001ecd281b9e (__kernel_map_pages+0x166/0x188)
R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
Krnl GPRS: 0000000000000000 0000000000000800 0000400b00000000 0000000000000100
0000000000000001 0000000000000000 0000000000000002 0000000000000100
0000001ece139230 0000001ecdd98d40 0000400b00000100 0000000000000000
000003ffa17e4000 001fffe0114f7d08 0000001ecd4d93ea 001fffe0114f7b20
Krnl Code: 0000001ecd281b8e: ec17ffff00d8 ahik %r1,%r7,-1
0000001ecd281b94: ec111dbc0355 risbg %r1,%r1,29,188,3
>0000001ecd281b9e: 94fb5006 ni 6(%r5),251
0000001ecd281ba2: 41505008 la %r5,8(%r5)
0000001ecd281ba6: ec51fffc6064 cgrj %r5,%r1,6,1ecd281b9e
0000001ecd281bac: 1a07 ar %r0,%r7
0000001ecd281bae: ec03ff584076 crj %r0,%r3,4,1ecd281a5e
Call Trace:
[<0000001ecd281b9e>] __kernel_map_pages+0x166/0x188
[<0000001ecd4d9516>] online_pages_range+0xf6/0x128
[<0000001ecd2a8186>] walk_system_ram_range+0x7e/0xd8
[<0000001ecda28aae>] online_pages+0x2fe/0x3f0
[<0000001ecd7d02a6>] memory_subsys_online+0x8e/0xc0
[<0000001ecd7add42>] device_online+0x5a/0xc8
[<0000001ecd7d0430>] state_store+0x88/0x118
[<0000001ecd5b9f62>] kernfs_fop_write+0xc2/0x200
[<0000001ecd5064b6>] vfs_write+0x176/0x1e0
[<0000001ecd50676a>] ksys_write+0xa2/0x100
[<0000001ecda315d4>] system_call+0xd8/0x2c8

Fix this by checking debug_pagealloc_enabled_static() before calling
kernel_map_pages(). Backports for kernel before 5.5 should use
debug_pagealloc_enabled() instead. Also add comments.

Fixes: cd02cf1aceea ("mm/hotplug: fix an imbalance with DEBUG_PAGEALLOC")
Reported-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: <stable@vger.kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Qian Cai <cai@lca.pw>
Link: http://lkml.kernel.org/r/20200224094651.18257-1-vbabka@suse.cz
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h  | 4 ++++
 mm/memory_hotplug.c | 8 +++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 52269e56c514..c54fb96cb1e6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2715,6 +2715,10 @@ static inline bool debug_pagealloc_enabled_static(void)
 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
 extern void __kernel_map_pages(struct page *page, int numpages, int enable);
 
+/*
+ * When called in DEBUG_PAGEALLOC context, the call should most likely be
+ * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static()
+ */
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
 {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0a54ffac8c68..19389cdc16a5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -574,7 +574,13 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback);
 
 void generic_online_page(struct page *page, unsigned int order)
 {
-	kernel_map_pages(page, 1 << order, 1);
+	/*
+	 * Freeing the page with debug_pagealloc enabled will try to unmap it,
+	 * so we should map it first. This is better than introducing a special
+	 * case in page freeing fast path.
+	 */
+	if (debug_pagealloc_enabled_static())
+		kernel_map_pages(page, 1 << order, 1);
 	__free_pages_core(page, order);
 	totalram_pages_add(1UL << order);
 #ifdef CONFIG_HIGHMEM
-- 
cgit v1.2.3-71-gd317