From df5d45aa08f848b79caf395211b222790534ccc7 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 1 Feb 2018 11:21:58 +0100
Subject: compiler-gcc.h: Introduce __optimize function attribute

Create a new function attribute __optimize, which allows to specify an
optimization level on a per-function basis.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/compiler-gcc.h | 4 ++++
 include/linux/compiler.h     | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 2272ded07496..7bba8e28c529 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -196,6 +196,10 @@
 #endif /* __CHECKER__ */
 #endif /* GCC_VERSION >= 40300 */
 
+#if GCC_VERSION >= 40400
+#define __optimize(level)	__attribute__((__optimize__(level)))
+#endif /* GCC_VERSION >= 40400 */
+
 #if GCC_VERSION >= 40500
 
 #ifndef __CHECKER__
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 188ed9f65517..cdc629f20e20 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -271,6 +271,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 
 #endif /* __ASSEMBLY__ */
 
+#ifndef __optimize
+# define __optimize(level)
+#endif
+
 /* Compile time object size, -1 for unknown */
 #ifndef __compiletime_object_size
 # define __compiletime_object_size(obj) -1
-- 
cgit v1.2.3-71-gd317


From d9afaaa4ff7af8b87d4a205e48cb8a6f666d7f01 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 1 Feb 2018 11:21:59 +0100
Subject: compiler-gcc.h: __nostackprotector needs gcc-4.4 and up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Gcc versions before 4.4 do not recognize the __optimize__ compiler
attribute:

    warning: ‘__optimize__’ attribute directive ignored

Fixes: 7375ae3a0b79ea07 ("compiler-gcc.h: Introduce __nostackprotector function attribute")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/compiler-gcc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 7bba8e28c529..bf09213895f7 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -167,8 +167,6 @@
 
 #if GCC_VERSION >= 40100
 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
-
-#define __nostackprotector	__attribute__((__optimize__("no-stack-protector")))
 #endif
 
 #if GCC_VERSION >= 40300
@@ -198,6 +196,7 @@
 
 #if GCC_VERSION >= 40400
 #define __optimize(level)	__attribute__((__optimize__(level)))
+#define __nostackprotector	__optimize("no-stack-protector")
 #endif /* GCC_VERSION >= 40400 */
 
 #if GCC_VERSION >= 40500
-- 
cgit v1.2.3-71-gd317


From 54e02162d4454a99227f520948bf4494c3d972d0 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Sun, 11 Feb 2018 11:28:12 +0800
Subject: ptr_ring: prevent integer overflow when calculating size

Switch to use dividing to prevent integer overflow when size is too
big to calculate allocation size properly.

Reported-by: Eric Biggers <ebiggers3@gmail.com>
Fixes: 6e6e41c31122 ("ptr_ring: fail early if queue occupies more than KMALLOC_MAX_SIZE")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index b884b7794187..e6335227b844 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -469,7 +469,7 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
  */
 static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
 {
-	if (size * sizeof(void *) > KMALLOC_MAX_SIZE)
+	if (size > KMALLOC_MAX_SIZE / sizeof(void *))
 		return NULL;
 	return kvmalloc_array(size, sizeof(void *), gfp | __GFP_ZERO);
 }
-- 
cgit v1.2.3-71-gd317


From 29d5325a14ab49375476e3a6442ff40a008a8c9a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 9 Feb 2018 17:38:35 +0200
Subject: ACPI / bus: Rename acpi_get_match_data() to
 acpi_device_get_match_data()

Do the renaming to be consistent with its sibling, i.e.
of_device_get_match_data().

No functional change.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c      | 4 ++--
 drivers/acpi/property.c | 2 +-
 include/linux/acpi.h    | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index ca4af098b1bf..e6285b5ce0d5 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -830,7 +830,7 @@ const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
 }
 EXPORT_SYMBOL_GPL(acpi_match_device);
 
-void *acpi_get_match_data(const struct device *dev)
+void *acpi_device_get_match_data(const struct device *dev)
 {
 	const struct acpi_device_id *match;
 
@@ -840,7 +840,7 @@ void *acpi_get_match_data(const struct device *dev)
 
 	return (void *)match->driver_data;
 }
-EXPORT_SYMBOL_GPL(acpi_get_match_data);
+EXPORT_SYMBOL_GPL(acpi_device_get_match_data);
 
 int acpi_match_device_ids(struct acpi_device *device,
 			  const struct acpi_device_id *ids)
diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 466d1503aba0..f9b5fa230a86 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1275,7 +1275,7 @@ static void *
 acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
 				  const struct device *dev)
 {
-	return acpi_get_match_data(dev);
+	return acpi_device_get_match_data(dev);
 }
 
 #define DECLARE_ACPI_FWNODE_OPS(ops) \
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 64e10746f282..bdf47e0f92e9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -587,7 +587,7 @@ extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
 const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
 					       const struct device *dev);
 
-void *acpi_get_match_data(const struct device *dev);
+void *acpi_device_get_match_data(const struct device *dev);
 extern bool acpi_driver_match_device(struct device *dev,
 				     const struct device_driver *drv);
 int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *);
@@ -766,7 +766,7 @@ static inline const struct acpi_device_id *acpi_match_device(
 	return NULL;
 }
 
-static inline void *acpi_get_match_data(const struct device *dev)
+static inline void *acpi_device_get_match_data(const struct device *dev)
 {
 	return NULL;
 }
-- 
cgit v1.2.3-71-gd317


From 67dcc26d208ca5578f08c3c78cb254418c24e9ec Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 9 Feb 2018 17:38:36 +0200
Subject: device property: Constify device_get_match_data()

Constify device_get_match_data() as OF and ACPI variants return
constant value.

Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c       | 4 ++--
 drivers/acpi/property.c  | 2 +-
 drivers/base/property.c  | 5 ++---
 drivers/of/property.c    | 4 ++--
 include/linux/acpi.h     | 4 ++--
 include/linux/fwnode.h   | 4 ++--
 include/linux/property.h | 2 +-
 7 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index e6285b5ce0d5..0dad0bd9327b 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -830,7 +830,7 @@ const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
 }
 EXPORT_SYMBOL_GPL(acpi_match_device);
 
-void *acpi_device_get_match_data(const struct device *dev)
+const void *acpi_device_get_match_data(const struct device *dev)
 {
 	const struct acpi_device_id *match;
 
@@ -838,7 +838,7 @@ void *acpi_device_get_match_data(const struct device *dev)
 	if (!match)
 		return NULL;
 
-	return (void *)match->driver_data;
+	return (const void *)match->driver_data;
 }
 EXPORT_SYMBOL_GPL(acpi_device_get_match_data);
 
diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index f9b5fa230a86..5815356ea6ad 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1271,7 +1271,7 @@ static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 	return 0;
 }
 
-static void *
+static const void *
 acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
 				  const struct device *dev)
 {
diff --git a/drivers/base/property.c b/drivers/base/property.c
index 302236281d83..8f205f6461ed 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -1410,9 +1410,8 @@ int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 }
 EXPORT_SYMBOL(fwnode_graph_parse_endpoint);
 
-void *device_get_match_data(struct device *dev)
+const void *device_get_match_data(struct device *dev)
 {
-	return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data,
-				  dev);
+	return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data, dev);
 }
 EXPORT_SYMBOL_GPL(device_get_match_data);
diff --git a/drivers/of/property.c b/drivers/of/property.c
index 36ed84e26d9c..f46828e3b082 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -977,11 +977,11 @@ static int of_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 	return 0;
 }
 
-static void *
+static const void *
 of_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
 				const struct device *dev)
 {
-	return (void *)of_device_get_match_data(dev);
+	return of_device_get_match_data(dev);
 }
 
 const struct fwnode_operations of_fwnode_ops = {
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index bdf47e0f92e9..968173ec2726 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -587,7 +587,7 @@ extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
 const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
 					       const struct device *dev);
 
-void *acpi_device_get_match_data(const struct device *dev);
+const void *acpi_device_get_match_data(const struct device *dev);
 extern bool acpi_driver_match_device(struct device *dev,
 				     const struct device_driver *drv);
 int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *);
@@ -766,7 +766,7 @@ static inline const struct acpi_device_id *acpi_match_device(
 	return NULL;
 }
 
-static inline void *acpi_device_get_match_data(const struct device *dev)
+static inline const void *acpi_device_get_match_data(const struct device *dev)
 {
 	return NULL;
 }
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 4fa1a489efe4..4fe8f289b3f6 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -73,8 +73,8 @@ struct fwnode_operations {
 	struct fwnode_handle *(*get)(struct fwnode_handle *fwnode);
 	void (*put)(struct fwnode_handle *fwnode);
 	bool (*device_is_available)(const struct fwnode_handle *fwnode);
-	void *(*device_get_match_data)(const struct fwnode_handle *fwnode,
-				       const struct device *dev);
+	const void *(*device_get_match_data)(const struct fwnode_handle *fwnode,
+					     const struct device *dev);
 	bool (*property_present)(const struct fwnode_handle *fwnode,
 				 const char *propname);
 	int (*property_read_int_array)(const struct fwnode_handle *fwnode,
diff --git a/include/linux/property.h b/include/linux/property.h
index 769d372c1edf..2eea4b310fc2 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -283,7 +283,7 @@ bool device_dma_supported(struct device *dev);
 
 enum dev_dma_attr device_get_dma_attr(struct device *dev);
 
-void *device_get_match_data(struct device *dev);
+const void *device_get_match_data(struct device *dev);
 
 int device_get_phy_mode(struct device *dev);
 
-- 
cgit v1.2.3-71-gd317


From d7212cfb05ba802bea4dd6c90d61cfe6366ea224 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 12 Feb 2018 11:34:22 +0100
Subject: PM: cpuidle: Fix cpuidle_poll_state_init() prototype

Commit f85942207516 (x86: PM: Make APM idle driver initialize polling
state) made apm_init() call cpuidle_poll_state_init(), but that only
is defined for CONFIG_CPU_IDLE set, so make the empty stub of it
available for CONFIG_CPU_IDLE unset too to fix the resulting build
issue.

Fixes: f85942207516 (x86: PM: Make APM idle driver initialize polling state)
Cc: 4.14+ <stable@vger.kernel.org> # 4.14+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpuidle.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 8f7788d23b57..a6989e02d0a0 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -225,7 +225,7 @@ static inline void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev,
 }
 #endif
 
-#ifdef CONFIG_ARCH_HAS_CPU_RELAX
+#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_ARCH_HAS_CPU_RELAX)
 void cpuidle_poll_state_init(struct cpuidle_driver *drv);
 #else
 static inline void cpuidle_poll_state_init(struct cpuidle_driver *drv) {}
-- 
cgit v1.2.3-71-gd317


From ecc2dc55ce79945c2e0a04977706a99dc4848229 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Feb 2018 09:43:49 +0100
Subject: dma-mapping: fix a comment typo

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 34fe8463d10e..eb9eab4ecd6d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -578,7 +578,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 
 /*
  * This is a hack for the legacy x86 forbid_dac and iommu_sac_force. Please
- * don't use this is new code.
+ * don't use this in new code.
  */
 #ifndef arch_dma_supported
 #define arch_dma_supported(dev, mask)	(1)
-- 
cgit v1.2.3-71-gd317


From 595dd46ebfc10be041a365d0a3fa99df50b6ba73 Mon Sep 17 00:00:00 2001
From: Jia Zhang <zhang.jia@linux.alibaba.com>
Date: Mon, 12 Feb 2018 22:44:53 +0800
Subject: vfs/proc/kcore, x86/mm/kcore: Fix SMAP fault when dumping vsyscall
 user page

Commit:

  df04abfd181a ("fs/proc/kcore.c: Add bounce buffer for ktext data")

... introduced a bounce buffer to work around CONFIG_HARDENED_USERCOPY=y.
However, accessing the vsyscall user page will cause an SMAP fault.

Replace memcpy() with copy_from_user() to fix this bug works, but adding
a common way to handle this sort of user page may be useful for future.

Currently, only vsyscall page requires KCORE_USER.

Signed-off-by: Jia Zhang <zhang.jia@linux.alibaba.com>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1518446694-21124-2-git-send-email-zhang.jia@linux.alibaba.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init_64.c | 3 +--
 fs/proc/kcore.c       | 4 ++++
 include/linux/kcore.h | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ab42c852069..6aa33d1e198f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1193,8 +1193,7 @@ void __init mem_init(void)
 	register_page_bootmem_info();
 
 	/* Register memory areas for /proc/kcore */
-	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
-			 PAGE_SIZE, KCORE_OTHER);
+	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
 
 	mem_init_print_info(NULL);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e8a93bc8285d..d1e82761de81 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -510,6 +510,10 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 			/* we have to zero-fill user buffer even if no read */
 			if (copy_to_user(buffer, buf, tsz))
 				return -EFAULT;
+		} else if (m->type == KCORE_USER) {
+			/* User page is handled prior to normal kernel page: */
+			if (copy_to_user(buffer, (char *)start, tsz))
+				return -EFAULT;
 		} else {
 			if (kern_addr_valid(start)) {
 				/*
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 7ff25a808fef..80db19d3a505 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -10,6 +10,7 @@ enum kcore_type {
 	KCORE_VMALLOC,
 	KCORE_RAM,
 	KCORE_VMEMMAP,
+	KCORE_USER,
 	KCORE_OTHER,
 };
 
-- 
cgit v1.2.3-71-gd317


From 2dd6fd2e999774041397f2a7da2e1d30b3a27c3a Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@tycho.ws>
Date: Thu, 1 Feb 2018 12:41:19 +0100
Subject: locking/semaphore: Update the file path in documentation

While reading this header I noticed that the locking stuff has moved to
kernel/locking/*, so update the path in semaphore.h to point to that.

Signed-off-by: Tycho Andersen <tycho@tycho.ws>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180201114119.1090-1-tycho@tycho.ws
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/semaphore.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index dc368b8ce215..11c86fbfeb98 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -4,7 +4,7 @@
  *
  * Distributed under the terms of the GNU GPL, version 2
  *
- * Please see kernel/semaphore.c for documentation of these functions
+ * Please see kernel/locking/semaphore.c for documentation of these functions
  */
 #ifndef __LINUX_SEMAPHORE_H
 #define __LINUX_SEMAPHORE_H
-- 
cgit v1.2.3-71-gd317


From fd0e786d9d09024f67bd71ec094b110237dc3840 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 25 Jan 2018 14:23:48 -0800
Subject: x86/mm, mm/hwpoison: Don't unconditionally unmap kernel 1:1 pages

In the following commit:

  ce0fa3e56ad2 ("x86/mm, mm/hwpoison: Clear PRESENT bit for kernel 1:1 mappings of poison pages")

... we added code to memory_failure() to unmap the page from the
kernel 1:1 virtual address space to avoid speculative access to the
page logging additional errors.

But memory_failure() may not always succeed in taking the page offline,
especially if the page belongs to the kernel.  This can happen if
there are too many corrected errors on a page and either mcelog(8)
or drivers/ras/cec.c asks to take a page offline.

Since we remove the 1:1 mapping early in memory_failure(), we can
end up with the page unmapped, but still in use. On the next access
the kernel crashes :-(

There are also various debug paths that call memory_failure() to simulate
occurrence of an error. Since there is no actual error in memory, we
don't need to map out the page for those cases.

Revert most of the previous attempt and keep the solution local to
arch/x86/kernel/cpu/mcheck/mce.c. Unmap the page only when:

	1) there is a real error
	2) memory_failure() succeeds.

All of this only applies to 64-bit systems. 32-bit kernel doesn't map
all of memory into kernel space. It isn't worth adding the code to unmap
the piece that is mapped because nobody would run a 32-bit kernel on a
machine that has recoverable machine checks.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave <dave.hansen@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert (Persistent Memory) <elliott@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: stable@vger.kernel.org #v4.14
Fixes: ce0fa3e56ad2 ("x86/mm, mm/hwpoison: Clear PRESENT bit for kernel 1:1 mappings of poison pages")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/page_64.h            |  4 ----
 arch/x86/kernel/cpu/mcheck/mce-internal.h | 15 +++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce.c          | 17 +++++++++++------
 include/linux/mm_inline.h                 |  6 ------
 mm/memory-failure.c                       |  2 --
 5 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 4baa6bceb232..d652a3808065 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -52,10 +52,6 @@ static inline void clear_page(void *page)
 
 void copy_page(void *to, void *from);
 
-#ifdef CONFIG_X86_MCE
-#define arch_unmap_kpfn arch_unmap_kpfn
-#endif
-
 #endif	/* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index aa0d5df9dc60..e956eb267061 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -115,4 +115,19 @@ static inline void mce_unregister_injector_chain(struct notifier_block *nb)	{ }
 
 extern struct mca_config mca_cfg;
 
+#ifndef CONFIG_X86_64
+/*
+ * On 32-bit systems it would be difficult to safely unmap a poison page
+ * from the kernel 1:1 map because there are no non-canonical addresses that
+ * we can use to refer to the address without risking a speculative access.
+ * However, this isn't much of an issue because:
+ * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which
+ *    are only mapped into the kernel as needed
+ * 2) Few people would run a 32-bit kernel on a machine that supports
+ *    recoverable errors because they have too much memory to boot 32-bit.
+ */
+static inline void mce_unmap_kpfn(unsigned long pfn) {}
+#define mce_unmap_kpfn mce_unmap_kpfn
+#endif
+
 #endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 75f405ac085c..8ff94d1e2dce 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -105,6 +105,10 @@ static struct irq_work mce_irq_work;
 
 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 
+#ifndef mce_unmap_kpfn
+static void mce_unmap_kpfn(unsigned long pfn);
+#endif
+
 /*
  * CPU/chipset specific EDAC code can register a notifier call here to print
  * MCE errors in a human-readable form.
@@ -590,7 +594,8 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 
 	if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
 		pfn = mce->addr >> PAGE_SHIFT;
-		memory_failure(pfn, 0);
+		if (!memory_failure(pfn, 0))
+			mce_unmap_kpfn(pfn);
 	}
 
 	return NOTIFY_OK;
@@ -1057,12 +1062,13 @@ static int do_memory_failure(struct mce *m)
 	ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
 	if (ret)
 		pr_err("Memory error not recovered");
+	else
+		mce_unmap_kpfn(m->addr >> PAGE_SHIFT);
 	return ret;
 }
 
-#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
-
-void arch_unmap_kpfn(unsigned long pfn)
+#ifndef mce_unmap_kpfn
+static void mce_unmap_kpfn(unsigned long pfn)
 {
 	unsigned long decoy_addr;
 
@@ -1073,7 +1079,7 @@ void arch_unmap_kpfn(unsigned long pfn)
 	 * We would like to just call:
 	 *	set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
 	 * but doing that would radically increase the odds of a
-	 * speculative access to the posion page because we'd have
+	 * speculative access to the poison page because we'd have
 	 * the virtual address of the kernel 1:1 mapping sitting
 	 * around in registers.
 	 * Instead we get tricky.  We create a non-canonical address
@@ -1098,7 +1104,6 @@ void arch_unmap_kpfn(unsigned long pfn)
 
 	if (set_memory_np(decoy_addr, 1))
 		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
-
 }
 #endif
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index c30b32e3c862..10191c28fc04 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -127,10 +127,4 @@ static __always_inline enum lru_list page_lru(struct page *page)
 
 #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
 
-#ifdef arch_unmap_kpfn
-extern void arch_unmap_kpfn(unsigned long pfn);
-#else
-static __always_inline void arch_unmap_kpfn(unsigned long pfn) { }
-#endif
-
 #endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4b80ccee4535..8291b75f42c8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1139,8 +1139,6 @@ int memory_failure(unsigned long pfn, int flags)
 		return 0;
 	}
 
-	arch_unmap_kpfn(pfn);
-
 	orig_head = hpage = compound_head(p);
 	num_poisoned_pages_inc();
 
-- 
cgit v1.2.3-71-gd317


From b1d03c1d12abbfa7de127772f281b309cf1650c3 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 12 Feb 2018 16:48:21 +0000
Subject: iommu/vt-d: Clean/document fault status flags

So one could decode them without opening the specification.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-iommu.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 8dad3dd26eae..ef169d67df92 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -209,12 +209,12 @@
 #define DMA_FECTL_IM (((u32)1) << 31)
 
 /* FSTS_REG */
-#define DMA_FSTS_PPF ((u32)2)
-#define DMA_FSTS_PFO ((u32)1)
-#define DMA_FSTS_IQE (1 << 4)
-#define DMA_FSTS_ICE (1 << 5)
-#define DMA_FSTS_ITE (1 << 6)
-#define DMA_FSTS_PRO (1 << 7)
+#define DMA_FSTS_PFO (1 << 0) /* Primary Fault Overflow */
+#define DMA_FSTS_PPF (1 << 1) /* Primary Pending Fault */
+#define DMA_FSTS_IQE (1 << 4) /* Invalidation Queue Error */
+#define DMA_FSTS_ICE (1 << 5) /* Invalidation Completion Error */
+#define DMA_FSTS_ITE (1 << 6) /* Invalidation Time-out Error */
+#define DMA_FSTS_PRO (1 << 7) /* Page Request Overflow */
 #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
 
 /* FRCD_REG, 32 bits access */
-- 
cgit v1.2.3-71-gd317


From c5611a8751e67595e4e7d3feaff3c900b92094b9 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Mon, 5 Feb 2018 05:45:53 -0500
Subject: iommu: Do not return error code for APIs with size_t return type

Currently, iommu_unmap, iommu_unmap_fast and iommu_map_sg return
size_t.  However, some of the return values are error codes (< 0),
which can be misinterpreted as large size. Therefore, returning size 0
instead to signify failure to map/unmap.

Cc: Joerg Roedel <joro@8bytes.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd_iommu.c |  2 +-
 drivers/iommu/iommu.c     |  6 +++---
 include/linux/iommu.h     | 14 +++++++-------
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 74788fdeb773..ecdeb045deef 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3050,7 +3050,7 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
 	size_t unmap_size;
 
 	if (domain->mode == PAGE_MODE_NONE)
-		return -EINVAL;
+		return 0;
 
 	mutex_lock(&domain->api_lock);
 	unmap_size = iommu_unmap_page(domain, iova, page_size);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 69fef991c651..d2aa23202bb9 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1573,10 +1573,10 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
 
 	if (unlikely(ops->unmap == NULL ||
 		     domain->pgsize_bitmap == 0UL))
-		return -ENODEV;
+		return 0;
 
 	if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
-		return -EINVAL;
+		return 0;
 
 	/* find out the minimum page size supported */
 	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
@@ -1589,7 +1589,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
 	if (!IS_ALIGNED(iova | size, min_pagesz)) {
 		pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
 		       iova, size, min_pagesz);
-		return -EINVAL;
+		return 0;
 	}
 
 	pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 41b8c5757859..19938ee6eb31 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -465,23 +465,23 @@ static inline int iommu_map(struct iommu_domain *domain, unsigned long iova,
 	return -ENODEV;
 }
 
-static inline int iommu_unmap(struct iommu_domain *domain, unsigned long iova,
-			      size_t size)
+static inline size_t iommu_unmap(struct iommu_domain *domain,
+				 unsigned long iova, size_t size)
 {
-	return -ENODEV;
+	return 0;
 }
 
-static inline int iommu_unmap_fast(struct iommu_domain *domain, unsigned long iova,
-				   int gfp_order)
+static inline size_t iommu_unmap_fast(struct iommu_domain *domain,
+				      unsigned long iova, int gfp_order)
 {
-	return -ENODEV;
+	return 0;
 }
 
 static inline size_t iommu_map_sg(struct iommu_domain *domain,
 				  unsigned long iova, struct scatterlist *sg,
 				  unsigned int nents, int prot)
 {
-	return -ENODEV;
+	return 0;
 }
 
 static inline void iommu_flush_tlb_all(struct iommu_domain *domain)
-- 
cgit v1.2.3-71-gd317


From 8b4282e6b8e239d8ce68ab884c89335cc6fdd7c7 Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Tue, 13 Feb 2018 15:20:50 +0000
Subject: ACPI/IORT: Add msi address regions reservation helper

On some platforms msi parent address regions have to be excluded from
normal IOVA allocation in that they are detected and decoded in a HW
specific way by system components and so they cannot be considered normal
IOVA address space.

Add a helper function that retrieves ITS address regions - the msi
parent - through IORT device <-> ITS mappings and reserves it so that
these regions will not be translated by IOMMU and will be excluded from
IOVA allocations. The function checks for the smmu model number and
only applies the msi reservation if the platform requires it.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
[For the ITS part]
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c        | 111 +++++++++++++++++++++++++++++++++++++--
 drivers/irqchip/irq-gic-v3-its.c |   3 +-
 include/linux/acpi_iort.h        |   7 ++-
 3 files changed, 116 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 95255ecfae7c..e2f7bddf5522 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -39,6 +39,7 @@
 struct iort_its_msi_chip {
 	struct list_head	list;
 	struct fwnode_handle	*fw_node;
+	phys_addr_t		base_addr;
 	u32			translation_id;
 };
 
@@ -161,14 +162,16 @@ static LIST_HEAD(iort_msi_chip_list);
 static DEFINE_SPINLOCK(iort_msi_chip_lock);
 
 /**
- * iort_register_domain_token() - register domain token and related ITS ID
- * to the list from where we can get it back later on.
+ * iort_register_domain_token() - register domain token along with related
+ * ITS ID and base address to the list from where we can get it back later on.
  * @trans_id: ITS ID.
+ * @base: ITS base address.
  * @fw_node: Domain token.
  *
  * Returns: 0 on success, -ENOMEM if no memory when allocating list element
  */
-int iort_register_domain_token(int trans_id, struct fwnode_handle *fw_node)
+int iort_register_domain_token(int trans_id, phys_addr_t base,
+			       struct fwnode_handle *fw_node)
 {
 	struct iort_its_msi_chip *its_msi_chip;
 
@@ -178,6 +181,7 @@ int iort_register_domain_token(int trans_id, struct fwnode_handle *fw_node)
 
 	its_msi_chip->fw_node = fw_node;
 	its_msi_chip->translation_id = trans_id;
+	its_msi_chip->base_addr = base;
 
 	spin_lock(&iort_msi_chip_lock);
 	list_add(&its_msi_chip->list, &iort_msi_chip_list);
@@ -581,6 +585,24 @@ int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id)
 	return -ENODEV;
 }
 
+static int __maybe_unused iort_find_its_base(u32 its_id, phys_addr_t *base)
+{
+	struct iort_its_msi_chip *its_msi_chip;
+	int ret = -ENODEV;
+
+	spin_lock(&iort_msi_chip_lock);
+	list_for_each_entry(its_msi_chip, &iort_msi_chip_list, list) {
+		if (its_msi_chip->translation_id == its_id) {
+			*base = its_msi_chip->base_addr;
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock(&iort_msi_chip_lock);
+
+	return ret;
+}
+
 /**
  * iort_dev_find_its_id() - Find the ITS identifier for a device
  * @dev: The device.
@@ -766,6 +788,24 @@ static inline bool iort_iommu_driver_enabled(u8 type)
 }
 
 #ifdef CONFIG_IOMMU_API
+static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev)
+{
+	struct acpi_iort_node *iommu;
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+
+	iommu = iort_get_iort_node(fwspec->iommu_fwnode);
+
+	if (iommu && (iommu->type == ACPI_IORT_NODE_SMMU_V3)) {
+		struct acpi_iort_smmu_v3 *smmu;
+
+		smmu = (struct acpi_iort_smmu_v3 *)iommu->node_data;
+		if (smmu->model == ACPI_IORT_SMMU_V3_HISILICON_HI161X)
+			return iommu;
+	}
+
+	return NULL;
+}
+
 static inline const struct iommu_ops *iort_fwspec_iommu_ops(
 				struct iommu_fwspec *fwspec)
 {
@@ -782,6 +822,69 @@ static inline int iort_add_device_replay(const struct iommu_ops *ops,
 
 	return err;
 }
+
+/**
+ * iort_iommu_msi_get_resv_regions - Reserved region driver helper
+ * @dev: Device from iommu_get_resv_regions()
+ * @head: Reserved region list from iommu_get_resv_regions()
+ *
+ * Returns: Number of msi reserved regions on success (0 if platform
+ *          doesn't require the reservation or no associated msi regions),
+ *          appropriate error value otherwise. The ITS interrupt translation
+ *          spaces (ITS_base + SZ_64K, SZ_64K) associated with the device
+ *          are the msi reserved regions.
+ */
+int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+{
+	struct acpi_iort_its_group *its;
+	struct acpi_iort_node *iommu_node, *its_node = NULL;
+	int i, resv = 0;
+
+	iommu_node = iort_get_msi_resv_iommu(dev);
+	if (!iommu_node)
+		return 0;
+
+	/*
+	 * Current logic to reserve ITS regions relies on HW topologies
+	 * where a given PCI or named component maps its IDs to only one
+	 * ITS group; if a PCI or named component can map its IDs to
+	 * different ITS groups through IORT mappings this function has
+	 * to be reworked to ensure we reserve regions for all ITS groups
+	 * a given PCI or named component may map IDs to.
+	 */
+
+	for (i = 0; i < dev->iommu_fwspec->num_ids; i++) {
+		its_node = iort_node_map_id(iommu_node,
+					dev->iommu_fwspec->ids[i],
+					NULL, IORT_MSI_TYPE);
+		if (its_node)
+			break;
+	}
+
+	if (!its_node)
+		return 0;
+
+	/* Move to ITS specific data */
+	its = (struct acpi_iort_its_group *)its_node->node_data;
+
+	for (i = 0; i < its->its_count; i++) {
+		phys_addr_t base;
+
+		if (!iort_find_its_base(its->identifiers[i], &base)) {
+			int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+			struct iommu_resv_region *region;
+
+			region = iommu_alloc_resv_region(base + SZ_64K, SZ_64K,
+							 prot, IOMMU_RESV_MSI);
+			if (region) {
+				list_add_tail(&region->list, head);
+				resv++;
+			}
+		}
+	}
+
+	return (resv == its->its_count) ? resv : -ENODEV;
+}
 #else
 static inline const struct iommu_ops *iort_fwspec_iommu_ops(
 				struct iommu_fwspec *fwspec)
@@ -789,6 +892,8 @@ static inline const struct iommu_ops *iort_fwspec_iommu_ops(
 static inline int iort_add_device_replay(const struct iommu_ops *ops,
 					 struct device *dev)
 { return 0; }
+int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+{ return 0; }
 #endif
 
 static int iort_iommu_xlate(struct device *dev, struct acpi_iort_node *node,
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 06f025fd5726..ab99d1bd7087 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3450,7 +3450,8 @@ static int __init gic_acpi_parse_madt_its(struct acpi_subtable_header *header,
 		return -ENOMEM;
 	}
 
-	err = iort_register_domain_token(its_entry->translation_id, dom_handle);
+	err = iort_register_domain_token(its_entry->translation_id, res.start,
+					 dom_handle);
 	if (err) {
 		pr_err("ITS@%pa: Unable to register GICv3 ITS domain token (ITS ID %d) to IORT\n",
 		       &res.start, its_entry->translation_id);
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 2f7a29242b87..38cd77b39a64 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -26,7 +26,8 @@
 #define IORT_IRQ_MASK(irq)		(irq & 0xffffffffULL)
 #define IORT_IRQ_TRIGGER_MASK(irq)	((irq >> 32) & 0xffffffffULL)
 
-int iort_register_domain_token(int trans_id, struct fwnode_handle *fw_node);
+int iort_register_domain_token(int trans_id, phys_addr_t base,
+			       struct fwnode_handle *fw_node);
 void iort_deregister_domain_token(int trans_id);
 struct fwnode_handle *iort_find_domain_token(int trans_id);
 #ifdef CONFIG_ACPI_IORT
@@ -38,6 +39,7 @@ int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
 void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size);
 const struct iommu_ops *iort_iommu_configure(struct device *dev);
+int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
 #else
 static inline void acpi_iort_init(void) { }
 static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id)
@@ -52,6 +54,9 @@ static inline void iort_dma_setup(struct device *dev, u64 *dma_addr,
 static inline const struct iommu_ops *iort_iommu_configure(
 				      struct device *dev)
 { return NULL; }
+static inline
+int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+{ return 0; }
 #endif
 
 #endif /* __ACPI_IORT_H__ */
-- 
cgit v1.2.3-71-gd317


From 8fa80c503b484ddc1abbd10c7cb2ab81f3824a50 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 5 Feb 2018 14:16:06 +0000
Subject: nospec: Move array_index_nospec() parameter checking into separate
 macro

For architectures providing their own implementation of
array_index_mask_nospec() in asm/barrier.h, attempting to use WARN_ONCE() to
complain about out-of-range parameters using WARN_ON() results in a mess
of mutually-dependent include files.

Rather than unpick the dependencies, simply have the core code in nospec.h
perform the checking for us.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1517840166-15399-1-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nospec.h | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nospec.h b/include/linux/nospec.h
index b99bced39ac2..fbc98e2c8228 100644
--- a/include/linux/nospec.h
+++ b/include/linux/nospec.h
@@ -19,20 +19,6 @@
 static inline unsigned long array_index_mask_nospec(unsigned long index,
 						    unsigned long size)
 {
-	/*
-	 * Warn developers about inappropriate array_index_nospec() usage.
-	 *
-	 * Even if the CPU speculates past the WARN_ONCE branch, the
-	 * sign bit of @index is taken into account when generating the
-	 * mask.
-	 *
-	 * This warning is compiled out when the compiler can infer that
-	 * @index and @size are less than LONG_MAX.
-	 */
-	if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX,
-			"array_index_nospec() limited to range of [0, LONG_MAX]\n"))
-		return 0;
-
 	/*
 	 * Always calculate and emit the mask even if the compiler
 	 * thinks the mask is not needed. The compiler does not take
@@ -43,6 +29,26 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 }
 #endif
 
+/*
+ * Warn developers about inappropriate array_index_nospec() usage.
+ *
+ * Even if the CPU speculates past the WARN_ONCE branch, the
+ * sign bit of @index is taken into account when generating the
+ * mask.
+ *
+ * This warning is compiled out when the compiler can infer that
+ * @index and @size are less than LONG_MAX.
+ */
+#define array_index_mask_nospec_check(index, size)				\
+({										\
+	if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX,			\
+	    "array_index_nospec() limited to range of [0, LONG_MAX]\n"))	\
+		_mask = 0;							\
+	else									\
+		_mask = array_index_mask_nospec(index, size);			\
+	_mask;									\
+})
+
 /*
  * array_index_nospec - sanitize an array index after a bounds check
  *
@@ -61,7 +67,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 ({									\
 	typeof(index) _i = (index);					\
 	typeof(size) _s = (size);					\
-	unsigned long _mask = array_index_mask_nospec(_i, _s);		\
+	unsigned long _mask = array_index_mask_nospec_check(_i, _s);	\
 									\
 	BUILD_BUG_ON(sizeof(_i) > sizeof(long));			\
 	BUILD_BUG_ON(sizeof(_s) > sizeof(long));			\
-- 
cgit v1.2.3-71-gd317


From 096392e0714d3a520366ba467e215edf7280acff Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Thu, 15 Feb 2018 23:53:17 +0900
Subject: block: fix a typo in comment of BLK_MQ_POLL_STATS_BKTS

Update comment typo _consisitent_ to _consistent_ from following commit.
commit 0206319fdfee ("blk-mq: Fix poll_stat for new size-based bucketing.")

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4f3df807cf8f..ed63f3b69c12 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -49,7 +49,7 @@ struct blk_stat_callback;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
-/* Must be consisitent with blk_mq_poll_stats_bkt() */
+/* Must be consistent with blk_mq_poll_stats_bkt() */
 #define BLK_MQ_POLL_STATS_BKTS 16
 
 /*
-- 
cgit v1.2.3-71-gd317


From d207af2eab3f8668b95ad02b21930481c42806fd Mon Sep 17 00:00:00 2001
From: Michael Kelley <mhkelley@outlook.com>
Date: Wed, 14 Feb 2018 02:54:03 +0000
Subject: cpumask: Make for_each_cpu_wrap() available on UP as well

for_each_cpu_wrap() was originally added in the #else half of a
large "#if NR_CPUS == 1" statement, but was omitted in the #if
half.  This patch adds the missing #if half to prevent compile
errors when NR_CPUS is 1.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Michael Kelley <mhkelley@outlook.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kys@microsoft.com
Cc: martin.petersen@oracle.com
Cc: mikelley@microsoft.com
Fixes: c743f0a5c50f ("sched/fair, cpumask: Export for_each_cpu_wrap()")
Link: http://lkml.kernel.org/r/SN6PR1901MB2045F087F59450507D4FCC17CBF50@SN6PR1901MB2045.namprd19.prod.outlook.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpumask.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d4a2a7dcd72d..bf53d893ad02 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -170,6 +170,8 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)		\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
+#define for_each_cpu_wrap(cpu, mask, start)	\
+	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start))
 #define for_each_cpu_and(cpu, mask, and)	\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and)
 #else
-- 
cgit v1.2.3-71-gd317


From da27988766e338e4a4fe198170497c0920395d4c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 16 Feb 2018 15:52:42 -0500
Subject: skbuff: Fix comment mis-spelling.

'peform' --> 'perform'

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5ebc0f869720..c1e66bdcf583 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3646,7 +3646,7 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
 	return true;
 }
 
-/* For small packets <= CHECKSUM_BREAK peform checksum complete directly
+/* For small packets <= CHECKSUM_BREAK perform checksum complete directly
  * in checksum_init.
  */
 #define CHECKSUM_BREAK 76
-- 
cgit v1.2.3-71-gd317


From 27d4ee03078aba88c5e07dcc4917e8d01d046f38 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sun, 11 Feb 2018 10:38:28 +0100
Subject: workqueue: Allow retrieval of current task's work struct

Introduce a helper to retrieve the current task's work struct if it is
a workqueue worker.

This allows us to fix a long-standing deadlock in several DRM drivers
wherein the ->runtime_suspend callback waits for a specific worker to
finish and that worker in turn calls a function which waits for runtime
suspend to finish.  That function is invoked from multiple call sites
and waiting for runtime suspend to finish is the correct thing to do
except if it's executing in the context of the worker.

Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lyude Paul <lyude@redhat.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Link: https://patchwork.freedesktop.org/patch/msgid/2d8f603074131eb87e588d2b803a71765bd3a2fd.1518338788.git.lukas@wunner.de
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 4a54ef96aff5..bc0cda180c8b 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -465,6 +465,7 @@ extern bool cancel_delayed_work_sync(struct delayed_work *dwork);
 
 extern void workqueue_set_max_active(struct workqueue_struct *wq,
 				     int max_active);
+extern struct work_struct *current_work(void);
 extern bool current_is_workqueue_rescuer(void);
 extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
 extern unsigned int work_busy(struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 43d18cb46308..255c20efdf7b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4167,6 +4167,22 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 }
 EXPORT_SYMBOL_GPL(workqueue_set_max_active);
 
+/**
+ * current_work - retrieve %current task's work struct
+ *
+ * Determine if %current task is a workqueue worker and what it's working on.
+ * Useful to find out the context that the %current task is running in.
+ *
+ * Return: work struct if %current task is a workqueue worker, %NULL otherwise.
+ */
+struct work_struct *current_work(void)
+{
+	struct worker *worker = current_wq_worker();
+
+	return worker ? worker->current_work : NULL;
+}
+EXPORT_SYMBOL(current_work);
+
 /**
  * current_is_workqueue_rescuer - is %current workqueue rescuer?
  *
-- 
cgit v1.2.3-71-gd317


From 1d91c1d2c80cb70e2e553845e278b87a960c04da Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 16 Feb 2018 13:20:42 -0800
Subject: nospec: Kill array_index_nospec_mask_check()

There are multiple problems with the dynamic sanity checking in
array_index_nospec_mask_check():

* It causes unnecessary overhead in the 32-bit case since integer sized
  @index values will no longer cause the check to be compiled away like
  in the 64-bit case.

* In the 32-bit case it may trigger with user controllable input when
  the expectation is that should only trigger during development of new
  kernel enabling.

* The macro reuses the input parameter in multiple locations which is
  broken if someone passes an expression like 'index++' to
  array_index_nospec().

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arch@vger.kernel.org
Link: http://lkml.kernel.org/r/151881604278.17395.6605847763178076520.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nospec.h | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nospec.h b/include/linux/nospec.h
index fbc98e2c8228..d6701e34424f 100644
--- a/include/linux/nospec.h
+++ b/include/linux/nospec.h
@@ -29,26 +29,6 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 }
 #endif
 
-/*
- * Warn developers about inappropriate array_index_nospec() usage.
- *
- * Even if the CPU speculates past the WARN_ONCE branch, the
- * sign bit of @index is taken into account when generating the
- * mask.
- *
- * This warning is compiled out when the compiler can infer that
- * @index and @size are less than LONG_MAX.
- */
-#define array_index_mask_nospec_check(index, size)				\
-({										\
-	if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX,			\
-	    "array_index_nospec() limited to range of [0, LONG_MAX]\n"))	\
-		_mask = 0;							\
-	else									\
-		_mask = array_index_mask_nospec(index, size);			\
-	_mask;									\
-})
-
 /*
  * array_index_nospec - sanitize an array index after a bounds check
  *
@@ -67,7 +47,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 ({									\
 	typeof(index) _i = (index);					\
 	typeof(size) _s = (size);					\
-	unsigned long _mask = array_index_mask_nospec_check(_i, _s);	\
+	unsigned long _mask = array_index_mask_nospec(_i, _s);		\
 									\
 	BUILD_BUG_ON(sizeof(_i) > sizeof(long));			\
 	BUILD_BUG_ON(sizeof(_s) > sizeof(long));			\
-- 
cgit v1.2.3-71-gd317


From b98c6a160a057d5686a8c54c79cc6c8c94a7d0c8 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 16 Feb 2018 13:20:48 -0800
Subject: nospec: Allow index argument to have const-qualified type

The last expression in a statement expression need not be a bare
variable, quoting gcc docs

  The last thing in the compound statement should be an expression
  followed by a semicolon; the value of this subexpression serves as the
  value of the entire construct.

and we already use that in e.g. the min/max macros which end with a
ternary expression.

This way, we can allow index to have const-qualified type, which will in
some cases avoid the need for introducing a local copy of index of
non-const qualified type. That, in turn, can prevent readers not
familiar with the internals of array_index_nospec from wondering about
the seemingly redundant extra variable, and I think that's worthwhile
considering how confusing the whole _nospec business is.

The expression _i&_mask has type unsigned long (since that is the type
of _mask, and the BUILD_BUG_ONs guarantee that _i will get promoted to
that), so in order not to change the type of the whole expression, add
a cast back to typeof(_i).

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arch@vger.kernel.org
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/151881604837.17395.10812767547837568328.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nospec.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nospec.h b/include/linux/nospec.h
index d6701e34424f..172a19dc35ab 100644
--- a/include/linux/nospec.h
+++ b/include/linux/nospec.h
@@ -52,7 +52,6 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 	BUILD_BUG_ON(sizeof(_i) > sizeof(long));			\
 	BUILD_BUG_ON(sizeof(_s) > sizeof(long));			\
 									\
-	_i &= _mask;							\
-	_i;								\
+	(typeof(_i)) (_i & _mask);					\
 })
 #endif /* _LINUX_NOSPEC_H */
-- 
cgit v1.2.3-71-gd317


From eb6174f6d1be16b19cfa43dac296bfed003ce1a6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 16 Feb 2018 13:20:54 -0800
Subject: nospec: Include <asm/barrier.h> dependency

The nospec.h header expects the per-architecture header file
<asm/barrier.h> to optionally define array_index_mask_nospec(). Include
that dependency to prevent inadvertent fallback to the default
array_index_mask_nospec() implementation.

The default implementation may not provide a full mitigation
on architectures that perform data value speculation.

Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arch@vger.kernel.org
Link: http://lkml.kernel.org/r/151881605404.17395.1341935530792574707.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nospec.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nospec.h b/include/linux/nospec.h
index 172a19dc35ab..e791ebc65c9c 100644
--- a/include/linux/nospec.h
+++ b/include/linux/nospec.h
@@ -5,6 +5,7 @@
 
 #ifndef _LINUX_NOSPEC_H
 #define _LINUX_NOSPEC_H
+#include <asm/barrier.h>
 
 /**
  * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
-- 
cgit v1.2.3-71-gd317


From 87358710c1fb4f1bf96bbe2349975ff9953fc9b2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 19 Feb 2018 10:50:57 +0000
Subject: x86/retpoline: Support retpoline builds with Clang

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: arjan.van.de.ven@intel.com
Cc: bp@alien8.de
Cc: dave.hansen@intel.com
Cc: jmattson@google.com
Cc: karahmed@amazon.de
Cc: kvm@vger.kernel.org
Cc: pbonzini@redhat.com
Cc: rkrcmar@redhat.com
Link: http://lkml.kernel.org/r/1519037457-7643-5-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Makefile              | 5 ++++-
 include/linux/compiler-clang.h | 5 +++++
 include/linux/compiler-gcc.h   | 4 ++++
 include/linux/init.h           | 8 ++++----
 4 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index fad55160dcb9..dbc7d0ed2eaa 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -232,7 +232,10 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 
 # Avoid indirect branches in kernel to deal with Spectre
 ifdef CONFIG_RETPOLINE
-    RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
+    RETPOLINE_CFLAGS_GCC := -mindirect-branch=thunk-extern -mindirect-branch-register
+    RETPOLINE_CFLAGS_CLANG := -mretpoline-external-thunk
+
+    RETPOLINE_CFLAGS += $(call cc-option,$(RETPOLINE_CFLAGS_GCC),$(call cc-option,$(RETPOLINE_CFLAGS_CLANG)))
     ifneq ($(RETPOLINE_CFLAGS),)
         KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
     endif
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index d02a4df3f473..d3f264a5b04d 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -27,3 +27,8 @@
 #if __has_feature(address_sanitizer)
 #define __SANITIZE_ADDRESS__
 #endif
+
+/* Clang doesn't have a way to turn it off per-function, yet. */
+#ifdef __noretpoline
+#undef __noretpoline
+#endif
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 73bc63e0a1c4..673fbf904fe5 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -93,6 +93,10 @@
 #define __weak		__attribute__((weak))
 #define __alias(symbol)	__attribute__((alias(#symbol)))
 
+#ifdef RETPOLINE
+#define __noretpoline __attribute__((indirect_branch("keep")))
+#endif
+
 /*
  * it doesn't make sense on ARM (currently the only user of __naked)
  * to trace naked functions because then mcount is called without
diff --git a/include/linux/init.h b/include/linux/init.h
index 506a98151131..bc27cf03c41e 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -6,10 +6,10 @@
 #include <linux/types.h>
 
 /* Built-in __init functions needn't be compiled with retpoline */
-#if defined(RETPOLINE) && !defined(MODULE)
-#define __noretpoline __attribute__((indirect_branch("keep")))
+#if defined(__noretpoline) && !defined(MODULE)
+#define __noinitretpoline __noretpoline
 #else
-#define __noretpoline
+#define __noinitretpoline
 #endif
 
 /* These macros are used to mark some functions or 
@@ -47,7 +47,7 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(.init.text) __cold  __latent_entropy __noretpoline
+#define __init		__section(.init.text) __cold  __latent_entropy __noinitretpoline
 #define __initdata	__section(.init.data)
 #define __initconst	__section(.init.rodata)
 #define __exitdata	__section(.exit.data)
-- 
cgit v1.2.3-71-gd317


From c0248c96631f38f02d58762fc018e316843acac8 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 5 Feb 2018 16:41:56 +0000
Subject: arm_pmu: kill arm_pmu_platdata

Now that we have no platforms passing platform data to the arm_pmu code,
we can get rid of the platdata and associated hooks, paving the way for
rework of our IRQ handling.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c       | 27 ++++-----------------------
 include/linux/perf/arm_pmu.h | 17 -----------------
 2 files changed, 4 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 7bc5eee96b31..82b09d1cb42c 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -17,7 +17,6 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/perf/arm_pmu.h>
-#include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/sched/clock.h>
 #include <linux/spinlock.h>
@@ -320,17 +319,9 @@ validate_group(struct perf_event *event)
 	return 0;
 }
 
-static struct arm_pmu_platdata *armpmu_get_platdata(struct arm_pmu *armpmu)
-{
-	struct platform_device *pdev = armpmu->plat_device;
-
-	return pdev ? dev_get_platdata(&pdev->dev) : NULL;
-}
-
 static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 {
 	struct arm_pmu *armpmu;
-	struct arm_pmu_platdata *plat;
 	int ret;
 	u64 start_clock, finish_clock;
 
@@ -342,13 +333,8 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 	 */
 	armpmu = *(void **)dev;
 
-	plat = armpmu_get_platdata(armpmu);
-
 	start_clock = sched_clock();
-	if (plat && plat->handle_irq)
-		ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
-	else
-		ret = armpmu->handle_irq(irq, armpmu);
+	ret = armpmu->handle_irq(irq, armpmu);
 	finish_clock = sched_clock();
 
 	perf_sample_event_took(finish_clock - start_clock);
@@ -578,7 +564,6 @@ int armpmu_request_irq(struct arm_pmu *armpmu, int cpu)
 			goto err_out;
 		}
 	} else {
-		struct arm_pmu_platdata *platdata = armpmu_get_platdata(armpmu);
 		unsigned long irq_flags;
 
 		err = irq_force_affinity(irq, cpumask_of(cpu));
@@ -589,13 +574,9 @@ int armpmu_request_irq(struct arm_pmu *armpmu, int cpu)
 			goto err_out;
 		}
 
-		if (platdata && platdata->irq_flags) {
-			irq_flags = platdata->irq_flags;
-		} else {
-			irq_flags = IRQF_PERCPU |
-				    IRQF_NOBALANCING |
-				    IRQF_NO_THREAD;
-		}
+		irq_flags = IRQF_PERCPU |
+			    IRQF_NOBALANCING |
+			    IRQF_NO_THREAD;
 
 		err = request_irq(irq, handler, irq_flags, "arm-pmu",
 				  per_cpu_ptr(&hw_events->percpu_pmu, cpu));
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index af0f44effd44..712764b35c6a 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -17,23 +17,6 @@
 #include <linux/sysfs.h>
 #include <asm/cputype.h>
 
-/*
- * struct arm_pmu_platdata - ARM PMU platform data
- *
- * @handle_irq: an optional handler which will be called from the
- *	interrupt and passed the address of the low level handler,
- *	and can be used to implement any platform specific handling
- *	before or after calling it.
- *
- * @irq_flags: if non-zero, these flags will be passed to request_irq
- *             when requesting interrupts for this PMU device.
- */
-struct arm_pmu_platdata {
-	irqreturn_t (*handle_irq)(int irq, void *dev,
-				  irq_handler_t pmu_handler);
-	unsigned long irq_flags;
-};
-
 #ifdef CONFIG_ARM_PMU
 
 /*
-- 
cgit v1.2.3-71-gd317


From d3d5aac206b4e9e569a22fe1811c909dde17587c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 5 Feb 2018 16:41:57 +0000
Subject: arm_pmu: fold platform helpers into platform code

The armpmu_{request,free}_irqs() helpers are only used by
arm_pmu_platform.c, so let's fold them in and make them static.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c          | 21 ---------------------
 drivers/perf/arm_pmu_platform.c | 21 +++++++++++++++++++++
 include/linux/perf/arm_pmu.h    |  2 --
 3 files changed, 21 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 82b09d1cb42c..373dfd7d8a1d 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -534,14 +534,6 @@ void armpmu_free_irq(struct arm_pmu *armpmu, int cpu)
 	free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, cpu));
 }
 
-void armpmu_free_irqs(struct arm_pmu *armpmu)
-{
-	int cpu;
-
-	for_each_cpu(cpu, &armpmu->supported_cpus)
-		armpmu_free_irq(armpmu, cpu);
-}
-
 int armpmu_request_irq(struct arm_pmu *armpmu, int cpu)
 {
 	int err = 0;
@@ -593,19 +585,6 @@ err_out:
 	return err;
 }
 
-int armpmu_request_irqs(struct arm_pmu *armpmu)
-{
-	int cpu, err;
-
-	for_each_cpu(cpu, &armpmu->supported_cpus) {
-		err = armpmu_request_irq(armpmu, cpu);
-		if (err)
-			break;
-	}
-
-	return err;
-}
-
 static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu)
 {
 	struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c
index 46501cc79fd7..244558cfdbce 100644
--- a/drivers/perf/arm_pmu_platform.c
+++ b/drivers/perf/arm_pmu_platform.c
@@ -164,6 +164,27 @@ static int pmu_parse_irqs(struct arm_pmu *pmu)
 	return 0;
 }
 
+static int armpmu_request_irqs(struct arm_pmu *armpmu)
+{
+	int cpu, err;
+
+	for_each_cpu(cpu, &armpmu->supported_cpus) {
+		err = armpmu_request_irq(armpmu, cpu);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static void armpmu_free_irqs(struct arm_pmu *armpmu)
+{
+	int cpu;
+
+	for_each_cpu(cpu, &armpmu->supported_cpus)
+		armpmu_free_irq(armpmu, cpu);
+}
+
 int arm_pmu_device_probe(struct platform_device *pdev,
 			 const struct of_device_id *of_table,
 			 const struct pmu_probe_info *probe_table)
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 712764b35c6a..899bc7ef0881 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -159,8 +159,6 @@ static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; }
 struct arm_pmu *armpmu_alloc(void);
 void armpmu_free(struct arm_pmu *pmu);
 int armpmu_register(struct arm_pmu *pmu);
-int armpmu_request_irqs(struct arm_pmu *armpmu);
-void armpmu_free_irqs(struct arm_pmu *armpmu);
 int armpmu_request_irq(struct arm_pmu *armpmu, int cpu);
 void armpmu_free_irq(struct arm_pmu *armpmu, int cpu);
 
-- 
cgit v1.2.3-71-gd317


From 0dc1a1851af1d593eee248b94c1277c7c7ccbbce Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 5 Feb 2018 16:41:58 +0000
Subject: arm_pmu: add armpmu_alloc_atomic()

In ACPI systems, we don't know the makeup of CPUs until we hotplug them
on, and thus have to allocate the PMU datastructures at hotplug time.
Thus, we must use GFP_ATOMIC allocations.

Let's add an armpmu_alloc_atomic() that we can use in this case.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c       | 17 ++++++++++++++---
 drivers/perf/arm_pmu_acpi.c  |  2 +-
 include/linux/perf/arm_pmu.h |  1 +
 3 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 373dfd7d8a1d..4f73c5e8d623 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -760,18 +760,18 @@ static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
 					    &cpu_pmu->node);
 }
 
-struct arm_pmu *armpmu_alloc(void)
+static struct arm_pmu *__armpmu_alloc(gfp_t flags)
 {
 	struct arm_pmu *pmu;
 	int cpu;
 
-	pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
+	pmu = kzalloc(sizeof(*pmu), flags);
 	if (!pmu) {
 		pr_info("failed to allocate PMU device!\n");
 		goto out;
 	}
 
-	pmu->hw_events = alloc_percpu(struct pmu_hw_events);
+	pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, flags);
 	if (!pmu->hw_events) {
 		pr_info("failed to allocate per-cpu PMU data.\n");
 		goto out_free_pmu;
@@ -817,6 +817,17 @@ out:
 	return NULL;
 }
 
+struct arm_pmu *armpmu_alloc(void)
+{
+	return __armpmu_alloc(GFP_KERNEL);
+}
+
+struct arm_pmu *armpmu_alloc_atomic(void)
+{
+	return __armpmu_alloc(GFP_ATOMIC);
+}
+
+
 void armpmu_free(struct arm_pmu *pmu)
 {
 	free_percpu(pmu->hw_events);
diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c
index 705f1a390e31..30c5f2bbce59 100644
--- a/drivers/perf/arm_pmu_acpi.c
+++ b/drivers/perf/arm_pmu_acpi.c
@@ -127,7 +127,7 @@ static struct arm_pmu *arm_pmu_acpi_find_alloc_pmu(void)
 		return pmu;
 	}
 
-	pmu = armpmu_alloc();
+	pmu = armpmu_alloc_atomic();
 	if (!pmu) {
 		pr_warn("Unable to allocate PMU for CPU%d\n",
 			smp_processor_id());
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 899bc7ef0881..1f8bb83ef42f 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -157,6 +157,7 @@ static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; }
 
 /* Internal functions only for core arm_pmu code */
 struct arm_pmu *armpmu_alloc(void);
+struct arm_pmu *armpmu_alloc_atomic(void);
 void armpmu_free(struct arm_pmu *pmu);
 int armpmu_register(struct arm_pmu *pmu);
 int armpmu_request_irq(struct arm_pmu *armpmu, int cpu);
-- 
cgit v1.2.3-71-gd317


From 84b4be57ae17f8c0b3c1d8629e10f23910838fd7 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 12 Dec 2017 16:56:06 +0000
Subject: arm_pmu: note IRQs and PMUs per-cpu

To support ACPI systems, we need to request IRQs before we know the
associated PMU, and thus we need some percpu variable that the IRQ
handler can find the PMU from.

As we're going to request IRQs without the PMU, we can't rely on the
arm_pmu::active_irqs mask, and similarly need to track requested IRQs
with a percpu variable.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
[will: made armpmu_count_irq_users static]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c       | 69 +++++++++++++++++++++++++++++++++-----------
 include/linux/perf/arm_pmu.h |  1 -
 2 files changed, 52 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 72118e6f9122..2b2af35db1b6 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -25,6 +25,9 @@
 
 #include <asm/irq_regs.h>
 
+static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu);
+static DEFINE_PER_CPU(int, cpu_irq);
+
 static int
 armpmu_map_cache_event(const unsigned (*cache_map)
 				      [PERF_COUNT_HW_CACHE_MAX]
@@ -332,6 +335,8 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 	 * dereference.
 	 */
 	armpmu = *(void **)dev;
+	if (WARN_ON_ONCE(!armpmu))
+		return IRQ_NONE;
 
 	start_clock = sched_clock();
 	ret = armpmu->handle_irq(irq, armpmu);
@@ -517,29 +522,45 @@ int perf_num_counters(void)
 }
 EXPORT_SYMBOL_GPL(perf_num_counters);
 
-void armpmu_free_irq(struct arm_pmu *armpmu, int cpu)
+static int armpmu_count_irq_users(const int irq)
 {
-	struct pmu_hw_events __percpu *hw_events = armpmu->hw_events;
-	int irq = per_cpu(hw_events->irq, cpu);
+	int cpu, count = 0;
 
-	if (!cpumask_test_and_clear_cpu(cpu, &armpmu->active_irqs))
-		return;
+	for_each_possible_cpu(cpu) {
+		if (per_cpu(cpu_irq, cpu) == irq)
+			count++;
+	}
+
+	return count;
+}
 
-	if (irq_is_percpu_devid(irq)) {
-		free_percpu_irq(irq, &hw_events->percpu_pmu);
-		cpumask_clear(&armpmu->active_irqs);
+void armpmu_free_cpu_irq(int irq, int cpu)
+{
+	if (per_cpu(cpu_irq, cpu) == 0)
 		return;
-	}
+	if (WARN_ON(irq != per_cpu(cpu_irq, cpu)))
+		return;
+
+	if (!irq_is_percpu_devid(irq))
+		free_irq(irq, per_cpu_ptr(&cpu_armpmu, cpu));
+	else if (armpmu_count_irq_users(irq) == 1)
+		free_percpu_irq(irq, &cpu_armpmu);
 
-	free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, cpu));
+	per_cpu(cpu_irq, cpu) = 0;
 }
 
-int armpmu_request_irq(struct arm_pmu *armpmu, int cpu)
+void armpmu_free_irq(struct arm_pmu *armpmu, int cpu)
 {
-	int err = 0;
 	struct pmu_hw_events __percpu *hw_events = armpmu->hw_events;
-	const irq_handler_t handler = armpmu_dispatch_irq;
 	int irq = per_cpu(hw_events->irq, cpu);
+
+	armpmu_free_cpu_irq(irq, cpu);
+}
+
+int armpmu_request_cpu_irq(int irq, int cpu)
+{
+	int err = 0;
+	const irq_handler_t handler = armpmu_dispatch_irq;
 	if (!irq)
 		return 0;
 
@@ -560,16 +581,16 @@ int armpmu_request_irq(struct arm_pmu *armpmu, int cpu)
 
 		irq_set_status_flags(irq, IRQ_NOAUTOEN);
 		err = request_irq(irq, handler, irq_flags, "arm-pmu",
-				  per_cpu_ptr(&hw_events->percpu_pmu, cpu));
-	} else if (cpumask_empty(&armpmu->active_irqs)) {
+				  per_cpu_ptr(&cpu_armpmu, cpu));
+	} else if (armpmu_count_irq_users(irq) == 0) {
 		err = request_percpu_irq(irq, handler, "arm-pmu",
-					 &hw_events->percpu_pmu);
+					 &cpu_armpmu);
 	}
 
 	if (err)
 		goto err_out;
 
-	cpumask_set_cpu(cpu, &armpmu->active_irqs);
+	per_cpu(cpu_irq, cpu) = irq;
 	return 0;
 
 err_out:
@@ -577,6 +598,16 @@ err_out:
 	return err;
 }
 
+int armpmu_request_irq(struct arm_pmu *armpmu, int cpu)
+{
+	struct pmu_hw_events __percpu *hw_events = armpmu->hw_events;
+	int irq = per_cpu(hw_events->irq, cpu);
+	if (!irq)
+		return 0;
+
+	return armpmu_request_cpu_irq(irq, cpu);
+}
+
 static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu)
 {
 	struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
@@ -599,6 +630,8 @@ static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node)
 	if (pmu->reset)
 		pmu->reset(pmu);
 
+	per_cpu(cpu_armpmu, cpu) = pmu;
+
 	irq = armpmu_get_cpu_irq(pmu, cpu);
 	if (irq) {
 		if (irq_is_percpu_devid(irq))
@@ -626,6 +659,8 @@ static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node)
 			disable_irq(irq);
 	}
 
+	per_cpu(cpu_armpmu, cpu) = NULL;
+
 	return 0;
 }
 
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 1f8bb83ef42f..feec9e7e85db 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -75,7 +75,6 @@ enum armpmu_attr_groups {
 
 struct arm_pmu {
 	struct pmu	pmu;
-	cpumask_t	active_irqs;
 	cpumask_t	supported_cpus;
 	char		*name;
 	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
-- 
cgit v1.2.3-71-gd317


From 167e61438da0664cab87c825a6c0cb83510d578e Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 9 Oct 2017 17:09:05 +0100
Subject: arm_pmu: acpi: request IRQs up-front

We can't request IRQs in atomic context, so for ACPI systems we'll have
to request them up-front, and later associate them with CPUs.

This patch reorganises the arm_pmu code to do so. As we no longer have
the arm_pmu structure at probe time, a number of prototypes need to be
adjusted, requiring changes to the common arm_pmu code and arm_pmu
platform code.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c          | 22 ++--------------------
 drivers/perf/arm_pmu_acpi.c     | 19 ++++++-------------
 drivers/perf/arm_pmu_platform.c | 15 ++++++++++++---
 include/linux/perf/arm_pmu.h    |  5 +++--
 4 files changed, 23 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 2b2af35db1b6..0c2ed11c0603 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -534,7 +534,7 @@ static int armpmu_count_irq_users(const int irq)
 	return count;
 }
 
-void armpmu_free_cpu_irq(int irq, int cpu)
+void armpmu_free_irq(int irq, int cpu)
 {
 	if (per_cpu(cpu_irq, cpu) == 0)
 		return;
@@ -549,15 +549,7 @@ void armpmu_free_cpu_irq(int irq, int cpu)
 	per_cpu(cpu_irq, cpu) = 0;
 }
 
-void armpmu_free_irq(struct arm_pmu *armpmu, int cpu)
-{
-	struct pmu_hw_events __percpu *hw_events = armpmu->hw_events;
-	int irq = per_cpu(hw_events->irq, cpu);
-
-	armpmu_free_cpu_irq(irq, cpu);
-}
-
-int armpmu_request_cpu_irq(int irq, int cpu)
+int armpmu_request_irq(int irq, int cpu)
 {
 	int err = 0;
 	const irq_handler_t handler = armpmu_dispatch_irq;
@@ -598,16 +590,6 @@ err_out:
 	return err;
 }
 
-int armpmu_request_irq(struct arm_pmu *armpmu, int cpu)
-{
-	struct pmu_hw_events __percpu *hw_events = armpmu->hw_events;
-	int irq = per_cpu(hw_events->irq, cpu);
-	if (!irq)
-		return 0;
-
-	return armpmu_request_cpu_irq(irq, cpu);
-}
-
 static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu)
 {
 	struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c
index 09a1a36cff57..0f197516d708 100644
--- a/drivers/perf/arm_pmu_acpi.c
+++ b/drivers/perf/arm_pmu_acpi.c
@@ -89,7 +89,13 @@ static int arm_pmu_acpi_parse_irqs(void)
 			pr_warn("No ACPI PMU IRQ for CPU%d\n", cpu);
 		}
 
+		/*
+		 * Log and request the IRQ so the core arm_pmu code can manage
+		 * it. We'll have to sanity-check IRQs later when we associate
+		 * them with their PMUs.
+		 */
 		per_cpu(pmu_irqs, cpu) = irq;
+		armpmu_request_irq(irq, cpu);
 	}
 
 	return 0;
@@ -204,14 +210,6 @@ static int arm_pmu_acpi_cpu_starting(unsigned int cpu)
 
 	cpumask_set_cpu(cpu, &pmu->supported_cpus);
 
-	/*
-	 * Log and request the IRQ so the core arm_pmu code can manage it.  In
-	 * some situations (e.g. mismatched PPIs), we may fail to request the
-	 * IRQ. However, it may be too late for us to do anything about it.
-	 * The common ARM PMU code will log a warning in this case.
-	 */
-	armpmu_request_irq(pmu, cpu);
-
 	/*
 	 * Ideally, we'd probe the PMU here when we find the first matching
 	 * CPU. We can't do that for several reasons; see the comment in
@@ -281,11 +279,6 @@ static int arm_pmu_acpi_init(void)
 	if (acpi_disabled)
 		return 0;
 
-	/*
-	 * We can't request IRQs yet, since we don't know the cookie value
-	 * until we know which CPUs share the same logical PMU. We'll handle
-	 * that in arm_pmu_acpi_cpu_starting().
-	 */
 	ret = arm_pmu_acpi_parse_irqs();
 	if (ret)
 		return ret;
diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c
index 1dc3c1f574e0..7729eda5909d 100644
--- a/drivers/perf/arm_pmu_platform.c
+++ b/drivers/perf/arm_pmu_platform.c
@@ -159,10 +159,15 @@ static int pmu_parse_irqs(struct arm_pmu *pmu)
 
 static int armpmu_request_irqs(struct arm_pmu *armpmu)
 {
+	struct pmu_hw_events __percpu *hw_events = armpmu->hw_events;
 	int cpu, err;
 
 	for_each_cpu(cpu, &armpmu->supported_cpus) {
-		err = armpmu_request_irq(armpmu, cpu);
+		int irq = per_cpu(hw_events->irq, cpu);
+		if (!irq)
+			continue;
+
+		err = armpmu_request_irq(irq, cpu);
 		if (err)
 			break;
 	}
@@ -173,9 +178,13 @@ static int armpmu_request_irqs(struct arm_pmu *armpmu)
 static void armpmu_free_irqs(struct arm_pmu *armpmu)
 {
 	int cpu;
+	struct pmu_hw_events __percpu *hw_events = armpmu->hw_events;
 
-	for_each_cpu(cpu, &armpmu->supported_cpus)
-		armpmu_free_irq(armpmu, cpu);
+	for_each_cpu(cpu, &armpmu->supported_cpus) {
+		int irq = per_cpu(hw_events->irq, cpu);
+
+		armpmu_free_irq(irq, cpu);
+	}
 }
 
 int arm_pmu_device_probe(struct platform_device *pdev,
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index feec9e7e85db..40036a57d072 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -14,6 +14,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/perf_event.h>
+#include <linux/platform_device.h>
 #include <linux/sysfs.h>
 #include <asm/cputype.h>
 
@@ -159,8 +160,8 @@ struct arm_pmu *armpmu_alloc(void);
 struct arm_pmu *armpmu_alloc_atomic(void);
 void armpmu_free(struct arm_pmu *pmu);
 int armpmu_register(struct arm_pmu *pmu);
-int armpmu_request_irq(struct arm_pmu *armpmu, int cpu);
-void armpmu_free_irq(struct arm_pmu *armpmu, int cpu);
+int armpmu_request_irq(int irq, int cpu);
+void armpmu_free_irq(int irq, int cpu);
 
 #define ARMV8_PMU_PDEV_NAME "armv8-pmu"
 
-- 
cgit v1.2.3-71-gd317


From 88e77dc6a354095ddaaae715bc0d3b55702fa3db Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 20 Feb 2018 16:01:36 +0100
Subject: locking/mutex: Add comment to __mutex_owner() to deter usage

Attempt to deter usage, this is not a public interface. It is entirely
possible to implement a conformant mutex without having this owner
field (in fact, we used to have that).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index f25c13423bd4..cb3bbed4e633 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -66,6 +66,11 @@ struct mutex {
 #endif
 };
 
+/*
+ * Internal helper function; C doesn't allow us to hide it :/
+ *
+ * DO NOT USE (outside of mutex code).
+ */
 static inline struct task_struct *__mutex_owner(struct mutex *lock)
 {
 	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07);
-- 
cgit v1.2.3-71-gd317


From 33352244706369ea6736781ae41fe41692eb69bb Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 20 Feb 2018 11:37:51 -0600
Subject: jump_label: Explicitly disable jump labels in __init code

After initmem has been freed, any jump labels in __init code are
prevented from being written to by the kernel_text_address() check in
__jump_label_update().  However, this check is quite broad.  If
kernel_text_address() were to return false for any other reason, the
jump label write would fail silently with no warning.

For jump labels in module init code, entry->code is set to zero to
indicate that the entry is disabled.  Do the same thing for core kernel
init code.  This makes the behavior more consistent, and will also make
it more straightforward to detect non-init jump label write failures in
the next patch.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c52825c73f3a174e8398b6898284ec20d4deb126.1519051220.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h |  3 +++
 init/main.c                |  2 ++
 kernel/jump_label.c        | 16 ++++++++++++++++
 3 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index b6a29c126cc4..2168cc6b8b30 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -151,6 +151,7 @@ extern struct jump_entry __start___jump_table[];
 extern struct jump_entry __stop___jump_table[];
 
 extern void jump_label_init(void);
+extern void jump_label_invalidate_init(void);
 extern void jump_label_lock(void);
 extern void jump_label_unlock(void);
 extern void arch_jump_label_transform(struct jump_entry *entry,
@@ -198,6 +199,8 @@ static __always_inline void jump_label_init(void)
 	static_key_initialized = true;
 }
 
+static inline void jump_label_invalidate_init(void) {}
+
 static __always_inline bool static_key_false(struct static_key *key)
 {
 	if (unlikely(static_key_count(key) > 0))
diff --git a/init/main.c b/init/main.c
index a8100b954839..969eaf140ef0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -89,6 +89,7 @@
 #include <linux/io.h>
 #include <linux/cache.h>
 #include <linux/rodata_test.h>
+#include <linux/jump_label.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -1000,6 +1001,7 @@ static int __ref kernel_init(void *unused)
 	/* need to finish all async __init code before freeing the memory */
 	async_synchronize_full();
 	ftrace_free_init_mem();
+	jump_label_invalidate_init();
 	free_initmem();
 	mark_readonly();
 	system_state = SYSTEM_RUNNING;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index b4517095db6a..b71776576a66 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -16,6 +16,7 @@
 #include <linux/jump_label_ratelimit.h>
 #include <linux/bug.h>
 #include <linux/cpu.h>
+#include <asm/sections.h>
 
 #ifdef HAVE_JUMP_LABEL
 
@@ -417,6 +418,20 @@ void __init jump_label_init(void)
 	cpus_read_unlock();
 }
 
+/* Disable any jump label entries in __init code */
+void __init jump_label_invalidate_init(void)
+{
+	struct jump_entry *iter_start = __start___jump_table;
+	struct jump_entry *iter_stop = __stop___jump_table;
+	struct jump_entry *iter;
+
+	for (iter = iter_start; iter < iter_stop; iter++) {
+		if (iter->code >= (unsigned long)_sinittext &&
+		    iter->code < (unsigned long)_einittext)
+			iter->code = 0;
+	}
+}
+
 #ifdef CONFIG_MODULES
 
 static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
@@ -633,6 +648,7 @@ static void jump_label_del_module(struct module *mod)
 	}
 }
 
+/* Disable any jump label entries in module init code */
 static void jump_label_invalidate_module_init(struct module *mod)
 {
 	struct jump_entry *iter_start = mod->jump_entries;
-- 
cgit v1.2.3-71-gd317


From 9fbcc57aa16424ef84cb54e0d9db3221763de88a Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 20 Feb 2018 11:37:53 -0600
Subject: extable: Make init_kernel_text() global

Convert init_kernel_text() to a global function and use it in a few
places instead of manually comparing _sinittext and _einittext.

Note that kallsyms.h has a very similar function called
is_kernel_inittext(), but its end check is inclusive.  I'm not sure
whether that's intentional behavior, so I didn't touch it.

Suggested-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/4335d02be8d45ca7d265d2f174251d0b7ee6c5fd.1519051220.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/unwind_orc.c | 3 +--
 include/linux/kernel.h       | 1 +
 kernel/extable.c             | 2 +-
 kernel/jump_label.c          | 4 +---
 4 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 1f9188f5357c..feb28fee6cea 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -5,7 +5,6 @@
 #include <asm/unwind.h>
 #include <asm/orc_types.h>
 #include <asm/orc_lookup.h>
-#include <asm/sections.h>
 
 #define orc_warn(fmt, ...) \
 	printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
@@ -148,7 +147,7 @@ static struct orc_entry *orc_find(unsigned long ip)
 	}
 
 	/* vmlinux .init slow lookup: */
-	if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext)
+	if (init_kernel_text(ip))
 		return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
 				  __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index ce51455e2adf..3fd291503576 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -472,6 +472,7 @@ extern bool parse_option_str(const char *str, const char *option);
 extern char *next_arg(char *args, char **param, char **val);
 
 extern int core_kernel_text(unsigned long addr);
+extern int init_kernel_text(unsigned long addr);
 extern int core_kernel_data(unsigned long addr);
 extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
diff --git a/kernel/extable.c b/kernel/extable.c
index a17fdb63dc3e..6a5b61ebc66c 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -64,7 +64,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
-static inline int init_kernel_text(unsigned long addr)
+int init_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_sinittext &&
 	    addr < (unsigned long)_einittext)
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index b2f0b479191b..52a0a7af8640 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -16,7 +16,6 @@
 #include <linux/jump_label_ratelimit.h>
 #include <linux/bug.h>
 #include <linux/cpu.h>
-#include <asm/sections.h>
 
 #ifdef HAVE_JUMP_LABEL
 
@@ -429,8 +428,7 @@ void __init jump_label_invalidate_init(void)
 	struct jump_entry *iter;
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		if (iter->code >= (unsigned long)_sinittext &&
-		    iter->code < (unsigned long)_einittext)
+		if (init_kernel_text(iter->code))
 			iter->code = 0;
 	}
 }
-- 
cgit v1.2.3-71-gd317


From d34bc48f8275b6ce0da44f639d68344891268ee9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 21 Feb 2018 14:45:17 -0800
Subject: include/linux/sched/mm.h: re-inline mmdrop()

As Peter points out, Doing a CALL+RET for just the decrement is a bit silly.

Fixes: d70f2a14b72a4bc ("include/linux/sched/mm.h: uninline mmdrop_async(), etc")
Acked-by: Peter Zijlstra (Intel) <peterz@infraded.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/mm.h | 13 ++++++++++++-
 kernel/fork.c            | 15 ++-------------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 1149533aa2fa..9806184bb3d5 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -36,7 +36,18 @@ static inline void mmgrab(struct mm_struct *mm)
 	atomic_inc(&mm->mm_count);
 }
 
-extern void mmdrop(struct mm_struct *mm);
+extern void __mmdrop(struct mm_struct *mm);
+
+static inline void mmdrop(struct mm_struct *mm)
+{
+	/*
+	 * The implicit full barrier implied by atomic_dec_and_test() is
+	 * required by the membarrier system call before returning to
+	 * user-space, after storing to rq->curr.
+	 */
+	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
+		__mmdrop(mm);
+}
 
 /**
  * mmget() - Pin the address space associated with a &struct mm_struct.
diff --git a/kernel/fork.c b/kernel/fork.c
index be8aa5b98666..e5d9d405ae4e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -592,7 +592,7 @@ static void check_mm(struct mm_struct *mm)
  * is dropped: either by a lazy thread or by
  * mmput. Free the page directory and the mm.
  */
-static void __mmdrop(struct mm_struct *mm)
+void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
 	mm_free_pgd(mm);
@@ -603,18 +603,7 @@ static void __mmdrop(struct mm_struct *mm)
 	put_user_ns(mm->user_ns);
 	free_mm(mm);
 }
-
-void mmdrop(struct mm_struct *mm)
-{
-	/*
-	 * The implicit full barrier implied by atomic_dec_and_test() is
-	 * required by the membarrier system call before returning to
-	 * user-space, after storing to rq->curr.
-	 */
-	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
-		__mmdrop(mm);
-}
-EXPORT_SYMBOL_GPL(mmdrop);
+EXPORT_SYMBOL_GPL(__mmdrop);
 
 static void mmdrop_async_fn(struct work_struct *work)
 {
-- 
cgit v1.2.3-71-gd317


From 101110f6271ce956a049250c907bc960030577f8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 21 Feb 2018 14:45:20 -0800
Subject: Kbuild: always define endianess in kconfig.h

Build testing with LTO found a couple of files that get compiled
differently depending on whether asm/byteorder.h gets included early
enough or not.  In particular, include/asm-generic/qrwlock_types.h is
affected by this, but there are probably others as well.

The symptom is a series of LTO link time warnings, including these:

    net/netlabel/netlabel_unlabeled.h:223: error: type of 'netlbl_unlhsh_add' does not match original declaration [-Werror=lto-type-mismatch]
     int netlbl_unlhsh_add(struct net *net,
    net/netlabel/netlabel_unlabeled.c:377: note: 'netlbl_unlhsh_add' was previously declared here

    include/net/ipv6.h:360: error: type of 'ipv6_renew_options_kern' does not match original declaration [-Werror=lto-type-mismatch]
     ipv6_renew_options_kern(struct sock *sk,
    net/ipv6/exthdrs.c:1162: note: 'ipv6_renew_options_kern' was previously declared here

    net/core/dev.c:761: note: 'dev_get_by_name_rcu' was previously declared here
     struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
    net/core/dev.c:761: note: code may be misoptimized unless -fno-strict-aliasing is used

    drivers/gpu/drm/i915/i915_drv.h:3377: error: type of 'i915_gem_object_set_to_wc_domain' does not match original declaration [-Werror=lto-type-mismatch]
     i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write);
    drivers/gpu/drm/i915/i915_gem.c:3639: note: 'i915_gem_object_set_to_wc_domain' was previously declared here

    include/linux/debugfs.h:92:9: error: type of 'debugfs_attr_read' does not match original declaration [-Werror=lto-type-mismatch]
     ssize_t debugfs_attr_read(struct file *file, char __user *buf,
    fs/debugfs/file.c:318: note: 'debugfs_attr_read' was previously declared here

    include/linux/rwlock_api_smp.h:30: error: type of '_raw_read_unlock' does not match original declaration [-Werror=lto-type-mismatch]
     void __lockfunc _raw_read_unlock(rwlock_t *lock) __releases(lock);
    kernel/locking/spinlock.c:246:26: note: '_raw_read_unlock' was previously declared here

    include/linux/fs.h:3308:5: error: type of 'simple_attr_open' does not match original declaration [-Werror=lto-type-mismatch]
     int simple_attr_open(struct inode *inode, struct file *file,
    fs/libfs.c:795: note: 'simple_attr_open' was previously declared here

All of the above are caused by include/asm-generic/qrwlock_types.h
failing to include asm/byteorder.h after commit e0d02285f16e
("locking/qrwlock: Use 'struct qrwlock' instead of 'struct __qrwlock'")
in linux-4.15.

Similar bugs may or may not exist in older kernels as well, but there is
no easy way to test those with link-time optimizations, and kernels
before 4.14 are harder to fix because they don't have Babu's patch
series

We had similar issues with CONFIG_ symbols in the past and ended up
always including the configuration headers though linux/kconfig.h.  This
works around the issue through that same file, defining either
__BIG_ENDIAN or __LITTLE_ENDIAN depending on CONFIG_CPU_BIG_ENDIAN,
which is now always set on all architectures since commit 4c97a0c8fee3
("arch: define CPU_BIG_ENDIAN for all fixed big endian archs").

Link: http://lkml.kernel.org/r/20180202154104.1522809-2-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Babu Moger <babu.moger@amd.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Nicolas Pitre <nico@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kconfig.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
index fec5076eda91..cc8fa109cfa3 100644
--- a/include/linux/kconfig.h
+++ b/include/linux/kconfig.h
@@ -4,6 +4,12 @@
 
 #include <generated/autoconf.h>
 
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define __BIG_ENDIAN 4321
+#else
+#define __LITTLE_ENDIAN 1234
+#endif
+
 #define __ARG_PLACEHOLDER_1 0,
 #define __take_second_arg(__ignored, val, ...) val
 
-- 
cgit v1.2.3-71-gd317


From c3cc39118c3610eb6ab4711bc624af7fc48a35fe Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 21 Feb 2018 14:45:24 -0800
Subject: mm: memcontrol: fix NR_WRITEBACK leak in memcg and system stats

After commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"), we observed slowly upward creeping NR_WRITEBACK
counts over the course of several days, both the per-memcg stats as well
as the system counter in e.g.  /proc/meminfo.

The conversion from full per-cpu stat counts to per-cpu cached atomic
stat counts introduced an irq-unsafe RMW operation into the updates.

Most stat updates come from process context, but one notable exception
is the NR_WRITEBACK counter.  While writebacks are issued from process
context, they are retired from (soft)irq context.

When writeback completions interrupt the RMW counter updates of new
writebacks being issued, the decs from the completions are lost.

Since the global updates are routed through the joint lruvec API, both
the memcg counters as well as the system counters are affected.

This patch makes the joint stat and event API irq safe.

Link: http://lkml.kernel.org/r/20180203082353.17284-1-hannes@cmpxchg.org
Fixes: a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Debugged-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 882046863581..c46016bb25eb 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -523,9 +523,11 @@ static inline void __mod_memcg_state(struct mem_cgroup *memcg,
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
 				   int idx, int val)
 {
-	preempt_disable();
+	unsigned long flags;
+
+	local_irq_save(flags);
 	__mod_memcg_state(memcg, idx, val);
-	preempt_enable();
+	local_irq_restore(flags);
 }
 
 /**
@@ -606,9 +608,11 @@ static inline void __mod_lruvec_state(struct lruvec *lruvec,
 static inline void mod_lruvec_state(struct lruvec *lruvec,
 				    enum node_stat_item idx, int val)
 {
-	preempt_disable();
+	unsigned long flags;
+
+	local_irq_save(flags);
 	__mod_lruvec_state(lruvec, idx, val);
-	preempt_enable();
+	local_irq_restore(flags);
 }
 
 static inline void __mod_lruvec_page_state(struct page *page,
@@ -630,9 +634,11 @@ static inline void __mod_lruvec_page_state(struct page *page,
 static inline void mod_lruvec_page_state(struct page *page,
 					 enum node_stat_item idx, int val)
 {
-	preempt_disable();
+	unsigned long flags;
+
+	local_irq_save(flags);
 	__mod_lruvec_page_state(page, idx, val);
-	preempt_enable();
+	local_irq_restore(flags);
 }
 
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
@@ -659,9 +665,11 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg,
 static inline void count_memcg_events(struct mem_cgroup *memcg,
 				      int idx, unsigned long count)
 {
-	preempt_disable();
+	unsigned long flags;
+
+	local_irq_save(flags);
 	__count_memcg_events(memcg, idx, count);
-	preempt_enable();
+	local_irq_restore(flags);
 }
 
 /* idx can be of type enum memcg_event_item or vm_event_item */
-- 
cgit v1.2.3-71-gd317


From 9c4e6b1a7027f102990c0395296015a812525f4d Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Wed, 21 Feb 2018 14:45:28 -0800
Subject: mm, mlock, vmscan: no more skipping pagevecs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a thread mlocks an address space backed either by file pages which
are currently not present in memory or swapped out anon pages (not in
swapcache), a new page is allocated and added to the local pagevec
(lru_add_pvec), I/O is triggered and the thread then sleeps on the page.
On I/O completion, the thread can wake on a different CPU, the mlock
syscall will then sets the PageMlocked() bit of the page but will not be
able to put that page in unevictable LRU as the page is on the pagevec
of a different CPU.  Even on drain, that page will go to evictable LRU
because the PageMlocked() bit is not checked on pagevec drain.

The page will eventually go to right LRU on reclaim but the LRU stats
will remain skewed for a long time.

This patch puts all the pages, even unevictable, to the pagevecs and on
the drain, the pages will be added on their LRUs correctly by checking
their evictability.  This resolves the mlocked pages on pagevec of other
CPUs issue because when those pagevecs will be drained, the mlocked file
pages will go to unevictable LRU.  Also this makes the race with munlock
easier to resolve because the pagevec drains happen in LRU lock.

However there is still one place which makes a page evictable and does
PageLRU check on that page without LRU lock and needs special attention.
TestClearPageMlocked() and isolate_lru_page() in clear_page_mlock().

	#0: __pagevec_lru_add_fn	#1: clear_page_mlock

	SetPageLRU()			if (!TestClearPageMlocked())
					  return
	smp_mb() // <--required
					// inside does PageLRU
	if (!PageMlocked())		if (isolate_lru_page())
	  move to evictable LRU		  putback_lru_page()
	else
	  move to unevictable LRU

In '#1', TestClearPageMlocked() provides full memory barrier semantics
and thus the PageLRU check (inside isolate_lru_page) can not be
reordered before it.

In '#0', without explicit memory barrier, the PageMlocked() check can be
reordered before SetPageLRU().  If that happens, '#0' can put a page in
unevictable LRU and '#1' might have just cleared the Mlocked bit of that
page but fails to isolate as PageLRU fails as '#0' still hasn't set
PageLRU bit of that page.  That page will be stranded on the unevictable
LRU.

There is one (good) side effect though.  Without this patch, the pages
allocated for System V shared memory segment are added to evictable LRUs
even after shmctl(SHM_LOCK) on that segment.  This patch will correctly
put such pages to unevictable LRU.

Link: http://lkml.kernel.org/r/20171121211241.18877-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shaohua Li <shli@fb.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  2 --
 mm/mlock.c           |  6 ++++
 mm/swap.c            | 82 ++++++++++++++++++++++++++++++----------------------
 mm/vmscan.c          | 59 +------------------------------------
 4 files changed, 54 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7b6a59f722a3..a1a3f4ed94ce 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -337,8 +337,6 @@ extern void deactivate_file_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
-extern void add_page_to_unevictable_list(struct page *page);
-
 extern void lru_cache_add_active_or_unevictable(struct page *page,
 						struct vm_area_struct *vma);
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 79398200e423..74e5a6547c3d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -64,6 +64,12 @@ void clear_page_mlock(struct page *page)
 	mod_zone_page_state(page_zone(page), NR_MLOCK,
 			    -hpage_nr_pages(page));
 	count_vm_event(UNEVICTABLE_PGCLEARED);
+	/*
+	 * The previous TestClearPageMlocked() corresponds to the smp_mb()
+	 * in __pagevec_lru_add_fn().
+	 *
+	 * See __pagevec_lru_add_fn for more explanation.
+	 */
 	if (!isolate_lru_page(page)) {
 		putback_lru_page(page);
 	} else {
diff --git a/mm/swap.c b/mm/swap.c
index 567a7b96e41d..2d337710218f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -445,30 +445,6 @@ void lru_cache_add(struct page *page)
 	__lru_cache_add(page);
 }
 
-/**
- * add_page_to_unevictable_list - add a page to the unevictable list
- * @page:  the page to be added to the unevictable list
- *
- * Add page directly to its zone's unevictable list.  To avoid races with
- * tasks that might be making the page evictable, through eg. munlock,
- * munmap or exit, while it's not on the lru, we want to add the page
- * while it's locked or otherwise "invisible" to other tasks.  This is
- * difficult to do when using the pagevec cache, so bypass that.
- */
-void add_page_to_unevictable_list(struct page *page)
-{
-	struct pglist_data *pgdat = page_pgdat(page);
-	struct lruvec *lruvec;
-
-	spin_lock_irq(&pgdat->lru_lock);
-	lruvec = mem_cgroup_page_lruvec(page, pgdat);
-	ClearPageActive(page);
-	SetPageUnevictable(page);
-	SetPageLRU(page);
-	add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
-	spin_unlock_irq(&pgdat->lru_lock);
-}
-
 /**
  * lru_cache_add_active_or_unevictable
  * @page:  the page to be added to LRU
@@ -484,13 +460,9 @@ void lru_cache_add_active_or_unevictable(struct page *page,
 {
 	VM_BUG_ON_PAGE(PageLRU(page), page);
 
-	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
 		SetPageActive(page);
-		lru_cache_add(page);
-		return;
-	}
-
-	if (!TestSetPageMlocked(page)) {
+	else if (!TestSetPageMlocked(page)) {
 		/*
 		 * We use the irq-unsafe __mod_zone_page_stat because this
 		 * counter is not modified from interrupt context, and the pte
@@ -500,7 +472,7 @@ void lru_cache_add_active_or_unevictable(struct page *page,
 				    hpage_nr_pages(page));
 		count_vm_event(UNEVICTABLE_PGMLOCKED);
 	}
-	add_page_to_unevictable_list(page);
+	lru_cache_add(page);
 }
 
 /*
@@ -886,15 +858,55 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 				 void *arg)
 {
-	int file = page_is_file_cache(page);
-	int active = PageActive(page);
-	enum lru_list lru = page_lru(page);
+	enum lru_list lru;
+	int was_unevictable = TestClearPageUnevictable(page);
 
 	VM_BUG_ON_PAGE(PageLRU(page), page);
 
 	SetPageLRU(page);
+	/*
+	 * Page becomes evictable in two ways:
+	 * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()].
+	 * 2) Before acquiring LRU lock to put the page to correct LRU and then
+	 *   a) do PageLRU check with lock [check_move_unevictable_pages]
+	 *   b) do PageLRU check before lock [clear_page_mlock]
+	 *
+	 * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
+	 * following strict ordering:
+	 *
+	 * #0: __pagevec_lru_add_fn		#1: clear_page_mlock
+	 *
+	 * SetPageLRU()				TestClearPageMlocked()
+	 * smp_mb() // explicit ordering	// above provides strict
+	 *					// ordering
+	 * PageMlocked()			PageLRU()
+	 *
+	 *
+	 * if '#1' does not observe setting of PG_lru by '#0' and fails
+	 * isolation, the explicit barrier will make sure that page_evictable
+	 * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
+	 * can be reordered after PageMlocked check and can make '#1' to fail
+	 * the isolation of the page whose Mlocked bit is cleared (#0 is also
+	 * looking at the same page) and the evictable page will be stranded
+	 * in an unevictable LRU.
+	 */
+	smp_mb();
+
+	if (page_evictable(page)) {
+		lru = page_lru(page);
+		update_page_reclaim_stat(lruvec, page_is_file_cache(page),
+					 PageActive(page));
+		if (was_unevictable)
+			count_vm_event(UNEVICTABLE_PGRESCUED);
+	} else {
+		lru = LRU_UNEVICTABLE;
+		ClearPageActive(page);
+		SetPageUnevictable(page);
+		if (!was_unevictable)
+			count_vm_event(UNEVICTABLE_PGCULLED);
+	}
+
 	add_page_to_lru_list(page, lruvec, lru);
-	update_page_reclaim_stat(lruvec, file, active);
 	trace_mm_lru_insertion(page, lru);
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 444749669187..bee53495a829 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -769,64 +769,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
  */
 void putback_lru_page(struct page *page)
 {
-	bool is_unevictable;
-	int was_unevictable = PageUnevictable(page);
-
-	VM_BUG_ON_PAGE(PageLRU(page), page);
-
-redo:
-	ClearPageUnevictable(page);
-
-	if (page_evictable(page)) {
-		/*
-		 * For evictable pages, we can use the cache.
-		 * In event of a race, worst case is we end up with an
-		 * unevictable page on [in]active list.
-		 * We know how to handle that.
-		 */
-		is_unevictable = false;
-		lru_cache_add(page);
-	} else {
-		/*
-		 * Put unevictable pages directly on zone's unevictable
-		 * list.
-		 */
-		is_unevictable = true;
-		add_page_to_unevictable_list(page);
-		/*
-		 * When racing with an mlock or AS_UNEVICTABLE clearing
-		 * (page is unlocked) make sure that if the other thread
-		 * does not observe our setting of PG_lru and fails
-		 * isolation/check_move_unevictable_pages,
-		 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
-		 * the page back to the evictable list.
-		 *
-		 * The other side is TestClearPageMlocked() or shmem_lock().
-		 */
-		smp_mb();
-	}
-
-	/*
-	 * page's status can change while we move it among lru. If an evictable
-	 * page is on unevictable list, it never be freed. To avoid that,
-	 * check after we added it to the list, again.
-	 */
-	if (is_unevictable && page_evictable(page)) {
-		if (!isolate_lru_page(page)) {
-			put_page(page);
-			goto redo;
-		}
-		/* This means someone else dropped this page from LRU
-		 * So, it will be freed or putback to LRU again. There is
-		 * nothing to do here.
-		 */
-	}
-
-	if (was_unevictable && !is_unevictable)
-		count_vm_event(UNEVICTABLE_PGRESCUED);
-	else if (!was_unevictable && is_unevictable)
-		count_vm_event(UNEVICTABLE_PGCULLED);
-
+	lru_cache_add(page);
 	put_page(page);		/* drop ref from isolate */
 }
 
-- 
cgit v1.2.3-71-gd317


From 173a3efd3edb2ef6ef07471397c5f542a360e9c1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 21 Feb 2018 14:45:54 -0800
Subject: bug.h: work around GCC PR82365 in BUG()

Looking at functions with large stack frames across all architectures
led me discovering that BUG() suffers from the same problem as
fortify_panic(), which I've added a workaround for already.

In short, variables that go out of scope by calling a noreturn function
or __builtin_unreachable() keep using stack space in functions
afterwards.

A workaround that was identified is to insert an empty assembler
statement just before calling the function that doesn't return.  I'm
adding a macro "barrier_before_unreachable()" to document this, and
insert calls to that in all instances of BUG() that currently suffer
from this problem.

The files that saw the largest change from this had these frame sizes
before, and much less with my patch:

  fs/ext4/inode.c:82:1: warning: the frame size of 1672 bytes is larger than 800 bytes [-Wframe-larger-than=]
  fs/ext4/namei.c:434:1: warning: the frame size of 904 bytes is larger than 800 bytes [-Wframe-larger-than=]
  fs/ext4/super.c:2279:1: warning: the frame size of 1160 bytes is larger than 800 bytes [-Wframe-larger-than=]
  fs/ext4/xattr.c:146:1: warning: the frame size of 1168 bytes is larger than 800 bytes [-Wframe-larger-than=]
  fs/f2fs/inode.c:152:1: warning: the frame size of 1424 bytes is larger than 800 bytes [-Wframe-larger-than=]
  net/netfilter/ipvs/ip_vs_core.c:1195:1: warning: the frame size of 1068 bytes is larger than 800 bytes [-Wframe-larger-than=]
  net/netfilter/ipvs/ip_vs_core.c:395:1: warning: the frame size of 1084 bytes is larger than 800 bytes [-Wframe-larger-than=]
  net/netfilter/ipvs/ip_vs_ftp.c:298:1: warning: the frame size of 928 bytes is larger than 800 bytes [-Wframe-larger-than=]
  net/netfilter/ipvs/ip_vs_ftp.c:418:1: warning: the frame size of 908 bytes is larger than 800 bytes [-Wframe-larger-than=]
  net/netfilter/ipvs/ip_vs_lblcr.c:718:1: warning: the frame size of 960 bytes is larger than 800 bytes [-Wframe-larger-than=]
  drivers/net/xen-netback/netback.c:1500:1: warning: the frame size of 1088 bytes is larger than 800 bytes [-Wframe-larger-than=]

In case of ARC and CRIS, it turns out that the BUG() implementation
actually does return (or at least the compiler thinks it does),
resulting in lots of warnings about uninitialized variable use and
leaving noreturn functions, such as:

  block/cfq-iosched.c: In function 'cfq_async_queue_prio':
  block/cfq-iosched.c:3804:1: error: control reaches end of non-void function [-Werror=return-type]
  include/linux/dmaengine.h: In function 'dma_maxpq':
  include/linux/dmaengine.h:1123:1: error: control reaches end of non-void function [-Werror=return-type]

This makes them call __builtin_trap() instead, which should normally
dump the stack and kill the current process, like some of the other
architectures already do.

I tried adding barrier_before_unreachable() to panic() and
fortify_panic() as well, but that had very little effect, so I'm not
submitting that patch.

Vineet said:

: For ARC, it is double win.
:
: 1. Fixes 3 -Wreturn-type warnings
:
: | ../net/core/ethtool.c:311:1: warning: control reaches end of non-void function
: [-Wreturn-type]
: | ../kernel/sched/core.c:3246:1: warning: control reaches end of non-void function
: [-Wreturn-type]
: | ../include/linux/sunrpc/svc_xprt.h:180:1: warning: control reaches end of
: non-void function [-Wreturn-type]
:
: 2.  bloat-o-meter reports code size improvements as gcc elides the
:    generated code for stack return.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365
Link: http://lkml.kernel.org/r/20171219114112.939391-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Vineet Gupta <vgupta@synopsys.com>	[arch/arc]
Tested-by: Vineet Gupta <vgupta@synopsys.com>	[arch/arc]
Cc: Mikael Starvik <starvik@axis.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Christopher Li <sparse@chrisli.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arc/include/asm/bug.h            |  3 ++-
 arch/cris/include/arch-v10/arch/bug.h | 11 +++++++++--
 arch/ia64/include/asm/bug.h           |  6 +++++-
 arch/m68k/include/asm/bug.h           |  3 +++
 arch/sparc/include/asm/bug.h          |  6 +++++-
 include/asm-generic/bug.h             |  1 +
 include/linux/compiler-gcc.h          | 15 ++++++++++++++-
 include/linux/compiler.h              |  5 +++++
 8 files changed, 44 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/include/asm/bug.h b/arch/arc/include/asm/bug.h
index ea022d47896c..21ec82466d62 100644
--- a/arch/arc/include/asm/bug.h
+++ b/arch/arc/include/asm/bug.h
@@ -23,7 +23,8 @@ void die(const char *str, struct pt_regs *regs, unsigned long address);
 
 #define BUG()	do {								\
 	pr_warn("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \
-	dump_stack();								\
+	barrier_before_unreachable();						\
+	__builtin_trap();							\
 } while (0)
 
 #define HAVE_ARCH_BUG
diff --git a/arch/cris/include/arch-v10/arch/bug.h b/arch/cris/include/arch-v10/arch/bug.h
index 905afeacfedf..06da9d49152a 100644
--- a/arch/cris/include/arch-v10/arch/bug.h
+++ b/arch/cris/include/arch-v10/arch/bug.h
@@ -44,18 +44,25 @@ struct bug_frame {
  * not be used like this with newer versions of gcc.
  */
 #define BUG()								\
+do {									\
 	__asm__ __volatile__ ("clear.d [" __stringify(BUG_MAGIC) "]\n\t"\
 			      "movu.w " __stringify(__LINE__) ",$r0\n\t"\
 			      "jump 0f\n\t"				\
 			      ".section .rodata\n"			\
 			      "0:\t.string \"" __FILE__ "\"\n\t"	\
-			      ".previous")
+			      ".previous");				\
+	unreachable();							\
+} while (0)
 #endif
 
 #else
 
 /* This just causes an oops. */
-#define BUG() (*(int *)0 = 0)
+#define BUG()								\
+do {									\
+	barrier_before_unreachable();					\
+	__builtin_trap();						\
+} while (0)
 
 #endif
 
diff --git a/arch/ia64/include/asm/bug.h b/arch/ia64/include/asm/bug.h
index bd3eeb8d1cfa..66b37a532765 100644
--- a/arch/ia64/include/asm/bug.h
+++ b/arch/ia64/include/asm/bug.h
@@ -4,7 +4,11 @@
 
 #ifdef CONFIG_BUG
 #define ia64_abort()	__builtin_trap()
-#define BUG() do { printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); ia64_abort(); } while (0)
+#define BUG() do {						\
+	printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__);	\
+	barrier_before_unreachable();				\
+	ia64_abort();						\
+} while (0)
 
 /* should this BUG be made generic? */
 #define HAVE_ARCH_BUG
diff --git a/arch/m68k/include/asm/bug.h b/arch/m68k/include/asm/bug.h
index b7e2bf1ba4a6..275dca1435bf 100644
--- a/arch/m68k/include/asm/bug.h
+++ b/arch/m68k/include/asm/bug.h
@@ -8,16 +8,19 @@
 #ifndef CONFIG_SUN3
 #define BUG() do { \
 	pr_crit("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \
+	barrier_before_unreachable(); \
 	__builtin_trap(); \
 } while (0)
 #else
 #define BUG() do { \
 	pr_crit("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \
+	barrier_before_unreachable(); \
 	panic("BUG!"); \
 } while (0)
 #endif
 #else
 #define BUG() do { \
+	barrier_before_unreachable(); \
 	__builtin_trap(); \
 } while (0)
 #endif
diff --git a/arch/sparc/include/asm/bug.h b/arch/sparc/include/asm/bug.h
index 6f17528356b2..ea53e418f6c0 100644
--- a/arch/sparc/include/asm/bug.h
+++ b/arch/sparc/include/asm/bug.h
@@ -9,10 +9,14 @@
 void do_BUG(const char *file, int line);
 #define BUG() do {					\
 	do_BUG(__FILE__, __LINE__);			\
+	barrier_before_unreachable();			\
 	__builtin_trap();				\
 } while (0)
 #else
-#define BUG()		__builtin_trap()
+#define BUG() do {					\
+	barrier_before_unreachable();			\
+	__builtin_trap();				\
+} while (0)
 #endif
 
 #define HAVE_ARCH_BUG
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 963b755d19b0..a7613e1b0c87 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -52,6 +52,7 @@ struct bug_entry {
 #ifndef HAVE_ARCH_BUG
 #define BUG() do { \
 	printk("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \
+	barrier_before_unreachable(); \
 	panic("BUG!"); \
 } while (0)
 #endif
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 73bc63e0a1c4..901c1ccb3374 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -207,6 +207,15 @@
 #endif
 #endif
 
+/*
+ * calling noreturn functions, __builtin_unreachable() and __builtin_trap()
+ * confuse the stack allocation in gcc, leading to overly large stack
+ * frames, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365
+ *
+ * Adding an empty inline assembly before it works around the problem
+ */
+#define barrier_before_unreachable() asm volatile("")
+
 /*
  * Mark a position in code as unreachable.  This can be used to
  * suppress control flow warnings after asm blocks that transfer
@@ -217,7 +226,11 @@
  * unreleased.  Really, we need to have autoconf for the kernel.
  */
 #define unreachable() \
-	do { annotate_unreachable(); __builtin_unreachable(); } while (0)
+	do {					\
+		annotate_unreachable();		\
+		barrier_before_unreachable();	\
+		__builtin_unreachable();	\
+	} while (0)
 
 /* Mark a function definition as prohibited from being cloned. */
 #define __noclone	__attribute__((__noclone__, __optimize__("no-tracer")))
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index e835fc0423ec..ab4711c63601 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -86,6 +86,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 # define barrier_data(ptr) barrier()
 #endif
 
+/* workaround for GCC PR82365 if needed */
+#ifndef barrier_before_unreachable
+# define barrier_before_unreachable() do { } while (0)
+#endif
+
 /* Unreachable code */
 #ifdef CONFIG_STACK_VALIDATION
 /*
-- 
cgit v1.2.3-71-gd317


From 28128c61e08eaeced9cc8ec0e6b5d677b5b94690 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 22 Feb 2018 09:41:40 -0800
Subject: kconfig.h: Include compiler types to avoid missed struct attributes

The header files for some structures could get included in such a way
that struct attributes (specifically __randomize_layout from path.h) would
be parsed as variable names instead of attributes. This could lead to
some instances of a structure being unrandomized, causing nasty GPFs, etc.

This patch makes sure the compiler_types.h header is included in
kconfig.h so that we've always got types and struct attributes defined,
since kconfig.h is included from the compiler command line.

Reported-by: Patrick McLean <chutzpah@gentoo.org>
Root-caused-by: Maciej S. Szmigiero <mail@maciej.szmigiero.name>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Maciej S. Szmigiero <mail@maciej.szmigiero.name>
Fixes: 3859a271a003 ("randstruct: Mark various structs for randomization")
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kconfig.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
index fec5076eda91..c5fd4ee776ba 100644
--- a/include/linux/kconfig.h
+++ b/include/linux/kconfig.h
@@ -64,4 +64,7 @@
  */
 #define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option))
 
+/* Make sure we always have all types and struct attributes defined. */
+#include <linux/compiler_types.h>
+
 #endif /* __LINUX_KCONFIG_H */
-- 
cgit v1.2.3-71-gd317


From bef3efbeb897b56867e271cdbc5f8adaacaeb9cd Mon Sep 17 00:00:00 2001
From: "Luck, Tony" <tony.luck@intel.com>
Date: Thu, 22 Feb 2018 09:15:06 -0800
Subject: efivarfs: Limit the rate for non-root to read files

Each read from a file in efivarfs results in two calls to EFI
(one to get the file size, another to get the actual data).

On X86 these EFI calls result in broadcast system management
interrupts (SMI) which affect performance of the whole system.
A malicious user can loop performing reads from efivarfs bringing
the system to its knees.

Linus suggested per-user rate limit to solve this.

So we add a ratelimit structure to "user_struct" and initialize
it for the root user for no limit. When allocating user_struct for
other users we set the limit to 100 per second. This could be used
for other places that want to limit the rate of some detrimental
user action.

In efivarfs if the limit is exceeded when reading, we take an
interruptible nap for 50ms and check the rate limit again.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/efivarfs/file.c         | 6 ++++++
 include/linux/sched/user.h | 4 ++++
 kernel/user.c              | 3 +++
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 5f22e74bbade..8e568428c88b 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/efi.h>
+#include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
@@ -74,6 +75,11 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
 	ssize_t size = 0;
 	int err;
 
+	while (!__ratelimit(&file->f_cred->user->ratelimit)) {
+		if (!msleep_interruptible(50))
+			return -EINTR;
+	}
+
 	err = efivar_entry_size(var, &datasize);
 
 	/*
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 0dcf4e480ef7..96fe289c4c6e 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -4,6 +4,7 @@
 
 #include <linux/uidgid.h>
 #include <linux/atomic.h>
+#include <linux/ratelimit.h>
 
 struct key;
 
@@ -41,6 +42,9 @@ struct user_struct {
     defined(CONFIG_NET)
 	atomic_long_t locked_vm;
 #endif
+
+	/* Miscellaneous per-user rate limit */
+	struct ratelimit_state ratelimit;
 };
 
 extern int uids_sysfs_init(void);
diff --git a/kernel/user.c b/kernel/user.c
index 9a20acce460d..36288d840675 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,6 +101,7 @@ struct user_struct root_user = {
 	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
 	.uid		= GLOBAL_ROOT_UID,
+	.ratelimit	= RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
 };
 
 /*
@@ -191,6 +192,8 @@ struct user_struct *alloc_uid(kuid_t uid)
 
 		new->uid = uid;
 		atomic_set(&new->__count, 1);
+		ratelimit_state_init(&new->ratelimit, HZ, 100);
+		ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
 
 		/*
 		 * Before adding this, check whether we raced
-- 
cgit v1.2.3-71-gd317


From 076467490b8176eb96eddc548a14d4135c7b5852 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Thu, 22 Feb 2018 13:05:41 +0100
Subject: kvm: fix warning for CONFIG_HAVE_KVM_EVENTFD builds

Move the kvm_arch_irq_routing_update() prototype outside of
ifdef CONFIG_HAVE_KVM_EVENTFD guards to fix the following sparse warning:

arch/s390/kvm/../../../virt/kvm/irqchip.c:171:28: warning: symbol 'kvm_arch_irq_routing_update' was not declared. Should it be static?

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ac0062b74aed..84b9c50693f2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1105,7 +1105,6 @@ static inline void kvm_irq_routing_update(struct kvm *kvm)
 {
 }
 #endif
-void kvm_arch_irq_routing_update(struct kvm *kvm);
 
 static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 {
@@ -1114,6 +1113,8 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
+void kvm_arch_irq_routing_update(struct kvm *kvm);
+
 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 {
 	/*
-- 
cgit v1.2.3-71-gd317


From f75e4924f0152be747bf04c9d16bb23fd8baf5f9 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Thu, 22 Feb 2018 13:04:39 +0100
Subject: kvm: fix warning for non-x86 builds

Fix the following sparse warning by moving the prototype
of kvm_arch_mmu_notifier_invalidate_range() to linux/kvm_host.h .

  CHECK   arch/s390/kvm/../../../virt/kvm/kvm_main.c
arch/s390/kvm/../../../virt/kvm/kvm_main.c:138:13: warning: symbol 'kvm_arch_mmu_notifier_invalidate_range' was not declared. Should it be static?

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 3 ---
 include/linux/kvm_host.h        | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dd6f57a54a26..0a9e330b34f0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1464,7 +1464,4 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
 #define put_smstate(type, buf, offset, val)                      \
 	*(type *)((buf) + (offset) - 0x7e00) = val
 
-void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-		unsigned long start, unsigned long end);
-
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 84b9c50693f2..6930c63126c7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1273,4 +1273,7 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
 }
 #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
 
+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+		unsigned long start, unsigned long end);
+
 #endif
-- 
cgit v1.2.3-71-gd317


From 3079c22ea815775837a4f389ce2f7e1e7b202e09 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 26 Feb 2018 13:01:38 +0100
Subject: genhd: Rename get_disk() to get_disk_and_module()

Rename get_disk() to get_disk_and_module() to make sure what the
function does. It's not a great name but at least it is now clear that
put_disk() is not it's counterpart.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c           | 10 ++++------
 drivers/block/amiflop.c |  2 +-
 drivers/block/ataflop.c |  2 +-
 drivers/block/brd.c     |  2 +-
 drivers/block/floppy.c  |  2 +-
 drivers/block/loop.c    |  2 +-
 drivers/block/swim.c    |  2 +-
 drivers/block/z2ram.c   |  2 +-
 drivers/ide/ide-probe.c |  2 +-
 include/linux/genhd.h   |  2 +-
 10 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 5098bffe6ba6..21b2843b27d0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -547,7 +547,7 @@ static int exact_lock(dev_t devt, void *data)
 {
 	struct gendisk *p = data;
 
-	if (!get_disk(p))
+	if (!get_disk_and_module(p))
 		return -1;
 	return 0;
 }
@@ -809,7 +809,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 
 		spin_lock_bh(&ext_devt_lock);
 		part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
-		if (part && get_disk(part_to_disk(part))) {
+		if (part && get_disk_and_module(part_to_disk(part))) {
 			*partno = part->partno;
 			disk = part_to_disk(part);
 		}
@@ -1456,7 +1456,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 }
 EXPORT_SYMBOL(__alloc_disk_node);
 
-struct kobject *get_disk(struct gendisk *disk)
+struct kobject *get_disk_and_module(struct gendisk *disk)
 {
 	struct module *owner;
 	struct kobject *kobj;
@@ -1474,15 +1474,13 @@ struct kobject *get_disk(struct gendisk *disk)
 	return kobj;
 
 }
-
-EXPORT_SYMBOL(get_disk);
+EXPORT_SYMBOL(get_disk_and_module);
 
 void put_disk(struct gendisk *disk)
 {
 	if (disk)
 		kobject_put(&disk_to_dev(disk)->kobj);
 }
-
 EXPORT_SYMBOL(put_disk);
 
 static void set_disk_ro_uevent(struct gendisk *gd, int ro)
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index e5aa62fcf5a8..3aaf6af3ec23 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1758,7 +1758,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
 	if (unit[drive].type->code == FD_NODRIVE)
 		return NULL;
 	*part = 0;
-	return get_disk(unit[drive].gendisk);
+	return get_disk_and_module(unit[drive].gendisk);
 }
 
 static int __init amiga_floppy_probe(struct platform_device *pdev)
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 8bc3b9fd8dd2..dfb2c2622e5a 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1917,7 +1917,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
 	if (drive >= FD_MAX_UNITS || type > NUM_DISK_MINORS)
 		return NULL;
 	*part = 0;
-	return get_disk(unit[drive].disk);
+	return get_disk_and_module(unit[drive].disk);
 }
 
 static int __init atari_floppy_init (void)
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 8028a3a7e7fd..deea78e485da 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -456,7 +456,7 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
 
 	mutex_lock(&brd_devices_mutex);
 	brd = brd_init_one(MINOR(dev) / max_part, &new);
-	kobj = brd ? get_disk(brd->brd_disk) : NULL;
+	kobj = brd ? get_disk_and_module(brd->brd_disk) : NULL;
 	mutex_unlock(&brd_devices_mutex);
 
 	if (new)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index eae484acfbbc..8ec7235fc93b 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4505,7 +4505,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
 	if (((*part >> 2) & 0x1f) >= ARRAY_SIZE(floppy_type))
 		return NULL;
 	*part = 0;
-	return get_disk(disks[drive]);
+	return get_disk_and_module(disks[drive]);
 }
 
 static int __init do_floppy_init(void)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d5fe720cf149..87855b5123a6 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1922,7 +1922,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 	if (err < 0)
 		kobj = NULL;
 	else
-		kobj = get_disk(lo->lo_disk);
+		kobj = get_disk_and_module(lo->lo_disk);
 	mutex_unlock(&loop_index_mutex);
 
 	*part = 0;
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 84434d3ea19b..64e066eba72e 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -799,7 +799,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
 		return NULL;
 
 	*part = 0;
-	return get_disk(swd->unit[drive].disk);
+	return get_disk_and_module(swd->unit[drive].disk);
 }
 
 static int swim_add_floppy(struct swim_priv *swd, enum drive_location location)
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 41c95c9b2ab4..8f9130ab5887 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -332,7 +332,7 @@ static const struct block_device_operations z2_fops =
 static struct kobject *z2_find(dev_t dev, int *part, void *data)
 {
 	*part = 0;
-	return get_disk(z2ram_gendisk);
+	return get_disk_and_module(z2ram_gendisk);
 }
 
 static struct request_queue *z2_queue;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 17fd55af4d92..caa20eb5f26b 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -928,7 +928,7 @@ static int exact_lock(dev_t dev, void *data)
 {
 	struct gendisk *p = data;
 
-	if (!get_disk(p))
+	if (!get_disk_and_module(p))
 		return -1;
 	return 0;
 }
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5e3531027b51..8e11b9321e55 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -600,7 +600,7 @@ extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
 extern struct gendisk *__alloc_disk_node(int minors, int node_id);
-extern struct kobject *get_disk(struct gendisk *disk);
+extern struct kobject *get_disk_and_module(struct gendisk *disk);
 extern void put_disk(struct gendisk *disk);
 extern void blk_register_region(dev_t devt, unsigned long range,
 			struct module *module,
-- 
cgit v1.2.3-71-gd317


From 9df6c29912315186fef1c79cc15b758ace84175b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 26 Feb 2018 13:01:39 +0100
Subject: genhd: Add helper put_disk_and_module()

Add a proper counterpart to get_disk_and_module() -
put_disk_and_module(). Currently it is opencoded in several places.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c    | 11 ++---------
 block/genhd.c         | 20 ++++++++++++++++----
 fs/block_dev.c        | 19 +++++--------------
 include/linux/genhd.h |  1 +
 4 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4117524ca45b..c2033a232a44 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -812,7 +812,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	struct gendisk *disk;
 	struct request_queue *q;
 	struct blkcg_gq *blkg;
-	struct module *owner;
 	unsigned int major, minor;
 	int key_len, part, ret;
 	char *body;
@@ -904,9 +903,7 @@ fail_unlock:
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 fail:
-	owner = disk->fops->owner;
-	put_disk(disk);
-	module_put(owner);
+	put_disk_and_module(disk);
 	/*
 	 * If queue was bypassing, we should retry.  Do so after a
 	 * short msleep().  It isn't strictly necessary but queue
@@ -931,13 +928,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
 {
-	struct module *owner;
-
 	spin_unlock_irq(ctx->disk->queue->queue_lock);
 	rcu_read_unlock();
-	owner = ctx->disk->fops->owner;
-	put_disk(ctx->disk);
-	module_put(owner);
+	put_disk_and_module(ctx->disk);
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
diff --git a/block/genhd.c b/block/genhd.c
index 21b2843b27d0..4c0590434591 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -817,10 +817,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 	}
 
 	if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) {
-		struct module *owner = disk->fops->owner;
-
-		put_disk(disk);
-		module_put(owner);
+		put_disk_and_module(disk);
 		disk = NULL;
 	}
 	return disk;
@@ -1483,6 +1480,21 @@ void put_disk(struct gendisk *disk)
 }
 EXPORT_SYMBOL(put_disk);
 
+/*
+ * This is a counterpart of get_disk_and_module() and thus also of
+ * get_gendisk().
+ */
+void put_disk_and_module(struct gendisk *disk)
+{
+	if (disk) {
+		struct module *owner = disk->fops->owner;
+
+		put_disk(disk);
+		module_put(owner);
+	}
+}
+EXPORT_SYMBOL(put_disk_and_module);
+
 static void set_disk_ro_uevent(struct gendisk *gd, int ro)
 {
 	char event[] = "DISK_RO=1";
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4a181fcb5175..1dbbf847911a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1111,8 +1111,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
 	else
 		whole = bdgrab(bdev);
 
-	module_put(disk->fops->owner);
-	put_disk(disk);
+	put_disk_and_module(disk);
 	if (!whole)
 		return ERR_PTR(-ENOMEM);
 
@@ -1407,7 +1406,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
-	struct module *owner;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -1433,7 +1431,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		goto out;
-	owner = disk->fops->owner;
 
 	disk_block_events(disk);
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1463,8 +1460,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					bdev->bd_queue = NULL;
 					mutex_unlock(&bdev->bd_mutex);
 					disk_unblock_events(disk);
-					put_disk(disk);
-					module_put(owner);
+					put_disk_and_module(disk);
 					goto restart;
 				}
 			}
@@ -1525,8 +1521,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_unlock_bdev;
 		}
 		/* only one opener holds refs to the module and disk */
-		put_disk(disk);
-		module_put(owner);
+		put_disk_and_module(disk);
 	}
 	bdev->bd_openers++;
 	if (for_part)
@@ -1546,8 +1541,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 	disk_unblock_events(disk);
-	put_disk(disk);
-	module_put(owner);
+	put_disk_and_module(disk);
  out:
 	bdput(bdev);
 
@@ -1770,8 +1764,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 			disk->fops->release(disk, mode);
 	}
 	if (!bdev->bd_openers) {
-		struct module *owner = disk->fops->owner;
-
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
@@ -1779,8 +1771,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
 
-		put_disk(disk);
-		module_put(owner);
+		put_disk_and_module(disk);
 	}
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 8e11b9321e55..7f5906fe1b70 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -602,6 +602,7 @@ extern void printk_all_partitions(void);
 extern struct gendisk *__alloc_disk_node(int minors, int node_id);
 extern struct kobject *get_disk_and_module(struct gendisk *disk);
 extern void put_disk(struct gendisk *disk);
+extern void put_disk_and_module(struct gendisk *disk);
 extern void blk_register_region(dev_t devt, unsigned long range,
 			struct module *module,
 			struct kobject *(*probe)(dev_t, int *, void *),
-- 
cgit v1.2.3-71-gd317


From 56c0908c855afbb2bdda17c15d2879949a091ad3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 26 Feb 2018 13:01:41 +0100
Subject: genhd: Fix BUG in blkdev_open()

When two blkdev_open() calls for a partition race with device removal
and recreation, we can hit BUG_ON(!bd_may_claim(bdev, whole, holder)) in
blkdev_open(). The race can happen as follows:

CPU0				CPU1			CPU2
							del_gendisk()
							  bdev_unhash_inode(part1);

blkdev_open(part1, O_EXCL)	blkdev_open(part1, O_EXCL)
  bdev = bd_acquire()		  bdev = bd_acquire()
  blkdev_get(bdev)
    bd_start_claiming(bdev)
      - finds old inode 'whole'
      bd_prepare_to_claim() -> 0
							  bdev_unhash_inode(whole);
							<device removed>
							<new device under same
							 number created>
				  blkdev_get(bdev);
				    bd_start_claiming(bdev)
				      - finds new inode 'whole'
				      bd_prepare_to_claim()
					- this also succeeds as we have
					  different 'whole' here...
					- bad things happen now as we
					  have two exclusive openers of
					  the same bdev

The problem here is that block device opens can see various intermediate
states while gendisk is shutting down and then being recreated.

We fix the problem by introducing new lookup_sem in gendisk that
synchronizes gendisk deletion with get_gendisk() and furthermore by
making sure that get_gendisk() does not return gendisk that is being (or
has been) deleted. This makes sure that once we ever manage to look up
newly created bdev inode, we are also guaranteed that following
get_gendisk() will either return failure (and we fail open) or it
returns gendisk for the new device and following bdget_disk() will
return new bdev inode (i.e., blkdev_open() follows the path as if it is
completely run after new device is created).

Reported-and-analyzed-by: Hou Tao <houtao1@huawei.com>
Tested-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 21 ++++++++++++++++++++-
 include/linux/genhd.h |  1 +
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 4c0590434591..9656f9e9f99e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -717,6 +717,11 @@ void del_gendisk(struct gendisk *disk)
 	blk_integrity_del(disk);
 	disk_del_events(disk);
 
+	/*
+	 * Block lookups of the disk until all bdevs are unhashed and the
+	 * disk is marked as dead (GENHD_FL_UP cleared).
+	 */
+	down_write(&disk->lookup_sem);
 	/* invalidate stuff */
 	disk_part_iter_init(&piter, disk,
 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
@@ -731,6 +736,7 @@ void del_gendisk(struct gendisk *disk)
 	bdev_unhash_inode(disk_devt(disk));
 	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
+	up_write(&disk->lookup_sem);
 
 	if (!(disk->flags & GENHD_FL_HIDDEN))
 		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
@@ -816,9 +822,21 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 		spin_unlock_bh(&ext_devt_lock);
 	}
 
-	if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) {
+	if (!disk)
+		return NULL;
+
+	/*
+	 * Synchronize with del_gendisk() to not return disk that is being
+	 * destroyed.
+	 */
+	down_read(&disk->lookup_sem);
+	if (unlikely((disk->flags & GENHD_FL_HIDDEN) ||
+		     !(disk->flags & GENHD_FL_UP))) {
+		up_read(&disk->lookup_sem);
 		put_disk_and_module(disk);
 		disk = NULL;
+	} else {
+		up_read(&disk->lookup_sem);
 	}
 	return disk;
 }
@@ -1418,6 +1436,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 			kfree(disk);
 			return NULL;
 		}
+		init_rwsem(&disk->lookup_sem);
 		disk->node_id = node_id;
 		if (disk_expand_part_tbl(disk, 0)) {
 			free_part_stats(&disk->part0);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7f5906fe1b70..c826b0b5232a 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -198,6 +198,7 @@ struct gendisk {
 	void *private_data;
 
 	int flags;
+	struct rw_semaphore lookup_sem;
 	struct kobject *slave_dir;
 
 	struct timer_rand_state *random;
-- 
cgit v1.2.3-71-gd317


From 230f5a8969d8345fc9bbe3683f068246cf1be4b8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 21 Feb 2018 17:08:01 -0800
Subject: dax: fix vma_is_fsdax() helper

Gerd reports that ->i_mode may contain other bits besides S_IFCHR. Use
S_ISCHR() instead. Otherwise, get_user_pages_longterm() may fail on
device-dax instances when those are meant to be explicitly allowed.

Fixes: 2bb6d2837083 ("mm: introduce get_user_pages_longterm")
Cc: <stable@vger.kernel.org>
Reported-by: Gerd Rausch <gerd.rausch@oracle.com>
Acked-by: Jane Chu <jane.chu@oracle.com>
Reported-by: Haozhong Zhang <haozhong.zhang@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2a815560fda0..79c413985305 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3198,7 +3198,7 @@ static inline bool vma_is_fsdax(struct vm_area_struct *vma)
 	if (!vma_is_dax(vma))
 		return false;
 	inode = file_inode(vma->vm_file);
-	if (inode->i_mode == S_IFCHR)
+	if (S_ISCHR(inode->i_mode))
 		return false; /* device-dax */
 	return true;
 }
-- 
cgit v1.2.3-71-gd317


From 9c2c2e62df3fa30fb13fbeb7512a4eede729383b Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Tue, 27 Feb 2018 01:56:06 +0100
Subject: net: phy: Restore phy_resume() locking assumption

commit f5e64032a799 ("net: phy: fix resume handling") changes the
locking semantics for phy_resume() such that the caller now needs to
hold the phy mutex. Not all call sites were adopted to this new
semantic, resulting in warnings from the added
WARN_ON(!mutex_is_locked(&phydev->lock)).  Rather than change the
semantics, add a __phy_resume() and restore the old behavior of
phy_resume().

Reported-by: Heiner Kallweit <hkallweit1@gmail.com>
Fixes: f5e64032a799 ("net: phy: fix resume handling")
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c        |  2 +-
 drivers/net/phy/phy_device.c | 18 +++++++++++++-----
 include/linux/phy.h          |  1 +
 3 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index e3e29c2b028b..a6f924fee584 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -819,7 +819,7 @@ void phy_start(struct phy_device *phydev)
 		break;
 	case PHY_HALTED:
 		/* if phy was suspended, bring the physical link up again */
-		phy_resume(phydev);
+		__phy_resume(phydev);
 
 		/* make sure interrupts are re-enabled for the PHY */
 		if (phy_interrupt_is_valid(phydev)) {
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index d39ae77707ef..478405e544cc 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -135,9 +135,7 @@ static int mdio_bus_phy_resume(struct device *dev)
 	if (!mdio_bus_phy_may_suspend(phydev))
 		goto no_resume;
 
-	mutex_lock(&phydev->lock);
 	ret = phy_resume(phydev);
-	mutex_unlock(&phydev->lock);
 	if (ret < 0)
 		return ret;
 
@@ -1041,9 +1039,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 	if (err)
 		goto error;
 
-	mutex_lock(&phydev->lock);
 	phy_resume(phydev);
-	mutex_unlock(&phydev->lock);
 	phy_led_triggers_register(phydev);
 
 	return err;
@@ -1172,7 +1168,7 @@ int phy_suspend(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_suspend);
 
-int phy_resume(struct phy_device *phydev)
+int __phy_resume(struct phy_device *phydev)
 {
 	struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver);
 	int ret = 0;
@@ -1189,6 +1185,18 @@ int phy_resume(struct phy_device *phydev)
 
 	return ret;
 }
+EXPORT_SYMBOL(__phy_resume);
+
+int phy_resume(struct phy_device *phydev)
+{
+	int ret;
+
+	mutex_lock(&phydev->lock);
+	ret = __phy_resume(phydev);
+	mutex_unlock(&phydev->lock);
+
+	return ret;
+}
 EXPORT_SYMBOL(phy_resume);
 
 int phy_loopback(struct phy_device *phydev, bool enable)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 5a0c3e53e7c2..d7069539f351 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -924,6 +924,7 @@ void phy_device_remove(struct phy_device *phydev);
 int phy_init_hw(struct phy_device *phydev);
 int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
+int __phy_resume(struct phy_device *phydev);
 int phy_loopback(struct phy_device *phydev, bool enable);
 struct phy_device *phy_attach(struct net_device *dev, const char *bus_id,
 			      phy_interface_t interface);
-- 
cgit v1.2.3-71-gd317


From 28b0f8a6962a24ed21737578f3b1b07424635c9e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 Feb 2018 07:38:08 -0800
Subject: tty: make n_tty_read() always abort if hangup is in progress

A tty is hung up by __tty_hangup() setting file->f_op to
hung_up_tty_fops, which is skipped on ttys whose write operation isn't
tty_write().  This means that, for example, /dev/console whose write
op is redirected_tty_write() is never actually marked hung up.

Because n_tty_read() uses the hung up status to decide whether to
abort the waiting readers, the lack of hung-up marking can lead to the
following scenario.

 1. A session contains two processes.  The leader and its child.  The
    child ignores SIGHUP.

 2. The leader exits and starts disassociating from the controlling
    terminal (/dev/console).

 3. __tty_hangup() skips setting f_op to hung_up_tty_fops.

 4. SIGHUP is delivered and ignored.

 5. tty_ldisc_hangup() is invoked.  It wakes up the waits which should
    clear the read lockers of tty->ldisc_sem.

 6. The reader wakes up but because tty_hung_up_p() is false, it
    doesn't abort and goes back to sleep while read-holding
    tty->ldisc_sem.

 7. The leader progresses to tty_ldisc_lock() in tty_ldisc_hangup()
    and is now stuck in D sleep indefinitely waiting for
    tty->ldisc_sem.

The following is Alan's explanation on why some ttys aren't hung up.

 http://lkml.kernel.org/r/20171101170908.6ad08580@alans-desktop

 1. It broke the serial consoles because they would hang up and close
    down the hardware. With tty_port that *should* be fixable properly
    for any cases remaining.

 2. The console layer was (and still is) completely broken and doens't
    refcount properly. So if you turn on console hangups it breaks (as
    indeed does freeing consoles and half a dozen other things).

As neither can be fixed quickly, this patch works around the problem
by introducing a new flag, TTY_HUPPING, which is used solely to tell
n_tty_read() that hang-up is in progress for the console and the
readers should be aborted regardless of the hung-up status of the
device.

The following is a sample hung task warning caused by this issue.

  INFO: task agetty:2662 blocked for more than 120 seconds.
        Not tainted 4.11.3-dbg-tty-lockup-02478-gfd6c7ee-dirty #28
  "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
      0  2662      1 0x00000086
  Call Trace:
   __schedule+0x267/0x890
   schedule+0x36/0x80
   schedule_timeout+0x23c/0x2e0
   ldsem_down_write+0xce/0x1f6
   tty_ldisc_lock+0x16/0x30
   tty_ldisc_hangup+0xb3/0x1b0
   __tty_hangup+0x300/0x410
   disassociate_ctty+0x6c/0x290
   do_exit+0x7ef/0xb00
   do_group_exit+0x3f/0xa0
   get_signal+0x1b3/0x5d0
   do_signal+0x28/0x660
   exit_to_usermode_loop+0x46/0x86
   do_syscall_64+0x9c/0xb0
   entry_SYSCALL64_slow_path+0x25/0x25

The following is the repro.  Run "$PROG /dev/console".  The parent
process hangs in D state.

  #include <sys/types.h>
  #include <sys/stat.h>
  #include <sys/wait.h>
  #include <sys/ioctl.h>
  #include <fcntl.h>
  #include <unistd.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <errno.h>
  #include <signal.h>
  #include <time.h>
  #include <termios.h>

  int main(int argc, char **argv)
  {
	  struct sigaction sact = { .sa_handler = SIG_IGN };
	  struct timespec ts1s = { .tv_sec = 1 };
	  pid_t pid;
	  int fd;

	  if (argc < 2) {
		  fprintf(stderr, "test-hung-tty /dev/$TTY\n");
		  return 1;
	  }

	  /* fork a child to ensure that it isn't already the session leader */
	  pid = fork();
	  if (pid < 0) {
		  perror("fork");
		  return 1;
	  }

	  if (pid > 0) {
		  /* top parent, wait for everyone */
		  while (waitpid(-1, NULL, 0) >= 0)
			  ;
		  if (errno != ECHILD)
			  perror("waitpid");
		  return 0;
	  }

	  /* new session, start a new session and set the controlling tty */
	  if (setsid() < 0) {
		  perror("setsid");
		  return 1;
	  }

	  fd = open(argv[1], O_RDWR);
	  if (fd < 0) {
		  perror("open");
		  return 1;
	  }

	  if (ioctl(fd, TIOCSCTTY, 1) < 0) {
		  perror("ioctl");
		  return 1;
	  }

	  /* fork a child, sleep a bit and exit */
	  pid = fork();
	  if (pid < 0) {
		  perror("fork");
		  return 1;
	  }

	  if (pid > 0) {
		  nanosleep(&ts1s, NULL);
		  printf("Session leader exiting\n");
		  exit(0);
	  }

	  /*
	   * The child ignores SIGHUP and keeps reading from the controlling
	   * tty.  Because SIGHUP is ignored, the child doesn't get killed on
	   * parent exit and the bug in n_tty makes the read(2) block the
	   * parent's control terminal hangup attempt.  The parent ends up in
	   * D sleep until the child is explicitly killed.
	   */
	  sigaction(SIGHUP, &sact, NULL);
	  printf("Child reading tty\n");
	  while (1) {
		  char buf[1024];

		  if (read(fd, buf, sizeof(buf)) < 0) {
			  perror("read");
			  return 1;
		  }
	  }

	  return 0;
  }

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Alan Cox <alan@llwyncelyn.cymru>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_tty.c  | 6 ++++++
 drivers/tty/tty_io.c | 9 +++++++++
 include/linux/tty.h  | 1 +
 3 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 5c0e59e8fe46..cbe98bc2b998 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -2180,6 +2180,12 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
 				}
 				if (tty_hung_up_p(file))
 					break;
+				/*
+				 * Abort readers for ttys which never actually
+				 * get hung up.  See __tty_hangup().
+				 */
+				if (test_bit(TTY_HUPPING, &tty->flags))
+					break;
 				if (!timeout)
 					break;
 				if (file->f_flags & O_NONBLOCK) {
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index eb9133b472f4..63114ea35ec1 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -586,6 +586,14 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session)
 		return;
 	}
 
+	/*
+	 * Some console devices aren't actually hung up for technical and
+	 * historical reasons, which can lead to indefinite interruptible
+	 * sleep in n_tty_read().  The following explicitly tells
+	 * n_tty_read() to abort readers.
+	 */
+	set_bit(TTY_HUPPING, &tty->flags);
+
 	/* inuse_filps is protected by the single tty lock,
 	   this really needs to change if we want to flush the
 	   workqueue with the lock held */
@@ -640,6 +648,7 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session)
 	 * from the ldisc side, which is now guaranteed.
 	 */
 	set_bit(TTY_HUPPED, &tty->flags);
+	clear_bit(TTY_HUPPING, &tty->flags);
 	tty_unlock(tty);
 
 	if (f)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 0a6c71e0ad01..47f8af22f216 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -364,6 +364,7 @@ struct tty_file_private {
 #define TTY_PTY_LOCK 		16	/* pty private */
 #define TTY_NO_WRITE_SPLIT 	17	/* Preserve write boundaries to driver */
 #define TTY_HUPPED 		18	/* Post driver->hangup() */
+#define TTY_HUPPING		19	/* Hangup in progress */
 #define TTY_LDISC_HALTED	22	/* Line discipline is halted */
 
 /* Values for tty->flow_change */
-- 
cgit v1.2.3-71-gd317


From 9c0fb1e313aaf4e8edec22433c8b22dd308e466c Mon Sep 17 00:00:00 2001
From: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Date: Tue, 27 Feb 2018 20:10:18 +0800
Subject: block: display the correct diskname for bio

bio_devname use __bdevname to display the device name, and can
only show the major and minor of the part0,
Fix this by using disk_name to display the correct name.

Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index")
Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/partition-generic.c | 6 ++++++
 include/linux/bio.h       | 4 +---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/partition-generic.c b/block/partition-generic.c
index 91622db9aedf..08dabcd8b6ae 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -51,6 +51,12 @@ const char *bdevname(struct block_device *bdev, char *buf)
 
 EXPORT_SYMBOL(bdevname);
 
+const char *bio_devname(struct bio *bio, char *buf)
+{
+	return disk_name(bio->bi_disk, bio->bi_partno, buf);
+}
+EXPORT_SYMBOL(bio_devname);
+
 /*
  * There's very little reason to use this, you should really
  * have a struct block_device just about everywhere and use
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d0eb659fa733..ce547a25e8ae 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -511,6 +511,7 @@ void zero_fill_bio(struct bio *bio);
 extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
 extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
+extern const char *bio_devname(struct bio *bio, char *buffer);
 
 #define bio_set_dev(bio, bdev) 			\
 do {						\
@@ -529,9 +530,6 @@ do {						\
 #define bio_dev(bio) \
 	disk_devt((bio)->bi_disk)
 
-#define bio_devname(bio, buf) \
-	__bdevname(bio_dev(bio), (buf))
-
 #ifdef CONFIG_BLK_CGROUP
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 void bio_disassociate_task(struct bio *bio);
-- 
cgit v1.2.3-71-gd317


From fde9fc766e96c494b82931b1d270a9a751be07c0 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@mips.com>
Date: Mon, 19 Feb 2018 16:55:06 +0000
Subject: signals: Move put_compat_sigset to compat.h to silence hardened
 usercopy

Since commit afcc90f8621e ("usercopy: WARN() on slab cache usercopy
region violations"), MIPS systems booting with a compat root filesystem
emit a warning when copying compat siginfo to userspace:

WARNING: CPU: 0 PID: 953 at mm/usercopy.c:81 usercopy_warn+0x98/0xe8
Bad or missing usercopy whitelist? Kernel memory exposure attempt
detected from SLAB object 'task_struct' (offset 1432, size 16)!
Modules linked in:
CPU: 0 PID: 953 Comm: S01logging Not tainted 4.16.0-rc2 #10
Stack : ffffffff808c0000 0000000000000000 0000000000000001 65ac85163f3bdc4a
	65ac85163f3bdc4a 0000000000000000 90000000ff667ab8 ffffffff808c0000
	00000000000003f8 ffffffff808d0000 00000000000000d1 0000000000000000
	000000000000003c 0000000000000000 ffffffff808c8ca8 ffffffff808d0000
	ffffffff808d0000 ffffffff80810000 fffffc0000000000 ffffffff80785c30
	0000000000000009 0000000000000051 90000000ff667eb0 90000000ff667db0
	000000007fe0d938 0000000000000018 ffffffff80449958 0000000020052798
	ffffffff808c0000 90000000ff664000 90000000ff667ab0 00000000100c0000
	ffffffff80698810 0000000000000000 0000000000000000 0000000000000000
	0000000000000000 0000000000000000 ffffffff8010d02c 65ac85163f3bdc4a
	...
Call Trace:
[<ffffffff8010d02c>] show_stack+0x9c/0x130
[<ffffffff80698810>] dump_stack+0x90/0xd0
[<ffffffff80137b78>] __warn+0x100/0x118
[<ffffffff80137bdc>] warn_slowpath_fmt+0x4c/0x70
[<ffffffff8021e4a8>] usercopy_warn+0x98/0xe8
[<ffffffff8021e68c>] __check_object_size+0xfc/0x250
[<ffffffff801bbfb8>] put_compat_sigset+0x30/0x88
[<ffffffff8011af24>] setup_rt_frame_n32+0xc4/0x160
[<ffffffff8010b8b4>] do_signal+0x19c/0x230
[<ffffffff8010c408>] do_notify_resume+0x60/0x78
[<ffffffff80106f50>] work_notifysig+0x10/0x18
---[ end trace 88fffbf69147f48a ]---

Commit 5905429ad856 ("fork: Provide usercopy whitelisting for
task_struct") noted that:

"While the blocked and saved_sigmask fields of task_struct are copied to
userspace (via sigmask_to_save() and setup_rt_frame()), it is always
copied with a static length (i.e. sizeof(sigset_t))."

However, this is not true in the case of compat signals, whose sigset
is copied by put_compat_sigset and receives size as an argument.

At most call sites, put_compat_sigset is copying a sigset from the
current task_struct. This triggers a warning when
CONFIG_HARDENED_USERCOPY is active. However, by marking this function as
static inline, the warning can be avoided because in all of these cases
the size is constant at compile time, which is allowed. The only site
where this is not the case is handling the rt_sigpending syscall, but
there the copy is being made from a stack local variable so does not
trigger the warning.

Move put_compat_sigset to compat.h, and mark it static inline. This
fixes the WARN on MIPS.

Fixes: afcc90f8621e ("usercopy: WARN() on slab cache usercopy region violations")
Signed-off-by: Matt Redfearn <matt.redfearn@mips.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: "Dmitry V . Levin" <ldv@altlinux.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/18639/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 include/linux/compat.h | 26 ++++++++++++++++++++++++--
 kernel/compat.c        | 19 -------------------
 2 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 8a9643857c4a..c4139c7a0de0 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -17,6 +17,7 @@
 #include <linux/if.h>
 #include <linux/fs.h>
 #include <linux/aio_abi.h>	/* for aio_context_t */
+#include <linux/uaccess.h>
 #include <linux/unistd.h>
 
 #include <asm/compat.h>
@@ -550,8 +551,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
 extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat);
-extern int put_compat_sigset(compat_sigset_t __user *compat,
-			     const sigset_t *set, unsigned int size);
+
+/*
+ * Defined inline such that size can be compile time constant, which avoids
+ * CONFIG_HARDENED_USERCOPY complaining about copies from task_struct
+ */
+static inline int
+put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
+		  unsigned int size)
+{
+	/* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
+#ifdef __BIG_ENDIAN
+	compat_sigset_t v;
+	switch (_NSIG_WORDS) {
+	case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
+	case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
+	case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
+	case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
+	}
+	return copy_to_user(compat, &v, size) ? -EFAULT : 0;
+#else
+	return copy_to_user(compat, set, size) ? -EFAULT : 0;
+#endif
+}
 
 asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
 		compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
diff --git a/kernel/compat.c b/kernel/compat.c
index 3247fe761f60..3f5fa8902e7d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -488,25 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
 }
 EXPORT_SYMBOL_GPL(get_compat_sigset);
 
-int
-put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
-		  unsigned int size)
-{
-	/* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
-#ifdef __BIG_ENDIAN
-	compat_sigset_t v;
-	switch (_NSIG_WORDS) {
-	case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
-	case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
-	case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
-	case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
-	}
-	return copy_to_user(compat, &v, size) ? -EFAULT : 0;
-#else
-	return copy_to_user(compat, set, size) ? -EFAULT : 0;
-#endif
-}
-
 #ifdef CONFIG_NUMA
 COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
 		       compat_uptr_t __user *, pages32,
-- 
cgit v1.2.3-71-gd317


From d02f51cbcf12b09ab945873e35046045875eed9a Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Sat, 3 Mar 2018 03:03:46 +0100
Subject: bpf: fix bpf_skb_adjust_net/bpf_skb_proto_xlat to deal with gso sctp
 skbs

SCTP GSO skbs have a gso_size of GSO_BY_FRAGS, so any sort of
unconditionally mangling of that will result in nonsense value
and would corrupt the skb later on.

Therefore, i) add two helpers skb_increase_gso_size() and
skb_decrease_gso_size() that would throw a one time warning and
bail out for such skbs and ii) refuse and return early with an
error in those BPF helpers that are affected. We do need to bail
out as early as possible from there before any changes on the
skb have been performed.

Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
Co-authored-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Axtens <dja@axtens.net>
Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/networking/segmentation-offloads.txt | 11 +++-
 include/linux/skbuff.h                             | 22 ++++++++
 net/core/filter.c                                  | 60 +++++++++++++++-------
 3 files changed, 73 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/segmentation-offloads.txt b/Documentation/networking/segmentation-offloads.txt
index d47480b61ac6..23a8dd91a9ec 100644
--- a/Documentation/networking/segmentation-offloads.txt
+++ b/Documentation/networking/segmentation-offloads.txt
@@ -153,8 +153,15 @@ To signal this, gso_size is set to the special value GSO_BY_FRAGS.
 
 Therefore, any code in the core networking stack must be aware of the
 possibility that gso_size will be GSO_BY_FRAGS and handle that case
-appropriately. (For size checks, the skb_gso_validate_*_len family of
-helpers do this automatically.)
+appropriately.
+
+There are a couple of helpers to make this easier:
+
+ - For size checks, the skb_gso_validate_*_len family of helpers correctly
+   considers GSO_BY_FRAGS.
+
+ - For manipulating packets, skb_increase_gso_size and skb_decrease_gso_size
+   will check for GSO_BY_FRAGS and WARN if asked to manipulate these skbs.
 
 This also affects drivers with the NETIF_F_FRAGLIST & NETIF_F_GSO_SCTP bits
 set. Note also that NETIF_F_GSO_SCTP is included in NETIF_F_GSO_SOFTWARE.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c1e66bdcf583..8c67c33f40c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4038,6 +4038,12 @@ static inline bool skb_is_gso_v6(const struct sk_buff *skb)
 	return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
 }
 
+/* Note: Should be called only if skb_is_gso(skb) is true */
+static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
+}
+
 static inline void skb_gso_reset(struct sk_buff *skb)
 {
 	skb_shinfo(skb)->gso_size = 0;
@@ -4045,6 +4051,22 @@ static inline void skb_gso_reset(struct sk_buff *skb)
 	skb_shinfo(skb)->gso_type = 0;
 }
 
+static inline void skb_increase_gso_size(struct skb_shared_info *shinfo,
+					 u16 increment)
+{
+	if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
+		return;
+	shinfo->gso_size += increment;
+}
+
+static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo,
+					 u16 decrement)
+{
+	if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
+		return;
+	shinfo->gso_size -= decrement;
+}
+
 void __skb_warn_lro_forwarding(const struct sk_buff *skb);
 
 static inline bool skb_warn_if_lro(const struct sk_buff *skb)
diff --git a/net/core/filter.c b/net/core/filter.c
index 0c121adbdbaa..48aa7c7320db 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2087,6 +2087,10 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
 	u32 off = skb_mac_header_len(skb);
 	int ret;
 
+	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+		return -ENOTSUPP;
+
 	ret = skb_cow(skb, len_diff);
 	if (unlikely(ret < 0))
 		return ret;
@@ -2096,19 +2100,21 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
 		return ret;
 
 	if (skb_is_gso(skb)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
 		/* SKB_GSO_TCPV4 needs to be changed into
 		 * SKB_GSO_TCPV6.
 		 */
-		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
-			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
-			skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV6;
+		if (shinfo->gso_type & SKB_GSO_TCPV4) {
+			shinfo->gso_type &= ~SKB_GSO_TCPV4;
+			shinfo->gso_type |=  SKB_GSO_TCPV6;
 		}
 
 		/* Due to IPv6 header, MSS needs to be downgraded. */
-		skb_shinfo(skb)->gso_size -= len_diff;
+		skb_decrease_gso_size(shinfo, len_diff);
 		/* Header must be checked, and gso_segs recomputed. */
-		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-		skb_shinfo(skb)->gso_segs = 0;
+		shinfo->gso_type |= SKB_GSO_DODGY;
+		shinfo->gso_segs = 0;
 	}
 
 	skb->protocol = htons(ETH_P_IPV6);
@@ -2123,6 +2129,10 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
 	u32 off = skb_mac_header_len(skb);
 	int ret;
 
+	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+		return -ENOTSUPP;
+
 	ret = skb_unclone(skb, GFP_ATOMIC);
 	if (unlikely(ret < 0))
 		return ret;
@@ -2132,19 +2142,21 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
 		return ret;
 
 	if (skb_is_gso(skb)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
 		/* SKB_GSO_TCPV6 needs to be changed into
 		 * SKB_GSO_TCPV4.
 		 */
-		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
-			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
-			skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV4;
+		if (shinfo->gso_type & SKB_GSO_TCPV6) {
+			shinfo->gso_type &= ~SKB_GSO_TCPV6;
+			shinfo->gso_type |=  SKB_GSO_TCPV4;
 		}
 
 		/* Due to IPv4 header, MSS can be upgraded. */
-		skb_shinfo(skb)->gso_size += len_diff;
+		skb_increase_gso_size(shinfo, len_diff);
 		/* Header must be checked, and gso_segs recomputed. */
-		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-		skb_shinfo(skb)->gso_segs = 0;
+		shinfo->gso_type |= SKB_GSO_DODGY;
+		shinfo->gso_segs = 0;
 	}
 
 	skb->protocol = htons(ETH_P_IP);
@@ -2243,6 +2255,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
 	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
 	int ret;
 
+	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+		return -ENOTSUPP;
+
 	ret = skb_cow(skb, len_diff);
 	if (unlikely(ret < 0))
 		return ret;
@@ -2252,11 +2268,13 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
 		return ret;
 
 	if (skb_is_gso(skb)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
 		/* Due to header grow, MSS needs to be downgraded. */
-		skb_shinfo(skb)->gso_size -= len_diff;
+		skb_decrease_gso_size(shinfo, len_diff);
 		/* Header must be checked, and gso_segs recomputed. */
-		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-		skb_shinfo(skb)->gso_segs = 0;
+		shinfo->gso_type |= SKB_GSO_DODGY;
+		shinfo->gso_segs = 0;
 	}
 
 	return 0;
@@ -2267,6 +2285,10 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
 	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
 	int ret;
 
+	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+		return -ENOTSUPP;
+
 	ret = skb_unclone(skb, GFP_ATOMIC);
 	if (unlikely(ret < 0))
 		return ret;
@@ -2276,11 +2298,13 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
 		return ret;
 
 	if (skb_is_gso(skb)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
 		/* Due to header shrink, MSS can be upgraded. */
-		skb_shinfo(skb)->gso_size += len_diff;
+		skb_increase_gso_size(shinfo, len_diff);
 		/* Header must be checked, and gso_segs recomputed. */
-		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-		skb_shinfo(skb)->gso_segs = 0;
+		shinfo->gso_type |= SKB_GSO_DODGY;
+		shinfo->gso_segs = 0;
 	}
 
 	return 0;
-- 
cgit v1.2.3-71-gd317


From 779b7931b27bfa80bac46d0115d229259aef580b Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Thu, 1 Mar 2018 17:13:37 +1100
Subject: net: rename skb_gso_validate_mtu -> skb_gso_validate_network_len

If you take a GSO skb, and split it into packets, will the network
length (L3 headers + L4 headers + payload) of those packets be small
enough to fit within a given MTU?

skb_gso_validate_mtu gives you the answer to that question. However,
we recently added to add a way to validate the MAC length of a split GSO
skb (L2+L3+L4+payload), and the names get confusing, so rename
skb_gso_validate_mtu to skb_gso_validate_network_len

Signed-off-by: Daniel Axtens <dja@axtens.net>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h                  |  2 +-
 net/core/skbuff.c                       | 11 ++++++-----
 net/ipv4/ip_forward.c                   |  2 +-
 net/ipv4/ip_output.c                    |  2 +-
 net/ipv4/netfilter/nf_flow_table_ipv4.c |  2 +-
 net/ipv6/ip6_output.c                   |  2 +-
 net/ipv6/netfilter/nf_flow_table_ipv6.c |  2 +-
 net/mpls/af_mpls.c                      |  2 +-
 net/xfrm/xfrm_device.c                  |  2 +-
 9 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c1e66bdcf583..a057dd1a75c7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3286,7 +3286,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
 void skb_scrub_packet(struct sk_buff *skb, bool xnet);
 unsigned int skb_gso_transport_seglen(const struct sk_buff *skb);
-bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu);
+bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu);
 bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
 struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 09bd89c90a71..b63767008824 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4955,19 +4955,20 @@ static inline bool skb_gso_size_check(const struct sk_buff *skb,
 }
 
 /**
- * skb_gso_validate_mtu - Return in case such skb fits a given MTU
+ * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
  *
  * @skb: GSO skb
  * @mtu: MTU to validate against
  *
- * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU
- * once split.
+ * skb_gso_validate_network_len validates if a given skb will fit a
+ * wanted MTU once split. It considers L3 headers, L4 headers, and the
+ * payload.
  */
-bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu)
+bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
 {
 	return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
 }
-EXPORT_SYMBOL_GPL(skb_gso_validate_mtu);
+EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
 
 /**
  * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 2dd21c3281a1..b54b948b0596 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -55,7 +55,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	if (skb->ignore_df)
 		return false;
 
-	if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 		return false;
 
 	return true;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e8e675be60ec..66340ab750e6 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -248,7 +248,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
 
 	/* common case: seglen is <= mtu
 	 */
-	if (skb_gso_validate_mtu(skb, mtu))
+	if (skb_gso_validate_network_len(skb, mtu))
 		return ip_finish_output2(net, sk, skb);
 
 	/* Slowpath -  GSO segment length exceeds the egress MTU.
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 282b9cc4fe82..0cd46bffa469 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -186,7 +186,7 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
 		return false;
 
-	if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 		return false;
 
 	return true;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 997c7f19ad62..a8a919520090 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -412,7 +412,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 	if (skb->ignore_df)
 		return false;
 
-	if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 		return false;
 
 	return true;
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index d346705d6ee6..207cb35569b1 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -178,7 +178,7 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	if (skb->len <= mtu)
 		return false;
 
-	if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 		return false;
 
 	return true;
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index e545a3c9365f..7a4de6d618b1 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -122,7 +122,7 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 	if (skb->len <= mtu)
 		return false;
 
-	if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 		return false;
 
 	return true;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 8e70291e586a..e87d6c4dd5b6 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -217,7 +217,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 		if (skb->len <= mtu)
 			goto ok;
 
-		if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+		if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 			goto ok;
 	}
 
-- 
cgit v1.2.3-71-gd317


From a4a77718ee4053a44aa40fe67247c1afb5ce2f1e Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Thu, 1 Mar 2018 17:13:40 +1100
Subject: net: make skb_gso_*_seglen functions private

They're very hard to use properly as they do not consider the
GSO_BY_FRAGS case. Code should use skb_gso_validate_network_len
and skb_gso_validate_mac_len as they do consider this case.

Make the seglen functions static, which stops people using them
outside of skbuff.c

Signed-off-by: Daniel Axtens <dja@axtens.net>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 33 ---------------------------------
 net/core/skbuff.c      | 37 +++++++++++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a057dd1a75c7..ddf77cf4ff2d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3285,7 +3285,6 @@ int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
 void skb_scrub_packet(struct sk_buff *skb, bool xnet);
-unsigned int skb_gso_transport_seglen(const struct sk_buff *skb);
 bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu);
 bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
@@ -4104,38 +4103,6 @@ static inline bool skb_head_is_locked(const struct sk_buff *skb)
 	return !skb->head_frag || skb_cloned(skb);
 }
 
-/**
- * skb_gso_network_seglen - Return length of individual segments of a gso packet
- *
- * @skb: GSO skb
- *
- * skb_gso_network_seglen is used to determine the real size of the
- * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
- *
- * The MAC/L2 header is not accounted for.
- */
-static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
-{
-	unsigned int hdr_len = skb_transport_header(skb) -
-			       skb_network_header(skb);
-	return hdr_len + skb_gso_transport_seglen(skb);
-}
-
-/**
- * skb_gso_mac_seglen - Return length of individual segments of a gso packet
- *
- * @skb: GSO skb
- *
- * skb_gso_mac_seglen is used to determine the real size of the
- * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
- * headers (TCP/UDP).
- */
-static inline unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
-{
-	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
-	return hdr_len + skb_gso_transport_seglen(skb);
-}
-
 /* Local Checksum Offload.
  * Compute outer checksum based on the assumption that the
  * inner checksum will be offloaded later.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b63767008824..0bb0d8877954 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4891,7 +4891,7 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet);
  *
  * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
  */
-unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
+static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
 {
 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 	unsigned int thlen = 0;
@@ -4913,7 +4913,40 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
 	 */
 	return thlen + shinfo->gso_size;
 }
-EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
+
+/**
+ * skb_gso_network_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_network_seglen is used to determine the real size of the
+ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
+ *
+ * The MAC/L2 header is not accounted for.
+ */
+static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
+{
+	unsigned int hdr_len = skb_transport_header(skb) -
+			       skb_network_header(skb);
+
+	return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/**
+ * skb_gso_mac_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_mac_seglen is used to determine the real size of the
+ * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
+ * headers (TCP/UDP).
+ */
+static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
+{
+	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+	return hdr_len + skb_gso_transport_seglen(skb);
+}
 
 /**
  * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
-- 
cgit v1.2.3-71-gd317


From a6f1086e29e93621a6394b94b8c0e4a4e490f38b Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Mon, 5 Mar 2018 15:22:30 -0800
Subject: PCI: Move of_irq_parse_and_map_pci() declaration under OF_IRQ

Since commit 4670d610d592 ("PCI: Move OF-related PCI functions into
PCI core"), sparc:allmodconfig fails to build with the following error.

  pcie-cadence-host.c:(.text+0x4c4): undefined reference to `of_irq_parse_and_map_pci'
  pcie-cadence-host.c:(.text+0x4c8): undefined reference to `of_irq_parse_and_map_pci'

of_irq_parse_and_map_pci() is now only available if OF_IRQ is enabled.
Make its declaration and its dummy function dependent on OF_IRQ to solve
the problem.

Fixes: 4670d610d592 ("PCI: Move OF-related PCI functions into PCI core")
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_pci.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
index 88865e0ebf4d..091033a6b836 100644
--- a/include/linux/of_pci.h
+++ b/include/linux/of_pci.h
@@ -13,7 +13,6 @@ struct device_node;
 struct device_node *of_pci_find_child_device(struct device_node *parent,
 					     unsigned int devfn);
 int of_pci_get_devfn(struct device_node *np);
-int of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin);
 int of_pci_parse_bus_range(struct device_node *node, struct resource *res);
 int of_get_pci_domain_nr(struct device_node *node);
 int of_pci_get_max_link_speed(struct device_node *node);
@@ -33,12 +32,6 @@ static inline int of_pci_get_devfn(struct device_node *np)
 	return -EINVAL;
 }
 
-static inline int
-of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin)
-{
-	return 0;
-}
-
 static inline int
 of_pci_parse_bus_range(struct device_node *node, struct resource *res)
 {
@@ -67,6 +60,16 @@ of_pci_get_max_link_speed(struct device_node *node)
 static inline void of_pci_check_probe_only(void) { }
 #endif
 
+#if IS_ENABLED(CONFIG_OF_IRQ)
+int of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin);
+#else
+static inline int
+of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin)
+{
+	return 0;
+}
+#endif
+
 #if defined(CONFIG_OF_ADDRESS)
 int of_pci_get_host_bridge_resources(struct device_node *dev,
 			unsigned char busno, unsigned char bus_max,
-- 
cgit v1.2.3-71-gd317


From 859d880cf544dbe095ce97534ef04cd88ba2f2b4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 6 Mar 2018 00:20:25 -0600
Subject: signal: Correct the offset of si_pkey in struct siginfo

The change moving addr_lsb into the _sigfault union failed to take
into account that _sigfault._addr_bnd._lower being a pointer forced
the entire union to have pointer alignment.  In practice this only
mattered for the offset of si_pkey which is why this has taken so long
to discover.

To correct this change _dummy_pkey and _dummy_bnd to have pointer type.

Reported-by: kernel test robot <shun.hao@intel.com>
Fixes: b68a68d3dcc1 ("signal: Move addr_lsb into the _sigfault union for clarity")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/compat.h             | 4 ++--
 include/uapi/asm-generic/siginfo.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 8a9643857c4a..e16d07eb08cf 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -229,13 +229,13 @@ typedef struct compat_siginfo {
 				short int _addr_lsb;	/* Valid LSB of the reported address. */
 				/* used when si_code=SEGV_BNDERR */
 				struct {
-					short _dummy_bnd;
+					compat_uptr_t _dummy_bnd;
 					compat_uptr_t _lower;
 					compat_uptr_t _upper;
 				} _addr_bnd;
 				/* used when si_code=SEGV_PKUERR */
 				struct {
-					short _dummy_pkey;
+					compat_uptr_t _dummy_pkey;
 					u32 _pkey;
 				} _addr_pkey;
 			};
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 85dc965afd89..99c902e460c2 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -102,13 +102,13 @@ typedef struct siginfo {
 				short _addr_lsb; /* LSB of the reported address */
 				/* used when si_code=SEGV_BNDERR */
 				struct {
-					short _dummy_bnd;
+					void *_dummy_bnd;
 					void __user *_lower;
 					void __user *_upper;
 				} _addr_bnd;
 				/* used when si_code=SEGV_PKUERR */
 				struct {
-					short _dummy_pkey;
+					void *_dummy_pkey;
 					__u32 _pkey;
 				} _addr_pkey;
 			};
-- 
cgit v1.2.3-71-gd317


From cb88a0588717ba6c756cb5972d75766b273a6817 Mon Sep 17 00:00:00 2001
From: Danilo Krummrich <danilokrummrich@dk-develop.de>
Date: Tue, 6 Mar 2018 09:38:49 +0100
Subject: usb: quirks: add control message delay for 1b1c:1b20

Corsair Strafe RGB keyboard does not respond to usb control messages
sometimes and hence generates timeouts.

Commit de3af5bf259d ("usb: quirks: add delay init quirk for Corsair
Strafe RGB keyboard") tried to fix those timeouts by adding
USB_QUIRK_DELAY_INIT.

Unfortunately, even with this quirk timeouts of usb_control_msg()
can still be seen, but with a lower frequency (approx. 1 out of 15):

[   29.103520] usb 1-8: string descriptor 0 read error: -110
[   34.363097] usb 1-8: can't set config #1, error -110

Adding further delays to different locations where usb control
messages are issued just moves the timeouts to other locations,
e.g.:

[   35.400533] usbhid 1-8:1.0: can't add hid device: -110
[   35.401014] usbhid: probe of 1-8:1.0 failed with error -110

The only way to reliably avoid those issues is having a pause after
each usb control message. In approx. 200 boot cycles no more timeouts
were seen.

Addionaly, keep USB_QUIRK_DELAY_INIT as it turned out to be necessary
to have the delay in hub_port_connect() after hub_port_init().

The overall boot time seems not to be influenced by these additional
delays, even on fast machines and lightweight distributions.

Fixes: de3af5bf259d ("usb: quirks: add delay init quirk for Corsair Strafe RGB keyboard")
Cc: stable@vger.kernel.org
Signed-off-by: Danilo Krummrich <danilokrummrich@dk-develop.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/message.c | 4 ++++
 drivers/usb/core/quirks.c  | 3 ++-
 include/linux/usb/quirks.h | 3 +++
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index c64cf6c4a83d..0c11d40a12bc 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -151,6 +151,10 @@ int usb_control_msg(struct usb_device *dev, unsigned int pipe, __u8 request,
 
 	ret = usb_internal_control_msg(dev, pipe, dr, data, size, timeout);
 
+	/* Linger a bit, prior to the next control message. */
+	if (dev->quirks & USB_QUIRK_DELAY_CTRL_MSG)
+		msleep(200);
+
 	kfree(dr);
 
 	return ret;
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index f4a548471f0f..54b019e267c5 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -230,7 +230,8 @@ static const struct usb_device_id usb_quirk_list[] = {
 	{ USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT },
 
 	/* Corsair Strafe RGB */
-	{ USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT },
+	{ USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT |
+	  USB_QUIRK_DELAY_CTRL_MSG },
 
 	/* Corsair K70 LUX */
 	{ USB_DEVICE(0x1b1c, 0x1b36), .driver_info = USB_QUIRK_DELAY_INIT },
diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h
index f1fcec2fd5f8..b7a99ce56bc9 100644
--- a/include/linux/usb/quirks.h
+++ b/include/linux/usb/quirks.h
@@ -63,4 +63,7 @@
  */
 #define USB_QUIRK_DISCONNECT_SUSPEND		BIT(12)
 
+/* Device needs a pause after every control message. */
+#define USB_QUIRK_DELAY_CTRL_MSG		BIT(13)
+
 #endif /* __LINUX_USB_QUIRKS_H */
-- 
cgit v1.2.3-71-gd317


From d3dcf8eb615537526bd42ff27a081d46d337816e Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Sun, 4 Mar 2018 17:29:48 +0200
Subject: rhashtable: Fix rhlist duplicates insertion

When inserting duplicate objects (those with the same key),
current rhlist implementation messes up the chain pointers by
updating the bucket pointer instead of prev next pointer to the
newly inserted node. This causes missing elements on removal and
travesal.

Fix that by properly updating pprev pointer to point to
the correct rhash_head next pointer.

Issue: 1241076
Change-Id: I86b2c140bcb4aeb10b70a72a267ff590bb2b17e7
Fixes: ca26893f05e8 ('rhashtable: Add rhlist interface')
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 4 +++-
 lib/rhashtable.c           | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index c9df2527e0cd..668a21f04b09 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -766,8 +766,10 @@ slow_path:
 		if (!key ||
 		    (params.obj_cmpfn ?
 		     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
-		     rhashtable_compare(&arg, rht_obj(ht, head))))
+		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
+			pprev = &head->next;
 			continue;
+		}
 
 		data = rht_obj(ht, head);
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 3825c30aaa36..47de025b6245 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -506,8 +506,10 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 		if (!key ||
 		    (ht->p.obj_cmpfn ?
 		     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
-		     rhashtable_compare(&arg, rht_obj(ht, head))))
+		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
+			pprev = &head->next;
 			continue;
+		}
 
 		if (!ht->rhlist)
 			return rht_obj(ht, head);
-- 
cgit v1.2.3-71-gd317


From 2695578b896aea472b2c0dcbe9d92daa71738484 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 5 Mar 2018 11:41:13 -0800
Subject: net: usbnet: fix potential deadlock on 32bit hosts

Marek reported a LOCKDEP issue occurring on 32bit host,
that we tracked down to the fact that usbnet could either
run from soft or hard irqs.

This patch adds u64_stats_update_begin_irqsave() and
u64_stats_update_end_irqrestore() helpers to solve this case.

[   17.768040] ================================
[   17.772239] WARNING: inconsistent lock state
[   17.776511] 4.16.0-rc3-next-20180227-00007-g876c53a7493c #453 Not tainted
[   17.783329] --------------------------------
[   17.787580] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
[   17.793607] swapper/0/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
[   17.798751]  (&syncp->seq#5){?.-.}, at: [<9b22e5f0>]
asix_rx_fixup_internal+0x188/0x288
[   17.806790] {IN-HARDIRQ-W} state was registered at:
[   17.811677]   tx_complete+0x100/0x208
[   17.815319]   __usb_hcd_giveback_urb+0x60/0xf0
[   17.819770]   xhci_giveback_urb_in_irq+0xa8/0x240
[   17.824469]   xhci_td_cleanup+0xf4/0x16c
[   17.828367]   xhci_irq+0xe74/0x2240
[   17.831827]   usb_hcd_irq+0x24/0x38
[   17.835343]   __handle_irq_event_percpu+0x98/0x510
[   17.840111]   handle_irq_event_percpu+0x1c/0x58
[   17.844623]   handle_irq_event+0x38/0x5c
[   17.848519]   handle_fasteoi_irq+0xa4/0x138
[   17.852681]   generic_handle_irq+0x18/0x28
[   17.856760]   __handle_domain_irq+0x6c/0xe4
[   17.860941]   gic_handle_irq+0x54/0xa0
[   17.864666]   __irq_svc+0x70/0xb0
[   17.867964]   arch_cpu_idle+0x20/0x3c
[   17.871578]   arch_cpu_idle+0x20/0x3c
[   17.875190]   do_idle+0x144/0x218
[   17.878468]   cpu_startup_entry+0x18/0x1c
[   17.882454]   start_kernel+0x394/0x400
[   17.886177] irq event stamp: 161912
[   17.889616] hardirqs last  enabled at (161912): [<7bedfacf>]
__netdev_alloc_skb+0xcc/0x140
[   17.897893] hardirqs last disabled at (161911): [<d58261d0>]
__netdev_alloc_skb+0x94/0x140
[   17.904903] exynos5-hsi2c 12ca0000.i2c: tx timeout
[   17.906116] softirqs last  enabled at (161904): [<387102ff>]
irq_enter+0x78/0x80
[   17.906123] softirqs last disabled at (161905): [<cf4c628e>]
irq_exit+0x134/0x158
[   17.925722].
[   17.925722] other info that might help us debug this:
[   17.933435]  Possible unsafe locking scenario:
[   17.933435].
[   17.940331]        CPU0
[   17.942488]        ----
[   17.944894]   lock(&syncp->seq#5);
[   17.948274]   <Interrupt>
[   17.950847]     lock(&syncp->seq#5);
[   17.954386].
[   17.954386]  *** DEADLOCK ***
[   17.954386].
[   17.962422] no locks held by swapper/0/0.

Fixes: c8b5d129ee29 ("net: usbnet: support 64bit stats")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c       | 10 ++++++----
 include/linux/u64_stats_sync.h | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 8a22ff67b026..d9eea8cfe6cb 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -315,6 +315,7 @@ static void __usbnet_status_stop_force(struct usbnet *dev)
 void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb)
 {
 	struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64);
+	unsigned long flags;
 	int	status;
 
 	if (test_bit(EVENT_RX_PAUSED, &dev->flags)) {
@@ -326,10 +327,10 @@ void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb)
 	if (skb->protocol == 0)
 		skb->protocol = eth_type_trans (skb, dev->net);
 
-	u64_stats_update_begin(&stats64->syncp);
+	flags = u64_stats_update_begin_irqsave(&stats64->syncp);
 	stats64->rx_packets++;
 	stats64->rx_bytes += skb->len;
-	u64_stats_update_end(&stats64->syncp);
+	u64_stats_update_end_irqrestore(&stats64->syncp, flags);
 
 	netif_dbg(dev, rx_status, dev->net, "< rx, len %zu, type 0x%x\n",
 		  skb->len + sizeof (struct ethhdr), skb->protocol);
@@ -1248,11 +1249,12 @@ static void tx_complete (struct urb *urb)
 
 	if (urb->status == 0) {
 		struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64);
+		unsigned long flags;
 
-		u64_stats_update_begin(&stats64->syncp);
+		flags = u64_stats_update_begin_irqsave(&stats64->syncp);
 		stats64->tx_packets += entry->packets;
 		stats64->tx_bytes += entry->length;
-		u64_stats_update_end(&stats64->syncp);
+		u64_stats_update_end_irqrestore(&stats64->syncp, flags);
 	} else {
 		dev->net->stats.tx_errors++;
 
diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index 5bdbd9f49395..07ee0f84a46c 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -90,6 +90,28 @@ static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
 #endif
 }
 
+static inline unsigned long
+u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
+{
+	unsigned long flags = 0;
+
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	local_irq_save(flags);
+	write_seqcount_begin(&syncp->seq);
+#endif
+	return flags;
+}
+
+static inline void
+u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
+				unsigned long flags)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	write_seqcount_end(&syncp->seq);
+	local_irq_restore(flags);
+#endif
+}
+
 static inline void u64_stats_update_begin_raw(struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-- 
cgit v1.2.3-71-gd317


From 3a4030761ea88ff439030ca98e3094b9900e96b7 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 9 Mar 2018 14:50:34 +0800
Subject: vhost_net: examine pointer types during un-producing

After commit fc72d1d54dd9 ("tuntap: XDP transmission"), we can
actually queueing XDP pointers in the pointer ring, so we should
examine the pointer type before freeing the pointer.

Fixes: fc72d1d54dd9 ("tuntap: XDP transmission")
Reported-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c      | 3 ++-
 drivers/vhost/net.c    | 2 +-
 include/linux/if_tun.h | 4 ++++
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 7433bb2e4451..28cfa642e39a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -655,7 +655,7 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
 	return tun;
 }
 
-static void tun_ptr_free(void *ptr)
+void tun_ptr_free(void *ptr)
 {
 	if (!ptr)
 		return;
@@ -667,6 +667,7 @@ static void tun_ptr_free(void *ptr)
 		__skb_array_destroy_skb(ptr);
 	}
 }
+EXPORT_SYMBOL_GPL(tun_ptr_free);
 
 static void tun_queue_purge(struct tun_file *tfile)
 {
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index efb93063fda1..8139bc70ad7d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -170,7 +170,7 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
 	if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
 		ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
 				   vhost_net_buf_get_size(rxq),
-				   __skb_array_destroy_skb);
+				   tun_ptr_free);
 		rxq->head = rxq->tail = 0;
 	}
 }
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index c5b0a75a7812..fd00170b494f 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -25,6 +25,7 @@ struct ptr_ring *tun_get_tx_ring(struct file *file);
 bool tun_is_xdp_buff(void *ptr);
 void *tun_xdp_to_ptr(void *ptr);
 void *tun_ptr_to_xdp(void *ptr);
+void tun_ptr_free(void *ptr);
 #else
 #include <linux/err.h>
 #include <linux/errno.h>
@@ -50,5 +51,8 @@ static inline void *tun_ptr_to_xdp(void *ptr)
 {
 	return NULL;
 }
+static inline void tun_ptr_free(void *ptr)
+{
+}
 #endif /* CONFIG_TUN */
 #endif /* __IF_TUN_H */
-- 
cgit v1.2.3-71-gd317


From b1d0a5d0cba4597c0394997b2d5fced3e3841b4e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 10 Mar 2018 01:15:45 +0100
Subject: netfilter: x_tables: add and use xt_check_proc_name

recent and hashlimit both create /proc files, but only check that
name is 0 terminated.

This can trigger WARN() from procfs when name is "" or "/".
Add helper for this and then use it for both.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: <syzbot+0502b00edac2a0680b61@syzkaller.appspotmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  2 ++
 net/netfilter/x_tables.c           | 30 ++++++++++++++++++++++++++++++
 net/netfilter/xt_hashlimit.c       | 16 ++++++++++------
 net/netfilter/xt_recent.c          |  6 +++---
 4 files changed, 45 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1313b35c3ab7..14529511c4b8 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -285,6 +285,8 @@ unsigned int *xt_alloc_entry_offsets(unsigned int size);
 bool xt_find_jump_offset(const unsigned int *offsets,
 			 unsigned int target, unsigned int size);
 
+int xt_check_proc_name(const char *name, unsigned int size);
+
 int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto,
 		   bool inv_proto);
 int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto,
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index fa1655aff8d3..4aa01c90e9d1 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -423,6 +423,36 @@ textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto)
 	return buf;
 }
 
+/**
+ * xt_check_proc_name - check that name is suitable for /proc file creation
+ *
+ * @name: file name candidate
+ * @size: length of buffer
+ *
+ * some x_tables modules wish to create a file in /proc.
+ * This function makes sure that the name is suitable for this
+ * purpose, it checks that name is NUL terminated and isn't a 'special'
+ * name, like "..".
+ *
+ * returns negative number on error or 0 if name is useable.
+ */
+int xt_check_proc_name(const char *name, unsigned int size)
+{
+	if (name[0] == '\0')
+		return -EINVAL;
+
+	if (strnlen(name, size) == size)
+		return -ENAMETOOLONG;
+
+	if (strcmp(name, ".") == 0 ||
+	    strcmp(name, "..") == 0 ||
+	    strchr(name, '/'))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(xt_check_proc_name);
+
 int xt_check_match(struct xt_mtchk_param *par,
 		   unsigned int size, u_int8_t proto, bool inv_proto)
 {
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 66f5aca62a08..3360f13dc208 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -917,8 +917,9 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
 	struct hashlimit_cfg3 cfg = {};
 	int ret;
 
-	if (info->name[sizeof(info->name) - 1] != '\0')
-		return -EINVAL;
+	ret = xt_check_proc_name(info->name, sizeof(info->name));
+	if (ret)
+		return ret;
 
 	ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
 
@@ -935,8 +936,9 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par)
 	struct hashlimit_cfg3 cfg = {};
 	int ret;
 
-	if (info->name[sizeof(info->name) - 1] != '\0')
-		return -EINVAL;
+	ret = xt_check_proc_name(info->name, sizeof(info->name));
+	if (ret)
+		return ret;
 
 	ret = cfg_copy(&cfg, (void *)&info->cfg, 2);
 
@@ -950,9 +952,11 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par)
 static int hashlimit_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_hashlimit_mtinfo3 *info = par->matchinfo;
+	int ret;
 
-	if (info->name[sizeof(info->name) - 1] != '\0')
-		return -EINVAL;
+	ret = xt_check_proc_name(info->name, sizeof(info->name));
+	if (ret)
+		return ret;
 
 	return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg,
 					 info->name, 3);
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 6d232d18faff..81ee1d6543b2 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -361,9 +361,9 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
 				    info->hit_count, XT_RECENT_MAX_NSTAMPS - 1);
 		return -EINVAL;
 	}
-	if (info->name[0] == '\0' ||
-	    strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN)
-		return -EINVAL;
+	ret = xt_check_proc_name(info->name, sizeof(info->name));
+	if (ret)
+		return ret;
 
 	if (ip_pkt_list_tot && info->hit_count < ip_pkt_list_tot)
 		nstamp_mask = roundup_pow_of_two(ip_pkt_list_tot) - 1;
-- 
cgit v1.2.3-71-gd317


From a2c054a896b8ac794ddcfc7c92e2dc7ec4ed4ed5 Mon Sep 17 00:00:00 2001
From: Brad Mouring <brad.mouring@ni.com>
Date: Thu, 8 Mar 2018 16:23:03 -0600
Subject: net: phy: Tell caller result of phy_change()

In 664fcf123a30e (net: phy: Threaded interrupts allow some simplification)
the phy_interrupt system was changed to use a traditional threaded
interrupt scheme instead of a workqueue approach.

With this change, the phy status check moved into phy_change, which
did not report back to the caller whether or not the interrupt was
handled. This means that, in the case of a shared phy interrupt,
only the first phydev's interrupt registers are checked (since
phy_interrupt() would always return IRQ_HANDLED). This leads to
interrupt storms when it is a secondary device that's actually the
interrupt source.

Signed-off-by: Brad Mouring <brad.mouring@ni.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 145 +++++++++++++++++++++++++-------------------------
 include/linux/phy.h   |   1 -
 2 files changed, 72 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index a6f924fee584..9aabfa1a455a 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -617,6 +617,77 @@ static void phy_error(struct phy_device *phydev)
 	phy_trigger_machine(phydev, false);
 }
 
+/**
+ * phy_disable_interrupts - Disable the PHY interrupts from the PHY side
+ * @phydev: target phy_device struct
+ */
+static int phy_disable_interrupts(struct phy_device *phydev)
+{
+	int err;
+
+	/* Disable PHY interrupts */
+	err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);
+	if (err)
+		goto phy_err;
+
+	/* Clear the interrupt */
+	err = phy_clear_interrupt(phydev);
+	if (err)
+		goto phy_err;
+
+	return 0;
+
+phy_err:
+	phy_error(phydev);
+
+	return err;
+}
+
+/**
+ * phy_change - Called by the phy_interrupt to handle PHY changes
+ * @phydev: phy_device struct that interrupted
+ */
+static irqreturn_t phy_change(struct phy_device *phydev)
+{
+	if (phy_interrupt_is_valid(phydev)) {
+		if (phydev->drv->did_interrupt &&
+		    !phydev->drv->did_interrupt(phydev))
+			return IRQ_NONE;
+
+		if (phydev->state == PHY_HALTED)
+			if (phy_disable_interrupts(phydev))
+				goto phy_err;
+	}
+
+	mutex_lock(&phydev->lock);
+	if ((PHY_RUNNING == phydev->state) || (PHY_NOLINK == phydev->state))
+		phydev->state = PHY_CHANGELINK;
+	mutex_unlock(&phydev->lock);
+
+	/* reschedule state queue work to run as soon as possible */
+	phy_trigger_machine(phydev, true);
+
+	if (phy_interrupt_is_valid(phydev) && phy_clear_interrupt(phydev))
+		goto phy_err;
+	return IRQ_HANDLED;
+
+phy_err:
+	phy_error(phydev);
+	return IRQ_NONE;
+}
+
+/**
+ * phy_change_work - Scheduled by the phy_mac_interrupt to handle PHY changes
+ * @work: work_struct that describes the work to be done
+ */
+void phy_change_work(struct work_struct *work)
+{
+	struct phy_device *phydev =
+		container_of(work, struct phy_device, phy_queue);
+
+	phy_change(phydev);
+}
+
 /**
  * phy_interrupt - PHY interrupt handler
  * @irq: interrupt line
@@ -632,9 +703,7 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 	if (PHY_HALTED == phydev->state)
 		return IRQ_NONE;		/* It can't be ours.  */
 
-	phy_change(phydev);
-
-	return IRQ_HANDLED;
+	return phy_change(phydev);
 }
 
 /**
@@ -651,32 +720,6 @@ static int phy_enable_interrupts(struct phy_device *phydev)
 	return phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED);
 }
 
-/**
- * phy_disable_interrupts - Disable the PHY interrupts from the PHY side
- * @phydev: target phy_device struct
- */
-static int phy_disable_interrupts(struct phy_device *phydev)
-{
-	int err;
-
-	/* Disable PHY interrupts */
-	err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);
-	if (err)
-		goto phy_err;
-
-	/* Clear the interrupt */
-	err = phy_clear_interrupt(phydev);
-	if (err)
-		goto phy_err;
-
-	return 0;
-
-phy_err:
-	phy_error(phydev);
-
-	return err;
-}
-
 /**
  * phy_start_interrupts - request and enable interrupts for a PHY device
  * @phydev: target phy_device struct
@@ -719,50 +762,6 @@ int phy_stop_interrupts(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_stop_interrupts);
 
-/**
- * phy_change - Called by the phy_interrupt to handle PHY changes
- * @phydev: phy_device struct that interrupted
- */
-void phy_change(struct phy_device *phydev)
-{
-	if (phy_interrupt_is_valid(phydev)) {
-		if (phydev->drv->did_interrupt &&
-		    !phydev->drv->did_interrupt(phydev))
-			return;
-
-		if (phydev->state == PHY_HALTED)
-			if (phy_disable_interrupts(phydev))
-				goto phy_err;
-	}
-
-	mutex_lock(&phydev->lock);
-	if ((PHY_RUNNING == phydev->state) || (PHY_NOLINK == phydev->state))
-		phydev->state = PHY_CHANGELINK;
-	mutex_unlock(&phydev->lock);
-
-	/* reschedule state queue work to run as soon as possible */
-	phy_trigger_machine(phydev, true);
-
-	if (phy_interrupt_is_valid(phydev) && phy_clear_interrupt(phydev))
-		goto phy_err;
-	return;
-
-phy_err:
-	phy_error(phydev);
-}
-
-/**
- * phy_change_work - Scheduled by the phy_mac_interrupt to handle PHY changes
- * @work: work_struct that describes the work to be done
- */
-void phy_change_work(struct work_struct *work)
-{
-	struct phy_device *phydev =
-		container_of(work, struct phy_device, phy_queue);
-
-	phy_change(phydev);
-}
-
 /**
  * phy_stop - Bring down the PHY link, and stop checking the status
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d7069539f351..b260fb336b25 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1012,7 +1012,6 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner);
 int phy_drivers_register(struct phy_driver *new_driver, int n,
 			 struct module *owner);
 void phy_state_machine(struct work_struct *work);
-void phy_change(struct phy_device *phydev);
 void phy_change_work(struct work_struct *work);
 void phy_mac_interrupt(struct phy_device *phydev);
 void phy_start_machine(struct phy_device *phydev);
-- 
cgit v1.2.3-71-gd317


From bf2ae2e4bf9360e07c0cdfa166bcdc0afd92f4ce Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 10 Mar 2018 18:57:50 +0800
Subject: sock_diag: request _diag module only when the family or proto has
 been registered

Now when using 'ss' in iproute, kernel would try to load all _diag
modules, which also causes corresponding family and proto modules
to be loaded as well due to module dependencies.

Like after running 'ss', sctp, dccp, af_packet (if it works as a module)
would be loaded.

For example:

  $ lsmod|grep sctp
  $ ss
  $ lsmod|grep sctp
  sctp_diag              16384  0
  sctp                  323584  5 sctp_diag
  inet_diag              24576  4 raw_diag,tcp_diag,sctp_diag,udp_diag
  libcrc32c              16384  3 nf_conntrack,nf_nat,sctp

As these family and proto modules are loaded unintentionally, it
could cause some problems, like:

- Some debug tools use 'ss' to collect the socket info, which loads all
  those diag and family and protocol modules. It's noisy for identifying
  issues.

- Users usually expect to drop sctp init packet silently when they
  have no sense of sctp protocol instead of sending abort back.

- It wastes resources (especially with multiple netns), and SCTP module
  can't be unloaded once it's loaded.

...

In short, it's really inappropriate to have these family and proto
modules loaded unexpectedly when just doing debugging with inet_diag.

This patch is to introduce sock_load_diag_module() where it loads
the _diag module only when it's corresponding family or proto has
been already registered.

Note that we can't just load _diag module without the family or
proto loaded, as some symbols used in _diag module are from the
family or proto module.

v1->v2:
  - move inet proto check to inet_diag to avoid a compiling err.
v2->v3:
  - define sock_load_diag_module in sock.c and export one symbol
    only.
  - improve the changelog.

Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Phil Sutter <phil@nwl.cc>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h  |  1 +
 include/net/sock.h   |  1 +
 net/core/sock.c      | 21 +++++++++++++++++++++
 net/core/sock_diag.c | 12 ++++--------
 net/ipv4/inet_diag.c |  3 +--
 net/socket.c         |  5 +++++
 6 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 91216b16feb7..2a0391eea05c 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -222,6 +222,7 @@ enum {
 int sock_wake_async(struct socket_wq *sk_wq, int how, int band);
 int sock_register(const struct net_proto_family *fam);
 void sock_unregister(int family);
+bool sock_is_registered(int family);
 int __sock_create(struct net *net, int family, int type, int proto,
 		  struct socket **res, int kern);
 int sock_create(int family, int type, int proto, struct socket **res);
diff --git a/include/net/sock.h b/include/net/sock.h
index 169c92afcafa..ae23f3b389ca 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1137,6 +1137,7 @@ struct proto {
 
 int proto_register(struct proto *prot, int alloc_slab);
 void proto_unregister(struct proto *prot);
+int sock_load_diag_module(int family, int protocol);
 
 #ifdef SOCK_REFCNT_DEBUG
 static inline void sk_refcnt_debug_inc(struct sock *sk)
diff --git a/net/core/sock.c b/net/core/sock.c
index c501499a04fe..85b0b64e7f9d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3261,6 +3261,27 @@ void proto_unregister(struct proto *prot)
 }
 EXPORT_SYMBOL(proto_unregister);
 
+int sock_load_diag_module(int family, int protocol)
+{
+	if (!protocol) {
+		if (!sock_is_registered(family))
+			return -ENOENT;
+
+		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+				      NETLINK_SOCK_DIAG, family);
+	}
+
+#ifdef CONFIG_INET
+	if (family == AF_INET &&
+	    !rcu_access_pointer(inet_protos[protocol]))
+		return -ENOENT;
+#endif
+
+	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
+			      NETLINK_SOCK_DIAG, family, protocol);
+}
+EXPORT_SYMBOL(sock_load_diag_module);
+
 #ifdef CONFIG_PROC_FS
 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(proto_list_mutex)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 146b50e30659..c37b5be7c5e4 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -220,8 +220,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return -EINVAL;
 
 	if (sock_diag_handlers[req->sdiag_family] == NULL)
-		request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-				NETLINK_SOCK_DIAG, req->sdiag_family);
+		sock_load_diag_module(req->sdiag_family, 0);
 
 	mutex_lock(&sock_diag_table_mutex);
 	hndl = sock_diag_handlers[req->sdiag_family];
@@ -247,8 +246,7 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	case TCPDIAG_GETSOCK:
 	case DCCPDIAG_GETSOCK:
 		if (inet_rcv_compat == NULL)
-			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-					NETLINK_SOCK_DIAG, AF_INET);
+			sock_load_diag_module(AF_INET, 0);
 
 		mutex_lock(&sock_diag_table_mutex);
 		if (inet_rcv_compat != NULL)
@@ -281,14 +279,12 @@ static int sock_diag_bind(struct net *net, int group)
 	case SKNLGRP_INET_TCP_DESTROY:
 	case SKNLGRP_INET_UDP_DESTROY:
 		if (!sock_diag_handlers[AF_INET])
-			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-				       NETLINK_SOCK_DIAG, AF_INET);
+			sock_load_diag_module(AF_INET, 0);
 		break;
 	case SKNLGRP_INET6_TCP_DESTROY:
 	case SKNLGRP_INET6_UDP_DESTROY:
 		if (!sock_diag_handlers[AF_INET6])
-			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-				       NETLINK_SOCK_DIAG, AF_INET6);
+			sock_load_diag_module(AF_INET6, 0);
 		break;
 	}
 	return 0;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index a383f299ce24..4e5bc4b2f14e 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -53,8 +53,7 @@ static DEFINE_MUTEX(inet_diag_table_mutex);
 static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
 {
 	if (!inet_diag_table[proto])
-		request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
-			       NETLINK_SOCK_DIAG, AF_INET, proto);
+		sock_load_diag_module(AF_INET, proto);
 
 	mutex_lock(&inet_diag_table_mutex);
 	if (!inet_diag_table[proto])
diff --git a/net/socket.c b/net/socket.c
index a93c99b518ca..08847c3b8c39 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2587,6 +2587,11 @@ void sock_unregister(int family)
 }
 EXPORT_SYMBOL(sock_unregister);
 
+bool sock_is_registered(int family)
+{
+	return family < NPROTO && rcu_access_pointer(net_families[family]);
+}
+
 static int __init sock_init(void)
 {
 	int err;
-- 
cgit v1.2.3-71-gd317


From c2b37f76485f073f020e60b5954b6dc4e55f693c Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@mellanox.com>
Date: Thu, 8 Mar 2018 15:51:41 +0200
Subject: IB/mlx5: Fix integer overflows in mlx5_ib_create_srq

This patch validates user provided input to prevent integer overflow due
to integer manipulation in the mlx5_ib_create_srq function.

Cc: syzkaller <syzkaller@googlegroups.com>
Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/srq.c | 15 +++++++++------
 include/linux/mlx5/driver.h      |  4 ++--
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 6d5fadad9090..3c7522d025f2 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -241,8 +241,8 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_ib_srq *srq;
-	int desc_size;
-	int buf_size;
+	size_t desc_size;
+	size_t buf_size;
 	int err;
 	struct mlx5_srq_attr in = {0};
 	__u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
@@ -266,15 +266,18 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 
 	desc_size = sizeof(struct mlx5_wqe_srq_next_seg) +
 		    srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg);
+	if (desc_size == 0 || srq->msrq.max_gs > desc_size)
+		return ERR_PTR(-EINVAL);
 	desc_size = roundup_pow_of_two(desc_size);
-	desc_size = max_t(int, 32, desc_size);
+	desc_size = max_t(size_t, 32, desc_size);
+	if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg))
+		return ERR_PTR(-EINVAL);
 	srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) /
 		sizeof(struct mlx5_wqe_data_seg);
 	srq->msrq.wqe_shift = ilog2(desc_size);
 	buf_size = srq->msrq.max * desc_size;
-	mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n",
-		    desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs,
-		    srq->msrq.max_avail_gather);
+	if (buf_size < desc_size)
+		return ERR_PTR(-EINVAL);
 	in.type = init_attr->srq_type;
 
 	if (pd->uobject)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 6ed79a8a8318..9d3a03364e6e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -453,8 +453,8 @@ struct mlx5_core_srq {
 	struct mlx5_core_rsc_common	common; /* must be first */
 	u32		srqn;
 	int		max;
-	int		max_gs;
-	int		max_avail_gather;
+	size_t		max_gs;
+	size_t		max_avail_gather;
 	int		wqe_shift;
 	void (*event)	(struct mlx5_core_srq *, enum mlx5_event);
 
-- 
cgit v1.2.3-71-gd317


From 6417250d3f894e66a68ba1cd93676143f2376a6f Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 6 Mar 2018 19:34:42 -0800
Subject: workqueue: remove unused cancel_work()

Found this by accident.
There are no usages of bare cancel_work() in current kernel source.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 1 -
 kernel/workqueue.c        | 8 --------
 2 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bc0cda180c8b..0c3301421c57 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -456,7 +456,6 @@ extern int schedule_on_each_cpu(work_func_t func);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 extern bool flush_work(struct work_struct *work);
-extern bool cancel_work(struct work_struct *work);
 extern bool cancel_work_sync(struct work_struct *work);
 
 extern bool flush_delayed_work(struct delayed_work *dwork);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ccd1080dd6e7..6ec6ba65127b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3018,14 +3018,6 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
 	return ret;
 }
 
-/*
- * See cancel_delayed_work()
- */
-bool cancel_work(struct work_struct *work)
-{
-	return __cancel_work(work, false);
-}
-
 /**
  * cancel_delayed_work - cancel a delayed work
  * @dwork: delayed_work to cancel
-- 
cgit v1.2.3-71-gd317


From 4dcb31d4649df36297296b819437709f5407059c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 14 Mar 2018 09:04:16 -0700
Subject: net: use skb_to_full_sk() in skb_update_prio()

Andrei Vagin reported a KASAN: slab-out-of-bounds error in
skb_update_prio()

Since SYNACK might be attached to a request socket, we need to
get back to the listener socket.
Since this listener is manipulated without locks, add const
qualifiers to sock_cgroup_prioidx() so that the const can also
be used in skb_update_prio()

Also add the const qualifier to sock_cgroup_classid() for consistency.

Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cgroup-defs.h |  4 ++--
 net/core/dev.c              | 22 +++++++++++++++-------
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 9f242b876fde..f8e76d01a5ad 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -755,13 +755,13 @@ struct sock_cgroup_data {
  * updaters and return part of the previous pointer as the prioidx or
  * classid.  Such races are short-lived and the result isn't critical.
  */
-static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd)
+static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
 {
 	/* fallback to 1 which is always the ID of the root cgroup */
 	return (skcd->is_data & 1) ? skcd->prioidx : 1;
 }
 
-static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd)
+static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
 {
 	/* fallback to 0 which is the unconfigured default classid */
 	return (skcd->is_data & 1) ? skcd->classid : 0;
diff --git a/net/core/dev.c b/net/core/dev.c
index 2cedf520cb28..12be20535714 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3278,15 +3278,23 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 static void skb_update_prio(struct sk_buff *skb)
 {
-	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
+	const struct netprio_map *map;
+	const struct sock *sk;
+	unsigned int prioidx;
 
-	if (!skb->priority && skb->sk && map) {
-		unsigned int prioidx =
-			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
+	if (skb->priority)
+		return;
+	map = rcu_dereference_bh(skb->dev->priomap);
+	if (!map)
+		return;
+	sk = skb_to_full_sk(skb);
+	if (!sk)
+		return;
 
-		if (prioidx < map->priomap_len)
-			skb->priority = map->priomap[prioidx];
-	}
+	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
+
+	if (prioidx < map->priomap_len)
+		skb->priority = map->priomap[prioidx];
 }
 #else
 #define skb_update_prio(skb)
-- 
cgit v1.2.3-71-gd317


From 16ca6a607d84bef0129698d8d808f501afd08d43 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 6 Mar 2018 21:48:01 +0000
Subject: KVM: arm/arm64: vgic: Don't populate multiple LRs with the same
 vintid

The vgic code is trying to be clever when injecting GICv2 SGIs,
and will happily populate LRs with the same interrupt number if
they come from multiple vcpus (after all, they are distinct
interrupt sources).

Unfortunately, this is against the letter of the architecture,
and the GICv2 architecture spec says "Each valid interrupt stored
in the List registers must have a unique VirtualID for that
virtual CPU interface.". GICv3 has similar (although slightly
ambiguous) restrictions.

This results in guests locking up when using GICv2-on-GICv3, for
example. The obvious fix is to stop trying so hard, and inject
a single vcpu per SGI per guest entry. After all, pending SGIs
with multiple source vcpus are pretty rare, and are mostly seen
in scenario where the physical CPUs are severely overcomitted.

But as we now only inject a single instance of a multi-source SGI per
vcpu entry, we may delay those interrupts for longer than strictly
necessary, and run the risk of injecting lower priority interrupts
in the meantime.

In order to address this, we adopt a three stage strategy:
- If we encounter a multi-source SGI in the AP list while computing
  its depth, we force the list to be sorted
- When populating the LRs, we prevent the injection of any interrupt
  of lower priority than that of the first multi-source SGI we've
  injected.
- Finally, the injection of a multi-source SGI triggers the request
  of a maintenance interrupt when there will be no pending interrupt
  in the LRs (HCR_NPIE).

At the point where the last pending interrupt in the LRs switches
from Pending to Active, the maintenance interrupt will be delivered,
allowing us to add the remaining SGIs using the same process.

Cc: stable@vger.kernel.org
Fixes: 0919e84c0fc1 ("KVM: arm/arm64: vgic-new: Add IRQ sync/flush framework")
Acked-by: Christoffer Dall <cdall@kernel.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h |  1 +
 include/linux/irqchip/arm-gic.h    |  1 +
 virt/kvm/arm/vgic/vgic-v2.c        |  9 +++++-
 virt/kvm/arm/vgic/vgic-v3.c        |  9 +++++-
 virt/kvm/arm/vgic/vgic.c           | 61 +++++++++++++++++++++++++++++---------
 virt/kvm/arm/vgic/vgic.h           |  2 ++
 6 files changed, 67 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index c00c4c33e432..b26eccc78fb1 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -503,6 +503,7 @@
 
 #define ICH_HCR_EN			(1 << 0)
 #define ICH_HCR_UIE			(1 << 1)
+#define ICH_HCR_NPIE			(1 << 3)
 #define ICH_HCR_TC			(1 << 10)
 #define ICH_HCR_TALL0			(1 << 11)
 #define ICH_HCR_TALL1			(1 << 12)
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index d3453ee072fc..68d8b1f73682 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -84,6 +84,7 @@
 
 #define GICH_HCR_EN			(1 << 0)
 #define GICH_HCR_UIE			(1 << 1)
+#define GICH_HCR_NPIE			(1 << 3)
 
 #define GICH_LR_VIRTUALID		(0x3ff << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT	(10)
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index e9d840a75e7b..29556f71b691 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -37,6 +37,13 @@ void vgic_v2_init_lrs(void)
 		vgic_v2_write_lr(i, 0);
 }
 
+void vgic_v2_set_npie(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+	cpuif->vgic_hcr |= GICH_HCR_NPIE;
+}
+
 void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
@@ -64,7 +71,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
 	int lr;
 	unsigned long flags;
 
-	cpuif->vgic_hcr &= ~GICH_HCR_UIE;
+	cpuif->vgic_hcr &= ~(GICH_HCR_UIE | GICH_HCR_NPIE);
 
 	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
 		u32 val = cpuif->vgic_lr[lr];
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 6b329414e57a..0ff2006f3781 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -26,6 +26,13 @@ static bool group1_trap;
 static bool common_trap;
 static bool gicv4_enable;
 
+void vgic_v3_set_npie(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	cpuif->vgic_hcr |= ICH_HCR_NPIE;
+}
+
 void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -47,7 +54,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 	int lr;
 	unsigned long flags;
 
-	cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+	cpuif->vgic_hcr &= ~(ICH_HCR_UIE | ICH_HCR_NPIE);
 
 	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
 		u64 val = cpuif->vgic_lr[lr];
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 0001858a2c23..8201899126f6 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -710,22 +710,37 @@ static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
 		vgic_v3_set_underflow(vcpu);
 }
 
+static inline void vgic_set_npie(struct kvm_vcpu *vcpu)
+{
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_set_npie(vcpu);
+	else
+		vgic_v3_set_npie(vcpu);
+}
+
 /* Requires the ap_list_lock to be held. */
-static int compute_ap_list_depth(struct kvm_vcpu *vcpu)
+static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
+				 bool *multi_sgi)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_irq *irq;
 	int count = 0;
 
+	*multi_sgi = false;
+
 	DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
 
 	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
 		spin_lock(&irq->irq_lock);
 		/* GICv2 SGIs can count for more than one... */
-		if (vgic_irq_is_sgi(irq->intid) && irq->source)
-			count += hweight8(irq->source);
-		else
+		if (vgic_irq_is_sgi(irq->intid) && irq->source) {
+			int w = hweight8(irq->source);
+
+			count += w;
+			*multi_sgi |= (w > 1);
+		} else {
 			count++;
+		}
 		spin_unlock(&irq->irq_lock);
 	}
 	return count;
@@ -736,28 +751,43 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_irq *irq;
-	int count = 0;
+	int count;
+	bool npie = false;
+	bool multi_sgi;
+	u8 prio = 0xff;
 
 	DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
 
-	if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr)
+	count = compute_ap_list_depth(vcpu, &multi_sgi);
+	if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
 		vgic_sort_ap_list(vcpu);
 
+	count = 0;
+
 	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
 		spin_lock(&irq->irq_lock);
 
-		if (unlikely(vgic_target_oracle(irq) != vcpu))
-			goto next;
-
 		/*
-		 * If we get an SGI with multiple sources, try to get
-		 * them in all at once.
+		 * If we have multi-SGIs in the pipeline, we need to
+		 * guarantee that they are all seen before any IRQ of
+		 * lower priority. In that case, we need to filter out
+		 * these interrupts by exiting early. This is easy as
+		 * the AP list has been sorted already.
 		 */
-		do {
+		if (multi_sgi && irq->priority > prio) {
+			spin_unlock(&irq->irq_lock);
+			break;
+		}
+
+		if (likely(vgic_target_oracle(irq) == vcpu)) {
 			vgic_populate_lr(vcpu, irq, count++);
-		} while (irq->source && count < kvm_vgic_global_state.nr_lr);
 
-next:
+			if (irq->source) {
+				npie = true;
+				prio = irq->priority;
+			}
+		}
+
 		spin_unlock(&irq->irq_lock);
 
 		if (count == kvm_vgic_global_state.nr_lr) {
@@ -768,6 +798,9 @@ next:
 		}
 	}
 
+	if (npie)
+		vgic_set_npie(vcpu);
+
 	vcpu->arch.vgic_cpu.used_lrs = count;
 
 	/* Nuke remaining LRs */
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 5b11859a1a1e..f5b8519e5546 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -160,6 +160,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
 void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
 void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
 void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v2_set_npie(struct kvm_vcpu *vcpu);
 int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
 int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
 			 int offset, u32 *val);
@@ -189,6 +190,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
 void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
 void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
 void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v3_set_npie(struct kvm_vcpu *vcpu);
 void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
 void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
 void vgic_v3_enable(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3-71-gd317


From 95dd77580ccd66a0da96e6d4696945b8cea39431 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 14 Mar 2018 18:20:29 -0500
Subject: fs: Teach path_connected to handle nfs filesystems with multiple
 roots.

On nfsv2 and nfsv3 the nfs server can export subsets of the same
filesystem and report the same filesystem identifier, so that the nfs
client can know they are the same filesystem.  The subsets can be from
disjoint directory trees.  The nfsv2 and nfsv3 filesystems provides no
way to find the common root of all directory trees exported form the
server with the same filesystem identifier.

The practical result is that in struct super s_root for nfs s_root is
not necessarily the root of the filesystem.  The nfs mount code sets
s_root to the root of the first subset of the nfs filesystem that the
kernel mounts.

This effects the dcache invalidation code in generic_shutdown_super
currently called shrunk_dcache_for_umount and that code for years
has gone through an additional list of dentries that might be dentry
trees that need to be freed to accomodate nfs.

When I wrote path_connected I did not realize nfs was so special, and
it's hueristic for avoiding calling is_subdir can fail.

The practical case where this fails is when there is a move of a
directory from the subtree exposed by one nfs mount to the subtree
exposed by another nfs mount.  This move can happen either locally or
remotely.  With the remote case requiring that the move directory be cached
before the move and that after the move someone walks the path
to where the move directory now exists and in so doing causes the
already cached directory to be moved in the dcache through the magic
of d_splice_alias.

If someone whose working directory is in the move directory or a
subdirectory and now starts calling .. from the initial mount of nfs
(where s_root == mnt_root), then path_connected as a heuristic will
not bother with the is_subdir check.  As s_root really is not the root
of the nfs filesystem this heuristic is wrong, and the path may
actually not be connected and path_connected can fail.

The is_subdir function might be cheap enough that we can call it
unconditionally.  Verifying that will take some benchmarking and
the result may not be the same on all kernels this fix needs
to be backported to.  So I am avoiding that for now.

Filesystems with snapshots such as nilfs and btrfs do something
similar.  But as the directory tree of the snapshots are disjoint
from one another and from the main directory tree rename won't move
things between them and this problem will not occur.

Cc: stable@vger.kernel.org
Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Fixes: 397d425dc26d ("vfs: Test for and handle paths that are unreachable from their mnt_root")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c         | 5 +++--
 fs/nfs/super.c     | 2 ++
 include/linux/fs.h | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 921ae32dbc80..cafa365eeb70 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -559,9 +559,10 @@ static int __nd_alloc_stack(struct nameidata *nd)
 static bool path_connected(const struct path *path)
 {
 	struct vfsmount *mnt = path->mnt;
+	struct super_block *sb = mnt->mnt_sb;
 
-	/* Only bind mounts can have disconnected paths */
-	if (mnt->mnt_root == mnt->mnt_sb->s_root)
+	/* Bind mounts and multi-root filesystems can have disconnected paths */
+	if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
 		return true;
 
 	return is_subdir(path->dentry, mnt->mnt_root);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 29bacdc56f6a..5e470e233c83 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2631,6 +2631,8 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 		/* initial superblock/root creation */
 		mount_info->fill_super(s, mount_info);
 		nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
+		if (!(server->flags & NFS_MOUNT_UNSHARED))
+			s->s_iflags |= SB_I_MULTIROOT;
 	}
 
 	mntroot = nfs_get_root(s, mount_info->mntfh, dev_name);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2a815560fda0..0430e03febaa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1317,6 +1317,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
 #define SB_I_NOEXEC	0x00000002	/* Ignore executables on this fs */
 #define SB_I_NODEV	0x00000004	/* Ignore devices on this fs */
+#define SB_I_MULTIROOT	0x00000008	/* Multiple roots to the dentry tree */
 
 /* sb->s_iflags to limit user namespace mounts */
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
-- 
cgit v1.2.3-71-gd317


From cbe7128c4b92e2004984f477fd38dfa81662f02e Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Tue, 13 Mar 2018 14:51:28 +0900
Subject: vlan: Fix out of order vlan headers with reorder header off

With reorder header off, received packets are untagged in skb_vlan_untag()
called from within __netif_receive_skb_core(), and later the tag will be
inserted back in vlan_do_receive().

This caused out of order vlan headers when we create a vlan device on top
of another vlan device, because vlan_do_receive() inserts a tag as the
outermost vlan tag. E.g. the outer tag is first removed in skb_vlan_untag()
and inserted back in vlan_do_receive(), then the inner tag is next removed
and inserted back as the outermost tag.

This patch fixes the behaviour by inserting the inner tag at the right
position.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 66 ++++++++++++++++++++++++++++++++++++++++---------
 net/8021q/vlan_core.c   |  4 +--
 2 files changed, 57 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 5e6a2d4dc366..c4a1cff9c768 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -300,30 +300,34 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features,
 }
 
 /**
- * __vlan_insert_tag - regular VLAN tag inserting
+ * __vlan_insert_inner_tag - inner VLAN tag inserting
  * @skb: skbuff to tag
  * @vlan_proto: VLAN encapsulation protocol
  * @vlan_tci: VLAN TCI to insert
+ * @mac_len: MAC header length including outer vlan headers
  *
- * Inserts the VLAN tag into @skb as part of the payload
+ * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
  * Returns error if skb_cow_head failes.
  *
  * Does not change skb->protocol so this function can be used during receive.
  */
-static inline int __vlan_insert_tag(struct sk_buff *skb,
-				    __be16 vlan_proto, u16 vlan_tci)
+static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
+					  __be16 vlan_proto, u16 vlan_tci,
+					  unsigned int mac_len)
 {
 	struct vlan_ethhdr *veth;
 
 	if (skb_cow_head(skb, VLAN_HLEN) < 0)
 		return -ENOMEM;
 
-	veth = skb_push(skb, VLAN_HLEN);
+	skb_push(skb, VLAN_HLEN);
 
-	/* Move the mac addresses to the beginning of the new header. */
-	memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN);
+	/* Move the mac header sans proto to the beginning of the new header. */
+	memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN);
 	skb->mac_header -= VLAN_HLEN;
 
+	veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN);
+
 	/* first, the ethernet type */
 	veth->h_vlan_proto = vlan_proto;
 
@@ -334,12 +338,30 @@ static inline int __vlan_insert_tag(struct sk_buff *skb,
 }
 
 /**
- * vlan_insert_tag - regular VLAN tag inserting
+ * __vlan_insert_tag - regular VLAN tag inserting
  * @skb: skbuff to tag
  * @vlan_proto: VLAN encapsulation protocol
  * @vlan_tci: VLAN TCI to insert
  *
  * Inserts the VLAN tag into @skb as part of the payload
+ * Returns error if skb_cow_head failes.
+ *
+ * Does not change skb->protocol so this function can be used during receive.
+ */
+static inline int __vlan_insert_tag(struct sk_buff *skb,
+				    __be16 vlan_proto, u16 vlan_tci)
+{
+	return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
+}
+
+/**
+ * vlan_insert_inner_tag - inner VLAN tag inserting
+ * @skb: skbuff to tag
+ * @vlan_proto: VLAN encapsulation protocol
+ * @vlan_tci: VLAN TCI to insert
+ * @mac_len: MAC header length including outer vlan headers
+ *
+ * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
  * Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
  *
  * Following the skb_unshare() example, in case of error, the calling function
@@ -347,12 +369,14 @@ static inline int __vlan_insert_tag(struct sk_buff *skb,
  *
  * Does not change skb->protocol so this function can be used during receive.
  */
-static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
-					      __be16 vlan_proto, u16 vlan_tci)
+static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb,
+						    __be16 vlan_proto,
+						    u16 vlan_tci,
+						    unsigned int mac_len)
 {
 	int err;
 
-	err = __vlan_insert_tag(skb, vlan_proto, vlan_tci);
+	err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len);
 	if (err) {
 		dev_kfree_skb_any(skb);
 		return NULL;
@@ -360,6 +384,26 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
 	return skb;
 }
 
+/**
+ * vlan_insert_tag - regular VLAN tag inserting
+ * @skb: skbuff to tag
+ * @vlan_proto: VLAN encapsulation protocol
+ * @vlan_tci: VLAN TCI to insert
+ *
+ * Inserts the VLAN tag into @skb as part of the payload
+ * Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
+ *
+ * Following the skb_unshare() example, in case of error, the calling function
+ * doesn't have to worry about freeing the original skb.
+ *
+ * Does not change skb->protocol so this function can be used during receive.
+ */
+static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
+					      __be16 vlan_proto, u16 vlan_tci)
+{
+	return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
+}
+
 /**
  * vlan_insert_tag_set_proto - regular VLAN tag inserting
  * @skb: skbuff to tag
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 64aa9f755e1d..45c9bf5ff3a0 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -48,8 +48,8 @@ bool vlan_do_receive(struct sk_buff **skbp)
 		 * original position later
 		 */
 		skb_push(skb, offset);
-		skb = *skbp = vlan_insert_tag(skb, skb->vlan_proto,
-					      skb->vlan_tci);
+		skb = *skbp = vlan_insert_inner_tag(skb, skb->vlan_proto,
+						    skb->vlan_tci, skb->mac_len);
 		if (!skb)
 			return false;
 		skb_pull(skb, offset + VLAN_HLEN);
-- 
cgit v1.2.3-71-gd317


From b3a5d111994450909158929560906f2c1c6c1d85 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Mar 2018 12:45:12 -0700
Subject: percpu_ref: Update doc to dissuade users from depending on internal
 RCU grace periods

percpu_ref internally uses sched-RCU to implement the percpu -> atomic
mode switching and the documentation suggested that this could be
depended upon.  This doesn't seem like a good idea.

* percpu_ref uses sched-RCU which has different grace periods regular
  RCU.  Users may combine percpu_ref with regular RCU usage and
  incorrectly believe that regular RCU grace periods are performed by
  percpu_ref.  This can lead to, for example, use-after-free due to
  premature freeing.

* percpu_ref has a grace period when switching from percpu to atomic
  mode.  It doesn't have one between the last put and release.  This
  distinction is subtle and can lead to surprising bugs.

* percpu_ref allows starting in and switching to atomic mode manually
  for debugging and other purposes.  This means that there may not be
  any grace periods from kill to release.

This patch makes it clear that the grace periods are percpu_ref's
internal implementation detail and can't be depended upon by the
users.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu-refcount.h | 18 ++++++++++++------
 lib/percpu-refcount.c           |  2 ++
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 864d167a1073..009cdf3d65b6 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -30,10 +30,14 @@
  * calls io_destroy() or the process exits.
  *
  * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
- * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove
- * the kioctx from the proccess's list of kioctxs - after that, there can't be
- * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop
- * the initial ref with percpu_ref_put().
+ * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref.
+ * After that, there can't be any new users of the kioctx (from lookup_ioctx())
+ * and it's then safe to drop the initial ref with percpu_ref_put().
+ *
+ * Note that the free path, free_ioctx(), needs to go through explicit call_rcu()
+ * to synchronize with RCU protected lookup_ioctx().  percpu_ref operations don't
+ * imply RCU grace periods of any kind and if a user wants to combine percpu_ref
+ * with RCU protection, it must be done explicitly.
  *
  * Code that does a two stage shutdown like this often needs some kind of
  * explicit synchronization to ensure the initial refcount can only be dropped
@@ -113,8 +117,10 @@ void percpu_ref_reinit(struct percpu_ref *ref);
  * Must be used to drop the initial ref on a percpu refcount; must be called
  * precisely once before shutdown.
  *
- * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the
- * percpu counters and dropping the initial ref.
+ * Switches @ref into atomic mode before gathering up the percpu counters
+ * and dropping the initial ref.
+ *
+ * There are no implied RCU grace periods between kill and release.
  */
 static inline void percpu_ref_kill(struct percpu_ref *ref)
 {
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 30e7dd88148b..9f96fa7bc000 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -322,6 +322,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
  * This function normally doesn't block and can be called from any context
  * but it may block if @confirm_kill is specified and @ref is in the
  * process of switching to atomic mode by percpu_ref_switch_to_atomic().
+ *
+ * There are no implied RCU grace periods between kill and release.
  */
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill)
-- 
cgit v1.2.3-71-gd317


From 578ae447e7e5d78c90ac40a06406c1741f79ba96 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 19 Mar 2018 13:18:57 -0500
Subject: jump_label: Disable jump labels in __exit code

With the following commit:

  333522447063 ("jump_label: Explicitly disable jump labels in __init code")

... we explicitly disabled jump labels in __init code, so they could be
detected and not warned about in the following commit:

  dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt")

In-kernel __exit code has the same issue.  It's never used, so it's
freed along with the rest of initmem.  But jump label entries in __exit
code aren't explicitly disabled, so we get the following warning when
enabling pr_debug() in __exit code:

  can't patch jump_label at dmi_sysfs_exit+0x0/0x2d
  WARNING: CPU: 0 PID: 22572 at kernel/jump_label.c:376 __jump_label_update+0x9d/0xb0

Fix the warning by disabling all jump labels in initmem (which includes
both __init and __exit code).

Reported-and-tested-by: Li Wang <liwang@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt")
Link: http://lkml.kernel.org/r/7121e6e595374f06616c505b6e690e275c0054d1.1521483452.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h | 4 ++--
 init/main.c                | 2 +-
 kernel/jump_label.c        | 7 ++++---
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 2168cc6b8b30..b46b541c67c4 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -151,7 +151,7 @@ extern struct jump_entry __start___jump_table[];
 extern struct jump_entry __stop___jump_table[];
 
 extern void jump_label_init(void);
-extern void jump_label_invalidate_init(void);
+extern void jump_label_invalidate_initmem(void);
 extern void jump_label_lock(void);
 extern void jump_label_unlock(void);
 extern void arch_jump_label_transform(struct jump_entry *entry,
@@ -199,7 +199,7 @@ static __always_inline void jump_label_init(void)
 	static_key_initialized = true;
 }
 
-static inline void jump_label_invalidate_init(void) {}
+static inline void jump_label_invalidate_initmem(void) {}
 
 static __always_inline bool static_key_false(struct static_key *key)
 {
diff --git a/init/main.c b/init/main.c
index 969eaf140ef0..21efbf6ace93 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1001,7 +1001,7 @@ static int __ref kernel_init(void *unused)
 	/* need to finish all async __init code before freeing the memory */
 	async_synchronize_full();
 	ftrace_free_init_mem();
-	jump_label_invalidate_init();
+	jump_label_invalidate_initmem();
 	free_initmem();
 	mark_readonly();
 	system_state = SYSTEM_RUNNING;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index e7214093dcd1..01ebdf1f9f40 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -16,6 +16,7 @@
 #include <linux/jump_label_ratelimit.h>
 #include <linux/bug.h>
 #include <linux/cpu.h>
+#include <asm/sections.h>
 
 #ifdef HAVE_JUMP_LABEL
 
@@ -421,15 +422,15 @@ void __init jump_label_init(void)
 	cpus_read_unlock();
 }
 
-/* Disable any jump label entries in __init code */
-void __init jump_label_invalidate_init(void)
+/* Disable any jump label entries in __init/__exit code */
+void __init jump_label_invalidate_initmem(void)
 {
 	struct jump_entry *iter_start = __start___jump_table;
 	struct jump_entry *iter_stop = __stop___jump_table;
 	struct jump_entry *iter;
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		if (init_kernel_text(iter->code))
+		if (init_section_contains((void *)(unsigned long)iter->code, 1))
 			iter->code = 0;
 	}
 }
-- 
cgit v1.2.3-71-gd317


From 6b00c35138b404be98b85f4a703be594cbed501c Mon Sep 17 00:00:00 2001
From: Jagdish Gediya <jagdish.gediya@nxp.com>
Date: Thu, 22 Mar 2018 01:08:10 +0530
Subject: mtd: nand: fsl_ifc: Read ECCSTAT0 and ECCSTAT1 registers for IFC 2.0

Due to missing information in Hardware manual, current
implementation doesn't read ECCSTAT0 and ECCSTAT1 registers
for IFC 2.0.

Add support to read ECCSTAT0 and ECCSTAT1 registers during
ecccheck for IFC 2.0.

Fixes: 656441478ed5 ("mtd: nand: ifc: Fix location of eccstat registers for IFC V1.0")
Cc: stable@vger.kernel.org # v3.18+
Signed-off-by: Jagdish Gediya <jagdish.gediya@nxp.com>
Reviewed-by: Prabhakar Kushwaha <prabhakar.kushwaha@nxp.com>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 drivers/mtd/nand/fsl_ifc_nand.c | 6 +-----
 include/linux/fsl_ifc.h         | 6 +-----
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/fsl_ifc_nand.c b/drivers/mtd/nand/fsl_ifc_nand.c
index fae01b04abc7..5a9c2f0020c2 100644
--- a/drivers/mtd/nand/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/fsl_ifc_nand.c
@@ -227,11 +227,7 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
 		int sector_end = sector_start + chip->ecc.steps - 1;
 		__be32 *eccstat_regs;
 
-		if (ctrl->version >= FSL_IFC_VERSION_2_0_0)
-			eccstat_regs = ifc->ifc_nand.v2_nand_eccstat;
-		else
-			eccstat_regs = ifc->ifc_nand.v1_nand_eccstat;
-
+		eccstat_regs = ifc->ifc_nand.nand_eccstat;
 		eccstat = ifc_in32(&eccstat_regs[sector_start / 4]);
 
 		for (i = sector_start; i <= sector_end; i++) {
diff --git a/include/linux/fsl_ifc.h b/include/linux/fsl_ifc.h
index c332f0a45607..3fdfede2f0f3 100644
--- a/include/linux/fsl_ifc.h
+++ b/include/linux/fsl_ifc.h
@@ -734,11 +734,7 @@ struct fsl_ifc_nand {
 	u32 res19[0x10];
 	__be32 nand_fsr;
 	u32 res20;
-	/* The V1 nand_eccstat is actually 4 words that overlaps the
-	 * V2 nand_eccstat.
-	 */
-	__be32 v1_nand_eccstat[2];
-	__be32 v2_nand_eccstat[6];
+	__be32 nand_eccstat[8];
 	u32 res21[0x1c];
 	__be32 nanndcr;
 	u32 res22[0x2];
-- 
cgit v1.2.3-71-gd317


From 5df7af85ecd88e8b5f1f31d6456c3cf38a8bbdda Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Tue, 20 Mar 2018 09:44:52 +0800
Subject: net: phy: Add general dummy stubs for MMD register access

For some phy devices, even though they don't support the MMD extended
register access, it does have some side effect if we are trying to
read/write the MMD registers via indirect method. So introduce general
dummy stubs for MMD register access which these devices can use to avoid
such side effect.

Fixes: b6b5e8a69118 ("gianfar: Disable EEE autoneg by default")
Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 17 +++++++++++++++++
 include/linux/phy.h          |  4 ++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index fe16f5894092..74664a6c0cdc 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1673,6 +1673,23 @@ int genphy_config_init(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(genphy_config_init);
 
+/* This is used for the phy device which doesn't support the MMD extended
+ * register access, but it does have side effect when we are trying to access
+ * the MMD register via indirect method.
+ */
+int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, u16 regnum)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(genphy_read_mmd_unsupported);
+
+int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum,
+				 u16 regnum, u16 val)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(genphy_write_mmd_unsupported);
+
 int genphy_suspend(struct phy_device *phydev)
 {
 	return phy_set_bits(phydev, MII_BMCR, BMCR_PDOWN);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index b260fb336b25..7c4c2379e010 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -984,6 +984,10 @@ static inline int genphy_no_soft_reset(struct phy_device *phydev)
 {
 	return 0;
 }
+int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad,
+				u16 regnum);
+int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum,
+				 u16 regnum, u16 val);
 
 /* Clause 45 PHY */
 int genphy_c45_restart_aneg(struct phy_device *phydev);
-- 
cgit v1.2.3-71-gd317


From f59f1caf72ba00d519c793c3deb32cd3be32edc2 Mon Sep 17 00:00:00 2001
From: Daniel Vacek <neelx@redhat.com>
Date: Thu, 22 Mar 2018 16:17:38 -0700
Subject: Revert "mm: page_alloc: skip over regions of invalid pfns where
 possible"

This reverts commit b92df1de5d28 ("mm: page_alloc: skip over regions of
invalid pfns where possible").  The commit is meant to be a boot init
speed up skipping the loop in memmap_init_zone() for invalid pfns.

But given some specific memory mapping on x86_64 (or more generally
theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID) the
implementation also skips valid pfns which is plain wrong and causes
'kernel BUG at mm/page_alloc.c:1389!'

  crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1
  kernel BUG at mm/page_alloc.c:1389!
  invalid opcode: 0000 [#1] SMP
  --
  RIP: 0010: move_freepages+0x15e/0x160
  --
  Call Trace:
    move_freepages_block+0x73/0x80
    __rmqueue+0x263/0x460
    get_page_from_freelist+0x7e1/0x9e0
    __alloc_pages_nodemask+0x176/0x420
  --

  crash> page_init_bug -v | grep RAM
  <struct resource 0xffff88067fffd2f8>          1000 -        9bfff       System RAM (620.00 KiB)
  <struct resource 0xffff88067fffd3a0>        100000 -     430bffff       System RAM (  1.05 GiB = 1071.75 MiB = 1097472.00 KiB)
  <struct resource 0xffff88067fffd410>      4b0c8000 -     4bf9cfff       System RAM ( 14.83 MiB = 15188.00 KiB)
  <struct resource 0xffff88067fffd480>      4bfac000 -     646b1fff       System RAM (391.02 MiB = 400408.00 KiB)
  <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
  <struct resource 0xffff88067fffd640>     100000000 -    67fffffff       System RAM ( 22.00 GiB)

  crash> page_init_bug | head -6
  <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
  <struct page 0xffffea0001ede200>   1fffff00000000  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
  <struct page 0xffffea0001ede200>       505736 505344 <struct page 0xffffea0001ed8000> 505855 <struct page 0xffffea0001edffc0>
  <struct page 0xffffea0001ed8000>                0  0 <struct pglist_data 0xffff88047ffd9000> 0 <struct zone 0xffff88047ffd9000> DMA               1       4095
  <struct page 0xffffea0001edffc0>   1fffff00000400  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
  BUG, zones differ!

  crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000
        PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
  ffffea0001e00000  78000000                0        0  0 0
  ffffea0001ed7fc0  7b5ff000                0        0  0 0
  ffffea0001ed8000  7b600000                0        0  0 0       <<<<
  ffffea0001ede1c0  7b787000                0        0  0 0
  ffffea0001ede200  7b788000                0        0  1 1fffff00000000

Link: http://lkml.kernel.org/r/20180316143855.29838-1-neelx@redhat.com
Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible")
Signed-off-by: Daniel Vacek <neelx@redhat.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  1 -
 mm/memblock.c            | 28 ----------------------------
 mm/page_alloc.c          | 11 +----------
 3 files changed, 1 insertion(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 8be5077efb5f..f92ea7783652 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 			  unsigned long *out_end_pfn, int *out_nid);
-unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
 
 /**
  * for_each_mem_pfn_range - early memory pfn range iterator
diff --git a/mm/memblock.c b/mm/memblock.c
index b6ba6b7adadc..48376bd33274 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 		*out_nid = r->nid;
 }
 
-unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
-						      unsigned long max_pfn)
-{
-	struct memblock_type *type = &memblock.memory;
-	unsigned int right = type->cnt;
-	unsigned int mid, left = 0;
-	phys_addr_t addr = PFN_PHYS(++pfn);
-
-	do {
-		mid = (right + left) / 2;
-
-		if (addr < type->regions[mid].base)
-			right = mid;
-		else if (addr >= (type->regions[mid].base +
-				  type->regions[mid].size))
-			left = mid + 1;
-		else {
-			/* addr is within the region, so pfn is valid */
-			return pfn;
-		}
-	} while (left < right);
-
-	if (right == type->cnt)
-		return -1UL;
-	else
-		return PHYS_PFN(type->regions[right].base);
-}
-
 /**
  * memblock_set_node - set node ID on memblock regions
  * @base: base of area to set node ID for
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 010dee0f089e..1741dd23e7c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		if (context != MEMMAP_EARLY)
 			goto not_early;
 
-		if (!early_pfn_valid(pfn)) {
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-			/*
-			 * Skip to the pfn preceding the next valid one (or
-			 * end_pfn), such that we hit a valid pfn (or end_pfn)
-			 * on our next iteration of the loop.
-			 */
-			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
-#endif
+		if (!early_pfn_valid(pfn))
 			continue;
-		}
 		if (!early_pfn_in_nid(pfn, nid))
 			continue;
 		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
-- 
cgit v1.2.3-71-gd317