From 26594678d00f94c62f2e43162bd6d10fd0b74917 Mon Sep 17 00:00:00 2001
From: Leandro Ribeiro <leandro.ribeiro@collabora.com>
Date: Wed, 9 Jun 2021 20:00:38 -0300
Subject: drm/doc: document how userspace should find out CRTC index

In this patch we add a section to document what userspace should do to
find out the CRTC index. This is important as they may be many places in
the documentation that need this, so it's better to just point to this
section and avoid repetition.

Signed-off-by: Leandro Ribeiro <leandro.ribeiro@collabora.com>
Reviewed-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210609230039.73307-2-leandro.ribeiro@collabora.com
---
 include/uapi/drm/drm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index d043752a74cf..e1f49dd241f7 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -635,8 +635,8 @@ struct drm_gem_open {
 /**
  * DRM_CAP_VBLANK_HIGH_CRTC
  *
- * If set to 1, the kernel supports specifying a CRTC index in the high bits of
- * &drm_wait_vblank_request.type.
+ * If set to 1, the kernel supports specifying a :ref:`CRTC index<crtc_index>`
+ * in the high bits of &drm_wait_vblank_request.type.
  *
  * Starting kernel version 2.6.39, this capability is always set to 1.
  */
-- 
cgit v1.2.3-71-gd317


From 7a7a933edd6c3a6d5d64e08093f2d564104cefcd Mon Sep 17 00:00:00 2001
From: Martin Krastev <krastevm@vmware.com>
Date: Wed, 9 Jun 2021 13:23:00 -0400
Subject: drm/vmwgfx: Introduce VMware mks-guest-stats

VMware mks-guest-stats mechanism allows the collection of performance stats from
guest userland GL contexts, as well as from vmwgfx kernelspace, via a set of sw-
defined performance counters. The userspace performance counters are (de)registerd
with vmware-vmx-stats hypervisor via new iocts. The vmwgfx kernelspace counters
are controlled at build-time via a new config DRM_VMWGFX_MKSSTATS.

* Add vmw_mksstat_{add|remove|reset}_ioctl controlling the tracking of
  mks-guest-stats in guest winsys contexts
* Add DRM_VMWGFX_MKSSTATS config to drivers/gpu/drm/vmwgfx/Kconfig controlling
  the instrumentation of vmwgfx for kernelspace mks-guest-stats counters
* Instrument vmwgfx vmw_execbuf_ioctl to collect mks-guest-stats according to
  DRM_VMWGFX_MKSSTATS

Signed-off-by: Martin Krastev <krastevm@vmware.com>
Reviewed-by: Zack Rusin <zackr@vmware.com>
Signed-off-by: Zack Rusin <zackr@vmware.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210609172307.131929-3-zackr@vmware.com
---
 drivers/gpu/drm/vmwgfx/Kconfig                     |   7 +
 drivers/gpu/drm/vmwgfx/device_include/svga_types.h |  92 +++-
 .../gpu/drm/vmwgfx/device_include/vm_basic_types.h |  22 -
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c                |  20 +
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h                |  28 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c            |  13 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_mksstat.h            | 144 +++++
 drivers/gpu/drm/vmwgfx/vmwgfx_msg.c                | 579 +++++++++++++++++++++
 include/uapi/drm/vmwgfx_drm.h                      |  41 ++
 9 files changed, 919 insertions(+), 27 deletions(-)
 delete mode 100644 drivers/gpu/drm/vmwgfx/device_include/vm_basic_types.h
 create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_mksstat.h

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/vmwgfx/Kconfig b/drivers/gpu/drm/vmwgfx/Kconfig
index 0060ef842b5a..a9052fae0bbc 100644
--- a/drivers/gpu/drm/vmwgfx/Kconfig
+++ b/drivers/gpu/drm/vmwgfx/Kconfig
@@ -22,3 +22,10 @@ config DRM_VMWGFX_FBCON
 	   Choose this option if you are shipping a new vmwgfx
 	   userspace driver that supports using the kernel driver.
 
+config DRM_VMWGFX_MKSSTATS
+	bool "Enable mksGuestStats instrumentation of vmwgfx by default"
+	depends on DRM_VMWGFX
+	default n
+	help
+	   Choose this option to instrument the kernel driver for mksGuestStats.
+
diff --git a/drivers/gpu/drm/vmwgfx/device_include/svga_types.h b/drivers/gpu/drm/vmwgfx/device_include/svga_types.h
index beddccee40f6..f5f79b114fac 100644
--- a/drivers/gpu/drm/vmwgfx/device_include/svga_types.h
+++ b/drivers/gpu/drm/vmwgfx/device_include/svga_types.h
@@ -23,9 +23,11 @@
  * SOFTWARE.
  *
  **********************************************************/
-#ifndef _VM_BASIC_TYPES_H_
-#define _VM_BASIC_TYPES_H_
+#ifndef _SVGA_TYPES_H_
+#define _SVGA_TYPES_H_
 #include <linux/kernel.h>
+#include <linux/mm.h>
+#include <asm/page.h>
 
 typedef u32 uint32;
 typedef s32 int32;
@@ -48,4 +50,90 @@ typedef bool Bool;
 
 #define CONST64U(x) x##ULL
 
+/*
+ * MKS Guest Stats types
+ */
+
+typedef struct MKSGuestStatCounter {
+	atomic64_t count;
+} MKSGuestStatCounter;
+
+typedef struct MKSGuestStatCounterTime {
+	MKSGuestStatCounter counter;
+	atomic64_t selfCycles;
+	atomic64_t totalCycles;
+} MKSGuestStatCounterTime;
+
+/*
+ * Flags for MKSGuestStatInfoEntry::flags below
+ */
+
+#define MKS_GUEST_STAT_FLAG_NONE    0
+#define MKS_GUEST_STAT_FLAG_TIME    (1U << 0)
+
+typedef __attribute__((aligned(32))) struct MKSGuestStatInfoEntry {
+	union {
+		const char *s;
+		uint64 u;
+	} name;
+	union {
+		const char *s;
+		uint64 u;
+	} description;
+	uint64 flags;
+	union {
+		MKSGuestStatCounter *counter;
+		MKSGuestStatCounterTime *counterTime;
+		uint64 u;
+	} stat;
+} MKSGuestStatInfoEntry;
+
+#define INVALID_PPN64       ((PPN64)0x000fffffffffffffULL)
+#define vmw_num_pages(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
+
+#define MKS_GUEST_STAT_INSTANCE_DESC_LENGTH 1024
+#define MKS_GUEST_STAT_INSTANCE_MAX_STATS   4096
+#define MKS_GUEST_STAT_INSTANCE_MAX_STAT_PPNS                \
+	(vmw_num_pages(MKS_GUEST_STAT_INSTANCE_MAX_STATS *   \
+		sizeof(MKSGuestStatCounterTime)))
+#define MKS_GUEST_STAT_INSTANCE_MAX_INFO_PPNS                \
+	(vmw_num_pages(MKS_GUEST_STAT_INSTANCE_MAX_STATS *   \
+		sizeof(MKSGuestStatInfoEntry)))
+#define MKS_GUEST_STAT_AVERAGE_NAME_LENGTH  40
+#define MKS_GUEST_STAT_INSTANCE_MAX_STRS_PPNS                \
+	(vmw_num_pages(MKS_GUEST_STAT_INSTANCE_MAX_STATS *   \
+		MKS_GUEST_STAT_AVERAGE_NAME_LENGTH))
+
+/*
+ * The MKSGuestStatInstanceDescriptor is used as main interface to
+ * communicate guest stats back to the host code.  The guest must
+ * allocate an instance of this structure at the start of a page and
+ * provide the physical address to the host.  From there the host code
+ * can walk this structure to find other (pinned) pages containing the
+ * stats data.
+ *
+ * Since the MKSGuestStatInfoEntry structures contain userlevel
+ * pointers, the InstanceDescriptor also contains pointers to the
+ * begining of these sections allowing the host side code to correctly
+ * interpret the pointers.
+ *
+ * Because the host side code never acknowledges anything back to the
+ * guest there is no strict requirement to maintain compatability
+ * across releases.  If the interface changes the host might not be
+ * able to log stats, but the guest will continue to run normally.
+ */
+
+typedef struct MKSGuestStatInstanceDescriptor {
+	uint64 reservedMBZ; /* must be zero for now. */
+	uint64 statStartVA; /* VA of the start of the stats section. */
+	uint64 strsStartVA; /* VA of the start of the strings section. */
+	uint64 statLength;  /* length of the stats section in bytes. */
+	uint64 infoLength;  /* length of the info entry section in bytes. */
+	uint64 strsLength;  /* length of the strings section in bytes. */
+	PPN64  statPPNs[MKS_GUEST_STAT_INSTANCE_MAX_STAT_PPNS]; /* stat counters */
+	PPN64  infoPPNs[MKS_GUEST_STAT_INSTANCE_MAX_INFO_PPNS]; /* stat info */
+	PPN64  strsPPNs[MKS_GUEST_STAT_INSTANCE_MAX_STRS_PPNS]; /* strings */
+	char   description[MKS_GUEST_STAT_INSTANCE_DESC_LENGTH];
+} MKSGuestStatInstanceDescriptor;
+
 #endif
diff --git a/drivers/gpu/drm/vmwgfx/device_include/vm_basic_types.h b/drivers/gpu/drm/vmwgfx/device_include/vm_basic_types.h
deleted file mode 100644
index 3a195e8106b3..000000000000
--- a/drivers/gpu/drm/vmwgfx/device_include/vm_basic_types.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _VM_BASIC_TYPES_H_
-#define _VM_BASIC_TYPES_H_
-#include <linux/kernel.h>
-
-typedef u32 uint32;
-typedef s32 int32;
-typedef u64 uint64;
-typedef u16 uint16;
-typedef s16 int16;
-typedef u8  uint8;
-typedef s8  int8;
-
-typedef uint64 PA;
-typedef uint32 PPN;
-typedef uint64 PPN64;
-
-typedef bool Bool;
-
-#define MAX_UINT32 U32_MAX
-
-#endif
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 3e438de0f157..b9f18151663a 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -43,6 +43,7 @@
 #include "vmwgfx_binding.h"
 #include "vmwgfx_devcaps.h"
 #include "vmwgfx_drv.h"
+#include "vmwgfx_mksstat.h"
 
 #define VMWGFX_DRIVER_DESC "Linux drm driver for VMware graphics devices"
 
@@ -148,6 +149,14 @@
 #define DRM_IOCTL_VMW_MSG						\
 	DRM_IOWR(DRM_COMMAND_BASE + DRM_VMW_MSG,			\
 		struct drm_vmw_msg_arg)
+#define DRM_IOCTL_VMW_MKSSTAT_RESET				\
+	DRM_IO(DRM_COMMAND_BASE + DRM_VMW_MKSSTAT_RESET)
+#define DRM_IOCTL_VMW_MKSSTAT_ADD				\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VMW_MKSSTAT_ADD,	\
+		struct drm_vmw_mksstat_add_arg)
+#define DRM_IOCTL_VMW_MKSSTAT_REMOVE				\
+	DRM_IOW(DRM_COMMAND_BASE + DRM_VMW_MKSSTAT_REMOVE,	\
+		struct drm_vmw_mksstat_remove_arg)
 
 /*
  * The core DRM version of this macro doesn't account for
@@ -244,6 +253,15 @@ static const struct drm_ioctl_desc vmw_ioctls[] = {
 	VMW_IOCTL_DEF(VMW_MSG,
 		      vmw_msg_ioctl,
 		      DRM_RENDER_ALLOW),
+	VMW_IOCTL_DEF(VMW_MKSSTAT_RESET,
+		      vmw_mksstat_reset_ioctl,
+		      DRM_RENDER_ALLOW),
+	VMW_IOCTL_DEF(VMW_MKSSTAT_ADD,
+		      vmw_mksstat_add_ioctl,
+		      DRM_RENDER_ALLOW),
+	VMW_IOCTL_DEF(VMW_MKSSTAT_REMOVE,
+		      vmw_mksstat_remove_ioctl,
+		      DRM_RENDER_ALLOW),
 };
 
 static const struct pci_device_id vmw_pci_id_list[] = {
@@ -1137,6 +1155,8 @@ static void vmw_driver_unload(struct drm_device *dev)
 	for (i = vmw_res_context; i < vmw_res_max; ++i)
 		idr_destroy(&dev_priv->res_idr[i]);
 
+	vmw_mksstat_remove_all(dev_priv);
+
 	pci_release_regions(pdev);
 }
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index 4c2afe9c0505..0d8699a43491 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 OR MIT */
 /**************************************************************************
  *
- * Copyright 2009-2015 VMware, Inc., Palo Alto, CA., USA
+ * Copyright 2009-2021 VMware, Inc., Palo Alto, CA., USA
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -91,6 +91,9 @@
 #define VMW_RES_FENCE ttm_driver_type3
 #define VMW_RES_SHADER ttm_driver_type4
 
+#define MKSSTAT_CAPACITY_LOG2 5U
+#define MKSSTAT_CAPACITY (1U << MKSSTAT_CAPACITY_LOG2)
+
 struct vmw_fpriv {
 	struct ttm_object_file *tfile;
 	bool gb_aware; /* user-space is guest-backed aware */
@@ -630,6 +633,18 @@ struct vmw_private {
 	struct vmw_validation_mem vvm;
 
 	uint32 *devcaps;
+
+	/*
+	 * mksGuestStat instance-descriptor and pid arrays
+	 */
+	struct page *mksstat_user_pages[MKSSTAT_CAPACITY];
+	atomic_t mksstat_user_pids[MKSSTAT_CAPACITY];
+
+#if IS_ENABLED(CONFIG_DRM_VMWGFX_MKSSTATS)
+	struct page *mksstat_kern_pages[MKSSTAT_CAPACITY];
+	u8 mksstat_kern_top_timer[MKSSTAT_CAPACITY];
+	atomic_t mksstat_kern_pids[MKSSTAT_CAPACITY];
+#endif
 };
 
 static inline struct vmw_surface *vmw_res_to_srf(struct vmw_resource *res)
@@ -1503,6 +1518,17 @@ __printf(1, 2) int vmw_host_printf(const char *fmt, ...);
 int vmw_msg_ioctl(struct drm_device *dev, void *data,
 		  struct drm_file *file_priv);
 
+/* Host mksGuestStats -vmwgfx_msg.c: */
+int vmw_mksstat_get_kern_slot(pid_t pid, struct vmw_private *dev_priv);
+
+int vmw_mksstat_reset_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv);
+int vmw_mksstat_add_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv);
+int vmw_mksstat_remove_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv);
+int vmw_mksstat_remove_all(struct vmw_private *dev_priv);
+
 /* VMW logging */
 
 /**
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index a2b8464b3f56..cd2621c8db38 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -32,6 +32,7 @@
 #include <drm/ttm/ttm_placement.h>
 #include "vmwgfx_so.h"
 #include "vmwgfx_binding.h"
+#include "vmwgfx_mksstat.h"
 
 #define VMW_RES_HT_ORDER 12
 
@@ -4406,6 +4407,9 @@ int vmw_execbuf_ioctl(struct drm_device *dev, void *data,
 	int ret;
 	struct dma_fence *in_fence = NULL;
 
+	MKS_STAT_TIME_DECL(MKSSTAT_KERN_EXECBUF);
+	MKS_STAT_TIME_PUSH(MKSSTAT_KERN_EXECBUF);
+
 	/*
 	 * Extend the ioctl argument while maintaining backwards compatibility:
 	 * We take different code paths depending on the value of arg->version.
@@ -4415,7 +4419,8 @@ int vmw_execbuf_ioctl(struct drm_device *dev, void *data,
 	if (unlikely(arg->version > DRM_VMW_EXECBUF_VERSION ||
 		     arg->version == 0)) {
 		VMW_DEBUG_USER("Incorrect execbuf version.\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto mksstats_out;
 	}
 
 	switch (arg->version) {
@@ -4435,7 +4440,8 @@ int vmw_execbuf_ioctl(struct drm_device *dev, void *data,
 
 		if (!in_fence) {
 			VMW_DEBUG_USER("Cannot get imported fence\n");
-			return -EINVAL;
+			ret = -EINVAL;
+			goto mksstats_out;
 		}
 
 		ret = vmw_wait_dma_fence(dev_priv->fman, in_fence);
@@ -4458,5 +4464,8 @@ int vmw_execbuf_ioctl(struct drm_device *dev, void *data,
 out:
 	if (in_fence)
 		dma_fence_put(in_fence);
+
+mksstats_out:
+	MKS_STAT_TIME_POP(MKSSTAT_KERN_EXECBUF);
 	return ret;
 }
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_mksstat.h b/drivers/gpu/drm/vmwgfx/vmwgfx_mksstat.h
new file mode 100644
index 000000000000..0509f55f07b4
--- /dev/null
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_mksstat.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/**************************************************************************
+ *
+ * Copyright 2021 VMware, Inc., Palo Alto, CA., USA
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef _VMWGFX_MKSSTAT_H_
+#define _VMWGFX_MKSSTAT_H_
+
+#include <asm/page.h>
+
+/* Reservation marker for mksstat pid's */
+#define MKSSTAT_PID_RESERVED -1
+
+#if IS_ENABLED(CONFIG_DRM_VMWGFX_MKSSTATS)
+/*
+ * Kernel-internal mksGuestStat counters. The order of this enum dictates the
+ * order of instantiation of these counters in the mksGuestStat pages.
+ */
+
+typedef enum {
+	MKSSTAT_KERN_EXECBUF, /* vmw_execbuf_ioctl */
+
+	MKSSTAT_KERN_COUNT /* Reserved entry; always last */
+} mksstat_kern_stats_t;
+
+/**
+ * vmw_mksstat_get_kern_pstat: Computes the address of the MKSGuestStatCounterTime
+ * array from the address of the base page.
+ *
+ * @page_addr: Pointer to the base page.
+ * Return: Pointer to the MKSGuestStatCounterTime array.
+ */
+
+static inline void *vmw_mksstat_get_kern_pstat(void *page_addr)
+{
+	return page_addr + PAGE_SIZE * 1;
+}
+
+/**
+ * vmw_mksstat_get_kern_pinfo: Computes the address of the MKSGuestStatInfoEntry
+ * array from the address of the base page.
+ *
+ * @page_addr: Pointer to the base page.
+ * Return: Pointer to the MKSGuestStatInfoEntry array.
+ */
+
+static inline void *vmw_mksstat_get_kern_pinfo(void *page_addr)
+{
+	return page_addr + PAGE_SIZE * 2;
+}
+
+/**
+ * vmw_mksstat_get_kern_pstrs: Computes the address of the mksGuestStat strings
+ * sequence from the address of the base page.
+ *
+ * @page_addr: Pointer to the base page.
+ * Return: Pointer to the mksGuestStat strings sequence.
+ */
+
+static inline void *vmw_mksstat_get_kern_pstrs(void *page_addr)
+{
+	return page_addr + PAGE_SIZE * 3;
+}
+
+/*
+ * MKS_STAT_TIME_DECL/PUSH/POP macros to be used in timer-counted routines.
+ */
+
+struct mksstat_timer_t {
+/* mutable */ mksstat_kern_stats_t old_top;
+	const u64 t0;
+	const int slot;
+};
+
+#define MKS_STAT_TIME_DECL(kern_cntr)                                     \
+	struct mksstat_timer_t _##kern_cntr = {                           \
+		.t0 = rdtsc(),                                            \
+		.slot = vmw_mksstat_get_kern_slot(current->pid, dev_priv) \
+	}
+
+#define MKS_STAT_TIME_PUSH(kern_cntr)                                                               \
+	do {                                                                                        \
+		if (_##kern_cntr.slot >= 0) {                                                       \
+			_##kern_cntr.old_top = dev_priv->mksstat_kern_top_timer[_##kern_cntr.slot]; \
+			dev_priv->mksstat_kern_top_timer[_##kern_cntr.slot] = kern_cntr;            \
+		}                                                                                   \
+	} while (0)
+
+#define MKS_STAT_TIME_POP(kern_cntr)                                                                                                           \
+	do {                                                                                                                                   \
+		if (_##kern_cntr.slot >= 0) {                                                                                                  \
+			const pid_t pid = atomic_cmpxchg(&dev_priv->mksstat_kern_pids[_##kern_cntr.slot], current->pid, MKSSTAT_PID_RESERVED); \
+			dev_priv->mksstat_kern_top_timer[_##kern_cntr.slot] = _##kern_cntr.old_top;                                            \
+			                                                                                                                       \
+			if (pid == current->pid) {                                                                                             \
+				const u64 dt = rdtsc() - _##kern_cntr.t0;                                                                      \
+				MKSGuestStatCounterTime *pstat;                                                                                \
+				                                                                                                               \
+				BUG_ON(!dev_priv->mksstat_kern_pages[_##kern_cntr.slot]);                                                      \
+				                                                                                                               \
+				pstat = vmw_mksstat_get_kern_pstat(page_address(dev_priv->mksstat_kern_pages[_##kern_cntr.slot]));             \
+				                                                                                                               \
+				atomic64_inc(&pstat[kern_cntr].counter.count);                                                                 \
+				atomic64_add(dt, &pstat[kern_cntr].selfCycles);                                                                \
+				atomic64_add(dt, &pstat[kern_cntr].totalCycles);                                                               \
+				                                                                                                               \
+				if (_##kern_cntr.old_top != MKSSTAT_KERN_COUNT)                                                                \
+					atomic64_sub(dt, &pstat[_##kern_cntr.old_top].selfCycles);                                             \
+					                                                                                                       \
+				atomic_set(&dev_priv->mksstat_kern_pids[_##kern_cntr.slot], current->pid);                                     \
+			}                                                                                                                      \
+		}                                                                                                                              \
+	} while (0)
+
+#else
+#define MKS_STAT_TIME_DECL(kern_cntr)
+#define MKS_STAT_TIME_PUSH(kern_cntr)
+#define MKS_STAT_TIME_POP(kern_cntr)
+
+#endif /* IS_ENABLED(CONFIG_DRM_VMWGFX_MKSSTATS */
+
+#endif
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_msg.c b/drivers/gpu/drm/vmwgfx/vmwgfx_msg.c
index 3d08f5700bdb..4218fe00e3b1 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_msg.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_msg.c
@@ -31,10 +31,12 @@
 #include <linux/mem_encrypt.h>
 
 #include <asm/hypervisor.h>
+#include <drm/drm_ioctl.h>
 
 #include "vmwgfx_drv.h"
 #include "vmwgfx_msg_x86.h"
 #include "vmwgfx_msg_arm64.h"
+#include "vmwgfx_mksstat.h"
 
 #define MESSAGE_STATUS_SUCCESS  0x0001
 #define MESSAGE_STATUS_DORECV   0x0002
@@ -56,6 +58,11 @@
 #define VMW_PORT_CMD_RECVSIZE   (MSG_TYPE_RECVSIZE << 16 | VMW_PORT_CMD_MSG)
 #define VMW_PORT_CMD_RECVSTATUS (MSG_TYPE_RECVSTATUS << 16 | VMW_PORT_CMD_MSG)
 
+#define VMW_PORT_CMD_MKS_GUEST_STATS   85
+#define VMW_PORT_CMD_MKSGS_RESET       (0 << 16 | VMW_PORT_CMD_MKS_GUEST_STATS)
+#define VMW_PORT_CMD_MKSGS_ADD_PPN     (1 << 16 | VMW_PORT_CMD_MKS_GUEST_STATS)
+#define VMW_PORT_CMD_MKSGS_REMOVE_PPN  (2 << 16 | VMW_PORT_CMD_MKS_GUEST_STATS)
+
 #define HIGH_WORD(X) ((X & 0xFFFF0000) >> 16)
 
 #define MAX_USER_MSG_LENGTH	PAGE_SIZE
@@ -612,3 +619,575 @@ out_open:
 
 	return -EINVAL;
 }
+
+/**
+ * reset_ppn_array: Resets a PPN64 array to INVALID_PPN64 content
+ *
+ * @arr: Array to reset.
+ * @size: Array length.
+ */
+static inline void reset_ppn_array(PPN64 *arr, size_t size)
+{
+	size_t i;
+
+	BUG_ON(!arr || size == 0);
+
+	for (i = 0; i < size; ++i)
+		arr[i] = INVALID_PPN64;
+}
+
+/**
+ * hypervisor_ppn_reset_all: Removes all mksGuestStat instance descriptors from
+ * the hypervisor. All related pages should be subsequently unpinned or freed.
+ *
+ */
+static inline void hypervisor_ppn_reset_all(void)
+{
+	unsigned long eax, ebx, ecx, edx, si = 0, di = 0;
+
+	VMW_PORT(VMW_PORT_CMD_MKSGS_RESET,
+		0, si, di,
+		0,
+		VMW_HYPERVISOR_MAGIC,
+		eax, ebx, ecx, edx, si, di);
+}
+
+/**
+ * hypervisor_ppn_add: Adds a single mksGuestStat instance descriptor to the
+ * hypervisor. Any related userspace pages should be pinned in advance.
+ *
+ * @pfn: Physical page number of the instance descriptor
+ */
+static inline void hypervisor_ppn_add(PPN64 pfn)
+{
+	unsigned long eax, ebx, ecx, edx, si = 0, di = 0;
+
+	VMW_PORT(VMW_PORT_CMD_MKSGS_ADD_PPN,
+		pfn, si, di,
+		0,
+		VMW_HYPERVISOR_MAGIC,
+		eax, ebx, ecx, edx, si, di);
+}
+
+/**
+ * hypervisor_ppn_remove: Removes a single mksGuestStat instance descriptor from
+ * the hypervisor. All related pages should be subsequently unpinned or freed.
+ *
+ * @pfn: Physical page number of the instance descriptor
+ */
+static inline void hypervisor_ppn_remove(PPN64 pfn)
+{
+	unsigned long eax, ebx, ecx, edx, si = 0, di = 0;
+
+	VMW_PORT(VMW_PORT_CMD_MKSGS_REMOVE_PPN,
+		pfn, si, di,
+		0,
+		VMW_HYPERVISOR_MAGIC,
+		eax, ebx, ecx, edx, si, di);
+}
+
+#if IS_ENABLED(CONFIG_DRM_VMWGFX_MKSSTATS)
+
+/* Order of the total number of pages used for kernel-internal mksGuestStat; at least 2 */
+#define MKSSTAT_KERNEL_PAGES_ORDER 2
+/* Header to the text description of mksGuestStat instance descriptor */
+#define MKSSTAT_KERNEL_DESCRIPTION "vmwgfx"
+
+/* Kernel mksGuestStats counter names and desciptions; same order as enum mksstat_kern_stats_t */
+static const char* const mksstat_kern_name_desc[MKSSTAT_KERN_COUNT][2] =
+{
+	{ "vmw_execbuf_ioctl", "vmw_execbuf_ioctl" },
+};
+
+/**
+ * mksstat_init_record: Initializes an MKSGuestStatCounter-based record
+ * for the respective mksGuestStat index.
+ *
+ * @stat_idx: Index of the MKSGuestStatCounter-based mksGuestStat record.
+ * @pstat: Pointer to array of MKSGuestStatCounterTime.
+ * @pinfo: Pointer to array of MKSGuestStatInfoEntry.
+ * @pstrs: Pointer to current end of the name/description sequence.
+ * Return: Pointer to the new end of the names/description sequence.
+ */
+
+static inline char *mksstat_init_record(mksstat_kern_stats_t stat_idx,
+	MKSGuestStatCounterTime *pstat, MKSGuestStatInfoEntry *pinfo, char *pstrs)
+{
+	char *const pstrd = pstrs + strlen(mksstat_kern_name_desc[stat_idx][0]) + 1;
+	strcpy(pstrs, mksstat_kern_name_desc[stat_idx][0]);
+	strcpy(pstrd, mksstat_kern_name_desc[stat_idx][1]);
+
+	pinfo[stat_idx].name.s = pstrs;
+	pinfo[stat_idx].description.s = pstrd;
+	pinfo[stat_idx].flags = MKS_GUEST_STAT_FLAG_NONE;
+	pinfo[stat_idx].stat.counter = (MKSGuestStatCounter *)&pstat[stat_idx];
+
+	return pstrd + strlen(mksstat_kern_name_desc[stat_idx][1]) + 1;
+}
+
+/**
+ * mksstat_init_record_time: Initializes an MKSGuestStatCounterTime-based record
+ * for the respective mksGuestStat index.
+ *
+ * @stat_idx: Index of the MKSGuestStatCounterTime-based mksGuestStat record.
+ * @pstat: Pointer to array of MKSGuestStatCounterTime.
+ * @pinfo: Pointer to array of MKSGuestStatInfoEntry.
+ * @pstrs: Pointer to current end of the name/description sequence.
+ * Return: Pointer to the new end of the names/description sequence.
+ */
+
+static inline char *mksstat_init_record_time(mksstat_kern_stats_t stat_idx,
+	MKSGuestStatCounterTime *pstat, MKSGuestStatInfoEntry *pinfo, char *pstrs)
+{
+	char *const pstrd = pstrs + strlen(mksstat_kern_name_desc[stat_idx][0]) + 1;
+	strcpy(pstrs, mksstat_kern_name_desc[stat_idx][0]);
+	strcpy(pstrd, mksstat_kern_name_desc[stat_idx][1]);
+
+	pinfo[stat_idx].name.s = pstrs;
+	pinfo[stat_idx].description.s = pstrd;
+	pinfo[stat_idx].flags = MKS_GUEST_STAT_FLAG_TIME;
+	pinfo[stat_idx].stat.counterTime = &pstat[stat_idx];
+
+	return pstrd + strlen(mksstat_kern_name_desc[stat_idx][1]) + 1;
+}
+
+/**
+ * mksstat_init_kern_id: Creates a single mksGuestStat instance descriptor and
+ * kernel-internal counters. Adds PFN mapping to the hypervisor.
+ *
+ * Create a single mksGuestStat instance descriptor and corresponding structures
+ * for all kernel-internal counters. The corresponding PFNs are mapped with the
+ * hypervisor.
+ *
+ * @ppage: Output pointer to page containing the instance descriptor.
+ * Return: Zero on success, negative error code on error.
+ */
+
+static int mksstat_init_kern_id(struct page **ppage)
+{
+	MKSGuestStatInstanceDescriptor *pdesc;
+	MKSGuestStatCounterTime *pstat;
+	MKSGuestStatInfoEntry *pinfo;
+	char *pstrs, *pstrs_acc;
+
+	/* Allocate pages for the kernel-internal instance descriptor */
+	struct page *page = alloc_pages(GFP_KERNEL | __GFP_ZERO, MKSSTAT_KERNEL_PAGES_ORDER);
+
+	if (!page)
+		return -ENOMEM;
+
+	pdesc = page_address(page);
+	pstat = vmw_mksstat_get_kern_pstat(pdesc);
+	pinfo = vmw_mksstat_get_kern_pinfo(pdesc);
+	pstrs = vmw_mksstat_get_kern_pstrs(pdesc);
+
+	/* Set up all kernel-internal counters and corresponding structures */
+	pstrs_acc = pstrs;
+	pstrs_acc = mksstat_init_record_time(MKSSTAT_KERN_EXECBUF, pstat, pinfo, pstrs_acc);
+
+	/* Add new counters above, in their order of appearance in mksstat_kern_stats_t */
+
+	BUG_ON(pstrs_acc - pstrs > PAGE_SIZE);
+
+	/* Set up the kernel-internal instance descriptor */
+	pdesc->reservedMBZ = 0;
+	pdesc->statStartVA = (uintptr_t)pstat;
+	pdesc->strsStartVA = (uintptr_t)pstrs;
+	pdesc->statLength = sizeof(*pstat) * MKSSTAT_KERN_COUNT;
+	pdesc->infoLength = sizeof(*pinfo) * MKSSTAT_KERN_COUNT;
+	pdesc->strsLength = pstrs_acc - pstrs;
+	snprintf(pdesc->description, ARRAY_SIZE(pdesc->description) - 1, "%s pid=%d",
+		MKSSTAT_KERNEL_DESCRIPTION, current->pid);
+
+	pdesc->statPPNs[0] = page_to_pfn(virt_to_page(pstat));
+	reset_ppn_array(pdesc->statPPNs + 1, ARRAY_SIZE(pdesc->statPPNs) - 1);
+
+	pdesc->infoPPNs[0] = page_to_pfn(virt_to_page(pinfo));
+	reset_ppn_array(pdesc->infoPPNs + 1, ARRAY_SIZE(pdesc->infoPPNs) - 1);
+
+	pdesc->strsPPNs[0] = page_to_pfn(virt_to_page(pstrs));
+	reset_ppn_array(pdesc->strsPPNs + 1, ARRAY_SIZE(pdesc->strsPPNs) - 1);
+
+	*ppage = page;
+
+	hypervisor_ppn_add((PPN64)page_to_pfn(page));
+
+	return 0;
+}
+
+/**
+ * vmw_mksstat_get_kern_slot: Acquires a slot for a single kernel-internal
+ * mksGuestStat instance descriptor.
+ *
+ * Find a slot for a single kernel-internal mksGuestStat instance descriptor.
+ * In case no such was already present, allocate a new one and set up a kernel-
+ * internal mksGuestStat instance descriptor for the former.
+ *
+ * @pid: Process for which a slot is sought.
+ * @dev_priv: Identifies the drm private device.
+ * Return: Non-negative slot on success, negative error code on error.
+ */
+
+int vmw_mksstat_get_kern_slot(pid_t pid, struct vmw_private *dev_priv)
+{
+	const size_t base = (u32)hash_32(pid, MKSSTAT_CAPACITY_LOG2);
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(dev_priv->mksstat_kern_pids); ++i) {
+		const size_t slot = (i + base) % ARRAY_SIZE(dev_priv->mksstat_kern_pids);
+
+		/* Check if an instance descriptor for this pid is already present */
+		if (pid == (pid_t)atomic_read(&dev_priv->mksstat_kern_pids[slot]))
+			return (int)slot;
+
+		/* Set up a new instance descriptor for this pid */
+		if (!atomic_cmpxchg(&dev_priv->mksstat_kern_pids[slot], 0, MKSSTAT_PID_RESERVED)) {
+			const int ret = mksstat_init_kern_id(&dev_priv->mksstat_kern_pages[slot]);
+
+			if (!ret) {
+				/* Reset top-timer tracking for this slot */
+				dev_priv->mksstat_kern_top_timer[slot] = MKSSTAT_KERN_COUNT;
+
+				atomic_set(&dev_priv->mksstat_kern_pids[slot], pid);
+				return (int)slot;
+			}
+
+			atomic_set(&dev_priv->mksstat_kern_pids[slot], 0);
+			return ret;
+		}
+	}
+
+	return -ENOSPC;
+}
+
+#endif
+
+/**
+ * vmw_mksstat_cleanup_descriptor: Frees a single userspace-originating
+ * mksGuestStat instance-descriptor page and unpins all related user pages.
+ *
+ * Unpin all user pages realated to this instance descriptor and free
+ * the instance-descriptor page itself.
+ *
+ * @page: Page of the instance descriptor.
+ */
+
+static void vmw_mksstat_cleanup_descriptor(struct page *page)
+{
+	MKSGuestStatInstanceDescriptor *pdesc = page_address(page);
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(pdesc->statPPNs) && pdesc->statPPNs[i] != INVALID_PPN64; ++i)
+		unpin_user_page(pfn_to_page(pdesc->statPPNs[i]));
+
+	for (i = 0; i < ARRAY_SIZE(pdesc->infoPPNs) && pdesc->infoPPNs[i] != INVALID_PPN64; ++i)
+		unpin_user_page(pfn_to_page(pdesc->infoPPNs[i]));
+
+	for (i = 0; i < ARRAY_SIZE(pdesc->strsPPNs) && pdesc->strsPPNs[i] != INVALID_PPN64; ++i)
+		unpin_user_page(pfn_to_page(pdesc->strsPPNs[i]));
+
+	__free_page(page);
+}
+
+/**
+ * vmw_mksstat_remove_all: Resets all mksGuestStat instance descriptors
+ * from the hypervisor.
+ *
+ * Discard all hypervisor PFN mappings, containing active mksGuestState instance
+ * descriptors, unpin the related userspace pages and free the related kernel pages.
+ *
+ * @dev_priv: Identifies the drm private device.
+ * Return: Zero on success, negative error code on error.
+ */
+
+int vmw_mksstat_remove_all(struct vmw_private *dev_priv)
+{
+	int ret = 0;
+	size_t i;
+
+	/* Discard all PFN mappings with the hypervisor */
+	hypervisor_ppn_reset_all();
+
+	/* Discard all userspace-originating instance descriptors and unpin all related pages */
+	for (i = 0; i < ARRAY_SIZE(dev_priv->mksstat_user_pids); ++i) {
+		const pid_t pid0 = (pid_t)atomic_read(&dev_priv->mksstat_user_pids[i]);
+
+		if (!pid0)
+			continue;
+
+		if (pid0 != MKSSTAT_PID_RESERVED) {
+			const pid_t pid1 = atomic_cmpxchg(&dev_priv->mksstat_user_pids[i], pid0, MKSSTAT_PID_RESERVED);
+
+			if (!pid1)
+				continue;
+
+			if (pid1 == pid0) {
+				struct page *const page = dev_priv->mksstat_user_pages[i];
+
+				BUG_ON(!page);
+
+				dev_priv->mksstat_user_pages[i] = NULL;
+				atomic_set(&dev_priv->mksstat_user_pids[i], 0);
+
+				vmw_mksstat_cleanup_descriptor(page);
+				continue;
+			}
+		}
+
+		ret = -EAGAIN;
+	}
+
+#if IS_ENABLED(CONFIG_DRM_VMWGFX_MKSSTATS)
+	/* Discard all kernel-internal instance descriptors and free all related pages */
+	for (i = 0; i < ARRAY_SIZE(dev_priv->mksstat_kern_pids); ++i) {
+		const pid_t pid0 = (pid_t)atomic_read(&dev_priv->mksstat_kern_pids[i]);
+
+		if (!pid0)
+			continue;
+
+		if (pid0 != MKSSTAT_PID_RESERVED) {
+			const pid_t pid1 = atomic_cmpxchg(&dev_priv->mksstat_kern_pids[i], pid0, MKSSTAT_PID_RESERVED);
+
+			if (!pid1)
+				continue;
+
+			if (pid1 == pid0) {
+				struct page *const page = dev_priv->mksstat_kern_pages[i];
+
+				BUG_ON(!page);
+
+				dev_priv->mksstat_kern_pages[i] = NULL;
+				atomic_set(&dev_priv->mksstat_kern_pids[i], 0);
+
+				__free_pages(page, MKSSTAT_KERNEL_PAGES_ORDER);
+				continue;
+			}
+		}
+
+		ret = -EAGAIN;
+	}
+
+#endif
+	return ret;
+}
+
+/**
+ * vmw_mksstat_reset_ioctl: Resets all mksGuestStat instance descriptors
+ * from the hypervisor.
+ *
+ * Discard all hypervisor PFN mappings, containing active mksGuestStat instance
+ * descriptors, unpin the related userspace pages and free the related kernel pages.
+ *
+ * @dev: Identifies the drm device.
+ * @data: Pointer to the ioctl argument.
+ * @file_priv: Identifies the caller; unused.
+ * Return: Zero on success, negative error code on error.
+ */
+
+int vmw_mksstat_reset_ioctl(struct drm_device *dev, void *data,
+				struct drm_file *file_priv)
+{
+	struct vmw_private *const dev_priv = vmw_priv(dev);
+	return vmw_mksstat_remove_all(dev_priv);
+}
+
+/**
+ * vmw_mksstat_add_ioctl: Creates a single userspace-originating mksGuestStat
+ * instance descriptor and registers that with the hypervisor.
+ *
+ * Create a hypervisor PFN mapping, containing a single mksGuestStat instance
+ * descriptor and pin the corresponding userspace pages.
+ *
+ * @dev: Identifies the drm device.
+ * @data: Pointer to the ioctl argument.
+ * @file_priv: Identifies the caller; unused.
+ * Return: Zero on success, negative error code on error.
+ */
+
+int vmw_mksstat_add_ioctl(struct drm_device *dev, void *data,
+				struct drm_file *file_priv)
+{
+	struct drm_vmw_mksstat_add_arg *arg =
+		(struct drm_vmw_mksstat_add_arg *) data;
+
+	struct vmw_private *const dev_priv = vmw_priv(dev);
+
+	struct page *page;
+	MKSGuestStatInstanceDescriptor *pdesc;
+	const size_t num_pages_stat = vmw_num_pages(arg->stat_len);
+	const size_t num_pages_info = vmw_num_pages(arg->info_len);
+	const size_t num_pages_strs = vmw_num_pages(arg->strs_len);
+	long desc_len;
+	long nr_pinned_stat;
+	long nr_pinned_info;
+	long nr_pinned_strs;
+	struct page *pages_stat[ARRAY_SIZE(pdesc->statPPNs)];
+	struct page *pages_info[ARRAY_SIZE(pdesc->infoPPNs)];
+	struct page *pages_strs[ARRAY_SIZE(pdesc->strsPPNs)];
+	size_t i, slot;
+
+	arg->id = -1;
+
+	if (!arg->stat || !arg->info || !arg->strs)
+		return -EINVAL;
+
+	if (!arg->stat_len || !arg->info_len || !arg->strs_len)
+		return -EINVAL;
+
+	if (!arg->description)
+		return -EINVAL;
+
+	if (num_pages_stat > ARRAY_SIZE(pdesc->statPPNs) ||
+		num_pages_info > ARRAY_SIZE(pdesc->infoPPNs) ||
+		num_pages_strs > ARRAY_SIZE(pdesc->strsPPNs))
+		return -EINVAL;
+
+	/* Find an available slot in the mksGuestStats user array and reserve it */
+	for (slot = 0; slot < ARRAY_SIZE(dev_priv->mksstat_user_pids); ++slot)
+		if (!atomic_cmpxchg(&dev_priv->mksstat_user_pids[slot], 0, MKSSTAT_PID_RESERVED))
+			break;
+
+	if (slot == ARRAY_SIZE(dev_priv->mksstat_user_pids))
+		return -ENOSPC;
+
+	BUG_ON(dev_priv->mksstat_user_pages[slot]);
+
+	/* Allocate a page for the instance descriptor */
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (!page) {
+		atomic_set(&dev_priv->mksstat_user_pids[slot], 0);
+		return -ENOMEM;
+	}
+
+	/* Set up the instance descriptor */
+	pdesc = page_address(page);
+
+	pdesc->reservedMBZ = 0;
+	pdesc->statStartVA = arg->stat;
+	pdesc->strsStartVA = arg->strs;
+	pdesc->statLength = arg->stat_len;
+	pdesc->infoLength = arg->info_len;
+	pdesc->strsLength = arg->strs_len;
+	desc_len = strncpy_from_user(pdesc->description, u64_to_user_ptr(arg->description),
+		ARRAY_SIZE(pdesc->description) - 1);
+
+	if (desc_len < 0) {
+		atomic_set(&dev_priv->mksstat_user_pids[slot], 0);
+		return -EFAULT;
+	}
+
+	reset_ppn_array(pdesc->statPPNs, ARRAY_SIZE(pdesc->statPPNs));
+	reset_ppn_array(pdesc->infoPPNs, ARRAY_SIZE(pdesc->infoPPNs));
+	reset_ppn_array(pdesc->strsPPNs, ARRAY_SIZE(pdesc->strsPPNs));
+
+	/* Pin mksGuestStat user pages and store those in the instance descriptor */
+	nr_pinned_stat = pin_user_pages(arg->stat, num_pages_stat, FOLL_LONGTERM, pages_stat, NULL);
+	if (num_pages_stat != nr_pinned_stat)
+		goto err_pin_stat;
+
+	for (i = 0; i < num_pages_stat; ++i)
+		pdesc->statPPNs[i] = page_to_pfn(pages_stat[i]);
+
+	nr_pinned_info = pin_user_pages(arg->info, num_pages_info, FOLL_LONGTERM, pages_info, NULL);
+	if (num_pages_info != nr_pinned_info)
+		goto err_pin_info;
+
+	for (i = 0; i < num_pages_info; ++i)
+		pdesc->infoPPNs[i] = page_to_pfn(pages_info[i]);
+
+	nr_pinned_strs = pin_user_pages(arg->strs, num_pages_strs, FOLL_LONGTERM, pages_strs, NULL);
+	if (num_pages_strs != nr_pinned_strs)
+		goto err_pin_strs;
+
+	for (i = 0; i < num_pages_strs; ++i)
+		pdesc->strsPPNs[i] = page_to_pfn(pages_strs[i]);
+
+	/* Send the descriptor to the host via a hypervisor call. The mksGuestStat
+	   pages will remain in use until the user requests a matching remove stats
+	   or a stats reset occurs. */
+	hypervisor_ppn_add((PPN64)page_to_pfn(page));
+
+	dev_priv->mksstat_user_pages[slot] = page;
+	atomic_set(&dev_priv->mksstat_user_pids[slot], current->pid);
+
+	arg->id = slot;
+
+	DRM_DEV_INFO(dev->dev, "pid=%d arg.description='%.*s' id=%lu\n", current->pid, (int)desc_len, pdesc->description, slot);
+
+	return 0;
+
+err_pin_strs:
+	if (nr_pinned_strs > 0)
+		unpin_user_pages(pages_strs, nr_pinned_strs);
+
+err_pin_info:
+	if (nr_pinned_info > 0)
+		unpin_user_pages(pages_info, nr_pinned_info);
+
+err_pin_stat:
+	if (nr_pinned_stat > 0)
+		unpin_user_pages(pages_stat, nr_pinned_stat);
+
+	atomic_set(&dev_priv->mksstat_user_pids[slot], 0);
+	__free_page(page);
+	return -ENOMEM;
+}
+
+/**
+ * vmw_mksstat_remove_ioctl: Removes a single userspace-originating mksGuestStat
+ * instance descriptor from the hypervisor.
+ *
+ * Discard a hypervisor PFN mapping, containing a single mksGuestStat instance
+ * descriptor and unpin the corresponding userspace pages.
+ *
+ * @dev: Identifies the drm device.
+ * @data: Pointer to the ioctl argument.
+ * @file_priv: Identifies the caller; unused.
+ * Return: Zero on success, negative error code on error.
+ */
+
+int vmw_mksstat_remove_ioctl(struct drm_device *dev, void *data,
+				struct drm_file *file_priv)
+{
+	struct drm_vmw_mksstat_remove_arg *arg =
+		(struct drm_vmw_mksstat_remove_arg *) data;
+
+	struct vmw_private *const dev_priv = vmw_priv(dev);
+
+	const size_t slot = arg->id;
+	pid_t pid0;
+
+	if (slot >= ARRAY_SIZE(dev_priv->mksstat_user_pids))
+		return -EINVAL;
+
+	DRM_DEV_INFO(dev->dev, "pid=%d arg.id=%lu\n", current->pid, slot);
+
+	pid0 = atomic_read(&dev_priv->mksstat_user_pids[slot]);
+
+	if (!pid0)
+		return 0;
+
+	if (pid0 != MKSSTAT_PID_RESERVED) {
+		const pid_t pid1 = atomic_cmpxchg(&dev_priv->mksstat_user_pids[slot], pid0, MKSSTAT_PID_RESERVED);
+
+		if (!pid1)
+			return 0;
+
+		if (pid1 == pid0) {
+			struct page *const page = dev_priv->mksstat_user_pages[slot];
+
+			BUG_ON(!page);
+
+			dev_priv->mksstat_user_pages[slot] = NULL;
+			atomic_set(&dev_priv->mksstat_user_pids[slot], 0);
+
+			hypervisor_ppn_remove((PPN64)page_to_pfn(page));
+
+			vmw_mksstat_cleanup_descriptor(page);
+			return 0;
+		}
+	}
+
+	return -EAGAIN;
+}
diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index 02e917507479..9078775feb51 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -72,6 +72,9 @@ extern "C" {
 #define DRM_VMW_GB_SURFACE_CREATE_EXT   27
 #define DRM_VMW_GB_SURFACE_REF_EXT      28
 #define DRM_VMW_MSG                     29
+#define DRM_VMW_MKSSTAT_RESET           30
+#define DRM_VMW_MKSSTAT_ADD             31
+#define DRM_VMW_MKSSTAT_REMOVE          32
 
 /*************************************************************************/
 /**
@@ -1236,6 +1239,44 @@ struct drm_vmw_msg_arg {
 	__u32 receive_len;
 };
 
+/**
+ * struct drm_vmw_mksstat_add_arg
+ *
+ * @stat: Pointer to user-space stat-counters array, page-aligned.
+ * @info: Pointer to user-space counter-infos array, page-aligned.
+ * @strs: Pointer to user-space stat strings, page-aligned.
+ * @stat_len: Length in bytes of stat-counters array.
+ * @info_len: Length in bytes of counter-infos array.
+ * @strs_len: Length in bytes of the stat strings, terminators included.
+ * @description: Pointer to instance descriptor string; will be truncated
+ *               to MKS_GUEST_STAT_INSTANCE_DESC_LENGTH chars.
+ * @id: Output identifier of the produced record; -1 if error.
+ *
+ * Argument to the DRM_VMW_MKSSTAT_ADD ioctl.
+ */
+struct drm_vmw_mksstat_add_arg {
+	__u64 stat;
+	__u64 info;
+	__u64 strs;
+	__u64 stat_len;
+	__u64 info_len;
+	__u64 strs_len;
+	__u64 description;
+	__u64 id;
+};
+
+/**
+ * struct drm_vmw_mksstat_remove_arg
+ *
+ * @id: Identifier of the record being disposed, originally obtained through
+ *      DRM_VMW_MKSSTAT_ADD ioctl.
+ *
+ * Argument to the DRM_VMW_MKSSTAT_REMOVE ioctl.
+ */
+struct drm_vmw_mksstat_remove_arg {
+	__u64 id;
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
cgit v1.2.3-71-gd317


From ade0e676ecdcdd3428210684c76493722e09a791 Mon Sep 17 00:00:00 2001
From: Leandro Ribeiro <leandro.ribeiro@collabora.com>
Date: Fri, 11 Jun 2021 18:35:16 -0300
Subject: drm/doc: document drm_mode_get_plane

Add a small description and document struct fields of
drm_mode_get_plane.

Signed-off-by: Leandro Ribeiro <leandro.ribeiro@collabora.com>
Reviewed-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Signed-off-by: Simon Ser <contact@emersion.fr>
Link: https://patchwork.freedesktop.org/patch/msgid/20210611213516.77904-2-leandro.ribeiro@collabora.com
---
 include/uapi/drm/drm_mode.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 9b6722d45f36..98bf130feda5 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -312,16 +312,48 @@ struct drm_mode_set_plane {
 	__u32 src_w;
 };
 
+/**
+ * struct drm_mode_get_plane - Get plane metadata.
+ *
+ * Userspace can perform a GETPLANE ioctl to retrieve information about a
+ * plane.
+ *
+ * To retrieve the number of formats supported, set @count_format_types to zero
+ * and call the ioctl. @count_format_types will be updated with the value.
+ *
+ * To retrieve these formats, allocate an array with the memory needed to store
+ * @count_format_types formats. Point @format_type_ptr to this array and call
+ * the ioctl again (with @count_format_types still set to the value returned in
+ * the first ioctl call).
+ */
 struct drm_mode_get_plane {
+	/**
+	 * @plane_id: Object ID of the plane whose information should be
+	 * retrieved. Set by caller.
+	 */
 	__u32 plane_id;
 
+	/** @crtc_id: Object ID of the current CRTC. */
 	__u32 crtc_id;
+	/** @fb_id: Object ID of the current fb. */
 	__u32 fb_id;
 
+	/**
+	 * @possible_crtcs: Bitmask of CRTC's compatible with the plane. CRTC's
+	 * are created and they receive an index, which corresponds to their
+	 * position in the bitmask. Bit N corresponds to
+	 * :ref:`CRTC index<crtc_index>` N.
+	 */
 	__u32 possible_crtcs;
+	/** @gamma_size: Never used. */
 	__u32 gamma_size;
 
+	/** @count_format_types: Number of formats. */
 	__u32 count_format_types;
+	/**
+	 * @format_type_ptr: Pointer to ``__u32`` array of formats that are
+	 * supported by the plane. These formats do not require modifiers.
+	 */
 	__u64 format_type_ptr;
 };
 
-- 
cgit v1.2.3-71-gd317


From c649432e86ca677d8762c5764a2832509ca8d449 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Fri, 11 Jun 2021 14:22:21 +0100
Subject: drm/i915: Fix busy ioctl commentary

Just tidy one instance of incorrect context parameter name and a stray
sentence ending from before reporting was converted to be class based.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210611132221.1055650-1-tvrtko.ursulin@linux.intel.com
---
 include/uapi/drm/i915_drm.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index c2c7759b7d2e..a1cb4aa035a9 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1348,12 +1348,11 @@ struct drm_i915_gem_busy {
 	 * reading from the object simultaneously.
 	 *
 	 * The value of each engine class is the same as specified in the
-	 * I915_CONTEXT_SET_ENGINES parameter and via perf, i.e.
+	 * I915_CONTEXT_PARAM_ENGINES context parameter and via perf, i.e.
 	 * I915_ENGINE_CLASS_RENDER, I915_ENGINE_CLASS_COPY, etc.
-	 * reported as active itself. Some hardware may have parallel
-	 * execution engines, e.g. multiple media engines, which are
-	 * mapped to the same class identifier and so are not separately
-	 * reported for busyness.
+	 * Some hardware may have parallel execution engines, e.g. multiple
+	 * media engines, which are mapped to the same class identifier and so
+	 * are not separately reported for busyness.
 	 *
 	 * Caveat emptor:
 	 * Only the boolean result of this query is reliable; that is whether
-- 
cgit v1.2.3-71-gd317


From 51f52547df700819db0d0e2b17b677cb209212b4 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Thu, 17 Jun 2021 14:42:58 -0500
Subject: dma-buf: Document DMA_BUF_IOCTL_SYNC (v3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds a new "DMA Buffer ioctls" section to the dma-buf docs and adds
documentation for DMA_BUF_IOCTL_SYNC.

v2 (Daniel Vetter):
 - Fix a couple typos
 - Add commentary about synchronization with other devices
 - Use item list format for describing flags

v3 (Pekka Paalanen):
 - Clarify stalling requirements.
 - Be more clear that that DMA_BUF_IOCTL_SYNC with SINC_END has to be
   called before more GPU work happens.

Signed-off-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210617194258.579011-1-jason@jlekstrand.net
---
 Documentation/driver-api/dma-buf.rst |  8 ++++++
 include/uapi/linux/dma-buf.h         | 50 +++++++++++++++++++++++++++++++++++-
 2 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst
index 84aab65c4962..f5ac4c90b237 100644
--- a/Documentation/driver-api/dma-buf.rst
+++ b/Documentation/driver-api/dma-buf.rst
@@ -88,6 +88,9 @@ consider though:
 - The DMA buffer FD is also pollable, see `Implicit Fence Poll Support`_ below for
   details.
 
+- The DMA buffer FD also supports a few dma-buf-specific ioctls, see
+  `DMA Buffer ioctls`_ below for details.
+
 Basic Operation and Device DMA Access
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -111,6 +114,11 @@ DMA-BUF statistics
 .. kernel-doc:: drivers/dma-buf/dma-buf-sysfs-stats.c
    :doc: overview
 
+DMA Buffer ioctls
+~~~~~~~~~~~~~~~~~
+
+.. kernel-doc:: include/uapi/linux/dma-buf.h
+
 Kernel Functions and Structures Reference
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
index 7f30393b92c3..8e4a2ca0bcbf 100644
--- a/include/uapi/linux/dma-buf.h
+++ b/include/uapi/linux/dma-buf.h
@@ -22,8 +22,56 @@
 
 #include <linux/types.h>
 
-/* begin/end dma-buf functions used for userspace mmap. */
+/**
+ * struct dma_buf_sync - Synchronize with CPU access.
+ *
+ * When a DMA buffer is accessed from the CPU via mmap, it is not always
+ * possible to guarantee coherency between the CPU-visible map and underlying
+ * memory.  To manage coherency, DMA_BUF_IOCTL_SYNC must be used to bracket
+ * any CPU access to give the kernel the chance to shuffle memory around if
+ * needed.
+ *
+ * Prior to accessing the map, the client must call DMA_BUF_IOCTL_SYNC
+ * with DMA_BUF_SYNC_START and the appropriate read/write flags.  Once the
+ * access is complete, the client should call DMA_BUF_IOCTL_SYNC with
+ * DMA_BUF_SYNC_END and the same read/write flags.
+ *
+ * The synchronization provided via DMA_BUF_IOCTL_SYNC only provides cache
+ * coherency.  It does not prevent other processes or devices from
+ * accessing the memory at the same time.  If synchronization with a GPU or
+ * other device driver is required, it is the client's responsibility to
+ * wait for buffer to be ready for reading or writing before calling this
+ * ioctl with DMA_BUF_SYNC_START.  Likewise, the client must ensure that
+ * follow-up work is not submitted to GPU or other device driver until
+ * after this ioctl has been called with DMA_BUF_SYNC_END?
+ *
+ * If the driver or API with which the client is interacting uses implicit
+ * synchronization, waiting for prior work to complete can be done via
+ * poll() on the DMA buffer file descriptor.  If the driver or API requires
+ * explicit synchronization, the client may have to wait on a sync_file or
+ * other synchronization primitive outside the scope of the DMA buffer API.
+ */
 struct dma_buf_sync {
+	/**
+	 * @flags: Set of access flags
+	 *
+	 * DMA_BUF_SYNC_START:
+	 *     Indicates the start of a map access session.
+	 *
+	 * DMA_BUF_SYNC_END:
+	 *     Indicates the end of a map access session.
+	 *
+	 * DMA_BUF_SYNC_READ:
+	 *     Indicates that the mapped DMA buffer will be read by the
+	 *     client via the CPU map.
+	 *
+	 * DMA_BUF_SYNC_WRITE:
+	 *     Indicates that the mapped DMA buffer will be written by the
+	 *     client via the CPU map.
+	 *
+	 * DMA_BUF_SYNC_RW:
+	 *     An alias for DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE.
+	 */
 	__u64 flags;
 };
 
-- 
cgit v1.2.3-71-gd317


From 577729533cdc4e37a8c230e404a44ad7a3ff4eda Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Fri, 18 Jun 2021 16:00:36 +0100
Subject: drm/i915: Document the Virtual Engine uAPI

A little bit of documentation covering the topics of engine discovery,
context engine maps and virtual engines. It is not very detailed but
supposed to be a starting point of giving a brief high level overview of
general principles and intended use cases.

v2:
 * Have the text in uapi header and link from there.

v4:
 * Link from driver-uapi.rst.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210618150036.2507653-1-tvrtko.ursulin@linux.intel.com
---
 Documentation/gpu/driver-uapi.rst |  21 +++++
 include/uapi/drm/i915_drm.h       | 188 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 209 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/gpu/driver-uapi.rst b/Documentation/gpu/driver-uapi.rst
index 4411e6919a3d..27d0fbe33e87 100644
--- a/Documentation/gpu/driver-uapi.rst
+++ b/Documentation/gpu/driver-uapi.rst
@@ -5,4 +5,25 @@ DRM Driver uAPI
 drm/i915 uAPI
 =============
 
+Engine Discovery uAPI
+---------------------
+
+.. kernel-doc:: include/uapi/drm/i915_drm.h
+   :doc: Engine Discovery uAPI
+
+Context Engine Map uAPI
+-----------------------
+
+.. kernel-doc:: include/uapi/drm/i915_drm.h
+   :doc: Context Engine Map uAPI
+
+Virtual Engine uAPI
+-------------------
+
+.. kernel-doc:: include/uapi/drm/i915_drm.h
+   :doc: Virtual Engine uAPI
+
+i915_drm.h
+----------
 .. kernel-doc:: include/uapi/drm/i915_drm.h
+   :internal:
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index a1cb4aa035a9..2f70c48567c0 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1806,6 +1806,69 @@ struct drm_i915_gem_context_param_sseu {
 	__u32 rsvd;
 };
 
+/**
+ * DOC: Virtual Engine uAPI
+ *
+ * Virtual engine is a concept where userspace is able to configure a set of
+ * physical engines, submit a batch buffer, and let the driver execute it on any
+ * engine from the set as it sees fit.
+ *
+ * This is primarily useful on parts which have multiple instances of a same
+ * class engine, like for example GT3+ Skylake parts with their two VCS engines.
+ *
+ * For instance userspace can enumerate all engines of a certain class using the
+ * previously described `Engine Discovery uAPI`_. After that userspace can
+ * create a GEM context with a placeholder slot for the virtual engine (using
+ * `I915_ENGINE_CLASS_INVALID` and `I915_ENGINE_CLASS_INVALID_NONE` for class
+ * and instance respectively) and finally using the
+ * `I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE` extension place a virtual engine in
+ * the same reserved slot.
+ *
+ * Example of creating a virtual engine and submitting a batch buffer to it:
+ *
+ * .. code-block:: C
+ *
+ * 	I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(virtual, 2) = {
+ * 		.base.name = I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE,
+ * 		.engine_index = 0, // Place this virtual engine into engine map slot 0
+ * 		.num_siblings = 2,
+ * 		.engines = { { I915_ENGINE_CLASS_VIDEO, 0 },
+ * 			     { I915_ENGINE_CLASS_VIDEO, 1 }, },
+ * 	};
+ * 	I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, 1) = {
+ * 		.engines = { { I915_ENGINE_CLASS_INVALID,
+ * 			       I915_ENGINE_CLASS_INVALID_NONE } },
+ * 		.extensions = to_user_pointer(&virtual), // Chains after load_balance extension
+ * 	};
+ * 	struct drm_i915_gem_context_create_ext_setparam p_engines = {
+ * 		.base = {
+ * 			.name = I915_CONTEXT_CREATE_EXT_SETPARAM,
+ * 		},
+ * 		.param = {
+ * 			.param = I915_CONTEXT_PARAM_ENGINES,
+ * 			.value = to_user_pointer(&engines),
+ * 			.size = sizeof(engines),
+ * 		},
+ * 	};
+ * 	struct drm_i915_gem_context_create_ext create = {
+ * 		.flags = I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS,
+ * 		.extensions = to_user_pointer(&p_engines);
+ * 	};
+ *
+ * 	ctx_id = gem_context_create_ext(drm_fd, &create);
+ *
+ * 	// Now we have created a GEM context with its engine map containing a
+ * 	// single virtual engine. Submissions to this slot can go either to
+ * 	// vcs0 or vcs1, depending on the load balancing algorithm used inside
+ * 	// the driver. The load balancing is dynamic from one batch buffer to
+ * 	// another and transparent to userspace.
+ *
+ * 	...
+ * 	execbuf.rsvd1 = ctx_id;
+ * 	execbuf.flags = 0; // Submits to index 0 which is the virtual engine
+ * 	gem_execbuf(drm_fd, &execbuf);
+ */
+
 /*
  * i915_context_engines_load_balance:
  *
@@ -1882,6 +1945,61 @@ struct i915_context_engines_bond {
 	struct i915_engine_class_instance engines[N__]; \
 } __attribute__((packed)) name__
 
+/**
+ * DOC: Context Engine Map uAPI
+ *
+ * Context engine map is a new way of addressing engines when submitting batch-
+ * buffers, replacing the existing way of using identifiers like `I915_EXEC_BLT`
+ * inside the flags field of `struct drm_i915_gem_execbuffer2`.
+ *
+ * To use it created GEM contexts need to be configured with a list of engines
+ * the user is intending to submit to. This is accomplished using the
+ * `I915_CONTEXT_PARAM_ENGINES` parameter and `struct
+ * i915_context_param_engines`.
+ *
+ * For such contexts the `I915_EXEC_RING_MASK` field becomes an index into the
+ * configured map.
+ *
+ * Example of creating such context and submitting against it:
+ *
+ * .. code-block:: C
+ *
+ * 	I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, 2) = {
+ * 		.engines = { { I915_ENGINE_CLASS_RENDER, 0 },
+ * 			     { I915_ENGINE_CLASS_COPY, 0 } }
+ * 	};
+ * 	struct drm_i915_gem_context_create_ext_setparam p_engines = {
+ * 		.base = {
+ * 			.name = I915_CONTEXT_CREATE_EXT_SETPARAM,
+ * 		},
+ * 		.param = {
+ * 			.param = I915_CONTEXT_PARAM_ENGINES,
+ * 			.value = to_user_pointer(&engines),
+ * 			.size = sizeof(engines),
+ * 		},
+ * 	};
+ * 	struct drm_i915_gem_context_create_ext create = {
+ * 		.flags = I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS,
+ * 		.extensions = to_user_pointer(&p_engines);
+ * 	};
+ *
+ * 	ctx_id = gem_context_create_ext(drm_fd, &create);
+ *
+ * 	// We have now created a GEM context with two engines in the map:
+ * 	// Index 0 points to rcs0 while index 1 points to bcs0. Other engines
+ * 	// will not be accessible from this context.
+ *
+ * 	...
+ * 	execbuf.rsvd1 = ctx_id;
+ * 	execbuf.flags = 0; // Submits to index 0, which is rcs0 for this context
+ * 	gem_execbuf(drm_fd, &execbuf);
+ *
+ * 	...
+ * 	execbuf.rsvd1 = ctx_id;
+ * 	execbuf.flags = 1; // Submits to index 0, which is bcs0 for this context
+ * 	gem_execbuf(drm_fd, &execbuf);
+ */
+
 struct i915_context_param_engines {
 	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
 #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */
@@ -2375,6 +2493,76 @@ struct drm_i915_query_topology_info {
 	__u8 data[];
 };
 
+/**
+ * DOC: Engine Discovery uAPI
+ *
+ * Engine discovery uAPI is a way of enumerating physical engines present in a
+ * GPU associated with an open i915 DRM file descriptor. This supersedes the old
+ * way of using `DRM_IOCTL_I915_GETPARAM` and engine identifiers like
+ * `I915_PARAM_HAS_BLT`.
+ *
+ * The need for this interface came starting with Icelake and newer GPUs, which
+ * started to establish a pattern of having multiple engines of a same class,
+ * where not all instances were always completely functionally equivalent.
+ *
+ * Entry point for this uapi is `DRM_IOCTL_I915_QUERY` with the
+ * `DRM_I915_QUERY_ENGINE_INFO` as the queried item id.
+ *
+ * Example for getting the list of engines:
+ *
+ * .. code-block:: C
+ *
+ * 	struct drm_i915_query_engine_info *info;
+ * 	struct drm_i915_query_item item = {
+ * 		.query_id = DRM_I915_QUERY_ENGINE_INFO;
+ * 	};
+ * 	struct drm_i915_query query = {
+ * 		.num_items = 1,
+ * 		.items_ptr = (uintptr_t)&item,
+ * 	};
+ * 	int err, i;
+ *
+ * 	// First query the size of the blob we need, this needs to be large
+ * 	// enough to hold our array of engines. The kernel will fill out the
+ * 	// item.length for us, which is the number of bytes we need.
+ * 	//
+ * 	// Alternatively a large buffer can be allocated straight away enabling
+ * 	// querying in one pass, in which case item.length should contain the
+ * 	// length of the provided buffer.
+ * 	err = ioctl(fd, DRM_IOCTL_I915_QUERY, &query);
+ * 	if (err) ...
+ *
+ * 	info = calloc(1, item.length);
+ * 	// Now that we allocated the required number of bytes, we call the ioctl
+ * 	// again, this time with the data_ptr pointing to our newly allocated
+ * 	// blob, which the kernel can then populate with info on all engines.
+ * 	item.data_ptr = (uintptr_t)&info,
+ *
+ * 	err = ioctl(fd, DRM_IOCTL_I915_QUERY, &query);
+ * 	if (err) ...
+ *
+ * 	// We can now access each engine in the array
+ * 	for (i = 0; i < info->num_engines; i++) {
+ * 		struct drm_i915_engine_info einfo = info->engines[i];
+ * 		u16 class = einfo.engine.class;
+ * 		u16 instance = einfo.engine.instance;
+ * 		....
+ * 	}
+ *
+ * 	free(info);
+ *
+ * Each of the enumerated engines, apart from being defined by its class and
+ * instance (see `struct i915_engine_class_instance`), also can have flags and
+ * capabilities defined as documented in i915_drm.h.
+ *
+ * For instance video engines which support HEVC encoding will have the
+ * `I915_VIDEO_CLASS_CAPABILITY_HEVC` capability bit set.
+ *
+ * Engine discovery only fully comes to its own when combined with the new way
+ * of addressing engines when submitting batch buffers using contexts with
+ * engine maps configured.
+ */
+
 /**
  * struct drm_i915_engine_info
  *
-- 
cgit v1.2.3-71-gd317


From c156174a67070042d51d2c866146d3c934d5468c Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:56 +0800
Subject: ethtool: add a new command for getting PHC virtual clocks

Add an interface for getting PHC (PTP Hardware Clock)
virtual clocks, which are based on PHC physical clock
providing hardware timestamp to network packets.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst | 22 +++++++
 include/linux/ethtool.h                      | 10 +++
 include/uapi/linux/ethtool_netlink.h         | 15 +++++
 net/ethtool/Makefile                         |  2 +-
 net/ethtool/common.c                         | 13 ++++
 net/ethtool/netlink.c                        | 10 +++
 net/ethtool/netlink.h                        |  2 +
 net/ethtool/phc_vclocks.c                    | 94 ++++++++++++++++++++++++++++
 8 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 net/ethtool/phc_vclocks.c

(limited to 'include/uapi')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 6ea91e41593f..c86628e6a235 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -212,6 +212,7 @@ Userspace to kernel:
   ``ETHTOOL_MSG_FEC_SET``               set FEC settings
   ``ETHTOOL_MSG_MODULE_EEPROM_GET``     read SFP module EEPROM
   ``ETHTOOL_MSG_STATS_GET``             get standard statistics
+  ``ETHTOOL_MSG_PHC_VCLOCKS_GET``       get PHC virtual clocks info
   ===================================== ================================
 
 Kernel to userspace:
@@ -250,6 +251,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_FEC_NTF``                  FEC settings
   ``ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY``  read SFP module EEPROM
   ``ETHTOOL_MSG_STATS_GET_REPLY``          standard statistics
+  ``ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY``    PHC virtual clocks info
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1477,6 +1479,25 @@ Low and high bounds are inclusive, for example:
  etherStatsPkts512to1023Octets 512  1023
  ============================= ==== ====
 
+PHC_VCLOCKS_GET
+===============
+
+Query device PHC virtual clocks information.
+
+Request contents:
+
+  ====================================  ======  ==========================
+  ``ETHTOOL_A_PHC_VCLOCKS_HEADER``      nested  request header
+  ====================================  ======  ==========================
+
+Kernel response contents:
+
+  ====================================  ======  ==========================
+  ``ETHTOOL_A_PHC_VCLOCKS_HEADER``      nested  reply header
+  ``ETHTOOL_A_PHC_VCLOCKS_NUM``         u32     PHC virtual clocks number
+  ``ETHTOOL_A_PHC_VCLOCKS_INDEX``       s32     PHC index array
+  ====================================  ======  ==========================
+
 Request translation
 ===================
 
@@ -1575,4 +1596,5 @@ are netlink only.
   n/a                                 ``ETHTOOL_MSG_CABLE_TEST_ACT``
   n/a                                 ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT``
   n/a                                 ``ETHTOOL_MSG_TUNNEL_INFO_GET``
+  n/a                                 ``ETHTOOL_MSG_PHC_VCLOCKS_GET``
   =================================== =====================================
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 29dbb603bc91..232daaec56e4 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -757,6 +757,16 @@ void
 ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings,
 			      enum ethtool_link_mode_bit_indices link_mode);
 
+/**
+ * ethtool_get_phc_vclocks - Derive phc vclocks information, and caller
+ *                           is responsible to free memory of vclock_index
+ * @dev: pointer to net_device structure
+ * @vclock_index: pointer to pointer of vclock index
+ *
+ * Return number of phc vclocks
+ */
+int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index);
+
 /**
  * ethtool_sprintf - Write formatted string to ethtool string data
  * @data: Pointer to start of string to update
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index c7135c9c37a5..b3b93710eff7 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -46,6 +46,7 @@ enum {
 	ETHTOOL_MSG_FEC_SET,
 	ETHTOOL_MSG_MODULE_EEPROM_GET,
 	ETHTOOL_MSG_STATS_GET,
+	ETHTOOL_MSG_PHC_VCLOCKS_GET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -88,6 +89,7 @@ enum {
 	ETHTOOL_MSG_FEC_NTF,
 	ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
 	ETHTOOL_MSG_STATS_GET_REPLY,
+	ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -440,6 +442,19 @@ enum {
 	ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1)
 };
 
+/* PHC VCLOCKS */
+
+enum {
+	ETHTOOL_A_PHC_VCLOCKS_UNSPEC,
+	ETHTOOL_A_PHC_VCLOCKS_HEADER,			/* nest - _A_HEADER_* */
+	ETHTOOL_A_PHC_VCLOCKS_NUM,			/* u32 */
+	ETHTOOL_A_PHC_VCLOCKS_INDEX,			/* array, s32 */
+
+	/* add new constants above here */
+	__ETHTOOL_A_PHC_VCLOCKS_CNT,
+	ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1)
+};
+
 /* CABLE TEST */
 
 enum {
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 723c9a8a8cdf..0a19470efbfb 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK)	+= ethtool_nl.o
 ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
-		   tunnels.o fec.o eeprom.o stats.o
+		   tunnels.o fec.o eeprom.o stats.o phc_vclocks.o
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index f9dcbad84788..798231b07676 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -4,6 +4,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/phy.h>
 #include <linux/rtnetlink.h>
+#include <linux/ptp_clock_kernel.h>
 
 #include "common.h"
 
@@ -554,6 +555,18 @@ int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
 	return 0;
 }
 
+int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index)
+{
+	struct ethtool_ts_info info = { };
+	int num = 0;
+
+	if (!__ethtool_get_ts_info(dev, &info))
+		num = ptp_get_vclocks_index(info.phc_index, vclock_index);
+
+	return num;
+}
+EXPORT_SYMBOL(ethtool_get_phc_vclocks);
+
 const struct ethtool_phy_ops *ethtool_phy_ops;
 
 void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops)
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index a7346346114f..73e0f5b626bf 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -248,6 +248,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_TSINFO_GET]	= &ethnl_tsinfo_request_ops,
 	[ETHTOOL_MSG_MODULE_EEPROM_GET]	= &ethnl_module_eeprom_request_ops,
 	[ETHTOOL_MSG_STATS_GET]		= &ethnl_stats_request_ops,
+	[ETHTOOL_MSG_PHC_VCLOCKS_GET]	= &ethnl_phc_vclocks_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -958,6 +959,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_stats_get_policy,
 		.maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_PHC_VCLOCKS_GET,
+		.doit	= ethnl_default_doit,
+		.start	= ethnl_default_start,
+		.dumpit	= ethnl_default_dumpit,
+		.done	= ethnl_default_done,
+		.policy = ethnl_phc_vclocks_get_policy,
+		.maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 3e25a47fd482..3fc395c86702 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -347,6 +347,7 @@ extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
 extern const struct ethnl_request_ops ethnl_fec_request_ops;
 extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
 extern const struct ethnl_request_ops ethnl_stats_request_ops;
+extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -382,6 +383,7 @@ extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1];
 extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1];
 extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1];
 extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1];
+extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/ethtool/phc_vclocks.c b/net/ethtool/phc_vclocks.c
new file mode 100644
index 000000000000..637b2f5297d5
--- /dev/null
+++ b/net/ethtool/phc_vclocks.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 NXP
+ */
+#include "netlink.h"
+#include "common.h"
+
+struct phc_vclocks_req_info {
+	struct ethnl_req_info		base;
+};
+
+struct phc_vclocks_reply_data {
+	struct ethnl_reply_data		base;
+	int				num;
+	int				*index;
+};
+
+#define PHC_VCLOCKS_REPDATA(__reply_base) \
+	container_of(__reply_base, struct phc_vclocks_reply_data, base)
+
+const struct nla_policy ethnl_phc_vclocks_get_policy[] = {
+	[ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base,
+				    struct ethnl_reply_data *reply_base,
+				    struct genl_info *info)
+{
+	struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base);
+	struct net_device *dev = reply_base->dev;
+	int ret;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		return ret;
+	data->num = ethtool_get_phc_vclocks(dev, &data->index);
+	ethnl_ops_complete(dev);
+
+	return ret;
+}
+
+static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base,
+				  const struct ethnl_reply_data *reply_base)
+{
+	const struct phc_vclocks_reply_data *data =
+		PHC_VCLOCKS_REPDATA(reply_base);
+	int len = 0;
+
+	if (data->num > 0) {
+		len += nla_total_size(sizeof(u32));
+		len += nla_total_size(sizeof(s32) * data->num);
+	}
+
+	return len;
+}
+
+static int phc_vclocks_fill_reply(struct sk_buff *skb,
+				  const struct ethnl_req_info *req_base,
+				  const struct ethnl_reply_data *reply_base)
+{
+	const struct phc_vclocks_reply_data *data =
+		PHC_VCLOCKS_REPDATA(reply_base);
+
+	if (data->num <= 0)
+		return 0;
+
+	if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) ||
+	    nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX,
+		    sizeof(s32) * data->num, data->index))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+	const struct phc_vclocks_reply_data *data =
+		PHC_VCLOCKS_REPDATA(reply_base);
+
+	kfree(data->index);
+}
+
+const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = {
+	.request_cmd		= ETHTOOL_MSG_PHC_VCLOCKS_GET,
+	.reply_cmd		= ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
+	.hdr_attr		= ETHTOOL_A_PHC_VCLOCKS_HEADER,
+	.req_info_size		= sizeof(struct phc_vclocks_req_info),
+	.reply_data_size	= sizeof(struct phc_vclocks_reply_data),
+
+	.prepare_data		= phc_vclocks_prepare_data,
+	.reply_size		= phc_vclocks_reply_size,
+	.fill_reply		= phc_vclocks_fill_reply,
+	.cleanup_data		= phc_vclocks_cleanup_data,
+};
-- 
cgit v1.2.3-71-gd317


From d463126e23f112629edb01594141ca437a92a108 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:59 +0800
Subject: net: sock: extend SO_TIMESTAMPING for PHC binding

Since PTP virtual clock support is added, there can be
several PTP virtual clocks based on one PTP physical
clock for timestamping.

This patch is to extend SO_TIMESTAMPING API to support
PHC (PTP Hardware Clock) binding by adding a new flag
SOF_TIMESTAMPING_BIND_PHC. When PTP virtual clocks are
in use, user space can configure to bind one for
timestamping, but PTP physical clock is not supported
and not needed to bind.

This patch is preparation for timestamp conversion from
raw timestamp to a specific PTP virtual clock time in
core net.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h              |  8 +++--
 include/uapi/linux/net_tstamp.h | 17 +++++++++--
 net/core/sock.c                 | 65 +++++++++++++++++++++++++++++++++++++++--
 net/ethtool/common.c            |  1 +
 net/mptcp/sockopt.c             | 23 +++++++++++----
 5 files changed, 101 insertions(+), 13 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/sock.h b/include/net/sock.h
index 8bdd80027ffb..f23cb259b0e2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -316,7 +316,9 @@ struct bpf_local_storage;
   *	@sk_timer: sock cleanup timer
   *	@sk_stamp: time stamp of last packet received
   *	@sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
-  *	@sk_tsflags: SO_TIMESTAMPING socket options
+  *	@sk_tsflags: SO_TIMESTAMPING flags
+  *	@sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
+  *	              for timestamping
   *	@sk_tskey: counter to disambiguate concurrent tstamp requests
   *	@sk_zckey: counter to order MSG_ZEROCOPY notifications
   *	@sk_socket: Identd and reporting IO signals
@@ -493,6 +495,7 @@ struct sock {
 	seqlock_t		sk_stamp_seq;
 #endif
 	u16			sk_tsflags;
+	int			sk_bind_phc;
 	u8			sk_shutdown;
 	u32			sk_tskey;
 	atomic_t		sk_zckey;
@@ -2755,7 +2758,8 @@ void sock_def_readable(struct sock *sk);
 
 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
 void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
-int sock_set_timestamping(struct sock *sk, int optname, int val);
+int sock_set_timestamping(struct sock *sk, int optname,
+			  struct so_timestamping timestamping);
 
 void sock_enable_timestamps(struct sock *sk);
 void sock_no_linger(struct sock *sk);
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 7ed0b3d1c00a..fcc61c73a666 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -13,7 +13,7 @@
 #include <linux/types.h>
 #include <linux/socket.h>   /* for SO_TIMESTAMPING */
 
-/* SO_TIMESTAMPING gets an integer bit field comprised of these values */
+/* SO_TIMESTAMPING flags */
 enum {
 	SOF_TIMESTAMPING_TX_HARDWARE = (1<<0),
 	SOF_TIMESTAMPING_TX_SOFTWARE = (1<<1),
@@ -30,8 +30,9 @@ enum {
 	SOF_TIMESTAMPING_OPT_STATS = (1<<12),
 	SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
 	SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14),
+	SOF_TIMESTAMPING_BIND_PHC = (1 << 15),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TX_SWHW,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_BIND_PHC,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
@@ -46,6 +47,18 @@ enum {
 					 SOF_TIMESTAMPING_TX_SCHED | \
 					 SOF_TIMESTAMPING_TX_ACK)
 
+/**
+ * struct so_timestamping - SO_TIMESTAMPING parameter
+ *
+ * @flags:	SO_TIMESTAMPING flags
+ * @bind_phc:	Index of PTP virtual clock bound to sock. This is available
+ *		if flag SOF_TIMESTAMPING_BIND_PHC is set.
+ */
+struct so_timestamping {
+	int flags;
+	int bind_phc;
+};
+
 /**
  * struct hwtstamp_config - %SIOCGHWTSTAMP and %SIOCSHWTSTAMP parameter
  *
diff --git a/net/core/sock.c b/net/core/sock.c
index dd9599656c40..cad107112204 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -139,6 +139,8 @@
 #include <net/tcp.h>
 #include <net/busy_poll.h>
 
+#include <linux/ethtool.h>
+
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
@@ -810,8 +812,47 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 	}
 }
 
-int sock_set_timestamping(struct sock *sk, int optname, int val)
+static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 {
+	struct net *net = sock_net(sk);
+	struct net_device *dev = NULL;
+	bool match = false;
+	int *vclock_index;
+	int i, num;
+
+	if (sk->sk_bound_dev_if)
+		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
+
+	if (!dev) {
+		pr_err("%s: sock not bind to device\n", __func__);
+		return -EOPNOTSUPP;
+	}
+
+	num = ethtool_get_phc_vclocks(dev, &vclock_index);
+	for (i = 0; i < num; i++) {
+		if (*(vclock_index + i) == phc_index) {
+			match = true;
+			break;
+		}
+	}
+
+	if (num > 0)
+		kfree(vclock_index);
+
+	if (!match)
+		return -EINVAL;
+
+	sk->sk_bind_phc = phc_index;
+
+	return 0;
+}
+
+int sock_set_timestamping(struct sock *sk, int optname,
+			  struct so_timestamping timestamping)
+{
+	int val = timestamping.flags;
+	int ret;
+
 	if (val & ~SOF_TIMESTAMPING_MASK)
 		return -EINVAL;
 
@@ -832,6 +873,12 @@ int sock_set_timestamping(struct sock *sk, int optname, int val)
 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 		return -EINVAL;
 
+	if (val & SOF_TIMESTAMPING_BIND_PHC) {
+		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
+		if (ret)
+			return ret;
+	}
+
 	sk->sk_tsflags = val;
 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 
@@ -907,6 +954,7 @@ EXPORT_SYMBOL(sock_set_mark);
 int sock_setsockopt(struct socket *sock, int level, int optname,
 		    sockptr_t optval, unsigned int optlen)
 {
+	struct so_timestamping timestamping;
 	struct sock_txtime sk_txtime;
 	struct sock *sk = sock->sk;
 	int val;
@@ -1073,7 +1121,15 @@ set_sndbuf:
 
 	case SO_TIMESTAMPING_NEW:
 	case SO_TIMESTAMPING_OLD:
-		ret = sock_set_timestamping(sk, optname, val);
+		if (optlen == sizeof(timestamping)) {
+			if (copy_from_sockptr(&timestamping, optval,
+					      sizeof(timestamping)))
+				return -EFAULT;
+		} else {
+			memset(&timestamping, 0, sizeof(timestamping));
+			timestamping.flags = val;
+		}
+		ret = sock_set_timestamping(sk, optname, timestamping);
 		break;
 
 	case SO_RCVLOWAT:
@@ -1348,6 +1404,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		struct __kernel_old_timeval tm;
 		struct  __kernel_sock_timeval stm;
 		struct sock_txtime txtime;
+		struct so_timestamping timestamping;
 	} v;
 
 	int lv = sizeof(int);
@@ -1451,7 +1508,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		break;
 
 	case SO_TIMESTAMPING_OLD:
-		v.val = sk->sk_tsflags;
+		lv = sizeof(v.timestamping);
+		v.timestamping.flags = sk->sk_tsflags;
+		v.timestamping.bind_phc = sk->sk_bind_phc;
 		break;
 
 	case SO_RCVTIMEO_OLD:
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 798231b07676..c63e0739dc6a 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -398,6 +398,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = {
 	[const_ilog2(SOF_TIMESTAMPING_OPT_STATS)]    = "option-stats",
 	[const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)]  = "option-pktinfo",
 	[const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)]  = "option-tx-swhw",
+	[const_ilog2(SOF_TIMESTAMPING_BIND_PHC)]     = "bind-phc",
 };
 static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT);
 
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index ea38cbcd2ad4..8c03afac5ca0 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -207,14 +207,25 @@ static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
 {
 	struct mptcp_subflow_context *subflow;
 	struct sock *sk = (struct sock *)msk;
-	int val, ret;
+	struct so_timestamping timestamping;
+	int ret;
 
-	ret = mptcp_get_int_option(msk, optval, optlen, &val);
-	if (ret)
-		return ret;
+	if (optlen == sizeof(timestamping)) {
+		if (copy_from_sockptr(&timestamping, optval,
+				      sizeof(timestamping)))
+			return -EFAULT;
+	} else if (optlen == sizeof(int)) {
+		memset(&timestamping, 0, sizeof(timestamping));
+
+		if (copy_from_sockptr(&timestamping.flags, optval, sizeof(int)))
+			return -EFAULT;
+	} else {
+		return -EINVAL;
+	}
 
 	ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
-			      KERNEL_SOCKPTR(&val), sizeof(val));
+			      KERNEL_SOCKPTR(&timestamping),
+			      sizeof(timestamping));
 	if (ret)
 		return ret;
 
@@ -224,7 +235,7 @@ static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 		bool slow = lock_sock_fast(ssk);
 
-		sock_set_timestamping(sk, optname, val);
+		sock_set_timestamping(sk, optname, timestamping);
 		unlock_sock_fast(ssk, slow);
 	}
 
-- 
cgit v1.2.3-71-gd317


From d322957ebfb9c21c2c72b66680f7c3ccd724e081 Mon Sep 17 00:00:00 2001
From: Duncan Roe <duncan_roe@optusnet.com.au>
Date: Wed, 7 Jul 2021 10:57:51 +1000
Subject: netfilter: uapi: refer to nfnetlink_conntrack.h, not
 nf_conntrack_netlink.h

nf_conntrack_netlink.h does not exist, refer to nfnetlink_conntrack.h instead.

Signed-off-by: Duncan Roe <duncan_roe@optusnet.com.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_log.h   | 2 +-
 include/uapi/linux/netfilter/nfnetlink_queue.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nfnetlink_log.h b/include/uapi/linux/netfilter/nfnetlink_log.h
index 45c8d3b027e0..0af9c113d665 100644
--- a/include/uapi/linux/netfilter/nfnetlink_log.h
+++ b/include/uapi/linux/netfilter/nfnetlink_log.h
@@ -61,7 +61,7 @@ enum nfulnl_attr_type {
 	NFULA_HWTYPE,			/* hardware type */
 	NFULA_HWHEADER,			/* hardware header */
 	NFULA_HWLEN,			/* hardware header length */
-	NFULA_CT,                       /* nf_conntrack_netlink.h */
+	NFULA_CT,                       /* nfnetlink_conntrack.h */
 	NFULA_CT_INFO,                  /* enum ip_conntrack_info */
 	NFULA_VLAN,			/* nested attribute: packet vlan info */
 	NFULA_L2HDR,			/* full L2 header */
diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index bcb2cb5d40b9..aed90c4df0c8 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -51,11 +51,11 @@ enum nfqnl_attr_type {
 	NFQA_IFINDEX_PHYSOUTDEV,	/* __u32 ifindex */
 	NFQA_HWADDR,			/* nfqnl_msg_packet_hw */
 	NFQA_PAYLOAD,			/* opaque data payload */
-	NFQA_CT,			/* nf_conntrack_netlink.h */
+	NFQA_CT,			/* nfnetlink_conntrack.h */
 	NFQA_CT_INFO,			/* enum ip_conntrack_info */
 	NFQA_CAP_LEN,			/* __u32 length of captured packet */
 	NFQA_SKB_INFO,			/* __u32 skb meta information */
-	NFQA_EXP,			/* nf_conntrack_netlink.h */
+	NFQA_EXP,			/* nfnetlink_conntrack.h */
 	NFQA_UID,			/* __u32 sk uid */
 	NFQA_GID,			/* __u32 sk gid */
 	NFQA_SECCTX,			/* security context string */
-- 
cgit v1.2.3-71-gd317


From 47316f4a305367794fc04f23e5c778678d8f1d8e Mon Sep 17 00:00:00 2001
From: Zvi Effron <zeffron@riotgames.com>
Date: Wed, 7 Jul 2021 22:16:55 +0000
Subject: bpf: Support input xdp_md context in BPF_PROG_TEST_RUN

Support passing a xdp_md via ctx_in/ctx_out in bpf_attr for
BPF_PROG_TEST_RUN.

The intended use case is to pass some XDP meta data to the test runs of
XDP programs that are used as tail calls.

For programs that use bpf_prog_test_run_xdp, support xdp_md input and
output. Unlike with an actual xdp_md during a non-test run, data_meta must
be 0 because it must point to the start of the provided user data. From
the initial xdp_md, use data and data_end to adjust the pointers in the
generated xdp_buff. All other non-zero fields are prohibited (with
EINVAL). If the user has set ctx_out/ctx_size_out, copy the (potentially
different) xdp_md back to the userspace.

We require all fields of input xdp_md except the ones we explicitly
support to be set to zero. The expectation is that in the future we might
add support for more fields and we want to fail explicitly if the user
runs the program on the kernel where we don't yet support them.

Co-developed-by: Cody Haas <chaas@riotgames.com>
Co-developed-by: Lisa Watanabe <lwatanabe@riotgames.com>
Signed-off-by: Cody Haas <chaas@riotgames.com>
Signed-off-by: Lisa Watanabe <lwatanabe@riotgames.com>
Signed-off-by: Zvi Effron <zeffron@riotgames.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210707221657.3985075-3-zeffron@riotgames.com
---
 include/uapi/linux/bpf.h |  3 ---
 net/bpf/test_run.c       | 67 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 59 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bf9252c7381e..b46a383e8db7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -324,9 +324,6 @@ union bpf_iter_link_info {
  *		**BPF_PROG_TYPE_SK_LOOKUP**
  *			*data_in* and *data_out* must be NULL.
  *
- *		**BPF_PROG_TYPE_XDP**
- *			*ctx_in* and *ctx_out* must be NULL.
- *
  *		**BPF_PROG_TYPE_RAW_TRACEPOINT**,
  *		**BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE**
  *
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index aa47af349ba8..229c5deb813c 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -15,6 +15,7 @@
 #include <linux/error-injection.h>
 #include <linux/smp.h>
 #include <linux/sock_diag.h>
+#include <net/xdp.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/bpf_test_run.h>
@@ -687,6 +688,22 @@ out:
 	return ret;
 }
 
+static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp)
+{
+	if (!xdp_md)
+		return 0;
+
+	if (xdp_md->egress_ifindex != 0)
+		return -EINVAL;
+
+	if (xdp_md->ingress_ifindex != 0 || xdp_md->rx_queue_index != 0)
+		return -EINVAL;
+
+	xdp->data = xdp->data_meta + xdp_md->data;
+
+	return 0;
+}
+
 int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr)
 {
@@ -697,35 +714,69 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 	struct netdev_rx_queue *rxqueue;
 	struct xdp_buff xdp = {};
 	u32 retval, duration;
+	struct xdp_md *ctx;
 	u32 max_data_sz;
 	void *data;
-	int ret;
+	int ret = -EINVAL;
 
-	if (kattr->test.ctx_in || kattr->test.ctx_out)
-		return -EINVAL;
+	ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md));
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	if (ctx) {
+		/* There can't be user provided data before the meta data */
+		if (ctx->data_meta || ctx->data_end != size ||
+		    ctx->data > ctx->data_end ||
+		    unlikely(xdp_metalen_invalid(ctx->data)))
+			goto free_ctx;
+		/* Meta data is allocated from the headroom */
+		headroom -= ctx->data;
+	}
 
 	/* XDP have extra tailroom as (most) drivers use full page */
 	max_data_sz = 4096 - headroom - tailroom;
 
 	data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
-	if (IS_ERR(data))
-		return PTR_ERR(data);
+	if (IS_ERR(data)) {
+		ret = PTR_ERR(data);
+		goto free_ctx;
+	}
 
 	rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
 	xdp_init_buff(&xdp, headroom + max_data_sz + tailroom,
 		      &rxqueue->xdp_rxq);
 	xdp_prepare_buff(&xdp, data, headroom, size, true);
 
+	ret = xdp_convert_md_to_buff(ctx, &xdp);
+	if (ret)
+		goto free_data;
+
 	bpf_prog_change_xdp(NULL, prog);
 	ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
 	if (ret)
 		goto out;
-	if (xdp.data != data + headroom || xdp.data_end != xdp.data + size)
-		size = xdp.data_end - xdp.data;
-	ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
+
+	if (xdp.data_meta != data + headroom ||
+	    xdp.data_end != xdp.data_meta + size)
+		size = xdp.data_end - xdp.data_meta;
+
+	if (ctx) {
+		ctx->data = xdp.data - xdp.data_meta;
+		ctx->data_end = xdp.data_end - xdp.data_meta;
+	}
+
+	ret = bpf_test_finish(kattr, uattr, xdp.data_meta, size, retval,
+			      duration);
+	if (!ret)
+		ret = bpf_ctx_finish(kattr, uattr, ctx,
+				     sizeof(struct xdp_md));
+
 out:
 	bpf_prog_change_xdp(prog, NULL);
+free_data:
 	kfree(data);
+free_ctx:
+	kfree(ctx);
 	return ret;
 }
 
-- 
cgit v1.2.3-71-gd317


From fe4751c3d513ff4f5422dbf55a966abafe39255e Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Thu, 8 Jul 2021 10:48:06 -0500
Subject: drm/i915: Drop I915_CONTEXT_PARAM_RINGSIZE

This reverts commit 88be76cdafc7 ("drm/i915: Allow userspace to specify
ringsize on construction").  This API was originally added for OpenCL
but the compute-runtime PR has sat open for a year without action so we
can still pull it out if we want.  I argue we should drop it for three
reasons:

 1. If the compute-runtime PR has sat open for a year, this clearly
    isn't that important.

 2. It's a very leaky API.  Ring size is an implementation detail of the
    current execlist scheduler and really only makes sense there.  It
    can't apply to the older ring-buffer scheduler on pre-execlist
    hardware because that's shared across all contexts and it won't
    apply to the GuC scheduler that's in the pipeline.

 3. Having userspace set a ring size in bytes is a bad solution to the
    problem of having too small a ring.  There is no way that userspace
    has the information to know how to properly set the ring size so
    it's just going to detect the feature and always set it to the
    maximum of 512K.  This is what the compute-runtime PR does.  The
    scheduler in i915, on the other hand, does have the information to
    make an informed choice.  It could detect if the ring size is a
    problem and grow it itself.  Or, if that's too hard, we could just
    increase the default size from 16K to 32K or even 64K instead of
    relying on userspace to do it.

Let's drop this API for now and, if someone decides they really care
about solving this problem, they can do it properly.

Signed-off-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210708154835.528166-2-jason@jlekstrand.net
---
 drivers/gpu/drm/i915/Makefile                 |  1 -
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 85 +--------------------------
 drivers/gpu/drm/i915/gt/intel_context_param.c | 63 --------------------
 drivers/gpu/drm/i915/gt/intel_context_param.h |  3 -
 include/uapi/drm/i915_drm.h                   | 20 +------
 5 files changed, 4 insertions(+), 168 deletions(-)
 delete mode 100644 drivers/gpu/drm/i915/gt/intel_context_param.c

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 01f28ad5ea57..10b3bb6207ba 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -89,7 +89,6 @@ gt-y += \
 	gt/gen8_ppgtt.o \
 	gt/intel_breadcrumbs.o \
 	gt/intel_context.o \
-	gt/intel_context_param.o \
 	gt/intel_context_sseu.o \
 	gt/intel_engine_cs.o \
 	gt/intel_engine_heartbeat.o \
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 7720b8c22c81..ddc3cc3f8f09 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1334,63 +1334,6 @@ out:
 	return err;
 }
 
-static int __apply_ringsize(struct intel_context *ce, void *sz)
-{
-	return intel_context_set_ring_size(ce, (unsigned long)sz);
-}
-
-static int set_ringsize(struct i915_gem_context *ctx,
-			struct drm_i915_gem_context_param *args)
-{
-	if (!HAS_LOGICAL_RING_CONTEXTS(ctx->i915))
-		return -ENODEV;
-
-	if (args->size)
-		return -EINVAL;
-
-	if (!IS_ALIGNED(args->value, I915_GTT_PAGE_SIZE))
-		return -EINVAL;
-
-	if (args->value < I915_GTT_PAGE_SIZE)
-		return -EINVAL;
-
-	if (args->value > 128 * I915_GTT_PAGE_SIZE)
-		return -EINVAL;
-
-	return context_apply_all(ctx,
-				 __apply_ringsize,
-				 __intel_context_ring_size(args->value));
-}
-
-static int __get_ringsize(struct intel_context *ce, void *arg)
-{
-	long sz;
-
-	sz = intel_context_get_ring_size(ce);
-	GEM_BUG_ON(sz > INT_MAX);
-
-	return sz; /* stop on first engine */
-}
-
-static int get_ringsize(struct i915_gem_context *ctx,
-			struct drm_i915_gem_context_param *args)
-{
-	int sz;
-
-	if (!HAS_LOGICAL_RING_CONTEXTS(ctx->i915))
-		return -ENODEV;
-
-	if (args->size)
-		return -EINVAL;
-
-	sz = context_apply_all(ctx, __get_ringsize, NULL);
-	if (sz < 0)
-		return sz;
-
-	args->value = sz;
-	return 0;
-}
-
 int
 i915_gem_user_to_context_sseu(struct intel_gt *gt,
 			      const struct drm_i915_gem_context_param_sseu *user,
@@ -2036,11 +1979,8 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv,
 		ret = set_persistence(ctx, args);
 		break;
 
-	case I915_CONTEXT_PARAM_RINGSIZE:
-		ret = set_ringsize(ctx, args);
-		break;
-
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
+	case I915_CONTEXT_PARAM_RINGSIZE:
 	default:
 		ret = -EINVAL;
 		break;
@@ -2068,18 +2008,6 @@ static int create_setparam(struct i915_user_extension __user *ext, void *data)
 	return ctx_setparam(arg->fpriv, arg->ctx, &local.param);
 }
 
-static int copy_ring_size(struct intel_context *dst,
-			  struct intel_context *src)
-{
-	long sz;
-
-	sz = intel_context_get_ring_size(src);
-	if (sz < 0)
-		return sz;
-
-	return intel_context_set_ring_size(dst, sz);
-}
-
 static int clone_engines(struct i915_gem_context *dst,
 			 struct i915_gem_context *src)
 {
@@ -2124,12 +2052,6 @@ static int clone_engines(struct i915_gem_context *dst,
 		}
 
 		intel_context_set_gem(clone->engines[n], dst);
-
-		/* Copy across the preferred ringsize */
-		if (copy_ring_size(clone->engines[n], e->engines[n])) {
-			__free_engines(clone, n + 1);
-			goto err_unlock;
-		}
 	}
 	clone->num_engines = n;
 	i915_sw_fence_complete(&e->fence);
@@ -2489,11 +2411,8 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		args->value = i915_gem_context_is_persistent(ctx);
 		break;
 
-	case I915_CONTEXT_PARAM_RINGSIZE:
-		ret = get_ringsize(ctx, args);
-		break;
-
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
+	case I915_CONTEXT_PARAM_RINGSIZE:
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/gt/intel_context_param.c b/drivers/gpu/drm/i915/gt/intel_context_param.c
deleted file mode 100644
index 65dcd090245d..000000000000
--- a/drivers/gpu/drm/i915/gt/intel_context_param.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: MIT
-/*
- * Copyright © 2019 Intel Corporation
- */
-
-#include "i915_active.h"
-#include "intel_context.h"
-#include "intel_context_param.h"
-#include "intel_ring.h"
-
-int intel_context_set_ring_size(struct intel_context *ce, long sz)
-{
-	int err;
-
-	if (intel_context_lock_pinned(ce))
-		return -EINTR;
-
-	err = i915_active_wait(&ce->active);
-	if (err < 0)
-		goto unlock;
-
-	if (intel_context_is_pinned(ce)) {
-		err = -EBUSY; /* In active use, come back later! */
-		goto unlock;
-	}
-
-	if (test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) {
-		struct intel_ring *ring;
-
-		/* Replace the existing ringbuffer */
-		ring = intel_engine_create_ring(ce->engine, sz);
-		if (IS_ERR(ring)) {
-			err = PTR_ERR(ring);
-			goto unlock;
-		}
-
-		intel_ring_put(ce->ring);
-		ce->ring = ring;
-
-		/* Context image will be updated on next pin */
-	} else {
-		ce->ring = __intel_context_ring_size(sz);
-	}
-
-unlock:
-	intel_context_unlock_pinned(ce);
-	return err;
-}
-
-long intel_context_get_ring_size(struct intel_context *ce)
-{
-	long sz = (unsigned long)READ_ONCE(ce->ring);
-
-	if (test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) {
-		if (intel_context_lock_pinned(ce))
-			return -EINTR;
-
-		sz = ce->ring->size;
-		intel_context_unlock_pinned(ce);
-	}
-
-	return sz;
-}
diff --git a/drivers/gpu/drm/i915/gt/intel_context_param.h b/drivers/gpu/drm/i915/gt/intel_context_param.h
index 3ecacc675f41..dffedd983693 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_param.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_param.h
@@ -10,9 +10,6 @@
 
 #include "intel_context.h"
 
-int intel_context_set_ring_size(struct intel_context *ce, long sz);
-long intel_context_get_ring_size(struct intel_context *ce);
-
 static inline int
 intel_context_set_watchdog_us(struct intel_context *ce, u64 timeout_us)
 {
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 2f70c48567c0..f229c0abcbb5 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1722,24 +1722,8 @@ struct drm_i915_gem_context_param {
  */
 #define I915_CONTEXT_PARAM_PERSISTENCE	0xb
 
-/*
- * I915_CONTEXT_PARAM_RINGSIZE:
- *
- * Sets the size of the CS ringbuffer to use for logical ring contexts. This
- * applies a limit of how many batches can be queued to HW before the caller
- * is blocked due to lack of space for more commands.
- *
- * Only reliably possible to be set prior to first use, i.e. during
- * construction. At any later point, the current execution must be flushed as
- * the ring can only be changed while the context is idle. Note, the ringsize
- * can be specified as a constructor property, see
- * I915_CONTEXT_CREATE_EXT_SETPARAM, but can also be set later if required.
- *
- * Only applies to the current set of engine and lost when those engines
- * are replaced by a new mapping (see I915_CONTEXT_PARAM_ENGINES).
- *
- * Must be between 4 - 512 KiB, in intervals of page size [4 KiB].
- * Default is 16 KiB.
+/* This API has been removed.  On the off chance someone somewhere has
+ * attempted to use it, never re-use this context param number.
  */
 #define I915_CONTEXT_PARAM_RINGSIZE	0xc
 /* Must be kept compact -- no holes and well documented */
-- 
cgit v1.2.3-71-gd317


From 6ff6d61dd2a943bd0c80bb77eb5630e8aa0cac15 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Thu, 8 Jul 2021 10:48:08 -0500
Subject: drm/i915: Drop I915_CONTEXT_PARAM_NO_ZEROMAP

The idea behind this param is to support OpenCL drivers with relocations
because OpenCL reserves 0x0 for NULL and, if we placed memory there, it
would confuse CL kernels.  It was originally sent out as part of a patch
series including libdrm [1] and Beignet [2] support.  However, the
libdrm and Beignet patches never landed in their respective upstream
projects so this API has never been used.  It's never been used in Mesa
or any other driver, either.

Dropping this API allows us to delete a small bit of code.

[1]: https://lists.freedesktop.org/archives/intel-gfx/2015-May/067030.html
[2]: https://lists.freedesktop.org/archives/intel-gfx/2015-May/067031.html

Signed-off-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210708154835.528166-4-jason@jlekstrand.net
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c       | 16 ++--------------
 drivers/gpu/drm/i915/gem/i915_gem_context_types.h |  1 -
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c    |  8 --------
 include/uapi/drm/i915_drm.h                       |  4 ++++
 4 files changed, 6 insertions(+), 23 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index a4faf06022d5..5fc0eb4beeea 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1920,15 +1920,6 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv,
 	int ret = 0;
 
 	switch (args->param) {
-	case I915_CONTEXT_PARAM_NO_ZEROMAP:
-		if (args->size)
-			ret = -EINVAL;
-		else if (args->value)
-			set_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags);
-		else
-			clear_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags);
-		break;
-
 	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
 		if (args->size)
 			ret = -EINVAL;
@@ -1978,6 +1969,7 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv,
 		ret = set_persistence(ctx, args);
 		break;
 
+	case I915_CONTEXT_PARAM_NO_ZEROMAP:
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	case I915_CONTEXT_PARAM_RINGSIZE:
 	default:
@@ -2358,11 +2350,6 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		return -ENOENT;
 
 	switch (args->param) {
-	case I915_CONTEXT_PARAM_NO_ZEROMAP:
-		args->size = 0;
-		args->value = test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags);
-		break;
-
 	case I915_CONTEXT_PARAM_GTT_SIZE:
 		args->size = 0;
 		rcu_read_lock();
@@ -2410,6 +2397,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		args->value = i915_gem_context_is_persistent(ctx);
 		break;
 
+	case I915_CONTEXT_PARAM_NO_ZEROMAP:
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	case I915_CONTEXT_PARAM_RINGSIZE:
 	default:
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index 340473aa70de..5ae71ec936f7 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -129,7 +129,6 @@ struct i915_gem_context {
 	 * @user_flags: small set of booleans controlled by the user
 	 */
 	unsigned long user_flags;
-#define UCONTEXT_NO_ZEROMAP		0
 #define UCONTEXT_NO_ERROR_CAPTURE	1
 #define UCONTEXT_BANNABLE		2
 #define UCONTEXT_RECOVERABLE		3
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 3661461bc04e..cb86574bdab4 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -290,7 +290,6 @@ struct i915_execbuffer {
 	struct intel_context *reloc_context;
 
 	u64 invalid_flags; /** Set of execobj.flags that are invalid */
-	u32 context_flags; /** Set of execobj.flags to insert from the ctx */
 
 	u64 batch_len; /** Length of batch within object */
 	u32 batch_start_offset; /** Location within object of batch */
@@ -541,9 +540,6 @@ eb_validate_vma(struct i915_execbuffer *eb,
 			entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP;
 	}
 
-	if (!(entry->flags & EXEC_OBJECT_PINNED))
-		entry->flags |= eb->context_flags;
-
 	return 0;
 }
 
@@ -750,10 +746,6 @@ static int eb_select_context(struct i915_execbuffer *eb)
 	if (rcu_access_pointer(ctx->vm))
 		eb->invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
 
-	eb->context_flags = 0;
-	if (test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags))
-		eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS;
-
 	return 0;
 }
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index f229c0abcbb5..79dcafaf476e 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1638,6 +1638,10 @@ struct drm_i915_gem_context_param {
 	__u32 size;
 	__u64 param;
 #define I915_CONTEXT_PARAM_BAN_PERIOD	0x1
+/* I915_CONTEXT_PARAM_NO_ZEROMAP has been removed.  On the off chance
+ * someone somewhere has attempted to use it, never re-use this context
+ * param number.
+ */
 #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
 #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
-- 
cgit v1.2.3-71-gd317


From 4a766ae40ec8330103a27922b5aa978fdf8bc005 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Thu, 8 Jul 2021 10:48:11 -0500
Subject: drm/i915: Drop the CONTEXT_CLONE API (v2)

This API allows one context to grab bits out of another context upon
creation.  It can be used as a short-cut for setparam(getparam()) for
things like I915_CONTEXT_PARAM_VM.  However, it's never been used by any
real userspace.  It's used by a few IGT tests and that's it.  Since it
doesn't add any real value (most of the stuff you can CLONE you can copy
in other ways), drop it.

There is one thing that this API allows you to clone which you cannot
clone via getparam/setparam: timelines.  However, timelines are an
implementation detail of i915 and not really something that needs to be
exposed to userspace.  Also, sharing timelines between contexts isn't
obviously useful and supporting it has the potential to complicate i915
internally.  It also doesn't add any functionality that the client can't
get in other ways.  If a client really wants a shared timeline, they can
use a syncobj and set it as an in and out fence on every submit.

v2 (Jason Ekstrand):
 - More detailed commit message

Signed-off-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210708154835.528166-7-jason@jlekstrand.net
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c        | 199 +--------------------
 .../gpu/drm/i915/gt/intel_execlists_submission.c   |  28 ---
 .../gpu/drm/i915/gt/intel_execlists_submission.h   |   3 -
 include/uapi/drm/i915_drm.h                        |  16 +-
 4 files changed, 6 insertions(+), 240 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 3503d46c88cb..9f9369d3c000 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1957,207 +1957,14 @@ static int create_setparam(struct i915_user_extension __user *ext, void *data)
 	return ctx_setparam(arg->fpriv, arg->ctx, &local.param);
 }
 
-static int clone_engines(struct i915_gem_context *dst,
-			 struct i915_gem_context *src)
+static int invalid_ext(struct i915_user_extension __user *ext, void *data)
 {
-	struct i915_gem_engines *clone, *e;
-	bool user_engines;
-	unsigned long n;
-
-	e = __context_engines_await(src, &user_engines);
-	if (!e)
-		return -ENOENT;
-
-	clone = alloc_engines(e->num_engines);
-	if (!clone)
-		goto err_unlock;
-
-	for (n = 0; n < e->num_engines; n++) {
-		struct intel_engine_cs *engine;
-
-		if (!e->engines[n]) {
-			clone->engines[n] = NULL;
-			continue;
-		}
-		engine = e->engines[n]->engine;
-
-		/*
-		 * Virtual engines are singletons; they can only exist
-		 * inside a single context, because they embed their
-		 * HW context... As each virtual context implies a single
-		 * timeline (each engine can only dequeue a single request
-		 * at any time), it would be surprising for two contexts
-		 * to use the same engine. So let's create a copy of
-		 * the virtual engine instead.
-		 */
-		if (intel_engine_is_virtual(engine))
-			clone->engines[n] =
-				intel_execlists_clone_virtual(engine);
-		else
-			clone->engines[n] = intel_context_create(engine);
-		if (IS_ERR_OR_NULL(clone->engines[n])) {
-			__free_engines(clone, n);
-			goto err_unlock;
-		}
-
-		intel_context_set_gem(clone->engines[n], dst);
-	}
-	clone->num_engines = n;
-	i915_sw_fence_complete(&e->fence);
-
-	/* Serialised by constructor */
-	engines_idle_release(dst, rcu_replace_pointer(dst->engines, clone, 1));
-	if (user_engines)
-		i915_gem_context_set_user_engines(dst);
-	else
-		i915_gem_context_clear_user_engines(dst);
-	return 0;
-
-err_unlock:
-	i915_sw_fence_complete(&e->fence);
-	return -ENOMEM;
-}
-
-static int clone_flags(struct i915_gem_context *dst,
-		       struct i915_gem_context *src)
-{
-	dst->user_flags = src->user_flags;
-	return 0;
-}
-
-static int clone_schedattr(struct i915_gem_context *dst,
-			   struct i915_gem_context *src)
-{
-	dst->sched = src->sched;
-	return 0;
-}
-
-static int clone_sseu(struct i915_gem_context *dst,
-		      struct i915_gem_context *src)
-{
-	struct i915_gem_engines *e = i915_gem_context_lock_engines(src);
-	struct i915_gem_engines *clone;
-	unsigned long n;
-	int err;
-
-	/* no locking required; sole access under constructor*/
-	clone = __context_engines_static(dst);
-	if (e->num_engines != clone->num_engines) {
-		err = -EINVAL;
-		goto unlock;
-	}
-
-	for (n = 0; n < e->num_engines; n++) {
-		struct intel_context *ce = e->engines[n];
-
-		if (clone->engines[n]->engine->class != ce->engine->class) {
-			/* Must have compatible engine maps! */
-			err = -EINVAL;
-			goto unlock;
-		}
-
-		/* serialises with set_sseu */
-		err = intel_context_lock_pinned(ce);
-		if (err)
-			goto unlock;
-
-		clone->engines[n]->sseu = ce->sseu;
-		intel_context_unlock_pinned(ce);
-	}
-
-	err = 0;
-unlock:
-	i915_gem_context_unlock_engines(src);
-	return err;
-}
-
-static int clone_timeline(struct i915_gem_context *dst,
-			  struct i915_gem_context *src)
-{
-	if (src->timeline)
-		__assign_timeline(dst, src->timeline);
-
-	return 0;
-}
-
-static int clone_vm(struct i915_gem_context *dst,
-		    struct i915_gem_context *src)
-{
-	struct i915_address_space *vm;
-	int err = 0;
-
-	if (!rcu_access_pointer(src->vm))
-		return 0;
-
-	rcu_read_lock();
-	vm = context_get_vm_rcu(src);
-	rcu_read_unlock();
-
-	if (!mutex_lock_interruptible(&dst->mutex)) {
-		__assign_ppgtt(dst, vm);
-		mutex_unlock(&dst->mutex);
-	} else {
-		err = -EINTR;
-	}
-
-	i915_vm_put(vm);
-	return err;
-}
-
-static int create_clone(struct i915_user_extension __user *ext, void *data)
-{
-	static int (* const fn[])(struct i915_gem_context *dst,
-				  struct i915_gem_context *src) = {
-#define MAP(x, y) [ilog2(I915_CONTEXT_CLONE_##x)] = y
-		MAP(ENGINES, clone_engines),
-		MAP(FLAGS, clone_flags),
-		MAP(SCHEDATTR, clone_schedattr),
-		MAP(SSEU, clone_sseu),
-		MAP(TIMELINE, clone_timeline),
-		MAP(VM, clone_vm),
-#undef MAP
-	};
-	struct drm_i915_gem_context_create_ext_clone local;
-	const struct create_ext *arg = data;
-	struct i915_gem_context *dst = arg->ctx;
-	struct i915_gem_context *src;
-	int err, bit;
-
-	if (copy_from_user(&local, ext, sizeof(local)))
-		return -EFAULT;
-
-	BUILD_BUG_ON(GENMASK(BITS_PER_TYPE(local.flags) - 1, ARRAY_SIZE(fn)) !=
-		     I915_CONTEXT_CLONE_UNKNOWN);
-
-	if (local.flags & I915_CONTEXT_CLONE_UNKNOWN)
-		return -EINVAL;
-
-	if (local.rsvd)
-		return -EINVAL;
-
-	rcu_read_lock();
-	src = __i915_gem_context_lookup_rcu(arg->fpriv, local.clone_id);
-	rcu_read_unlock();
-	if (!src)
-		return -ENOENT;
-
-	GEM_BUG_ON(src == dst);
-
-	for (bit = 0; bit < ARRAY_SIZE(fn); bit++) {
-		if (!(local.flags & BIT(bit)))
-			continue;
-
-		err = fn[bit](dst, src);
-		if (err)
-			return err;
-	}
-
-	return 0;
+	return -EINVAL;
 }
 
 static const i915_user_extension_fn create_extensions[] = {
 	[I915_CONTEXT_CREATE_EXT_SETPARAM] = create_setparam,
-	[I915_CONTEXT_CREATE_EXT_CLONE] = create_clone,
+	[I915_CONTEXT_CREATE_EXT_CLONE] = invalid_ext,
 };
 
 static bool client_is_banned(struct drm_i915_file_private *file_priv)
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index cdb2126a159a..7dd7afccb3ad 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3850,34 +3850,6 @@ err_put:
 	return ERR_PTR(err);
 }
 
-struct intel_context *
-intel_execlists_clone_virtual(struct intel_engine_cs *src)
-{
-	struct virtual_engine *se = to_virtual_engine(src);
-	struct intel_context *dst;
-
-	dst = intel_execlists_create_virtual(se->siblings,
-					     se->num_siblings);
-	if (IS_ERR(dst))
-		return dst;
-
-	if (se->num_bonds) {
-		struct virtual_engine *de = to_virtual_engine(dst->engine);
-
-		de->bonds = kmemdup(se->bonds,
-				    sizeof(*se->bonds) * se->num_bonds,
-				    GFP_KERNEL);
-		if (!de->bonds) {
-			intel_context_put(dst);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		de->num_bonds = se->num_bonds;
-	}
-
-	return dst;
-}
-
 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
 				     const struct intel_engine_cs *master,
 				     const struct intel_engine_cs *sibling)
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
index 4ca9b475e252..c0b23f69535e 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
@@ -36,9 +36,6 @@ struct intel_context *
 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
 			       unsigned int count);
 
-struct intel_context *
-intel_execlists_clone_virtual(struct intel_engine_cs *src);
-
 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
 				     const struct intel_engine_cs *master,
 				     const struct intel_engine_cs *sibling);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 79dcafaf476e..e334a8b14ef2 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2006,20 +2006,10 @@ struct drm_i915_gem_context_create_ext_setparam {
 	struct drm_i915_gem_context_param param;
 };
 
-struct drm_i915_gem_context_create_ext_clone {
+/* This API has been removed.  On the off chance someone somewhere has
+ * attempted to use it, never re-use this extension number.
+ */
 #define I915_CONTEXT_CREATE_EXT_CLONE 1
-	struct i915_user_extension base;
-	__u32 clone_id;
-	__u32 flags;
-#define I915_CONTEXT_CLONE_ENGINES	(1u << 0)
-#define I915_CONTEXT_CLONE_FLAGS	(1u << 1)
-#define I915_CONTEXT_CLONE_SCHEDATTR	(1u << 2)
-#define I915_CONTEXT_CLONE_SSEU		(1u << 3)
-#define I915_CONTEXT_CLONE_TIMELINE	(1u << 4)
-#define I915_CONTEXT_CLONE_VM		(1u << 5)
-#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_VM << 1)
-	__u64 rsvd;
-};
 
 struct drm_i915_gem_context_destroy {
 	__u32 ctx_id;
-- 
cgit v1.2.3-71-gd317


From caa7302b3a346d9a9ec85568cff52d2cee5f32c4 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Fri, 18 Jun 2021 16:21:36 +0200
Subject: media: include/uapi/linux/cec.h: typo: SATERDAY -> SATURDAY

Fix typo in a define: CEC_OP_REC_SEQ_SATERDAY -> CEC_OP_REC_SEQ_SATURDAY

This isn't used yet in actual applications to the best of my knowledge,
and it certainly doesn't break the ABI since the value doesn't change.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 Documentation/userspace-api/media/cec.h.rst.exceptions | 2 +-
 include/uapi/linux/cec.h                               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/userspace-api/media/cec.h.rst.exceptions b/Documentation/userspace-api/media/cec.h.rst.exceptions
index d83790ccac8e..13de01d9555e 100644
--- a/Documentation/userspace-api/media/cec.h.rst.exceptions
+++ b/Documentation/userspace-api/media/cec.h.rst.exceptions
@@ -140,7 +140,7 @@ ignore define CEC_OP_REC_SEQ_TUESDAY
 ignore define CEC_OP_REC_SEQ_WEDNESDAY
 ignore define CEC_OP_REC_SEQ_THURSDAY
 ignore define CEC_OP_REC_SEQ_FRIDAY
-ignore define CEC_OP_REC_SEQ_SATERDAY
+ignore define CEC_OP_REC_SEQ_SATURDAY
 ignore define CEC_OP_REC_SEQ_ONCE_ONLY
 
 ignore define CEC_MSG_CLEAR_DIGITAL_TIMER
diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index dc8879d179fd..de936f5e446d 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -642,7 +642,7 @@ struct cec_event {
 #define CEC_OP_REC_SEQ_WEDNESDAY			0x08
 #define CEC_OP_REC_SEQ_THURSDAY				0x10
 #define CEC_OP_REC_SEQ_FRIDAY				0x20
-#define CEC_OP_REC_SEQ_SATERDAY				0x40
+#define CEC_OP_REC_SEQ_SATURDAY				0x40
 #define CEC_OP_REC_SEQ_ONCE_ONLY			0x00
 
 #define CEC_MSG_CLEAR_DIGITAL_TIMER			0x99
-- 
cgit v1.2.3-71-gd317


From 514305ee0a1dade95c6ff1eb5735de5a329d1f89 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 1 Jul 2021 12:41:27 +0200
Subject: RDMA/irdma: Make spdxcheck.py happy

Commit 48d6b3336a9f ("RDMA/irdma: Add ABI definitions") adds
./include/uapi/rdma/irdma-abi.h with an additional unneeded closing
bracket at the end of the SPDX-License-Identifier line.

Hence, ./scripts/spdxcheck.py complains:

  include/uapi/rdma/irdma-abi.h: 1:77 Syntax error: )

Remove that closing bracket to make spdxcheck.py happy.

Fixes: 48d6b3336a9f ("RDMA/irdma: Add ABI definitions")
Link: https://lore.kernel.org/r/20210701104127.1877-1-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Acked-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/uapi/rdma/irdma-abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/rdma/irdma-abi.h b/include/uapi/rdma/irdma-abi.h
index 26b638a7ad97..a7085e092d34 100644
--- a/include/uapi/rdma/irdma-abi.h
+++ b/include/uapi/rdma/irdma-abi.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB */
 /*
  * Copyright (c) 2006 - 2021 Intel Corporation.  All rights reserved.
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
-- 
cgit v1.2.3-71-gd317


From b48c7236b13cb5ef1b5fdf744aa8841df0f7b43a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 29 Jun 2021 15:11:44 -0500
Subject: exit/bdflush: Remove the deprecated bdflush system call

The bdflush system call has been deprecated for a very long time.
Recently Michael Schmitz tested[1] and found that the last known
caller of of the bdflush system call is unaffected by it's removal.

Since the code is not needed delete it.

[1] https://lkml.kernel.org/r/36123b5d-daa0-6c2b-f2d4-a942f069fd54@gmail.com
Link: https://lkml.kernel.org/r/87sg10quue.fsf_-_@disp2133
Tested-by: Michael Schmitz <schmitzmic@gmail.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Cyril Hrubis <chrubis@suse.cz>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/alpha/kernel/syscalls/syscall.tbl             |  2 +-
 arch/arm/tools/syscall.tbl                         |  2 +-
 arch/arm64/include/asm/unistd32.h                  |  2 +-
 arch/ia64/kernel/syscalls/syscall.tbl              |  2 +-
 arch/m68k/kernel/syscalls/syscall.tbl              |  2 +-
 arch/microblaze/kernel/syscalls/syscall.tbl        |  2 +-
 arch/mips/kernel/syscalls/syscall_o32.tbl          |  2 +-
 arch/parisc/kernel/syscalls/syscall.tbl            |  2 +-
 arch/powerpc/kernel/syscalls/syscall.tbl           |  2 +-
 arch/s390/kernel/syscalls/syscall.tbl              |  2 +-
 arch/sh/kernel/syscalls/syscall.tbl                |  2 +-
 arch/sparc/kernel/syscalls/syscall.tbl             |  2 +-
 arch/x86/entry/syscalls/syscall_32.tbl             |  2 +-
 arch/xtensa/kernel/syscalls/syscall.tbl            |  2 +-
 fs/buffer.c                                        | 27 ----------------------
 include/linux/syscalls.h                           |  1 -
 include/uapi/linux/capability.h                    |  1 -
 kernel/sys_ni.c                                    |  1 -
 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl |  2 +-
 tools/perf/arch/s390/entry/syscalls/syscall.tbl    |  2 +-
 20 files changed, 16 insertions(+), 46 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index a17687ed4b51..7ac22e007d52 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -230,7 +230,7 @@
 259	common	osf_swapctl			sys_ni_syscall
 260	common	osf_memcntl			sys_ni_syscall
 261	common	osf_fdatasync			sys_ni_syscall
-300	common	bdflush				sys_bdflush
+300	common	bdflush				sys_ni_syscall
 301	common	sethae				sys_sethae
 302	common	mount				sys_mount
 303	common	old_adjtimex			sys_old_adjtimex
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index c5df1179fc5d..f8a2d5aa17b7 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -147,7 +147,7 @@
 131	common	quotactl		sys_quotactl
 132	common	getpgid			sys_getpgid
 133	common	fchdir			sys_fchdir
-134	common	bdflush			sys_bdflush
+134	common	bdflush			sys_ni_syscall
 135	common	sysfs			sys_sysfs
 136	common	personality		sys_personality
 # 137 was sys_afs_syscall
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 99ffcafc736c..03d4ca47d253 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -279,7 +279,7 @@ __SYSCALL(__NR_getpgid, sys_getpgid)
 #define __NR_fchdir 133
 __SYSCALL(__NR_fchdir, sys_fchdir)
 #define __NR_bdflush 134
-__SYSCALL(__NR_bdflush, sys_bdflush)
+__SYSCALL(__NR_bdflush, sys_ni_syscall)
 #define __NR_sysfs 135
 __SYSCALL(__NR_sysfs, sys_sysfs)
 #define __NR_personality 136
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 6d07742c57b8..4b20224b14d9 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -123,7 +123,7 @@
 # 1135 was get_kernel_syms
 # 1136 was query_module
 113	common	quotactl			sys_quotactl
-114	common	bdflush				sys_bdflush
+114	common	bdflush				sys_ni_syscall
 115	common	sysfs				sys_sysfs
 116	common	personality			sys_personality
 117	common	afs_syscall			sys_ni_syscall
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 541bc1b3a8f9..3ec1291c268d 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -141,7 +141,7 @@
 131	common	quotactl			sys_quotactl
 132	common	getpgid				sys_getpgid
 133	common	fchdir				sys_fchdir
-134	common	bdflush				sys_bdflush
+134	common	bdflush				sys_ni_syscall
 135	common	sysfs				sys_sysfs
 136	common	personality			sys_personality
 # 137 was afs_syscall
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index a176faca2927..9be3ace12938 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -141,7 +141,7 @@
 131	common	quotactl			sys_quotactl
 132	common	getpgid				sys_getpgid
 133	common	fchdir				sys_fchdir
-134	common	bdflush				sys_bdflush
+134	common	bdflush				sys_ni_syscall
 135	common	sysfs				sys_sysfs
 136	common	personality			sys_personality
 137	common	afs_syscall			sys_ni_syscall
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 253f2cd70b6b..fae35882a165 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -145,7 +145,7 @@
 131	o32	quotactl			sys_quotactl
 132	o32	getpgid				sys_getpgid
 133	o32	fchdir				sys_fchdir
-134	o32	bdflush				sys_bdflush
+134	o32	bdflush				sys_ni_syscall
 135	o32	sysfs				sys_sysfs
 136	o32	personality			sys_personality			sys_32_personality
 137	o32	afs_syscall			sys_ni_syscall
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index e26187b9ab87..eaf0603ae781 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -147,7 +147,7 @@
 131	common	quotactl		sys_quotactl
 132	common	getpgid			sys_getpgid
 133	common	fchdir			sys_fchdir
-134	common	bdflush			sys_bdflush
+134	common	bdflush			sys_ni_syscall
 135	common	sysfs			sys_sysfs
 136	32	personality		parisc_personality
 136	64	personality		sys_personality
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index aef2a290e71a..6f3953f2a0d5 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -176,7 +176,7 @@
 131	nospu	quotactl			sys_quotactl
 132	common	getpgid				sys_getpgid
 133	common	fchdir				sys_fchdir
-134	common	bdflush				sys_bdflush
+134	common	bdflush				sys_ni_syscall
 135	common	sysfs				sys_sysfs
 136	32	personality			sys_personality			ppc64_personality
 136	64	personality			ppc64_personality
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 64d51ab5a8b4..aa705e1bd0dc 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -122,7 +122,7 @@
 131  common	quotactl		sys_quotactl			sys_quotactl
 132  common	getpgid			sys_getpgid			sys_getpgid
 133  common	fchdir			sys_fchdir			sys_fchdir
-134  common	bdflush			sys_bdflush			sys_bdflush
+134  common	bdflush			sys_ni_syscall			sys_ni_syscall
 135  common	sysfs			sys_sysfs			sys_sysfs
 136  common	personality		sys_s390_personality		sys_s390_personality
 137  common	afs_syscall		-				-
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index e0a70be77d84..7bbd6700ae4b 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -141,7 +141,7 @@
 131	common	quotactl			sys_quotactl
 132	common	getpgid				sys_getpgid
 133	common	fchdir				sys_fchdir
-134	common	bdflush				sys_bdflush
+134	common	bdflush				sys_ni_syscall
 135	common	sysfs				sys_sysfs
 136	common	personality			sys_personality
 # 137 was afs_syscall
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 603f5a821502..f520e9cd2c78 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -270,7 +270,7 @@
 222	common	delete_module		sys_delete_module
 223	common	get_kernel_syms		sys_ni_syscall
 224	common	getpgid			sys_getpgid
-225	common	bdflush			sys_bdflush
+225	common	bdflush			sys_ni_syscall
 226	common	sysfs			sys_sysfs
 227	common	afs_syscall		sys_nis_syscall
 228	common	setfsuid		sys_setfsuid16
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ce763a12311c..a5beae6daf20 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -145,7 +145,7 @@
 131	i386	quotactl		sys_quotactl
 132	i386	getpgid			sys_getpgid
 133	i386	fchdir			sys_fchdir
-134	i386	bdflush			sys_bdflush
+134	i386	bdflush			sys_ni_syscall
 135	i386	sysfs			sys_sysfs
 136	i386	personality		sys_personality
 137	i386	afs_syscall
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 235d67d6ceb4..b3d1bc8a9095 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -223,7 +223,7 @@
 # 205 was old nfsservctl
 205	common	nfsservctl			sys_ni_syscall
 206	common	_sysctl				sys_ni_syscall
-207	common	bdflush				sys_bdflush
+207	common	bdflush				sys_ni_syscall
 208	common	uname				sys_newuname
 209	common	sysinfo				sys_sysinfo
 210	common	init_module			sys_init_module
diff --git a/fs/buffer.c b/fs/buffer.c
index 6290c3afdba4..32718ee13b08 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3267,33 +3267,6 @@ out:
 }
 EXPORT_SYMBOL(try_to_free_buffers);
 
-/*
- * There are no bdflush tunables left.  But distributions are
- * still running obsolete flush daemons, so we terminate them here.
- *
- * Use of bdflush() is deprecated and will be removed in a future kernel.
- * The `flush-X' kernel threads fully replace bdflush daemons and this call.
- */
-SYSCALL_DEFINE2(bdflush, int, func, long, data)
-{
-	static int msg_count;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (msg_count < 5) {
-		msg_count++;
-		printk(KERN_INFO
-			"warning: process `%s' used the obsolete bdflush"
-			" system call\n", current->comm);
-		printk(KERN_INFO "Fix your initscripts?\n");
-	}
-
-	if (func == 1)
-		do_exit(0);
-	return 0;
-}
-
 /*
  * Buffer-head allocation
  */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 69c9a7010081..2b47584eb843 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1158,7 +1158,6 @@ asmlinkage long sys_ustat(unsigned dev, struct ustat __user *ubuf);
 asmlinkage long sys_vfork(void);
 asmlinkage long sys_recv(int, void __user *, size_t, unsigned);
 asmlinkage long sys_send(int, void __user *, size_t, unsigned);
-asmlinkage long sys_bdflush(int func, long data);
 asmlinkage long sys_oldumount(char __user *name);
 asmlinkage long sys_uselib(const char __user *library);
 asmlinkage long sys_sysfs(int option,
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 2ddb4226cd23..463d1ba2232a 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -243,7 +243,6 @@ struct vfs_ns_cap_data {
 /* Allow examination and configuration of disk quotas */
 /* Allow setting the domainname */
 /* Allow setting the hostname */
-/* Allow calling bdflush() */
 /* Allow mount() and umount(), setting up new smb connection */
 /* Allow some autofs root ioctls */
 /* Allow nfsservctl */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 30971b1dd4a9..cb6f98f5c97a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -416,7 +416,6 @@ COND_SYSCALL(epoll_wait);
 COND_SYSCALL(recv);
 COND_SYSCALL_COMPAT(recv);
 COND_SYSCALL(send);
-COND_SYSCALL(bdflush);
 COND_SYSCALL(uselib);
 
 /* optional: time32 */
diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index aef2a290e71a..6f3953f2a0d5 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -176,7 +176,7 @@
 131	nospu	quotactl			sys_quotactl
 132	common	getpgid				sys_getpgid
 133	common	fchdir				sys_fchdir
-134	common	bdflush				sys_bdflush
+134	common	bdflush				sys_ni_syscall
 135	common	sysfs				sys_sysfs
 136	32	personality			sys_personality			ppc64_personality
 136	64	personality			ppc64_personality
diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
index 64d51ab5a8b4..8d619ec86dcc 100644
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -122,7 +122,7 @@
 131  common	quotactl		sys_quotactl			sys_quotactl
 132  common	getpgid			sys_getpgid			sys_getpgid
 133  common	fchdir			sys_fchdir			sys_fchdir
-134  common	bdflush			sys_bdflush			sys_bdflush
+134  common	bdflush			-				-
 135  common	sysfs			sys_sysfs			sys_sysfs
 136  common	personality		sys_s390_personality		sys_s390_personality
 137  common	afs_syscall		-				-
-- 
cgit v1.2.3-71-gd317


From 289f5a72009b8f67334c9f911f7f5fe6e8a80049 Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Mon, 5 Jul 2021 14:53:07 +0100
Subject: drm/i915/uapi: convert drm_i915_gem_caching to kernel doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert all the drm_i915_gem_caching bits to proper kernel doc.

Suggested-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Jordan Justen <jordan.l.justen@intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Ramalingam C <ramalingam.c@intel.com>
Reviewed-by: Ramalingam C <ramalingam.c@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210705135310.1502437-2-matthew.auld@intel.com
---
 include/uapi/drm/i915_drm.h | 65 +++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 29 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index e334a8b14ef2..9078ded93558 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1363,43 +1363,50 @@ struct drm_i915_gem_busy {
 };
 
 /**
- * I915_CACHING_NONE
+ * struct drm_i915_gem_caching - Set or get the caching for given object
+ * handle.
  *
- * GPU access is not coherent with cpu caches. Default for machines without an
- * LLC.
+ * Allow userspace to control the GTT caching bits for a given object when the
+ * object is later mapped through the ppGTT(or GGTT on older platforms lacking
+ * ppGTT support, or if the object is used for scanout). Note that this might
+ * require unbinding the object from the GTT first, if its current caching value
+ * doesn't match.
  */
-#define I915_CACHING_NONE		0
-/**
- * I915_CACHING_CACHED
- *
- * GPU access is coherent with cpu caches and furthermore the data is cached in
- * last-level caches shared between cpu cores and the gpu GT. Default on
- * machines with HAS_LLC.
- */
-#define I915_CACHING_CACHED		1
-/**
- * I915_CACHING_DISPLAY
- *
- * Special GPU caching mode which is coherent with the scanout engines.
- * Transparently falls back to I915_CACHING_NONE on platforms where no special
- * cache mode (like write-through or gfdt flushing) is available. The kernel
- * automatically sets this mode when using a buffer as a scanout target.
- * Userspace can manually set this mode to avoid a costly stall and clflush in
- * the hotpath of drawing the first frame.
- */
-#define I915_CACHING_DISPLAY		2
-
 struct drm_i915_gem_caching {
 	/**
-	 * Handle of the buffer to set/get the caching level of. */
+	 * @handle: Handle of the buffer to set/get the caching level.
+	 */
 	__u32 handle;
 
 	/**
-	 * Cacheing level to apply or return value
+	 * @caching: The GTT caching level to apply or possible return value.
 	 *
-	 * bits0-15 are for generic caching control (i.e. the above defined
-	 * values). bits16-31 are reserved for platform-specific variations
-	 * (e.g. l3$ caching on gen7). */
+	 * The supported @caching values:
+	 *
+	 * I915_CACHING_NONE:
+	 *
+	 * GPU access is not coherent with CPU caches.  Default for machines
+	 * without an LLC. This means manual flushing might be needed, if we
+	 * want GPU access to be coherent.
+	 *
+	 * I915_CACHING_CACHED:
+	 *
+	 * GPU access is coherent with CPU caches and furthermore the data is
+	 * cached in last-level caches shared between CPU cores and the GPU GT.
+	 *
+	 * I915_CACHING_DISPLAY:
+	 *
+	 * Special GPU caching mode which is coherent with the scanout engines.
+	 * Transparently falls back to I915_CACHING_NONE on platforms where no
+	 * special cache mode (like write-through or gfdt flushing) is
+	 * available. The kernel automatically sets this mode when using a
+	 * buffer as a scanout target.  Userspace can manually set this mode to
+	 * avoid a costly stall and clflush in the hotpath of drawing the first
+	 * frame.
+	 */
+#define I915_CACHING_NONE		0
+#define I915_CACHING_CACHED		1
+#define I915_CACHING_DISPLAY		2
 	__u32 caching;
 };
 
-- 
cgit v1.2.3-71-gd317


From 3aa8c57fe25a9247e25977f1c2302395cbbd8242 Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Mon, 5 Jul 2021 14:53:09 +0100
Subject: drm/i915/uapi: convert drm_i915_gem_set_domain to kernel doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert all the drm_i915_gem_set_domain bits to proper kernel doc.

Suggested-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Jordan Justen <jordan.l.justen@intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Ramalingam C <ramalingam.c@intel.com>
Reviewed-by: Ramalingam C <ramalingam.c@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210705135310.1502437-4-matthew.auld@intel.com
---
 include/uapi/drm/i915_drm.h | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 9078ded93558..e54f9efaead0 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -880,14 +880,38 @@ struct drm_i915_gem_mmap_offset {
 	__u64 extensions;
 };
 
+/**
+ * struct drm_i915_gem_set_domain - Adjust the objects write or read domain, in
+ * preparation for accessing the pages via some CPU domain.
+ *
+ * Specifying a new write or read domain will flush the object out of the
+ * previous domain(if required), before then updating the objects domain
+ * tracking with the new domain.
+ *
+ * Note this might involve waiting for the object first if it is still active on
+ * the GPU.
+ *
+ * Supported values for @read_domains and @write_domain:
+ *
+ *	- I915_GEM_DOMAIN_WC: Uncached write-combined domain
+ *	- I915_GEM_DOMAIN_CPU: CPU cache domain
+ *	- I915_GEM_DOMAIN_GTT: Mappable aperture domain
+ *
+ * All other domains are rejected.
+ */
 struct drm_i915_gem_set_domain {
-	/** Handle for the object */
+	/** @handle: Handle for the object. */
 	__u32 handle;
 
-	/** New read domains */
+	/** @read_domains: New read domains. */
 	__u32 read_domains;
 
-	/** New write domain */
+	/**
+	 * @write_domain: New write domain.
+	 *
+	 * Note that having something in the write domain implies it's in the
+	 * read domain, and only that read domain.
+	 */
 	__u32 write_domain;
 };
 
-- 
cgit v1.2.3-71-gd317


From f170acda7ffaf0473d06e1e17c12cd9fd63904f5 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Wed, 14 Jul 2021 21:43:17 +0900
Subject: bpf: Fix a typo of reuseport map in bpf.h.

Fix s/BPF_MAP_TYPE_REUSEPORT_ARRAY/BPF_MAP_TYPE_REUSEPORT_SOCKARRAY/ typo
in bpf.h.

Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210714124317.67526-1-kuniyu@amazon.co.jp
---
 include/uapi/linux/bpf.h       | 2 +-
 tools/include/uapi/linux/bpf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b46a383e8db7..bafb6282032b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3246,7 +3246,7 @@ union bpf_attr {
  * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
  *	Description
  *		Select a **SO_REUSEPORT** socket from a
- *		**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
+ *		**BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*.
  *		It checks the selected socket is matching the incoming
  *		request in the socket buffer.
  *	Return
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b46a383e8db7..bafb6282032b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3246,7 +3246,7 @@ union bpf_attr {
  * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
  *	Description
  *		Select a **SO_REUSEPORT** socket from a
- *		**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
+ *		**BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*.
  *		It checks the selected socket is matching the incoming
  *		request in the socket buffer.
  *	Return
-- 
cgit v1.2.3-71-gd317


From b00628b1c7d595ae5b544e059c27b1f5828314b4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 14 Jul 2021 17:54:09 -0700
Subject: bpf: Introduce bpf timers.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded
in hash/array/lru maps as a regular field and helpers to operate on it:

// Initialize the timer.
// First 4 bits of 'flags' specify clockid.
// Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, int flags);

// Configure the timer to call 'callback_fn' static function.
long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);

// Arm the timer to expire 'nsec' nanoseconds from the current time.
long bpf_timer_start(struct bpf_timer *timer, u64 nsec, u64 flags);

// Cancel the timer and wait for callback_fn to finish if it was running.
long bpf_timer_cancel(struct bpf_timer *timer);

Here is how BPF program might look like:
struct map_elem {
    int counter;
    struct bpf_timer timer;
};

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __uint(max_entries, 1000);
    __type(key, int);
    __type(value, struct map_elem);
} hmap SEC(".maps");

static int timer_cb(void *map, int *key, struct map_elem *val);
/* val points to particular map element that contains bpf_timer. */

SEC("fentry/bpf_fentry_test1")
int BPF_PROG(test1, int a)
{
    struct map_elem *val;
    int key = 0;

    val = bpf_map_lookup_elem(&hmap, &key);
    if (val) {
        bpf_timer_init(&val->timer, &hmap, CLOCK_REALTIME);
        bpf_timer_set_callback(&val->timer, timer_cb);
        bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */, 0);
    }
}

This patch adds helper implementations that rely on hrtimers
to call bpf functions as timers expire.
The following patches add necessary safety checks.

Only programs with CAP_BPF are allowed to use bpf_timer.

The amount of timers used by the program is constrained by
the memcg recorded at map creation time.

The bpf_timer_init() helper needs explicit 'map' argument because inner maps
are dynamic and not known at load time. While the bpf_timer_set_callback() is
receiving hidden 'aux->prog' argument supplied by the verifier.

The prog pointer is needed to do refcnting of bpf program to make sure that
program doesn't get freed while the timer is armed. This approach relies on
"user refcnt" scheme used in prog_array that stores bpf programs for
bpf_tail_call. The bpf_timer_set_callback() will increment the prog refcnt which is
paired with bpf_timer_cancel() that will drop the prog refcnt. The
ops->map_release_uref is responsible for cancelling the timers and dropping
prog refcnt when user space reference to a map reaches zero.
This uref approach is done to make sure that Ctrl-C of user space process will
not leave timers running forever unless the user space explicitly pinned a map
that contained timers in bpffs.

bpf_timer_init() and bpf_timer_set_callback() will return -EPERM if map doesn't
have user references (is not held by open file descriptor from user space and
not pinned in bpffs).

The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel
and free the timer if given map element had it allocated.
"bpftool map update" command can be used to cancel timers.

The 'struct bpf_timer' is explicitly __attribute__((aligned(8))) because
'__u64 :64' has 1 byte alignment of 8 byte padding.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210715005417.78572-4-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h            |   3 +
 include/uapi/linux/bpf.h       |  73 ++++++++++
 kernel/bpf/helpers.c           | 324 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 109 ++++++++++++++
 kernel/trace/bpf_trace.c       |   2 +-
 scripts/bpf_doc.py             |   2 +
 tools/include/uapi/linux/bpf.h |  73 ++++++++++
 7 files changed, 585 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4afbff308ca3..125240b7cefb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -168,6 +168,7 @@ struct bpf_map {
 	u32 max_entries;
 	u32 map_flags;
 	int spin_lock_off; /* >=0 valid offset, <0 error */
+	int timer_off; /* >=0 valid offset, <0 error */
 	u32 id;
 	int numa_node;
 	u32 btf_key_type_id;
@@ -221,6 +222,7 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 }
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
 			   bool lock_src);
+void bpf_timer_cancel_and_free(void *timer);
 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
 
 struct bpf_offload_dev;
@@ -314,6 +316,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_FUNC,	/* pointer to a bpf program function */
 	ARG_PTR_TO_STACK_OR_NULL,	/* pointer to stack or NULL */
 	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
+	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
 	__BPF_ARG_TYPE_MAX,
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bafb6282032b..3544ec5234f0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4777,6 +4777,70 @@ union bpf_attr {
  * 		Execute close syscall for given FD.
  * 	Return
  * 		A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags)
+ *	Description
+ *		Initialize the timer.
+ *		First 4 bits of *flags* specify clockid.
+ *		Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
+ *		All other bits of *flags* are reserved.
+ *		The verifier will reject the program if *timer* is not from
+ *		the same *map*.
+ *	Return
+ *		0 on success.
+ *		**-EBUSY** if *timer* is already initialized.
+ *		**-EINVAL** if invalid *flags* are passed.
+ *		**-EPERM** if *timer* is in a map that doesn't have any user references.
+ *		The user space should either hold a file descriptor to a map with timers
+ *		or pin such map in bpffs. When map is unpinned or file descriptor is
+ *		closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn)
+ *	Description
+ *		Configure the timer to call *callback_fn* static function.
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ *		**-EPERM** if *timer* is in a map that doesn't have any user references.
+ *		The user space should either hold a file descriptor to a map with timers
+ *		or pin such map in bpffs. When map is unpinned or file descriptor is
+ *		closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags)
+ *	Description
+ *		Set timer expiration N nanoseconds from the current time. The
+ *		configured callback will be invoked in soft irq context on some cpu
+ *		and will not repeat unless another bpf_timer_start() is made.
+ *		In such case the next invocation can migrate to a different cpu.
+ *		Since struct bpf_timer is a field inside map element the map
+ *		owns the timer. The bpf_timer_set_callback() will increment refcnt
+ *		of BPF program to make sure that callback_fn code stays valid.
+ *		When user space reference to a map reaches zero all timers
+ *		in a map are cancelled and corresponding program's refcnts are
+ *		decremented. This is done to make sure that Ctrl-C of a user
+ *		process doesn't leave any timers running. If map is pinned in
+ *		bpffs the callback_fn can re-arm itself indefinitely.
+ *		bpf_map_update/delete_elem() helpers and user space sys_bpf commands
+ *		cancel and free the timer in the given map element.
+ *		The map can contain timers that invoke callback_fn-s from different
+ *		programs. The same callback_fn can serve different timers from
+ *		different maps if key/value layout matches across maps.
+ *		Every bpf_timer_set_callback() can have different callback_fn.
+ *
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
+ *		or invalid *flags* are passed.
+ *
+ * long bpf_timer_cancel(struct bpf_timer *timer)
+ *	Description
+ *		Cancel the timer and wait for callback_fn to finish if it was running.
+ *	Return
+ *		0 if the timer was not active.
+ *		1 if the timer was active.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ *		**-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
+ *		own timer which would have led to a deadlock otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4948,6 +5012,10 @@ union bpf_attr {
 	FN(sys_bpf),			\
 	FN(btf_find_by_name_kind),	\
 	FN(sys_close),			\
+	FN(timer_init),			\
+	FN(timer_set_callback),		\
+	FN(timer_start),		\
+	FN(timer_cancel),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6074,6 +6142,11 @@ struct bpf_spin_lock {
 	__u32	val;
 };
 
+struct bpf_timer {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 38be3cfc2f58..74b16593983d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -999,6 +999,322 @@ const struct bpf_func_proto bpf_snprintf_proto = {
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
+/* BPF map elements can contain 'struct bpf_timer'.
+ * Such map owns all of its BPF timers.
+ * 'struct bpf_timer' is allocated as part of map element allocation
+ * and it's zero initialized.
+ * That space is used to keep 'struct bpf_timer_kern'.
+ * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
+ * remembers 'struct bpf_map *' pointer it's part of.
+ * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
+ * bpf_timer_start() arms the timer.
+ * If user space reference to a map goes to zero at this point
+ * ops->map_release_uref callback is responsible for cancelling the timers,
+ * freeing their memory, and decrementing prog's refcnts.
+ * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
+ * Inner maps can contain bpf timers as well. ops->map_release_uref is
+ * freeing the timers when inner map is replaced or deleted by user space.
+ */
+struct bpf_hrtimer {
+	struct hrtimer timer;
+	struct bpf_map *map;
+	struct bpf_prog *prog;
+	void __rcu *callback_fn;
+	void *value;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer */
+struct bpf_timer_kern {
+	struct bpf_hrtimer *timer;
+	/* bpf_spin_lock is used here instead of spinlock_t to make
+	 * sure that it always fits into space resereved by struct bpf_timer
+	 * regardless of LOCKDEP and spinlock debug flags.
+	 */
+	struct bpf_spin_lock lock;
+} __attribute__((aligned(8)));
+
+static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+
+static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+{
+	struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
+	struct bpf_map *map = t->map;
+	void *value = t->value;
+	void *callback_fn;
+	void *key;
+	u32 idx;
+	int ret;
+
+	callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
+	if (!callback_fn)
+		goto out;
+
+	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
+	 * Remember the timer this callback is servicing to prevent
+	 * deadlock if callback_fn() calls bpf_timer_cancel() or
+	 * bpf_map_delete_elem() on the same timer.
+	 */
+	this_cpu_write(hrtimer_running, t);
+	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+		/* compute the key */
+		idx = ((char *)value - array->value) / array->elem_size;
+		key = &idx;
+	} else { /* hash or lru */
+		key = value - round_up(map->key_size, 8);
+	}
+
+	ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
+					 (u64)(long)key,
+					 (u64)(long)value, 0, 0);
+	WARN_ON(ret != 0); /* Next patch moves this check into the verifier */
+
+	this_cpu_write(hrtimer_running, NULL);
+out:
+	return HRTIMER_NORESTART;
+}
+
+BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
+	   u64, flags)
+{
+	clockid_t clockid = flags & (MAX_CLOCKS - 1);
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	BUILD_BUG_ON(MAX_CLOCKS != 16);
+	BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
+	BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
+
+	if (in_nmi())
+		return -EOPNOTSUPP;
+
+	if (flags >= MAX_CLOCKS ||
+	    /* similar to timerfd except _ALARM variants are not supported */
+	    (clockid != CLOCK_MONOTONIC &&
+	     clockid != CLOCK_REALTIME &&
+	     clockid != CLOCK_BOOTTIME))
+		return -EINVAL;
+	__bpf_spin_lock_irqsave(&timer->lock);
+	t = timer->timer;
+	if (t) {
+		ret = -EBUSY;
+		goto out;
+	}
+	if (!atomic64_read(&map->usercnt)) {
+		/* maps with timers must be either held by user space
+		 * or pinned in bpffs.
+		 */
+		ret = -EPERM;
+		goto out;
+	}
+	/* allocate hrtimer via map_kmalloc to use memcg accounting */
+	t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
+	if (!t) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	t->value = (void *)timer - map->timer_off;
+	t->map = map;
+	t->prog = NULL;
+	rcu_assign_pointer(t->callback_fn, NULL);
+	hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+	t->timer.function = bpf_timer_cb;
+	timer->timer = t;
+out:
+	__bpf_spin_unlock_irqrestore(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_init_proto = {
+	.func		= bpf_timer_init,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
+	   struct bpf_prog_aux *, aux)
+{
+	struct bpf_prog *prev, *prog = aux->prog;
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	if (in_nmi())
+		return -EOPNOTSUPP;
+	__bpf_spin_lock_irqsave(&timer->lock);
+	t = timer->timer;
+	if (!t) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!atomic64_read(&t->map->usercnt)) {
+		/* maps with timers must be either held by user space
+		 * or pinned in bpffs. Otherwise timer might still be
+		 * running even when bpf prog is detached and user space
+		 * is gone, since map_release_uref won't ever be called.
+		 */
+		ret = -EPERM;
+		goto out;
+	}
+	prev = t->prog;
+	if (prev != prog) {
+		/* Bump prog refcnt once. Every bpf_timer_set_callback()
+		 * can pick different callback_fn-s within the same prog.
+		 */
+		prog = bpf_prog_inc_not_zero(prog);
+		if (IS_ERR(prog)) {
+			ret = PTR_ERR(prog);
+			goto out;
+		}
+		if (prev)
+			/* Drop prev prog refcnt when swapping with new prog */
+			bpf_prog_put(prev);
+		t->prog = prog;
+	}
+	rcu_assign_pointer(t->callback_fn, callback_fn);
+out:
+	__bpf_spin_unlock_irqrestore(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_set_callback_proto = {
+	.func		= bpf_timer_set_callback,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+};
+
+BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
+{
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	if (in_nmi())
+		return -EOPNOTSUPP;
+	if (flags)
+		return -EINVAL;
+	__bpf_spin_lock_irqsave(&timer->lock);
+	t = timer->timer;
+	if (!t || !t->prog) {
+		ret = -EINVAL;
+		goto out;
+	}
+	hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+out:
+	__bpf_spin_unlock_irqrestore(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_start_proto = {
+	.func		= bpf_timer_start,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+static void drop_prog_refcnt(struct bpf_hrtimer *t)
+{
+	struct bpf_prog *prog = t->prog;
+
+	if (prog) {
+		bpf_prog_put(prog);
+		t->prog = NULL;
+		rcu_assign_pointer(t->callback_fn, NULL);
+	}
+}
+
+BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+{
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	if (in_nmi())
+		return -EOPNOTSUPP;
+	__bpf_spin_lock_irqsave(&timer->lock);
+	t = timer->timer;
+	if (!t) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (this_cpu_read(hrtimer_running) == t) {
+		/* If bpf callback_fn is trying to bpf_timer_cancel()
+		 * its own timer the hrtimer_cancel() will deadlock
+		 * since it waits for callback_fn to finish
+		 */
+		ret = -EDEADLK;
+		goto out;
+	}
+	drop_prog_refcnt(t);
+out:
+	__bpf_spin_unlock_irqrestore(&timer->lock);
+	/* Cancel the timer and wait for associated callback to finish
+	 * if it was running.
+	 */
+	ret = ret ?: hrtimer_cancel(&t->timer);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_cancel_proto = {
+	.func		= bpf_timer_cancel,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+};
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+	struct bpf_timer_kern *timer = val;
+	struct bpf_hrtimer *t;
+
+	/* Performance optimization: read timer->timer without lock first. */
+	if (!READ_ONCE(timer->timer))
+		return;
+
+	__bpf_spin_lock_irqsave(&timer->lock);
+	/* re-read it under lock */
+	t = timer->timer;
+	if (!t)
+		goto out;
+	drop_prog_refcnt(t);
+	/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
+	 * this timer, since it won't be initialized.
+	 */
+	timer->timer = NULL;
+out:
+	__bpf_spin_unlock_irqrestore(&timer->lock);
+	if (!t)
+		return;
+	/* Cancel the timer and wait for callback to complete if it was running.
+	 * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
+	 * right after for both preallocated and non-preallocated maps.
+	 * The timer->timer = NULL was already done and no code path can
+	 * see address 't' anymore.
+	 *
+	 * Check that bpf_map_delete/update_elem() wasn't called from timer
+	 * callback_fn. In such case don't call hrtimer_cancel() (since it will
+	 * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
+	 * return -1). Though callback_fn is still running on this cpu it's
+	 * safe to do kfree(t) because bpf_timer_cb() read everything it needed
+	 * from 't'. The bpf subprog callback_fn won't be able to access 't',
+	 * since timer->timer = NULL was already done. The timer will be
+	 * effectively cancelled because bpf_timer_cb() will return
+	 * HRTIMER_NORESTART.
+	 */
+	if (this_cpu_read(hrtimer_running) != t)
+		hrtimer_cancel(&t->timer);
+	kfree(t);
+}
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -1065,6 +1381,14 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_per_cpu_ptr_proto;
 	case BPF_FUNC_this_cpu_ptr:
 		return &bpf_this_cpu_ptr_proto;
+	case BPF_FUNC_timer_init:
+		return &bpf_timer_init_proto;
+	case BPF_FUNC_timer_set_callback:
+		return &bpf_timer_set_callback_proto;
+	case BPF_FUNC_timer_start:
+		return &bpf_timer_start_proto;
+	case BPF_FUNC_timer_cancel:
+		return &bpf_timer_cancel_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3dbb3b40b754..e8645c819803 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4656,6 +4656,38 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+			      struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	bool is_const = tnum_is_const(reg->var_off);
+	struct bpf_map *map = reg->map_ptr;
+	u64 val = reg->var_off.value;
+
+	if (!is_const) {
+		verbose(env,
+			"R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
+			regno);
+		return -EINVAL;
+	}
+	if (!map->btf) {
+		verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
+			map->name);
+		return -EINVAL;
+	}
+	if (val) {
+		/* This restriction will be removed in the next patch */
+		verbose(env, "bpf_timer field can only be first in the map value element\n");
+		return -EINVAL;
+	}
+	if (meta->map_ptr) {
+		verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+		return -EFAULT;
+	}
+	meta->map_ptr = map;
+	return 0;
+}
+
 static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
 {
 	return type == ARG_PTR_TO_MEM ||
@@ -4788,6 +4820,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER
 static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
 static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
 static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -4819,6 +4852,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_FUNC]		= &func_ptr_types,
 	[ARG_PTR_TO_STACK_OR_NULL]	= &stack_ptr_types,
 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
+	[ARG_PTR_TO_TIMER]		= &timer_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4948,6 +4982,10 @@ skip_type_check:
 
 	if (arg_type == ARG_CONST_MAP_PTR) {
 		/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+		if (meta->map_ptr && meta->map_ptr != reg->map_ptr) {
+			verbose(env, "Map pointer doesn't match bpf_timer.\n");
+			return -EINVAL;
+		}
 		meta->map_ptr = reg->map_ptr;
 	} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
 		/* bpf_map_xxx(..., map_ptr, ..., key) call:
@@ -5000,6 +5038,9 @@ skip_type_check:
 			verbose(env, "verifier internal error\n");
 			return -EFAULT;
 		}
+	} else if (arg_type == ARG_PTR_TO_TIMER) {
+		if (process_timer_func(env, regno, meta))
+			return -EACCES;
 	} else if (arg_type == ARG_PTR_TO_FUNC) {
 		meta->subprogno = reg->subprogno;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
@@ -5742,6 +5783,34 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int set_timer_callback_state(struct bpf_verifier_env *env,
+				    struct bpf_func_state *caller,
+				    struct bpf_func_state *callee,
+				    int insn_idx)
+{
+	struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+	/* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
+	 * callback_fn(struct bpf_map *map, void *key, void *value);
+	 */
+	callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+	callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+	callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+	callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+	callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+	callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+	/* unused */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+	return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -6069,6 +6138,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_timer_set_callback) {
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_timer_callback_state);
+		if (err < 0)
+			return -EINVAL;
+	}
+
 	if (func_id == BPF_FUNC_snprintf) {
 		err = check_bpf_snprintf_call(env, regs);
 		if (err < 0)
@@ -12591,6 +12667,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			continue;
 		}
 
+		if (insn->imm == BPF_FUNC_timer_set_callback) {
+			/* The verifier will process callback_fn as many times as necessary
+			 * with different maps and the register states prepared by
+			 * set_timer_callback_state will be accurate.
+			 *
+			 * The following use case is valid:
+			 *   map1 is shared by prog1, prog2, prog3.
+			 *   prog1 calls bpf_timer_init for some map1 elements
+			 *   prog2 calls bpf_timer_set_callback for some map1 elements.
+			 *     Those that were not bpf_timer_init-ed will return -EINVAL.
+			 *   prog3 calls bpf_timer_start for some map1 elements.
+			 *     Those that were not both bpf_timer_init-ed and
+			 *     bpf_timer_set_callback-ed will return -EINVAL.
+			 */
+			struct bpf_insn ld_addrs[2] = {
+				BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
+			};
+
+			insn_buf[0] = ld_addrs[0];
+			insn_buf[1] = ld_addrs[1];
+			insn_buf[2] = *insn;
+			cnt = 3;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			goto patch_call_imm;
+		}
+
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
 		 * and other inlining handlers are currently limited to 64 bit
 		 * only.
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 64bd2d84367f..6c77d25137e0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1059,7 +1059,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_snprintf:
 		return &bpf_snprintf_proto;
 	default:
-		return NULL;
+		return bpf_base_func_proto(func_id);
 	}
 }
 
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index 2d94025b38e9..00ac7b79cddb 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -547,6 +547,7 @@ class PrinterHelpers(Printer):
             'struct inode',
             'struct socket',
             'struct file',
+            'struct bpf_timer',
     ]
     known_types = {
             '...',
@@ -594,6 +595,7 @@ class PrinterHelpers(Printer):
             'struct inode',
             'struct socket',
             'struct file',
+            'struct bpf_timer',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bafb6282032b..3544ec5234f0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4777,6 +4777,70 @@ union bpf_attr {
  * 		Execute close syscall for given FD.
  * 	Return
  * 		A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags)
+ *	Description
+ *		Initialize the timer.
+ *		First 4 bits of *flags* specify clockid.
+ *		Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
+ *		All other bits of *flags* are reserved.
+ *		The verifier will reject the program if *timer* is not from
+ *		the same *map*.
+ *	Return
+ *		0 on success.
+ *		**-EBUSY** if *timer* is already initialized.
+ *		**-EINVAL** if invalid *flags* are passed.
+ *		**-EPERM** if *timer* is in a map that doesn't have any user references.
+ *		The user space should either hold a file descriptor to a map with timers
+ *		or pin such map in bpffs. When map is unpinned or file descriptor is
+ *		closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn)
+ *	Description
+ *		Configure the timer to call *callback_fn* static function.
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ *		**-EPERM** if *timer* is in a map that doesn't have any user references.
+ *		The user space should either hold a file descriptor to a map with timers
+ *		or pin such map in bpffs. When map is unpinned or file descriptor is
+ *		closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags)
+ *	Description
+ *		Set timer expiration N nanoseconds from the current time. The
+ *		configured callback will be invoked in soft irq context on some cpu
+ *		and will not repeat unless another bpf_timer_start() is made.
+ *		In such case the next invocation can migrate to a different cpu.
+ *		Since struct bpf_timer is a field inside map element the map
+ *		owns the timer. The bpf_timer_set_callback() will increment refcnt
+ *		of BPF program to make sure that callback_fn code stays valid.
+ *		When user space reference to a map reaches zero all timers
+ *		in a map are cancelled and corresponding program's refcnts are
+ *		decremented. This is done to make sure that Ctrl-C of a user
+ *		process doesn't leave any timers running. If map is pinned in
+ *		bpffs the callback_fn can re-arm itself indefinitely.
+ *		bpf_map_update/delete_elem() helpers and user space sys_bpf commands
+ *		cancel and free the timer in the given map element.
+ *		The map can contain timers that invoke callback_fn-s from different
+ *		programs. The same callback_fn can serve different timers from
+ *		different maps if key/value layout matches across maps.
+ *		Every bpf_timer_set_callback() can have different callback_fn.
+ *
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
+ *		or invalid *flags* are passed.
+ *
+ * long bpf_timer_cancel(struct bpf_timer *timer)
+ *	Description
+ *		Cancel the timer and wait for callback_fn to finish if it was running.
+ *	Return
+ *		0 if the timer was not active.
+ *		1 if the timer was active.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ *		**-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
+ *		own timer which would have led to a deadlock otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4948,6 +5012,10 @@ union bpf_attr {
 	FN(sys_bpf),			\
 	FN(btf_find_by_name_kind),	\
 	FN(sys_close),			\
+	FN(timer_init),			\
+	FN(timer_set_callback),		\
+	FN(timer_start),		\
+	FN(timer_cancel),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6074,6 +6142,11 @@ struct bpf_spin_lock {
 	__u32	val;
 };
 
+struct bpf_timer {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
-- 
cgit v1.2.3-71-gd317


From 9b99edcae5c80c8fb9f8e7149bae528c9e610a72 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 14 Jul 2021 11:43:55 +0200
Subject: bpf: Add bpf_get_func_ip helper for tracing programs

Adding bpf_get_func_ip helper for BPF_PROG_TYPE_TRACING programs,
specifically for all trampoline attach types.

The trampoline's caller IP address is stored in (ctx - 8) address.
so there's no reason to actually call the helper, but rather fixup
the call instruction and return [ctx - 8] value directly.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210714094400.396467-4-jolsa@kernel.org
---
 include/uapi/linux/bpf.h       |  7 +++++++
 kernel/bpf/verifier.c          | 43 ++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c       | 15 +++++++++++++++
 tools/include/uapi/linux/bpf.h |  7 +++++++
 4 files changed, 72 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3544ec5234f0..89688f16ad60 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4841,6 +4841,12 @@ union bpf_attr {
  *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
  *		**-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
  *		own timer which would have led to a deadlock otherwise.
+ *
+ * u64 bpf_get_func_ip(void *ctx)
+ * 	Description
+ * 		Get address of the traced function (for tracing programs).
+ * 	Return
+ * 		Address of the traced function.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5016,6 +5022,7 @@ union bpf_attr {
 	FN(timer_set_callback),		\
 	FN(timer_start),		\
 	FN(timer_cancel),		\
+	FN(get_func_ip),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 344ee67265cc..ceef190514e4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6161,6 +6161,27 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
 	return err;
 }
 
+static int check_get_func_ip(struct bpf_verifier_env *env)
+{
+	enum bpf_attach_type eatype = env->prog->expected_attach_type;
+	enum bpf_prog_type type = resolve_prog_type(env->prog);
+	int func_id = BPF_FUNC_get_func_ip;
+
+	if (type == BPF_PROG_TYPE_TRACING) {
+		if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT &&
+		    eatype != BPF_MODIFY_RETURN) {
+			verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
+				func_id_name(func_id), func_id);
+			return -ENOTSUPP;
+		}
+		return 0;
+	}
+
+	verbose(env, "func %s#%d not supported for program type %d\n",
+		func_id_name(func_id), func_id, type);
+	return -ENOTSUPP;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			     int *insn_idx_p)
 {
@@ -6439,6 +6460,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
 		env->prog->call_get_stack = true;
 
+	if (func_id == BPF_FUNC_get_func_ip) {
+		if (check_get_func_ip(env))
+			return -ENOTSUPP;
+		env->prog->call_get_func_ip = true;
+	}
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
@@ -12632,6 +12659,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 {
 	struct bpf_prog *prog = env->prog;
 	bool expect_blinding = bpf_jit_blinding_enabled(prog);
+	enum bpf_prog_type prog_type = resolve_prog_type(prog);
 	struct bpf_insn *insn = prog->insnsi;
 	const struct bpf_func_proto *fn;
 	const int insn_cnt = prog->len;
@@ -12998,6 +13026,21 @@ patch_map_ops_generic:
 			continue;
 		}
 
+		/* Implement bpf_get_func_ip inline. */
+		if (prog_type == BPF_PROG_TYPE_TRACING &&
+		    insn->imm == BPF_FUNC_get_func_ip) {
+			/* Load IP address from ctx - 8 */
+			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+			if (!new_prog)
+				return -ENOMEM;
+
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 patch_call_imm:
 		fn = env->ops->get_func_proto(insn->imm, env->prog);
 		/* all functions that have prototype and verifier allowed
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6c77d25137e0..3e71503eeb23 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -948,6 +948,19 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
+{
+	/* This helper call is inlined by verifier. */
+	return ((u64 *)ctx)[-1];
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
+	.func		= bpf_get_func_ip_tracing,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1058,6 +1071,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_for_each_map_elem_proto;
 	case BPF_FUNC_snprintf:
 		return &bpf_snprintf_proto;
+	case BPF_FUNC_get_func_ip:
+		return &bpf_get_func_ip_proto_tracing;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3544ec5234f0..89688f16ad60 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4841,6 +4841,12 @@ union bpf_attr {
  *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
  *		**-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
  *		own timer which would have led to a deadlock otherwise.
+ *
+ * u64 bpf_get_func_ip(void *ctx)
+ * 	Description
+ * 		Get address of the traced function (for tracing programs).
+ * 	Return
+ * 		Address of the traced function.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5016,6 +5022,7 @@ union bpf_attr {
 	FN(timer_set_callback),		\
 	FN(timer_start),		\
 	FN(timer_cancel),		\
+	FN(get_func_ip),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3-71-gd317


From 9ffd9f3ff7193933dae171740ab70a103d460065 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 14 Jul 2021 11:43:56 +0200
Subject: bpf: Add bpf_get_func_ip helper for kprobe programs

Adding bpf_get_func_ip helper for BPF_PROG_TYPE_KPROBE programs,
so it's now possible to call bpf_get_func_ip from both kprobe and
kretprobe programs.

Taking the caller's address from 'struct kprobe::addr', which is
defined for both kprobe and kretprobe.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lore.kernel.org/bpf/20210714094400.396467-5-jolsa@kernel.org
---
 include/uapi/linux/bpf.h       |  2 +-
 kernel/bpf/verifier.c          |  2 ++
 kernel/trace/bpf_trace.c       | 16 ++++++++++++++++
 tools/include/uapi/linux/bpf.h |  2 +-
 4 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 89688f16ad60..2db6925e04f4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4844,7 +4844,7 @@ union bpf_attr {
  *
  * u64 bpf_get_func_ip(void *ctx)
  * 	Description
- * 		Get address of the traced function (for tracing programs).
+ * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
  */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ceef190514e4..97216f799ba8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6175,6 +6175,8 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
 			return -ENOTSUPP;
 		}
 		return 0;
+	} else if (type == BPF_PROG_TYPE_KPROBE) {
+		return 0;
 	}
 
 	verbose(env, "func %s#%d not supported for program type %d\n",
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e71503eeb23..0b113716bc7a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -961,6 +961,20 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
+{
+	struct kprobe *kp = kprobe_running();
+
+	return kp ? (u64) kp->addr : 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
+	.func		= bpf_get_func_ip_kprobe,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1092,6 +1106,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_override_return:
 		return &bpf_override_return_proto;
 #endif
+	case BPF_FUNC_get_func_ip:
+		return &bpf_get_func_ip_proto_kprobe;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 89688f16ad60..2db6925e04f4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4844,7 +4844,7 @@ union bpf_attr {
  *
  * u64 bpf_get_func_ip(void *ctx)
  * 	Description
- * 		Get address of the traced function (for tracing programs).
+ * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
  */
-- 
cgit v1.2.3-71-gd317


From b83d23a2a38b1770da0491257ae81d52307f7816 Mon Sep 17 00:00:00 2001
From: Mark Gray <mark.d.gray@redhat.com>
Date: Thu, 15 Jul 2021 08:27:54 -0400
Subject: openvswitch: Introduce per-cpu upcall dispatch

The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.

This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:

* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)

This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.

In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:

a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.

The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html

Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  8 +++++
 net/openvswitch/actions.c        |  6 +++-
 net/openvswitch/datapath.c       | 72 ++++++++++++++++++++++++++++++++++++++--
 net/openvswitch/datapath.h       | 20 +++++++++++
 4 files changed, 103 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 8d16744edc31..6571b57b2268 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -70,6 +70,8 @@ enum ovs_datapath_cmd {
  * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
  * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
  * not be sent.
+ * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
+ * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
  * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
  * datapath.  Always present in notifications.
  * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the
@@ -87,6 +89,9 @@ enum ovs_datapath_attr {
 	OVS_DP_ATTR_USER_FEATURES,	/* OVS_DP_F_*  */
 	OVS_DP_ATTR_PAD,
 	OVS_DP_ATTR_MASKS_CACHE_SIZE,
+	OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in per-cpu
+				     * dispatch mode
+				     */
 	__OVS_DP_ATTR_MAX
 };
 
@@ -127,6 +132,9 @@ struct ovs_vport_stats {
 /* Allow tc offload recirc sharing */
 #define OVS_DP_F_TC_RECIRC_SHARING	(1 << 2)
 
+/* Allow per-cpu dispatch of upcalls */
+#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU	(1 << 3)
+
 /* Fixed logical ports. */
 #define OVSP_LOCAL      ((__u32)0)
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index ef15d9eb4774..f79679746c62 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -924,7 +924,11 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 			break;
 
 		case OVS_USERSPACE_ATTR_PID:
-			upcall.portid = nla_get_u32(a);
+			if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+				upcall.portid =
+				   ovs_dp_get_upcall_portid(dp, smp_processor_id());
+			else
+				upcall.portid = nla_get_u32(a);
 			break;
 
 		case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: {
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index bc164b35e67d..7a4edafdc685 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -133,6 +133,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
 
 static void ovs_dp_masks_rebalance(struct work_struct *work);
 
+static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *);
+
 /* Must be called with rcu_read_lock or ovs_mutex. */
 const char *ovs_dp_name(const struct datapath *dp)
 {
@@ -166,6 +168,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
 	free_percpu(dp->stats_percpu);
 	kfree(dp->ports);
 	ovs_meters_exit(dp);
+	kfree(dp->upcall_portids);
 	kfree(dp);
 }
 
@@ -239,7 +242,12 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 
 		memset(&upcall, 0, sizeof(upcall));
 		upcall.cmd = OVS_PACKET_CMD_MISS;
-		upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+
+		if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+			upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id());
+		else
+			upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+
 		upcall.mru = OVS_CB(skb)->mru;
 		error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
 		if (unlikely(error))
@@ -1594,16 +1602,67 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb,
 
 DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
 
+static int ovs_dp_set_upcall_portids(struct datapath *dp,
+			      const struct nlattr *ids)
+{
+	struct dp_nlsk_pids *old, *dp_nlsk_pids;
+
+	if (!nla_len(ids) || nla_len(ids) % sizeof(u32))
+		return -EINVAL;
+
+	old = ovsl_dereference(dp->upcall_portids);
+
+	dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids),
+			       GFP_KERNEL);
+	if (!dp_nlsk_pids)
+		return -ENOMEM;
+
+	dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32);
+	nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids));
+
+	rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids);
+
+	kfree_rcu(old, rcu);
+
+	return 0;
+}
+
+u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id)
+{
+	struct dp_nlsk_pids *dp_nlsk_pids;
+
+	dp_nlsk_pids = rcu_dereference(dp->upcall_portids);
+
+	if (dp_nlsk_pids) {
+		if (cpu_id < dp_nlsk_pids->n_pids) {
+			return dp_nlsk_pids->pids[cpu_id];
+		} else if (dp_nlsk_pids->n_pids > 0 && cpu_id >= dp_nlsk_pids->n_pids) {
+			/* If the number of netlink PIDs is mismatched with the number of
+			 * CPUs as seen by the kernel, log this and send the upcall to an
+			 * arbitrary socket (0) in order to not drop packets
+			 */
+			pr_info_ratelimited("cpu_id mismatch with handler threads");
+			return dp_nlsk_pids->pids[cpu_id % dp_nlsk_pids->n_pids];
+		} else {
+			return 0;
+		}
+	} else {
+		return 0;
+	}
+}
+
 static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
 {
 	u32 user_features = 0;
+	int err;
 
 	if (a[OVS_DP_ATTR_USER_FEATURES]) {
 		user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
 
 		if (user_features & ~(OVS_DP_F_VPORT_PIDS |
 				      OVS_DP_F_UNALIGNED |
-				      OVS_DP_F_TC_RECIRC_SHARING))
+				      OVS_DP_F_TC_RECIRC_SHARING |
+				      OVS_DP_F_DISPATCH_UPCALL_PER_CPU))
 			return -EOPNOTSUPP;
 
 #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
@@ -1624,6 +1683,15 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
 
 	dp->user_features = user_features;
 
+	if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU &&
+	    a[OVS_DP_ATTR_PER_CPU_PIDS]) {
+		/* Upcall Netlink Port IDs have been updated */
+		err = ovs_dp_set_upcall_portids(dp,
+						a[OVS_DP_ATTR_PER_CPU_PIDS]);
+		if (err)
+			return err;
+	}
+
 	if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
 		static_branch_enable(&tc_recirc_sharing_support);
 	else
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 38f7d3e66ca6..fcfe6cb46441 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -50,6 +50,21 @@ struct dp_stats_percpu {
 	struct u64_stats_sync syncp;
 };
 
+/**
+ * struct dp_nlsk_pids - array of netlink portids of for a datapath.
+ *                       This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU
+ *                       is enabled and must be protected by rcu.
+ * @rcu: RCU callback head for deferred destruction.
+ * @n_pids: Size of @pids array.
+ * @pids: Array storing the Netlink socket PIDs indexed by CPU ID for packets
+ *       that miss the flow table.
+ */
+struct dp_nlsk_pids {
+	struct rcu_head rcu;
+	u32 n_pids;
+	u32 pids[];
+};
+
 /**
  * struct datapath - datapath for flow-based packet switching
  * @rcu: RCU callback head for deferred destruction.
@@ -61,6 +76,7 @@ struct dp_stats_percpu {
  * @net: Reference to net namespace.
  * @max_headroom: the maximum headroom of all vports in this datapath; it will
  * be used by all the internal vports in this dp.
+ * @upcall_portids: RCU protected 'struct dp_nlsk_pids'.
  *
  * Context: See the comment on locking at the top of datapath.c for additional
  * locking information.
@@ -87,6 +103,8 @@ struct datapath {
 
 	/* Switch meters. */
 	struct dp_meter_table meter_tbl;
+
+	struct dp_nlsk_pids __rcu *upcall_portids;
 };
 
 /**
@@ -243,6 +261,8 @@ int ovs_dp_upcall(struct datapath *, struct sk_buff *,
 		  const struct sw_flow_key *, const struct dp_upcall_info *,
 		  uint32_t cutlen);
 
+u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id);
+
 const char *ovs_dp_name(const struct datapath *dp);
 struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
 					 u32 portid, u32 seq, u8 cmd);
-- 
cgit v1.2.3-71-gd317


From e7737b67ab46ee0eeaa0ca1958f72d86f8d8ccf6 Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Thu, 15 Jul 2021 11:15:33 +0100
Subject: drm/i915/uapi: reject caching ioctls for discrete
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's a noop on DG1, and in the future when need to support other devices
which let us control the coherency, then it should be an immutable
creation time property for the BO. This will likely be controlled
through a new gem_create_ext extension.

v2: add some kernel doc for the discrete changes, and document the
    implicit rules

Suggested-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Jordan Justen <jordan.l.justen@intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Ramalingam C <ramalingam.c@intel.com>
Reviewed-by: Ramalingam C <ramalingam.c@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20210715101536.2606307-2-matthew.auld@intel.com
---
 drivers/gpu/drm/i915/gem/i915_gem_domain.c |  6 ++++++
 include/uapi/drm/i915_drm.h                | 29 +++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
index 7d1400b13429..43004bef55cb 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
@@ -268,6 +268,9 @@ int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
 	struct drm_i915_gem_object *obj;
 	int err = 0;
 
+	if (IS_DGFX(to_i915(dev)))
+		return -ENODEV;
+
 	rcu_read_lock();
 	obj = i915_gem_object_lookup_rcu(file, args->handle);
 	if (!obj) {
@@ -303,6 +306,9 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
 	enum i915_cache_level level;
 	int ret = 0;
 
+	if (IS_DGFX(i915))
+		return -ENODEV;
+
 	switch (args->caching) {
 	case I915_CACHING_NONE:
 		level = I915_CACHE_NONE;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index e54f9efaead0..868c2ee7be60 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1395,6 +1395,35 @@ struct drm_i915_gem_busy {
  * ppGTT support, or if the object is used for scanout). Note that this might
  * require unbinding the object from the GTT first, if its current caching value
  * doesn't match.
+ *
+ * Note that this all changes on discrete platforms, starting from DG1, the
+ * set/get caching is no longer supported, and is now rejected.  Instead the CPU
+ * caching attributes(WB vs WC) will become an immutable creation time property
+ * for the object, along with the GTT caching level. For now we don't expose any
+ * new uAPI for this, instead on DG1 this is all implicit, although this largely
+ * shouldn't matter since DG1 is coherent by default(without any way of
+ * controlling it).
+ *
+ * Implicit caching rules, starting from DG1:
+ *
+ *     - If any of the object placements (see &drm_i915_gem_create_ext_memory_regions)
+ *       contain I915_MEMORY_CLASS_DEVICE then the object will be allocated and
+ *       mapped as write-combined only.
+ *
+ *     - Everything else is always allocated and mapped as write-back, with the
+ *       guarantee that everything is also coherent with the GPU.
+ *
+ * Note that this is likely to change in the future again, where we might need
+ * more flexibility on future devices, so making this all explicit as part of a
+ * new &drm_i915_gem_create_ext extension is probable.
+ *
+ * Side note: Part of the reason for this is that changing the at-allocation-time CPU
+ * caching attributes for the pages might be required(and is expensive) if we
+ * need to then CPU map the pages later with different caching attributes. This
+ * inconsistent caching behaviour, while supported on x86, is not universally
+ * supported on other architectures. So for simplicity we opt for setting
+ * everything at creation time, whilst also making it immutable, on discrete
+ * platforms.
  */
 struct drm_i915_gem_caching {
 	/**
-- 
cgit v1.2.3-71-gd317


From aef7b67a79564f6cff488aff7f4b89438ca80b23 Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Thu, 15 Jul 2021 11:15:34 +0100
Subject: drm/i915/uapi: convert drm_i915_gem_userptr to kernel doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the missing kernel-doc.

Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Jordan Justen <jordan.l.justen@intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Ramalingam C <ramalingam.c@intel.com>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210715101536.2606307-3-matthew.auld@intel.com
---
 include/uapi/drm/i915_drm.h | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 868c2ee7be60..e20eeeca7a1c 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2141,14 +2141,52 @@ struct drm_i915_reset_stats {
 	__u32 pad;
 };
 
+/**
+ * struct drm_i915_gem_userptr - Create GEM object from user allocated memory.
+ *
+ * Userptr objects have several restrictions on what ioctls can be used with the
+ * object handle.
+ */
 struct drm_i915_gem_userptr {
+	/**
+	 * @user_ptr: The pointer to the allocated memory.
+	 *
+	 * Needs to be aligned to PAGE_SIZE.
+	 */
 	__u64 user_ptr;
+
+	/**
+	 * @user_size:
+	 *
+	 * The size in bytes for the allocated memory. This will also become the
+	 * object size.
+	 *
+	 * Needs to be aligned to PAGE_SIZE, and should be at least PAGE_SIZE,
+	 * or larger.
+	 */
 	__u64 user_size;
+
+	/**
+	 * @flags:
+	 *
+	 * Supported flags:
+	 *
+	 * I915_USERPTR_READ_ONLY:
+	 *
+	 * Mark the object as readonly, this also means GPU access can only be
+	 * readonly. This is only supported on HW which supports readonly access
+	 * through the GTT. If the HW can't support readonly access, an error is
+	 * returned.
+	 *
+	 * I915_USERPTR_UNSYNCHRONIZED:
+	 *
+	 * NOT USED. Setting this flag will result in an error.
+	 */
 	__u32 flags;
 #define I915_USERPTR_READ_ONLY 0x1
 #define I915_USERPTR_UNSYNCHRONIZED 0x80000000
 	/**
-	 * Returned handle for the object.
+	 * @handle: Returned handle for the object.
 	 *
 	 * Object handles are nonzero.
 	 */
-- 
cgit v1.2.3-71-gd317


From f4b7002a7076f025dce59647a77c8251175d2b34 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Mon, 19 Jul 2021 20:06:28 +0300
Subject: net: bridge: add vlan mcast snooping knob

Add a global knob that controls if vlan multicast snooping is enabled.
The proper contexts (vlan or bridge-wide) will be chosen based on the knob
when processing packets and changing bridge device state. Note that
vlans have their individual mcast snooping enabled by default, but this
knob is needed to turn on bridge vlan snooping. It is disabled by
default. To enable the knob vlan filtering must also be enabled, it
doesn't make sense to have vlan mcast snooping without vlan filtering
since that would lead to inconsistencies. Disabling vlan filtering will
also automatically disable vlan mcast snooping.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |   2 +
 net/bridge/br.c                |   9 ++-
 net/bridge/br_device.c         |   7 +-
 net/bridge/br_input.c          |   5 +-
 net/bridge/br_multicast.c      | 143 ++++++++++++++++++++++++++++++++---------
 net/bridge/br_private.h        |  37 +++++++++--
 net/bridge/br_vlan.c           |  20 ++++--
 7 files changed, 175 insertions(+), 48 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 6b56a7549531..7927ad80ee86 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -720,12 +720,14 @@ struct br_mcast_stats {
 
 /* bridge boolean options
  * BR_BOOLOPT_NO_LL_LEARN - disable learning from link-local packets
+ * BR_BOOLOPT_MCAST_VLAN_SNOOPING - control vlan multicast snooping
  *
  * IMPORTANT: if adding a new option do not forget to handle
  *            it in br_boolopt_toggle/get and bridge sysfs
  */
 enum br_boolopt_id {
 	BR_BOOLOPT_NO_LL_LEARN,
+	BR_BOOLOPT_MCAST_VLAN_SNOOPING,
 	BR_BOOLOPT_MAX
 };
 
diff --git a/net/bridge/br.c b/net/bridge/br.c
index ef743f94254d..51f2e25c4cd6 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -214,17 +214,22 @@ static struct notifier_block br_switchdev_notifier = {
 int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
 		      struct netlink_ext_ack *extack)
 {
+	int err = 0;
+
 	switch (opt) {
 	case BR_BOOLOPT_NO_LL_LEARN:
 		br_opt_toggle(br, BROPT_NO_LL_LEARN, on);
 		break;
+	case BR_BOOLOPT_MCAST_VLAN_SNOOPING:
+		err = br_multicast_toggle_vlan_snooping(br, on, extack);
+		break;
 	default:
 		/* shouldn't be called with unsupported options */
 		WARN_ON(1);
 		break;
 	}
 
-	return 0;
+	return err;
 }
 
 int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
@@ -232,6 +237,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
 	switch (opt) {
 	case BR_BOOLOPT_NO_LL_LEARN:
 		return br_opt_get(br, BROPT_NO_LL_LEARN);
+	case BR_BOOLOPT_MCAST_VLAN_SNOOPING:
+		return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED);
 	default:
 		/* shouldn't be called with unsupported options */
 		WARN_ON(1);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index e815bf4f9f24..00daf35f54d5 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -27,12 +27,14 @@ EXPORT_SYMBOL_GPL(nf_br_ops);
 /* net device transmit always called with BH disabled */
 netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+	struct net_bridge_mcast_port *pmctx_null = NULL;
 	struct net_bridge *br = netdev_priv(dev);
 	struct net_bridge_mcast *brmctx = &br->multicast_ctx;
 	struct net_bridge_fdb_entry *dst;
 	struct net_bridge_mdb_entry *mdst;
 	const struct nf_br_ops *nf_ops;
 	u8 state = BR_STATE_FORWARDING;
+	struct net_bridge_vlan *vlan;
 	const unsigned char *dest;
 	u16 vid = 0;
 
@@ -54,7 +56,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb_reset_mac_header(skb);
 	skb_pull(skb, ETH_HLEN);
 
-	if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state))
+	if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid,
+				&state, &vlan))
 		goto out;
 
 	if (IS_ENABLED(CONFIG_INET) &&
@@ -83,7 +86,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
 			goto out;
 		}
-		if (br_multicast_rcv(brmctx, NULL, skb, vid)) {
+		if (br_multicast_rcv(&brmctx, &pmctx_null, vlan, skb, vid)) {
 			kfree_skb(skb);
 			goto out;
 		}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index bb2036dd4934..8a0c0cc55cb4 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -73,6 +73,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	struct net_bridge_mdb_entry *mdst;
 	bool local_rcv, mcast_hit = false;
 	struct net_bridge_mcast *brmctx;
+	struct net_bridge_vlan *vlan;
 	struct net_bridge *br;
 	u16 vid = 0;
 	u8 state;
@@ -84,7 +85,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	pmctx = &p->multicast_ctx;
 	state = p->state;
 	if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid,
-				&state))
+				&state, &vlan))
 		goto out;
 
 	nbp_switchdev_frame_mark(p, skb);
@@ -102,7 +103,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 			local_rcv = true;
 		} else {
 			pkt_type = BR_PKT_MULTICAST;
-			if (br_multicast_rcv(brmctx, pmctx, skb, vid))
+			if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid))
 				goto drop;
 		}
 	}
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index ef4e7de3f18d..b71772828b23 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1797,9 +1797,9 @@ void br_multicast_enable_port(struct net_bridge_port *port)
 {
 	struct net_bridge *br = port->br;
 
-	spin_lock(&br->multicast_lock);
+	spin_lock_bh(&br->multicast_lock);
 	__br_multicast_enable_port_ctx(&port->multicast_ctx);
-	spin_unlock(&br->multicast_lock);
+	spin_unlock_bh(&br->multicast_lock);
 }
 
 static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx)
@@ -1827,9 +1827,9 @@ static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx)
 
 void br_multicast_disable_port(struct net_bridge_port *port)
 {
-	spin_lock(&port->br->multicast_lock);
+	spin_lock_bh(&port->br->multicast_lock);
 	__br_multicast_disable_port_ctx(&port->multicast_ctx);
-	spin_unlock(&port->br->multicast_lock);
+	spin_unlock_bh(&port->br->multicast_lock);
 }
 
 static int __grp_src_delete_marked(struct net_bridge_port_group *pg)
@@ -3510,8 +3510,9 @@ static int br_multicast_ipv6_rcv(struct net_bridge_mcast *brmctx,
 }
 #endif
 
-int br_multicast_rcv(struct net_bridge_mcast *brmctx,
-		     struct net_bridge_mcast_port *pmctx,
+int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+		     struct net_bridge_mcast_port **pmctx,
+		     struct net_bridge_vlan *vlan,
 		     struct sk_buff *skb, u16 vid)
 {
 	int ret = 0;
@@ -3519,16 +3520,36 @@ int br_multicast_rcv(struct net_bridge_mcast *brmctx,
 	BR_INPUT_SKB_CB(skb)->igmp = 0;
 	BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
 
-	if (!br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED))
+	if (!br_opt_get((*brmctx)->br, BROPT_MULTICAST_ENABLED))
 		return 0;
 
+	if (br_opt_get((*brmctx)->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && vlan) {
+		const struct net_bridge_vlan *masterv;
+
+		/* the vlan has the master flag set only when transmitting
+		 * through the bridge device
+		 */
+		if (br_vlan_is_master(vlan)) {
+			masterv = vlan;
+			*brmctx = &vlan->br_mcast_ctx;
+			*pmctx = NULL;
+		} else {
+			masterv = vlan->brvlan;
+			*brmctx = &vlan->brvlan->br_mcast_ctx;
+			*pmctx = &vlan->port_mcast_ctx;
+		}
+
+		if (!(masterv->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED))
+			return 0;
+	}
+
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
-		ret = br_multicast_ipv4_rcv(brmctx, pmctx, skb, vid);
+		ret = br_multicast_ipv4_rcv(*brmctx, *pmctx, skb, vid);
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case htons(ETH_P_IPV6):
-		ret = br_multicast_ipv6_rcv(brmctx, pmctx, skb, vid);
+		ret = br_multicast_ipv6_rcv(*brmctx, *pmctx, skb, vid);
 		break;
 #endif
 	}
@@ -3727,20 +3748,22 @@ static void __br_multicast_open(struct net_bridge_mcast *brmctx)
 
 void br_multicast_open(struct net_bridge *br)
 {
-	struct net_bridge_vlan_group *vg;
-	struct net_bridge_vlan *vlan;
-
 	ASSERT_RTNL();
 
-	vg = br_vlan_group(br);
-	if (vg) {
-		list_for_each_entry(vlan, &vg->vlan_list, vlist) {
-			struct net_bridge_mcast *brmctx;
-
-			brmctx = &vlan->br_mcast_ctx;
-			if (br_vlan_is_brentry(vlan) &&
-			    !br_multicast_ctx_vlan_disabled(brmctx))
-				__br_multicast_open(&vlan->br_mcast_ctx);
+	if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+		struct net_bridge_vlan_group *vg;
+		struct net_bridge_vlan *vlan;
+
+		vg = br_vlan_group(br);
+		if (vg) {
+			list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+				struct net_bridge_mcast *brmctx;
+
+				brmctx = &vlan->br_mcast_ctx;
+				if (br_vlan_is_brentry(vlan) &&
+				    !br_multicast_ctx_vlan_disabled(brmctx))
+					__br_multicast_open(&vlan->br_mcast_ctx);
+			}
 		}
 	}
 
@@ -3804,22 +3827,80 @@ void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on)
 	}
 }
 
-void br_multicast_stop(struct net_bridge *br)
+void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on)
+{
+	struct net_bridge_port *p;
+
+	if (WARN_ON_ONCE(!br_vlan_is_master(vlan)))
+		return;
+
+	list_for_each_entry(p, &vlan->br->port_list, list) {
+		struct net_bridge_vlan *vport;
+
+		vport = br_vlan_find(nbp_vlan_group(p), vlan->vid);
+		if (!vport)
+			continue;
+		br_multicast_toggle_one_vlan(vport, on);
+	}
+}
+
+int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
+				      struct netlink_ext_ack *extack)
 {
 	struct net_bridge_vlan_group *vg;
 	struct net_bridge_vlan *vlan;
+	struct net_bridge_port *p;
 
-	ASSERT_RTNL();
+	if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) == on)
+		return 0;
+
+	if (on && !br_opt_get(br, BROPT_VLAN_ENABLED)) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot enable multicast vlan snooping with vlan filtering disabled");
+		return -EINVAL;
+	}
 
 	vg = br_vlan_group(br);
-	if (vg) {
-		list_for_each_entry(vlan, &vg->vlan_list, vlist) {
-			struct net_bridge_mcast *brmctx;
-
-			brmctx = &vlan->br_mcast_ctx;
-			if (br_vlan_is_brentry(vlan) &&
-			    !br_multicast_ctx_vlan_disabled(brmctx))
-				__br_multicast_stop(&vlan->br_mcast_ctx);
+	if (!vg)
+		return 0;
+
+	br_opt_toggle(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED, on);
+
+	/* disable/enable non-vlan mcast contexts based on vlan snooping */
+	if (on)
+		__br_multicast_stop(&br->multicast_ctx);
+	else
+		__br_multicast_open(&br->multicast_ctx);
+	list_for_each_entry(p, &br->port_list, list) {
+		if (on)
+			br_multicast_disable_port(p);
+		else
+			br_multicast_enable_port(p);
+	}
+
+	list_for_each_entry(vlan, &vg->vlan_list, vlist)
+		br_multicast_toggle_vlan(vlan, on);
+
+	return 0;
+}
+
+void br_multicast_stop(struct net_bridge *br)
+{
+	ASSERT_RTNL();
+
+	if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+		struct net_bridge_vlan_group *vg;
+		struct net_bridge_vlan *vlan;
+
+		vg = br_vlan_group(br);
+		if (vg) {
+			list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+				struct net_bridge_mcast *brmctx;
+
+				brmctx = &vlan->br_mcast_ctx;
+				if (br_vlan_is_brentry(vlan) &&
+				    !br_multicast_ctx_vlan_disabled(brmctx))
+					__br_multicast_stop(&vlan->br_mcast_ctx);
+			}
 		}
 	}
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 5588f2d3546f..c3c2f19d3b71 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -433,6 +433,7 @@ enum net_bridge_opts {
 	BROPT_VLAN_STATS_PER_PORT,
 	BROPT_NO_LL_LEARN,
 	BROPT_VLAN_BRIDGE_BINDING,
+	BROPT_MCAST_VLAN_SNOOPING_ENABLED,
 };
 
 struct net_bridge {
@@ -829,8 +830,9 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
 
 /* br_multicast.c */
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
-int br_multicast_rcv(struct net_bridge_mcast *brmctx,
-		     struct net_bridge_mcast_port *pmctx,
+int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+		     struct net_bridge_mcast_port **pmctx,
+		     struct net_bridge_vlan *vlan,
 		     struct sk_buff *skb, u16 vid);
 struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge_mcast *brmctx,
 					struct sk_buff *skb, u16 vid);
@@ -904,6 +906,9 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port,
 				struct net_bridge_mcast_port *pmctx);
 void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx);
 void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on);
+void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on);
+int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
+				      struct netlink_ext_ack *extack);
 
 static inline bool br_group_is_l2(const struct br_ip *group)
 {
@@ -1090,7 +1095,8 @@ br_multicast_port_ctx_get_global(const struct net_bridge_mcast_port *pmctx)
 static inline bool
 br_multicast_ctx_vlan_global_disabled(const struct net_bridge_mcast *brmctx)
 {
-	return br_multicast_ctx_is_vlan(brmctx) &&
+	return br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) &&
+	       br_multicast_ctx_is_vlan(brmctx) &&
 	       !(brmctx->vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED);
 }
 
@@ -1108,8 +1114,9 @@ br_multicast_port_ctx_vlan_disabled(const struct net_bridge_mcast_port *pmctx)
 	       !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED);
 }
 #else
-static inline int br_multicast_rcv(struct net_bridge_mcast *brmctx,
-				   struct net_bridge_mcast_port *pmctx,
+static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+				   struct net_bridge_mcast_port **pmctx,
+				   struct net_bridge_vlan *vlan,
 				   struct sk_buff *skb,
 				   u16 vid)
 {
@@ -1245,13 +1252,26 @@ static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan,
 						bool on)
 {
 }
+
+static inline void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan,
+					    bool on)
+{
+}
+
+static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br,
+						    bool on,
+						    struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 /* br_vlan.c */
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
 bool br_allowed_ingress(const struct net_bridge *br,
 			struct net_bridge_vlan_group *vg, struct sk_buff *skb,
-			u16 *vid, u8 *state);
+			u16 *vid, u8 *state,
+			struct net_bridge_vlan **vlan);
 bool br_allowed_egress(struct net_bridge_vlan_group *vg,
 		       const struct sk_buff *skb);
 bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
@@ -1363,8 +1383,11 @@ static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid)
 static inline bool br_allowed_ingress(const struct net_bridge *br,
 				      struct net_bridge_vlan_group *vg,
 				      struct sk_buff *skb,
-				      u16 *vid, u8 *state)
+				      u16 *vid, u8 *state,
+				      struct net_bridge_vlan **vlan)
+
 {
+	*vlan = NULL;
 	return true;
 }
 
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 1a8cb2b1b762..ab4969a4a380 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -481,7 +481,8 @@ out:
 static bool __allowed_ingress(const struct net_bridge *br,
 			      struct net_bridge_vlan_group *vg,
 			      struct sk_buff *skb, u16 *vid,
-			      u8 *state)
+			      u8 *state,
+			      struct net_bridge_vlan **vlan)
 {
 	struct pcpu_sw_netstats *stats;
 	struct net_bridge_vlan *v;
@@ -546,8 +547,9 @@ static bool __allowed_ingress(const struct net_bridge *br,
 			 */
 			skb->vlan_tci |= pvid;
 
-		/* if stats are disabled we can avoid the lookup */
-		if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
+		/* if snooping and stats are disabled we can avoid the lookup */
+		if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) &&
+		    !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
 			if (*state == BR_STATE_FORWARDING) {
 				*state = br_vlan_get_pvid_state(vg);
 				return br_vlan_state_allowed(*state, true);
@@ -574,6 +576,8 @@ static bool __allowed_ingress(const struct net_bridge *br,
 		u64_stats_update_end(&stats->syncp);
 	}
 
+	*vlan = v;
+
 	return true;
 
 drop:
@@ -583,17 +587,19 @@ drop:
 
 bool br_allowed_ingress(const struct net_bridge *br,
 			struct net_bridge_vlan_group *vg, struct sk_buff *skb,
-			u16 *vid, u8 *state)
+			u16 *vid, u8 *state,
+			struct net_bridge_vlan **vlan)
 {
 	/* If VLAN filtering is disabled on the bridge, all packets are
 	 * permitted.
 	 */
+	*vlan = NULL;
 	if (!br_opt_get(br, BROPT_VLAN_ENABLED)) {
 		BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
 		return true;
 	}
 
-	return __allowed_ingress(br, vg, skb, vid, state);
+	return __allowed_ingress(br, vg, skb, vid, state, vlan);
 }
 
 /* Called under RCU. */
@@ -834,6 +840,10 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val,
 	br_manage_promisc(br);
 	recalculate_group_addr(br);
 	br_recalculate_fwd_mask(br);
+	if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+		br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n");
+		br_multicast_toggle_vlan_snooping(br, false, NULL);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From 1e9ca45662d6bb65fb60d3fbb7737b081d9cffc9 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Mon, 19 Jul 2021 20:06:33 +0300
Subject: net: bridge: multicast: include router port vlan id in notifications

Use the port multicast context to check if the router port is a vlan and
in case it is include its vlan id in the notification.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_mdb.c            | 29 ++++++++++++++++++++++-------
 net/bridge/br_multicast.c      |  4 ++--
 net/bridge/br_private.h        |  2 +-
 4 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 7927ad80ee86..90ac9e11c15b 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -629,6 +629,7 @@ enum {
 	MDBA_ROUTER_PATTR_TYPE,
 	MDBA_ROUTER_PATTR_INET_TIMER,
 	MDBA_ROUTER_PATTR_INET6_TIMER,
+	MDBA_ROUTER_PATTR_VID,
 	__MDBA_ROUTER_PATTR_MAX
 };
 #define MDBA_ROUTER_PATTR_MAX (__MDBA_ROUTER_PATTR_MAX - 1)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 5319587198eb..d3383a47a2f2 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -781,12 +781,12 @@ errout:
 
 static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
 				   struct net_device *dev,
-				   int ifindex, u32 pid,
+				   int ifindex, u16 vid, u32 pid,
 				   u32 seq, int type, unsigned int flags)
 {
+	struct nlattr *nest, *port_nest;
 	struct br_port_msg *bpm;
 	struct nlmsghdr *nlh;
-	struct nlattr *nest;
 
 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
 	if (!nlh)
@@ -800,8 +800,18 @@ static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
 	if (!nest)
 		goto cancel;
 
-	if (nla_put_u32(skb, MDBA_ROUTER_PORT, ifindex))
+	port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT);
+	if (!port_nest)
+		goto end;
+	if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) {
+		nla_nest_cancel(skb, port_nest);
 		goto end;
+	}
+	if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) {
+		nla_nest_cancel(skb, port_nest);
+		goto end;
+	}
+	nla_nest_end(skb, port_nest);
 
 	nla_nest_end(skb, nest);
 	nlmsg_end(skb, nlh);
@@ -817,23 +827,28 @@ cancel:
 static inline size_t rtnl_rtr_nlmsg_size(void)
 {
 	return NLMSG_ALIGN(sizeof(struct br_port_msg))
-		+ nla_total_size(sizeof(__u32));
+		+ nla_total_size(sizeof(__u32))
+		+ nla_total_size(sizeof(u16));
 }
 
-void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
+void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx,
 		   int type)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 	int ifindex;
+	u16 vid;
 
-	ifindex = port ? port->dev->ifindex : 0;
+	ifindex = pmctx ? pmctx->port->dev->ifindex : 0;
+	vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid :
+							      0;
 	skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
-	err = nlmsg_populate_rtr_fill(skb, dev, ifindex, 0, 0, type, NTF_SELF);
+	err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type,
+				      NTF_SELF);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto errout;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 9d4a18a711e4..fb5e5df571fd 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2979,7 +2979,7 @@ static void br_multicast_add_router(struct net_bridge_mcast *brmctx,
 	 * IPv4 or IPv6 multicast router.
 	 */
 	if (br_multicast_no_router_otherpf(pmctx, rlist)) {
-		br_rtr_notify(pmctx->port->br->dev, pmctx->port, RTM_NEWMDB);
+		br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_NEWMDB);
 		br_port_mc_router_state_change(pmctx->port, true);
 	}
 }
@@ -4078,7 +4078,7 @@ br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted)
 		return;
 #endif
 
-	br_rtr_notify(pmctx->port->br->dev, pmctx->port, RTM_DELMDB);
+	br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_DELMDB);
 	br_port_mc_router_state_change(pmctx->port, false);
 
 	/* don't allow timer refresh */
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 0d019b0b00e5..a18c27f581a3 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -872,7 +872,7 @@ int br_mdb_hash_init(struct net_bridge *br);
 void br_mdb_hash_fini(struct net_bridge *br);
 void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
 		   struct net_bridge_port_group *pg, int type);
-void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
+void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx,
 		   int type);
 void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
 			 struct net_bridge_port_group *pg,
-- 
cgit v1.2.3-71-gd317


From 47ecd2dbd8ec43125ea75d7d2e73c888cda8663f Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Mon, 19 Jul 2021 20:06:34 +0300
Subject: net: bridge: vlan: add support for global options

We can have two types of vlan options depending on context:
 - per-device vlan options (split in per-bridge and per-port)
 - global vlan options

The second type wasn't supported in the bridge until now, but we need
them for per-vlan multicast support, per-vlan STP support and other
options which require global vlan context. They are contained in the global
bridge vlan context even if the vlan is not configured on the bridge device
itself. This patch adds initial netlink attributes and support for setting
these global vlan options, they can only be set (RTM_NEWVLAN) and the
operation must use the bridge device. Since there are no such options yet
it shouldn't have any functional effect.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 13 +++++++
 net/bridge/br_private.h        |  4 ++
 net/bridge/br_vlan.c           | 16 ++++++--
 net/bridge/br_vlan_options.c   | 85 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 90ac9e11c15b..4ed57d1a5d89 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -485,10 +485,15 @@ enum {
  *     [BRIDGE_VLANDB_ENTRY_INFO]
  *     ...
  * }
+ * [BRIDGE_VLANDB_GLOBAL_OPTIONS] = {
+ *     [BRIDGE_VLANDB_GOPTS_ID]
+ *     ...
+ * }
  */
 enum {
 	BRIDGE_VLANDB_UNSPEC,
 	BRIDGE_VLANDB_ENTRY,
+	BRIDGE_VLANDB_GLOBAL_OPTIONS,
 	__BRIDGE_VLANDB_MAX,
 };
 #define BRIDGE_VLANDB_MAX (__BRIDGE_VLANDB_MAX - 1)
@@ -538,6 +543,14 @@ enum {
 };
 #define BRIDGE_VLANDB_STATS_MAX (__BRIDGE_VLANDB_STATS_MAX - 1)
 
+enum {
+	BRIDGE_VLANDB_GOPTS_UNSPEC,
+	BRIDGE_VLANDB_GOPTS_ID,
+	BRIDGE_VLANDB_GOPTS_RANGE,
+	__BRIDGE_VLANDB_GOPTS_MAX
+};
+#define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
+
 /* Bridge multicast database attributes
  * [MDBA_MDB] = {
  *     [MDBA_MDB_ENTRY] = {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a18c27f581a3..6a6ce233a999 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1592,6 +1592,10 @@ int br_vlan_process_options(const struct net_bridge *br,
 			    struct net_bridge_vlan *range_end,
 			    struct nlattr **tb,
 			    struct netlink_ext_ack *extack);
+int br_vlan_rtm_process_global_options(struct net_device *dev,
+				       const struct nlattr *attr,
+				       int cmd,
+				       struct netlink_ext_ack *extack);
 
 /* vlan state manipulation helpers using *_ONCE to annotate lock-free access */
 static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v)
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index ab4969a4a380..dcb5acf783d2 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -2203,12 +2203,22 @@ static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh,
 	}
 
 	nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) {
-		if (nla_type(attr) != BRIDGE_VLANDB_ENTRY)
+		switch (nla_type(attr)) {
+		case BRIDGE_VLANDB_ENTRY:
+			err = br_vlan_rtm_process_one(dev, attr,
+						      nlh->nlmsg_type,
+						      extack);
+			break;
+		case BRIDGE_VLANDB_GLOBAL_OPTIONS:
+			err = br_vlan_rtm_process_global_options(dev, attr,
+								 nlh->nlmsg_type,
+								 extack);
+			break;
+		default:
 			continue;
+		}
 
 		vlans++;
-		err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type,
-					      extack);
 		if (err)
 			break;
 	}
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index b4add9ea8964..a7d5a2334207 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -258,3 +258,88 @@ int br_vlan_process_options(const struct net_bridge *br,
 
 	return err;
 }
+
+static int br_vlan_process_global_one_opts(const struct net_bridge *br,
+					   struct net_bridge_vlan_group *vg,
+					   struct net_bridge_vlan *v,
+					   struct nlattr **tb,
+					   bool *changed,
+					   struct netlink_ext_ack *extack)
+{
+	*changed = false;
+	return 0;
+}
+
+static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
+	[BRIDGE_VLANDB_GOPTS_ID]	= { .type = NLA_U16 },
+	[BRIDGE_VLANDB_GOPTS_RANGE]	= { .type = NLA_U16 },
+};
+
+int br_vlan_rtm_process_global_options(struct net_device *dev,
+				       const struct nlattr *attr,
+				       int cmd,
+				       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[BRIDGE_VLANDB_GOPTS_MAX + 1];
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *v;
+	u16 vid, vid_range = 0;
+	struct net_bridge *br;
+	int err = 0;
+
+	if (cmd != RTM_NEWVLAN) {
+		NL_SET_ERR_MSG_MOD(extack, "Global vlan options support only set operation");
+		return -EINVAL;
+	}
+	if (!netif_is_bridge_master(dev)) {
+		NL_SET_ERR_MSG_MOD(extack, "Global vlan options can only be set on bridge device");
+		return -EINVAL;
+	}
+	br = netdev_priv(dev);
+	vg = br_vlan_group(br);
+	if (WARN_ON(!vg))
+		return -ENODEV;
+
+	err = nla_parse_nested(tb, BRIDGE_VLANDB_GOPTS_MAX, attr,
+			       br_vlan_db_gpol, extack);
+	if (err)
+		return err;
+
+	if (!tb[BRIDGE_VLANDB_GOPTS_ID]) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry id");
+		return -EINVAL;
+	}
+	vid = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_ID]);
+	if (!br_vlan_valid_id(vid, extack))
+		return -EINVAL;
+
+	if (tb[BRIDGE_VLANDB_GOPTS_RANGE]) {
+		vid_range = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_RANGE]);
+		if (!br_vlan_valid_id(vid_range, extack))
+			return -EINVAL;
+		if (vid >= vid_range) {
+			NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id");
+			return -EINVAL;
+		}
+	} else {
+		vid_range = vid;
+	}
+
+	for (; vid <= vid_range; vid++) {
+		bool changed = false;
+
+		v = br_vlan_find(vg, vid);
+		if (!v) {
+			NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process global options");
+			err = -ENOENT;
+			break;
+		}
+
+		err = br_vlan_process_global_one_opts(br, vg, v, tb, &changed,
+						      extack);
+		if (err)
+			break;
+	}
+
+	return err;
+}
-- 
cgit v1.2.3-71-gd317


From 743a53d9636aad83da63a8638e8365e817ef6365 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Mon, 19 Jul 2021 20:06:35 +0300
Subject: net: bridge: vlan: add support for dumping global vlan options

Add a new vlan options dump flag which causes only global vlan options
to be dumped. The dumps are done only with bridge devices, ports are
ignored. They support vlan compression if the options in sequential
vlans are equal (currently always true).

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  4 ++++
 net/bridge/br_vlan.c           | 41 +++++++++++++++++++++++++++++++++--------
 net/bridge/br_vlan_options.c   | 31 +++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 4ed57d1a5d89..946ccf33dc53 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -479,6 +479,7 @@ enum {
 
 /* flags used in BRIDGE_VLANDB_DUMP_FLAGS attribute to affect dumps */
 #define BRIDGE_VLANDB_DUMPF_STATS	(1 << 0) /* Include stats in the dump */
+#define BRIDGE_VLANDB_DUMPF_GLOBAL	(1 << 1) /* Dump global vlan options only */
 
 /* Bridge vlan RTM attributes
  * [BRIDGE_VLANDB_ENTRY] = {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6a6ce233a999..a19dbd63d670 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1596,6 +1596,10 @@ int br_vlan_rtm_process_global_options(struct net_device *dev,
 				       const struct nlattr *attr,
 				       int cmd,
 				       struct netlink_ext_ack *extack);
+bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
+					 const struct net_bridge_vlan *r_end);
+bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
+			      const struct net_bridge_vlan *v_opts);
 
 /* vlan state manipulation helpers using *_ONCE to annotate lock-free access */
 static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v)
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index dcb5acf783d2..e66b004df763 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1919,6 +1919,7 @@ static int br_vlan_dump_dev(const struct net_device *dev,
 			    u32 dump_flags)
 {
 	struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL;
+	bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL);
 	bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS);
 	struct net_bridge_vlan_group *vg;
 	int idx = 0, s_idx = cb->args[1];
@@ -1937,6 +1938,10 @@ static int br_vlan_dump_dev(const struct net_device *dev,
 		vg = br_vlan_group_rcu(br);
 		p = NULL;
 	} else {
+		/* global options are dumped only for bridge devices */
+		if (dump_global)
+			return 0;
+
 		p = br_port_get_rcu(dev);
 		if (WARN_ON(!p))
 			return -EINVAL;
@@ -1959,7 +1964,7 @@ static int br_vlan_dump_dev(const struct net_device *dev,
 
 	/* idx must stay at range's beginning until it is filled in */
 	list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
-		if (!br_vlan_should_use(v))
+		if (!dump_global && !br_vlan_should_use(v))
 			continue;
 		if (idx < s_idx) {
 			idx++;
@@ -1972,8 +1977,21 @@ static int br_vlan_dump_dev(const struct net_device *dev,
 			continue;
 		}
 
-		if (dump_stats || v->vid == pvid ||
-		    !br_vlan_can_enter_range(v, range_end)) {
+		if (dump_global) {
+			if (br_vlan_global_opts_can_enter_range(v, range_end))
+				continue;
+			if (!br_vlan_global_opts_fill(skb, range_start->vid,
+						      range_end->vid,
+						      range_start)) {
+				err = -EMSGSIZE;
+				break;
+			}
+			/* advance number of filled vlans */
+			idx += range_end->vid - range_start->vid + 1;
+
+			range_start = v;
+		} else if (dump_stats || v->vid == pvid ||
+			   !br_vlan_can_enter_range(v, range_end)) {
 			u16 vlan_flags = br_vlan_flags(range_start, pvid);
 
 			if (!br_vlan_fill_vids(skb, range_start->vid,
@@ -1995,11 +2013,18 @@ static int br_vlan_dump_dev(const struct net_device *dev,
 	 * - last vlan (range_start == range_end, not in range)
 	 * - last vlan range (range_start != range_end, in range)
 	 */
-	if (!err && range_start &&
-	    !br_vlan_fill_vids(skb, range_start->vid, range_end->vid,
-			       range_start, br_vlan_flags(range_start, pvid),
-			       dump_stats))
-		err = -EMSGSIZE;
+	if (!err && range_start) {
+		if (dump_global &&
+		    !br_vlan_global_opts_fill(skb, range_start->vid,
+					      range_end->vid, range_start))
+			err = -EMSGSIZE;
+		else if (!dump_global &&
+			 !br_vlan_fill_vids(skb, range_start->vid,
+					    range_end->vid, range_start,
+					    br_vlan_flags(range_start, pvid),
+					    dump_stats))
+			err = -EMSGSIZE;
+	}
 
 	cb->args[1] = err ? idx : 0;
 
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index a7d5a2334207..f290f5140547 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -259,6 +259,37 @@ int br_vlan_process_options(const struct net_bridge *br,
 	return err;
 }
 
+bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
+					 const struct net_bridge_vlan *r_end)
+{
+	return v_curr->vid - r_end->vid == 1;
+}
+
+bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
+			      const struct net_bridge_vlan *v_opts)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, BRIDGE_VLANDB_GLOBAL_OPTIONS);
+	if (!nest)
+		return false;
+
+	if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_ID, vid))
+		goto out_err;
+
+	if (vid_range && vid < vid_range &&
+	    nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_RANGE, vid_range))
+		goto out_err;
+
+	nla_nest_end(skb, nest);
+
+	return true;
+
+out_err:
+	nla_nest_cancel(skb, nest);
+	return false;
+}
+
 static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 					   struct net_bridge_vlan_group *vg,
 					   struct net_bridge_vlan *v,
-- 
cgit v1.2.3-71-gd317


From 9dee572c384846f4ece029ab5688faed0682e48a Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Mon, 19 Jul 2021 20:06:37 +0300
Subject: net: bridge: vlan: add mcast snooping control

Add a new global vlan option which controls whether multicast snooping
is enabled or disabled for a single vlan. It controls the vlan private
flag: BR_VLFLAG_GLOBAL_MCAST_ENABLED.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_multicast.c      | 16 ++++++++++++++++
 net/bridge/br_private.h        |  7 +++++++
 net/bridge/br_vlan_options.c   | 24 +++++++++++++++++++++++-
 4 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 946ccf33dc53..5aca85874447 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -548,6 +548,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_UNSPEC,
 	BRIDGE_VLANDB_GOPTS_ID,
 	BRIDGE_VLANDB_GOPTS_RANGE,
+	BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index fb5e5df571fd..976491951c82 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -3988,6 +3988,22 @@ int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
 	return 0;
 }
 
+bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on)
+{
+	ASSERT_RTNL();
+
+	/* BR_VLFLAG_GLOBAL_MCAST_ENABLED relies on eventual consistency and
+	 * requires only RTNL to change
+	 */
+	if (on == !!(vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED))
+		return false;
+
+	vlan->priv_flags ^= BR_VLFLAG_GLOBAL_MCAST_ENABLED;
+	br_multicast_toggle_vlan(vlan, on);
+
+	return true;
+}
+
 void br_multicast_stop(struct net_bridge *br)
 {
 	ASSERT_RTNL();
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a19dbd63d670..4681a4b6020f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -909,6 +909,7 @@ void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on);
 void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on);
 int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
 				      struct netlink_ext_ack *extack);
+bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on);
 
 static inline bool br_group_is_l2(const struct br_ip *group)
 {
@@ -1282,6 +1283,12 @@ static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan,
+						   bool on)
+{
+	return false;
+}
 #endif
 
 /* br_vlan.c */
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 827bfc319599..4ef975b20185 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -262,7 +262,9 @@ int br_vlan_process_options(const struct net_bridge *br,
 bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
 					 const struct net_bridge_vlan *r_end)
 {
-	return v_curr->vid - r_end->vid == 1;
+	return v_curr->vid - r_end->vid == 1 &&
+	       ((v_curr->priv_flags ^ r_end->priv_flags) &
+		BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0;
 }
 
 bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
@@ -281,6 +283,12 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	    nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_RANGE, vid_range))
 		goto out_err;
 
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
+		       !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)))
+		goto out_err;
+#endif
+
 	nla_nest_end(skb, nest);
 
 	return true;
@@ -295,6 +303,9 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 	return NLMSG_ALIGN(sizeof(struct br_vlan_msg))
 		+ nla_total_size(0) /* BRIDGE_VLANDB_GLOBAL_OPTIONS */
 		+ nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_ID */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */
+#endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
 
@@ -349,12 +360,23 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 					   struct netlink_ext_ack *extack)
 {
 	*changed = false;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]) {
+		u8 mc_snooping;
+
+		mc_snooping = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]);
+		if (br_multicast_toggle_global_vlan(v, !!mc_snooping))
+			*changed = true;
+	}
+#endif
+
 	return 0;
 }
 
 static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_ID]	= { .type = NLA_U16 },
 	[BRIDGE_VLANDB_GOPTS_RANGE]	= { .type = NLA_U16 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]	= { .type = NLA_U8 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3-71-gd317


From 11656f593a869a4345e3421037614d2b75ae2ad3 Mon Sep 17 00:00:00 2001
From: Lior Nahmanson <liorna@mellanox.com>
Date: Mon, 21 Jun 2021 10:06:16 +0300
Subject: RDMA/mlx5: Add DCS offload support

DCS is an offload to SW load balancing of DC initiator work requests.

A single DCI can be connected to only one target at the time and can't
start new connection until the previous work request is completed.  This
limitation will cause to delay when the initiator process needs to
transfer data to multiple targets at the same time.  The SW solution is to
use a process that handling and spreading the work request on many DCIs
according to destinations.

This feature is an offload to this process and coming to reduce the load
from the CPU and improve the performance.

Link: https://lore.kernel.org/r/491c2c2afdb5b07de7f03eab3f93cf0704549dbc.1624258894.git.leonro@nvidia.com
Reviewed-by: Meir Lichtinger <meirl@nvidia.com>
Signed-off-by: Lior Nahmanson <liorna@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c | 10 ++++++++++
 drivers/infiniband/hw/mlx5/qp.c   | 11 +++++++++++
 include/uapi/rdma/mlx5-abi.h      | 17 +++++++++++++++--
 3 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 094c976b1eed..cac0c52ed1d9 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1174,6 +1174,16 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 				MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
 	}
 
+	if (offsetofend(typeof(resp), dci_streams_caps) <= uhw_outlen) {
+		resp.response_length += sizeof(resp.dci_streams_caps);
+
+		resp.dci_streams_caps.max_log_num_concurent =
+			MLX5_CAP_GEN(mdev, log_max_dci_stream_channels);
+
+		resp.dci_streams_caps.max_log_num_errored =
+			MLX5_CAP_GEN(mdev, log_max_dci_errored_streams);
+	}
+
 	if (uhw_outlen) {
 		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
 
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index b70fdfe6e8a5..a056b7a8e0c3 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -2064,6 +2064,13 @@ static int create_dci(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt));
 	}
 
+	if (qp->flags_en & MLX5_QP_FLAG_DCI_STREAM) {
+		MLX5_SET(qpc, qpc, log_num_dci_stream_channels,
+			 ucmd->dci_streams.log_num_concurent);
+		MLX5_SET(qpc, qpc, log_num_dci_errored_streams,
+			 ucmd->dci_streams.log_num_errored);
+	}
+
 	MLX5_SET(qpc, qpc, ts_format, ts_format);
 	MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, init_attr));
 
@@ -2807,6 +2814,10 @@ static int process_vendor_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 
 	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TYPE_DCI, true, qp);
 	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TYPE_DCT, true, qp);
+	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_DCI_STREAM,
+			    MLX5_CAP_GEN(mdev, log_max_dci_stream_channels) &&
+			    MLX5_CAP_GEN(mdev, log_max_dci_errored_streams),
+			    qp);
 
 	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_SIGNATURE, true, qp);
 	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_SCATTER_CQE,
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 8597e6f22a1c..86be4a92b67b 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -50,6 +50,7 @@ enum {
 	MLX5_QP_FLAG_ALLOW_SCATTER_CQE	= 1 << 8,
 	MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE	= 1 << 9,
 	MLX5_QP_FLAG_UAR_PAGE_INDEX = 1 << 10,
+	MLX5_QP_FLAG_DCI_STREAM	= 1 << 11,
 };
 
 enum {
@@ -238,6 +239,11 @@ struct mlx5_ib_striding_rq_caps {
 	__u32 reserved;
 };
 
+struct mlx5_ib_dci_streams_caps {
+	__u8 max_log_num_concurent;
+	__u8 max_log_num_errored;
+};
+
 enum mlx5_ib_query_dev_resp_flags {
 	/* Support 128B CQE compression */
 	MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP = 1 << 0,
@@ -266,7 +272,8 @@ struct mlx5_ib_query_device_resp {
 	struct mlx5_ib_sw_parsing_caps sw_parsing_caps;
 	struct mlx5_ib_striding_rq_caps striding_rq_caps;
 	__u32	tunnel_offloads_caps; /* enum mlx5_ib_tunnel_offloads */
-	__u32	reserved;
+	struct  mlx5_ib_dci_streams_caps dci_streams_caps;
+	__u16 reserved;
 };
 
 enum mlx5_ib_create_cq_flags {
@@ -313,6 +320,11 @@ struct mlx5_ib_create_srq_resp {
 	__u32	reserved;
 };
 
+struct mlx5_ib_create_qp_dci_streams {
+	__u8 log_num_concurent;
+	__u8 log_num_errored;
+};
+
 struct mlx5_ib_create_qp {
 	__aligned_u64 buf_addr;
 	__aligned_u64 db_addr;
@@ -327,7 +339,8 @@ struct mlx5_ib_create_qp {
 		__aligned_u64 access_key;
 	};
 	__u32  ece_options;
-	__u32  reserved;
+	struct  mlx5_ib_create_qp_dci_streams dci_streams;
+	__u16 reserved;
 };
 
 /* RX Hash function flags */
-- 
cgit v1.2.3-71-gd317


From 26a4dc29b74a137f45665089f6d3d633fcc9b662 Mon Sep 17 00:00:00 2001
From: "Juan A. Suarez Romero" <jasuarez@igalia.com>
Date: Tue, 8 Jun 2021 13:15:41 +0200
Subject: drm/v3d: Expose performance counters to userspace

The V3D engine has several hardware performance counters that can of
interest for userspace performance analysis tools.

This exposes new ioctls to create and destroy performance monitor
objects, as well as to query the counter values.

Each created performance monitor object has an ID that can be attached
to CL/CSD submissions, so the driver enables the requested counters when
the job is submitted, and updates the performance monitor values when
the job is done.

It is up to the user to ensure all the jobs have been finished before
getting the performance monitor values. It is also up to the user to
properly synchronize BCL jobs when submitting jobs with different
performance monitors attached.

Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: David Airlie <airlied@linux.ie>
Cc: Emma Anholt <emma@anholt.net>
To: dri-devel@lists.freedesktop.org
Signed-off-by: Juan A. Suarez Romero <jasuarez@igalia.com>
Acked-by: Melissa Wen <mwen@igalia.com>
Signed-off-by: Melissa Wen <melissa.srw@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210608111541.461991-1-jasuarez@igalia.com
---
 drivers/gpu/drm/v3d/Makefile      |   1 +
 drivers/gpu/drm/v3d/v3d_drv.c     |   8 ++
 drivers/gpu/drm/v3d/v3d_drv.h     |  63 +++++++++++
 drivers/gpu/drm/v3d/v3d_gem.c     |  31 ++++++
 drivers/gpu/drm/v3d/v3d_perfmon.c | 213 ++++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/v3d/v3d_regs.h    |   2 +
 drivers/gpu/drm/v3d/v3d_sched.c   |  16 +++
 include/uapi/drm/v3d_drm.h        | 136 ++++++++++++++++++++++++
 8 files changed, 470 insertions(+)
 create mode 100644 drivers/gpu/drm/v3d/v3d_perfmon.c

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/v3d/Makefile b/drivers/gpu/drm/v3d/Makefile
index db4cfc155821..e8b314137020 100644
--- a/drivers/gpu/drm/v3d/Makefile
+++ b/drivers/gpu/drm/v3d/Makefile
@@ -9,6 +9,7 @@ v3d-y := \
 	v3d_gem.o \
 	v3d_irq.o \
 	v3d_mmu.o \
+	v3d_perfmon.o \
 	v3d_trace_points.o \
 	v3d_sched.o
 
diff --git a/drivers/gpu/drm/v3d/v3d_drv.c b/drivers/gpu/drm/v3d/v3d_drv.c
index 99e22beea90b..9403c3b36aca 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.c
+++ b/drivers/gpu/drm/v3d/v3d_drv.c
@@ -94,6 +94,9 @@ static int v3d_get_param_ioctl(struct drm_device *dev, void *data,
 	case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH:
 		args->value = 1;
 		return 0;
+	case DRM_V3D_PARAM_SUPPORTS_PERFMON:
+		args->value = (v3d->ver >= 40);
+		return 0;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", args->param);
 		return -EINVAL;
@@ -121,6 +124,7 @@ v3d_open(struct drm_device *dev, struct drm_file *file)
 				      1, NULL);
 	}
 
+	v3d_perfmon_open_file(v3d_priv);
 	file->driver_priv = v3d_priv;
 
 	return 0;
@@ -136,6 +140,7 @@ v3d_postclose(struct drm_device *dev, struct drm_file *file)
 		drm_sched_entity_destroy(&v3d_priv->sched_entity[q]);
 	}
 
+	v3d_perfmon_close_file(v3d_priv);
 	kfree(v3d_priv);
 }
 
@@ -156,6 +161,9 @@ static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
 	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CSD, v3d_submit_csd_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
+	DRM_IOCTL_DEF_DRV(V3D_PERFMON_CREATE, v3d_perfmon_create_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(V3D_PERFMON_DESTROY, v3d_perfmon_destroy_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(V3D_PERFMON_GET_VALUES, v3d_perfmon_get_values_ioctl, DRM_RENDER_ALLOW),
 };
 
 static const struct drm_driver v3d_drm_driver = {
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index 8a390738d65b..270134779073 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -37,6 +37,40 @@ struct v3d_queue_state {
 	u64 emit_seqno;
 };
 
+/* Performance monitor object. The perform lifetime is controlled by userspace
+ * using perfmon related ioctls. A perfmon can be attached to a submit_cl
+ * request, and when this is the case, HW perf counters will be activated just
+ * before the submit_cl is submitted to the GPU and disabled when the job is
+ * done. This way, only events related to a specific job will be counted.
+ */
+struct v3d_perfmon {
+	/* Tracks the number of users of the perfmon, when this counter reaches
+	 * zero the perfmon is destroyed.
+	 */
+	refcount_t refcnt;
+
+	/* Protects perfmon stop, as it can be invoked from multiple places. */
+	struct mutex lock;
+
+	/* Number of counters activated in this perfmon instance
+	 * (should be less than DRM_V3D_MAX_PERF_COUNTERS).
+	 */
+	u8 ncounters;
+
+	/* Events counted by the HW perf counters. */
+	u8 counters[DRM_V3D_MAX_PERF_COUNTERS];
+
+	/* Storage for counter values. Counters are incremented by the
+	 * HW perf counter values every time the perfmon is attached
+	 * to a GPU job.  This way, perfmon users don't have to
+	 * retrieve the results after each job if they want to track
+	 * events covering several submissions.  Note that counter
+	 * values can't be reset, but you can fake a reset by
+	 * destroying the perfmon and creating a new one.
+	 */
+	u64 values[];
+};
+
 struct v3d_dev {
 	struct drm_device drm;
 
@@ -89,6 +123,9 @@ struct v3d_dev {
 	 */
 	spinlock_t job_lock;
 
+	/* Used to track the active perfmon if any. */
+	struct v3d_perfmon *active_perfmon;
+
 	/* Protects bo_stats */
 	struct mutex bo_lock;
 
@@ -133,6 +170,11 @@ v3d_has_csd(struct v3d_dev *v3d)
 struct v3d_file_priv {
 	struct v3d_dev *v3d;
 
+	struct {
+		struct idr idr;
+		struct mutex lock;
+	} perfmon;
+
 	struct drm_sched_entity sched_entity[V3D_MAX_QUEUES];
 };
 
@@ -205,6 +247,11 @@ struct v3d_job {
 	 */
 	struct dma_fence *done_fence;
 
+	/* Pointer to a performance monitor object if the user requested it,
+	 * NULL otherwise.
+	 */
+	struct v3d_perfmon *perfmon;
+
 	/* Callback for the freeing of the job on refcount going to 0. */
 	void (*free)(struct kref *ref);
 };
@@ -353,3 +400,19 @@ void v3d_mmu_remove_ptes(struct v3d_bo *bo);
 /* v3d_sched.c */
 int v3d_sched_init(struct v3d_dev *v3d);
 void v3d_sched_fini(struct v3d_dev *v3d);
+
+/* v3d_perfmon.c */
+void v3d_perfmon_get(struct v3d_perfmon *perfmon);
+void v3d_perfmon_put(struct v3d_perfmon *perfmon);
+void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon);
+void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
+		      bool capture);
+struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id);
+void v3d_perfmon_open_file(struct v3d_file_priv *v3d_priv);
+void v3d_perfmon_close_file(struct v3d_file_priv *v3d_priv);
+int v3d_perfmon_create_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file_priv);
+int v3d_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *file_priv);
+int v3d_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
+				 struct drm_file *file_priv);
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index 4eb354226972..5689da118197 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -126,6 +126,8 @@ v3d_reset(struct v3d_dev *v3d)
 	v3d_mmu_set_page_table(v3d);
 	v3d_irq_reset(v3d);
 
+	v3d_perfmon_stop(v3d, v3d->active_perfmon, false);
+
 	trace_v3d_reset_end(dev);
 }
 
@@ -375,6 +377,9 @@ v3d_job_free(struct kref *ref)
 	pm_runtime_mark_last_busy(job->v3d->drm.dev);
 	pm_runtime_put_autosuspend(job->v3d->drm.dev);
 
+	if (job->perfmon)
+		v3d_perfmon_put(job->perfmon);
+
 	kfree(job);
 }
 
@@ -539,6 +544,9 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 
 	trace_v3d_submit_cl_ioctl(&v3d->drm, args->rcl_start, args->rcl_end);
 
+	if (args->pad != 0)
+		return -EINVAL;
+
 	if (args->flags != 0 &&
 	    args->flags != DRM_V3D_SUBMIT_CL_FLUSH_CACHE) {
 		DRM_INFO("invalid flags: %d\n", args->flags);
@@ -611,8 +619,20 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		goto fail;
 
+	if (args->perfmon_id) {
+		render->base.perfmon = v3d_perfmon_find(v3d_priv,
+							args->perfmon_id);
+
+		if (!render->base.perfmon) {
+			ret = -ENOENT;
+			goto fail;
+		}
+	}
+
 	mutex_lock(&v3d->sched_lock);
 	if (bin) {
+		bin->base.perfmon = render->base.perfmon;
+		v3d_perfmon_get(bin->base.perfmon);
 		ret = v3d_push_job(v3d_priv, &bin->base, V3D_BIN);
 		if (ret)
 			goto fail_unreserve;
@@ -633,6 +653,8 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 		ret = drm_gem_fence_array_add(&clean_job->deps, render_fence);
 		if (ret)
 			goto fail_unreserve;
+		clean_job->perfmon = render->base.perfmon;
+		v3d_perfmon_get(clean_job->perfmon);
 		ret = v3d_push_job(v3d_priv, clean_job, V3D_CACHE_CLEAN);
 		if (ret)
 			goto fail_unreserve;
@@ -827,6 +849,15 @@ v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		goto fail;
 
+	if (args->perfmon_id) {
+		job->base.perfmon = v3d_perfmon_find(v3d_priv,
+						     args->perfmon_id);
+		if (!job->base.perfmon) {
+			ret = -ENOENT;
+			goto fail;
+		}
+	}
+
 	mutex_lock(&v3d->sched_lock);
 	ret = v3d_push_job(v3d_priv, &job->base, V3D_CSD);
 	if (ret)
diff --git a/drivers/gpu/drm/v3d/v3d_perfmon.c b/drivers/gpu/drm/v3d/v3d_perfmon.c
new file mode 100644
index 000000000000..0288ef063513
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_perfmon.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Raspberry Pi
+ */
+
+#include "v3d_drv.h"
+#include "v3d_regs.h"
+
+#define V3D_PERFMONID_MIN	1
+#define V3D_PERFMONID_MAX	U32_MAX
+
+void v3d_perfmon_get(struct v3d_perfmon *perfmon)
+{
+	if (perfmon)
+		refcount_inc(&perfmon->refcnt);
+}
+
+void v3d_perfmon_put(struct v3d_perfmon *perfmon)
+{
+	if (perfmon && refcount_dec_and_test(&perfmon->refcnt))
+		kfree(perfmon);
+}
+
+void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon)
+{
+	unsigned int i;
+	u32 mask;
+	u8 ncounters = perfmon->ncounters;
+
+	if (WARN_ON_ONCE(!perfmon || v3d->active_perfmon))
+		return;
+
+	mask = GENMASK(ncounters - 1, 0);
+
+	for (i = 0; i < ncounters; i++) {
+		u32 source = i / 4;
+		u32 channel = V3D_SET_FIELD(perfmon->counters[i], V3D_PCTR_S0);
+
+		i++;
+		channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0,
+					 V3D_PCTR_S1);
+		i++;
+		channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0,
+					 V3D_PCTR_S2);
+		i++;
+		channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0,
+					 V3D_PCTR_S3);
+		V3D_CORE_WRITE(0, V3D_V4_PCTR_0_SRC_X(source), channel);
+	}
+
+	V3D_CORE_WRITE(0, V3D_V4_PCTR_0_CLR, mask);
+	V3D_CORE_WRITE(0, V3D_PCTR_0_OVERFLOW, mask);
+	V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, mask);
+
+	v3d->active_perfmon = perfmon;
+}
+
+void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
+		      bool capture)
+{
+	unsigned int i;
+
+	if (!perfmon || !v3d->active_perfmon)
+		return;
+
+	mutex_lock(&perfmon->lock);
+	if (perfmon != v3d->active_perfmon) {
+		mutex_unlock(&perfmon->lock);
+		return;
+	}
+
+	if (capture)
+		for (i = 0; i < perfmon->ncounters; i++)
+			perfmon->values[i] += V3D_CORE_READ(0, V3D_PCTR_0_PCTRX(i));
+
+	V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, 0);
+
+	v3d->active_perfmon = NULL;
+	mutex_unlock(&perfmon->lock);
+}
+
+struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id)
+{
+	struct v3d_perfmon *perfmon;
+
+	mutex_lock(&v3d_priv->perfmon.lock);
+	perfmon = idr_find(&v3d_priv->perfmon.idr, id);
+	v3d_perfmon_get(perfmon);
+	mutex_unlock(&v3d_priv->perfmon.lock);
+
+	return perfmon;
+}
+
+void v3d_perfmon_open_file(struct v3d_file_priv *v3d_priv)
+{
+	mutex_init(&v3d_priv->perfmon.lock);
+	idr_init(&v3d_priv->perfmon.idr);
+}
+
+static int v3d_perfmon_idr_del(int id, void *elem, void *data)
+{
+	struct v3d_perfmon *perfmon = elem;
+
+	v3d_perfmon_put(perfmon);
+
+	return 0;
+}
+
+void v3d_perfmon_close_file(struct v3d_file_priv *v3d_priv)
+{
+	mutex_lock(&v3d_priv->perfmon.lock);
+	idr_for_each(&v3d_priv->perfmon.idr, v3d_perfmon_idr_del, NULL);
+	idr_destroy(&v3d_priv->perfmon.idr);
+	mutex_unlock(&v3d_priv->perfmon.lock);
+}
+
+int v3d_perfmon_create_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file_priv)
+{
+	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
+	struct drm_v3d_perfmon_create *req = data;
+	struct v3d_perfmon *perfmon;
+	unsigned int i;
+	int ret;
+
+	/* Number of monitored counters cannot exceed HW limits. */
+	if (req->ncounters > DRM_V3D_MAX_PERF_COUNTERS ||
+	    !req->ncounters)
+		return -EINVAL;
+
+	/* Make sure all counters are valid. */
+	for (i = 0; i < req->ncounters; i++) {
+		if (req->counters[i] >= V3D_PERFCNT_NUM)
+			return -EINVAL;
+	}
+
+	perfmon = kzalloc(struct_size(perfmon, values, req->ncounters),
+			  GFP_KERNEL);
+	if (!perfmon)
+		return -ENOMEM;
+
+	for (i = 0; i < req->ncounters; i++)
+		perfmon->counters[i] = req->counters[i];
+
+	perfmon->ncounters = req->ncounters;
+
+	refcount_set(&perfmon->refcnt, 1);
+	mutex_init(&perfmon->lock);
+
+	mutex_lock(&v3d_priv->perfmon.lock);
+	ret = idr_alloc(&v3d_priv->perfmon.idr, perfmon, V3D_PERFMONID_MIN,
+			V3D_PERFMONID_MAX, GFP_KERNEL);
+	mutex_unlock(&v3d_priv->perfmon.lock);
+
+	if (ret < 0) {
+		kfree(perfmon);
+		return ret;
+	}
+
+	req->id = ret;
+
+	return 0;
+}
+
+int v3d_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *file_priv)
+{
+	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
+	struct drm_v3d_perfmon_destroy *req = data;
+	struct v3d_perfmon *perfmon;
+
+	mutex_lock(&v3d_priv->perfmon.lock);
+	perfmon = idr_remove(&v3d_priv->perfmon.idr, req->id);
+	mutex_unlock(&v3d_priv->perfmon.lock);
+
+	if (!perfmon)
+		return -EINVAL;
+
+	v3d_perfmon_put(perfmon);
+
+	return 0;
+}
+
+int v3d_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
+				 struct drm_file *file_priv)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
+	struct drm_v3d_perfmon_get_values *req = data;
+	struct v3d_perfmon *perfmon;
+	int ret = 0;
+
+	if (req->pad != 0)
+		return -EINVAL;
+
+	mutex_lock(&v3d_priv->perfmon.lock);
+	perfmon = idr_find(&v3d_priv->perfmon.idr, req->id);
+	v3d_perfmon_get(perfmon);
+	mutex_unlock(&v3d_priv->perfmon.lock);
+
+	if (!perfmon)
+		return -EINVAL;
+
+	v3d_perfmon_stop(v3d, perfmon, true);
+
+	if (copy_to_user(u64_to_user_ptr(req->values_ptr), perfmon->values,
+			 perfmon->ncounters * sizeof(u64)))
+		ret = -EFAULT;
+
+	v3d_perfmon_put(perfmon);
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
index 9bcb57781d31..3663e0d6bf76 100644
--- a/drivers/gpu/drm/v3d/v3d_regs.h
+++ b/drivers/gpu/drm/v3d/v3d_regs.h
@@ -347,6 +347,8 @@
 /* Each src reg muxes four counters each. */
 #define V3D_V4_PCTR_0_SRC_0_3                          0x00660
 #define V3D_V4_PCTR_0_SRC_28_31                        0x0067c
+#define V3D_V4_PCTR_0_SRC_X(x)                         (V3D_V4_PCTR_0_SRC_0_3 + \
+							4 * (x))
 # define V3D_PCTR_S0_MASK                              V3D_MASK(6, 0)
 # define V3D_PCTR_S0_SHIFT                             0
 # define V3D_PCTR_S1_MASK                              V3D_MASK(14, 8)
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index a39bdd5cfc4f..dd7fcc36d726 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -63,6 +63,16 @@ v3d_job_free(struct drm_sched_job *sched_job)
 	v3d_job_put(job);
 }
 
+static void
+v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job)
+{
+	if (job->perfmon != v3d->active_perfmon)
+		v3d_perfmon_stop(v3d, v3d->active_perfmon, true);
+
+	if (job->perfmon && v3d->active_perfmon != job->perfmon)
+		v3d_perfmon_start(v3d, job->perfmon);
+}
+
 /*
  * Returns the fences that the job depends on, one by one.
  *
@@ -120,6 +130,8 @@ static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
 	trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno,
 			    job->start, job->end);
 
+	v3d_switch_perfmon(v3d, &job->base);
+
 	/* Set the current and end address of the control list.
 	 * Writing the end register is what starts the job.
 	 */
@@ -169,6 +181,8 @@ static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job)
 	trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno,
 			    job->start, job->end);
 
+	v3d_switch_perfmon(v3d, &job->base);
+
 	/* XXX: Set the QCFG */
 
 	/* Set the current and end address of the control list.
@@ -240,6 +254,8 @@ v3d_csd_job_run(struct drm_sched_job *sched_job)
 
 	trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
 
+	v3d_switch_perfmon(v3d, &job->base);
+
 	for (i = 1; i <= 6; i++)
 		V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]);
 	/* CFG0 write kicks off the job. */
diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h
index 1ce746e228d9..4104f22fb3d3 100644
--- a/include/uapi/drm/v3d_drm.h
+++ b/include/uapi/drm/v3d_drm.h
@@ -38,6 +38,9 @@ extern "C" {
 #define DRM_V3D_GET_BO_OFFSET                     0x05
 #define DRM_V3D_SUBMIT_TFU                        0x06
 #define DRM_V3D_SUBMIT_CSD                        0x07
+#define DRM_V3D_PERFMON_CREATE                    0x08
+#define DRM_V3D_PERFMON_DESTROY                   0x09
+#define DRM_V3D_PERFMON_GET_VALUES                0x0a
 
 #define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
 #define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
@@ -47,6 +50,12 @@ extern "C" {
 #define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
 #define DRM_IOCTL_V3D_SUBMIT_TFU          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
 #define DRM_IOCTL_V3D_SUBMIT_CSD          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CSD, struct drm_v3d_submit_csd)
+#define DRM_IOCTL_V3D_PERFMON_CREATE      DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_CREATE, \
+						   struct drm_v3d_perfmon_create)
+#define DRM_IOCTL_V3D_PERFMON_DESTROY     DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_DESTROY, \
+						   struct drm_v3d_perfmon_destroy)
+#define DRM_IOCTL_V3D_PERFMON_GET_VALUES  DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_GET_VALUES, \
+						   struct drm_v3d_perfmon_get_values)
 
 #define DRM_V3D_SUBMIT_CL_FLUSH_CACHE             0x01
 
@@ -127,6 +136,11 @@ struct drm_v3d_submit_cl {
 	__u32 bo_handle_count;
 
 	__u32 flags;
+
+	/* ID of the perfmon to attach to this job. 0 means no perfmon. */
+	__u32 perfmon_id;
+
+	__u32 pad;
 };
 
 /**
@@ -195,6 +209,7 @@ enum drm_v3d_param {
 	DRM_V3D_PARAM_SUPPORTS_TFU,
 	DRM_V3D_PARAM_SUPPORTS_CSD,
 	DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH,
+	DRM_V3D_PARAM_SUPPORTS_PERFMON,
 };
 
 struct drm_v3d_get_param {
@@ -258,6 +273,127 @@ struct drm_v3d_submit_csd {
 	__u32 in_sync;
 	/* Sync object to signal when the CSD job is done. */
 	__u32 out_sync;
+
+	/* ID of the perfmon to attach to this job. 0 means no perfmon. */
+	__u32 perfmon_id;
+};
+
+enum {
+	V3D_PERFCNT_FEP_VALID_PRIMTS_NO_PIXELS,
+	V3D_PERFCNT_FEP_VALID_PRIMS,
+	V3D_PERFCNT_FEP_EZ_NFCLIP_QUADS,
+	V3D_PERFCNT_FEP_VALID_QUADS,
+	V3D_PERFCNT_TLB_QUADS_STENCIL_FAIL,
+	V3D_PERFCNT_TLB_QUADS_STENCILZ_FAIL,
+	V3D_PERFCNT_TLB_QUADS_STENCILZ_PASS,
+	V3D_PERFCNT_TLB_QUADS_ZERO_COV,
+	V3D_PERFCNT_TLB_QUADS_NONZERO_COV,
+	V3D_PERFCNT_TLB_QUADS_WRITTEN,
+	V3D_PERFCNT_PTB_PRIM_VIEWPOINT_DISCARD,
+	V3D_PERFCNT_PTB_PRIM_CLIP,
+	V3D_PERFCNT_PTB_PRIM_REV,
+	V3D_PERFCNT_QPU_IDLE_CYCLES,
+	V3D_PERFCNT_QPU_ACTIVE_CYCLES_VERTEX_COORD_USER,
+	V3D_PERFCNT_QPU_ACTIVE_CYCLES_FRAG,
+	V3D_PERFCNT_QPU_CYCLES_VALID_INSTR,
+	V3D_PERFCNT_QPU_CYCLES_TMU_STALL,
+	V3D_PERFCNT_QPU_CYCLES_SCOREBOARD_STALL,
+	V3D_PERFCNT_QPU_CYCLES_VARYINGS_STALL,
+	V3D_PERFCNT_QPU_IC_HIT,
+	V3D_PERFCNT_QPU_IC_MISS,
+	V3D_PERFCNT_QPU_UC_HIT,
+	V3D_PERFCNT_QPU_UC_MISS,
+	V3D_PERFCNT_TMU_TCACHE_ACCESS,
+	V3D_PERFCNT_TMU_TCACHE_MISS,
+	V3D_PERFCNT_VPM_VDW_STALL,
+	V3D_PERFCNT_VPM_VCD_STALL,
+	V3D_PERFCNT_BIN_ACTIVE,
+	V3D_PERFCNT_RDR_ACTIVE,
+	V3D_PERFCNT_L2T_HITS,
+	V3D_PERFCNT_L2T_MISSES,
+	V3D_PERFCNT_CYCLE_COUNT,
+	V3D_PERFCNT_QPU_CYCLES_STALLED_VERTEX_COORD_USER,
+	V3D_PERFCNT_QPU_CYCLES_STALLED_FRAGMENT,
+	V3D_PERFCNT_PTB_PRIMS_BINNED,
+	V3D_PERFCNT_AXI_WRITES_WATCH_0,
+	V3D_PERFCNT_AXI_READS_WATCH_0,
+	V3D_PERFCNT_AXI_WRITE_STALLS_WATCH_0,
+	V3D_PERFCNT_AXI_READ_STALLS_WATCH_0,
+	V3D_PERFCNT_AXI_WRITE_BYTES_WATCH_0,
+	V3D_PERFCNT_AXI_READ_BYTES_WATCH_0,
+	V3D_PERFCNT_AXI_WRITES_WATCH_1,
+	V3D_PERFCNT_AXI_READS_WATCH_1,
+	V3D_PERFCNT_AXI_WRITE_STALLS_WATCH_1,
+	V3D_PERFCNT_AXI_READ_STALLS_WATCH_1,
+	V3D_PERFCNT_AXI_WRITE_BYTES_WATCH_1,
+	V3D_PERFCNT_AXI_READ_BYTES_WATCH_1,
+	V3D_PERFCNT_TLB_PARTIAL_QUADS,
+	V3D_PERFCNT_TMU_CONFIG_ACCESSES,
+	V3D_PERFCNT_L2T_NO_ID_STALL,
+	V3D_PERFCNT_L2T_COM_QUE_STALL,
+	V3D_PERFCNT_L2T_TMU_WRITES,
+	V3D_PERFCNT_TMU_ACTIVE_CYCLES,
+	V3D_PERFCNT_TMU_STALLED_CYCLES,
+	V3D_PERFCNT_CLE_ACTIVE,
+	V3D_PERFCNT_L2T_TMU_READS,
+	V3D_PERFCNT_L2T_CLE_READS,
+	V3D_PERFCNT_L2T_VCD_READS,
+	V3D_PERFCNT_L2T_TMUCFG_READS,
+	V3D_PERFCNT_L2T_SLC0_READS,
+	V3D_PERFCNT_L2T_SLC1_READS,
+	V3D_PERFCNT_L2T_SLC2_READS,
+	V3D_PERFCNT_L2T_TMU_W_MISSES,
+	V3D_PERFCNT_L2T_TMU_R_MISSES,
+	V3D_PERFCNT_L2T_CLE_MISSES,
+	V3D_PERFCNT_L2T_VCD_MISSES,
+	V3D_PERFCNT_L2T_TMUCFG_MISSES,
+	V3D_PERFCNT_L2T_SLC0_MISSES,
+	V3D_PERFCNT_L2T_SLC1_MISSES,
+	V3D_PERFCNT_L2T_SLC2_MISSES,
+	V3D_PERFCNT_CORE_MEM_WRITES,
+	V3D_PERFCNT_L2T_MEM_WRITES,
+	V3D_PERFCNT_PTB_MEM_WRITES,
+	V3D_PERFCNT_TLB_MEM_WRITES,
+	V3D_PERFCNT_CORE_MEM_READS,
+	V3D_PERFCNT_L2T_MEM_READS,
+	V3D_PERFCNT_PTB_MEM_READS,
+	V3D_PERFCNT_PSE_MEM_READS,
+	V3D_PERFCNT_TLB_MEM_READS,
+	V3D_PERFCNT_GMP_MEM_READS,
+	V3D_PERFCNT_PTB_W_MEM_WORDS,
+	V3D_PERFCNT_TLB_W_MEM_WORDS,
+	V3D_PERFCNT_PSE_R_MEM_WORDS,
+	V3D_PERFCNT_TLB_R_MEM_WORDS,
+	V3D_PERFCNT_TMU_MRU_HITS,
+	V3D_PERFCNT_COMPUTE_ACTIVE,
+	V3D_PERFCNT_NUM,
+};
+
+#define DRM_V3D_MAX_PERF_COUNTERS                 32
+
+struct drm_v3d_perfmon_create {
+	__u32 id;
+	__u32 ncounters;
+	__u8 counters[DRM_V3D_MAX_PERF_COUNTERS];
+};
+
+struct drm_v3d_perfmon_destroy {
+	__u32 id;
+};
+
+/*
+ * Returns the values of the performance counters tracked by this
+ * perfmon (as an array of ncounters u64 values).
+ *
+ * No implicit synchronization is performed, so the user has to
+ * guarantee that any jobs using this perfmon have already been
+ * completed  (probably by blocking on the seqno returned by the
+ * last exec that used the perfmon).
+ */
+struct drm_v3d_perfmon_get_values {
+	__u32 id;
+	__u32 pad;
+	__u64 values_ptr;
 };
 
 #if defined(__cplusplus)
-- 
cgit v1.2.3-71-gd317


From 2d151d39073aff498358543801fca0f670fea981 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Sun, 18 Jul 2021 09:11:06 +0200
Subject: xfrm: Add possibility to set the default to block if we have no
 policy

As the default we assume the traffic to pass, if we have no
matching IPsec policy. With this patch, we have a possibility to
change this default from allow to block. It can be configured
via netlink. Each direction (input/output/forward) can be
configured separately. With the default to block configuered,
we need allow policies for all packet flows we accept.
We do not use default policy lookup for the loopback device.

v1->v2
 - fix compiling when XFRM is disabled
 - Reported-by: kernel test robot <lkp@intel.com>

Co-developed-by: Christian Langrock <christian.langrock@secunet.com>
Signed-off-by: Christian Langrock <christian.langrock@secunet.com>
Co-developed-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h  |  7 +++++++
 include/net/xfrm.h        | 36 ++++++++++++++++++++++++++------
 include/uapi/linux/xfrm.h | 10 +++++++++
 net/xfrm/xfrm_policy.c    | 16 +++++++++++++++
 net/xfrm/xfrm_user.c      | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 115 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index e946366e8ba5..88c647302977 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -65,6 +65,13 @@ struct netns_xfrm {
 	u32			sysctl_aevent_rseqth;
 	int			sysctl_larval_drop;
 	u32			sysctl_acq_expires;
+
+	u8			policy_default;
+#define XFRM_POL_DEFAULT_IN	1
+#define XFRM_POL_DEFAULT_OUT	2
+#define XFRM_POL_DEFAULT_FWD	4
+#define XFRM_POL_DEFAULT_MASK	7
+
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*sysctl_hdr;
 #endif
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index cbff7c2a9724..2308210793a0 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1075,6 +1075,22 @@ xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, un
 }
 
 #ifdef CONFIG_XFRM
+static inline bool
+xfrm_default_allow(struct net *net, int dir)
+{
+	u8 def = net->xfrm.policy_default;
+
+	switch (dir) {
+	case XFRM_POLICY_IN:
+		return def & XFRM_POL_DEFAULT_IN ? false : true;
+	case XFRM_POLICY_OUT:
+		return def & XFRM_POL_DEFAULT_OUT ? false : true;
+	case XFRM_POLICY_FWD:
+		return def & XFRM_POL_DEFAULT_FWD ? false : true;
+	}
+	return false;
+}
+
 int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb,
 			unsigned short family);
 
@@ -1088,9 +1104,13 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir,
 	if (sk && sk->sk_policy[XFRM_POLICY_IN])
 		return __xfrm_policy_check(sk, ndir, skb, family);
 
-	return	(!net->xfrm.policy_count[dir] && !secpath_exists(skb)) ||
-		(skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) ||
-		__xfrm_policy_check(sk, ndir, skb, family);
+	if (xfrm_default_allow(net, dir))
+		return (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) ||
+		       (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) ||
+		       __xfrm_policy_check(sk, ndir, skb, family);
+	else
+		return (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) ||
+		       __xfrm_policy_check(sk, ndir, skb, family);
 }
 
 static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
@@ -1142,9 +1162,13 @@ static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 {
 	struct net *net = dev_net(skb->dev);
 
-	return	!net->xfrm.policy_count[XFRM_POLICY_OUT] ||
-		(skb_dst(skb)->flags & DST_NOXFRM) ||
-		__xfrm_route_forward(skb, family);
+	if (xfrm_default_allow(net, XFRM_POLICY_FWD))
+		return !net->xfrm.policy_count[XFRM_POLICY_OUT] ||
+			(skb_dst(skb)->flags & DST_NOXFRM) ||
+			__xfrm_route_forward(skb, family);
+	else
+		return (skb_dst(skb)->flags & DST_NOXFRM) ||
+			__xfrm_route_forward(skb, family);
 }
 
 static inline int xfrm4_route_forward(struct sk_buff *skb)
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index ffc6a5391bb7..6e8095106192 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -213,6 +213,11 @@ enum {
 	XFRM_MSG_GETSPDINFO,
 #define XFRM_MSG_GETSPDINFO XFRM_MSG_GETSPDINFO
 
+	XFRM_MSG_SETDEFAULT,
+#define XFRM_MSG_SETDEFAULT XFRM_MSG_SETDEFAULT
+	XFRM_MSG_GETDEFAULT,
+#define XFRM_MSG_GETDEFAULT XFRM_MSG_GETDEFAULT
+
 	XFRM_MSG_MAPPING,
 #define XFRM_MSG_MAPPING XFRM_MSG_MAPPING
 	__XFRM_MSG_MAX
@@ -508,6 +513,11 @@ struct xfrm_user_offload {
 #define XFRM_OFFLOAD_IPV6	1
 #define XFRM_OFFLOAD_INBOUND	2
 
+struct xfrm_userpolicy_default {
+	__u8				dirmask;
+	__u8				action;
+};
+
 #ifndef __KERNEL__
 /* backwards compatibility for userspace */
 #define XFRMGRP_ACQUIRE		1
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 827d84255021..d5cb082e11fc 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -3165,6 +3165,11 @@ ok:
 	return dst;
 
 nopol:
+	if (!(dst_orig->dev->flags & IFF_LOOPBACK) &&
+	    !xfrm_default_allow(net, dir)) {
+		err = -EPERM;
+		goto error;
+	}
 	if (!(flags & XFRM_LOOKUP_ICMP)) {
 		dst = dst_orig;
 		goto ok;
@@ -3553,6 +3558,11 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	}
 
 	if (!pol) {
+		if (!xfrm_default_allow(net, dir)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
+			return 0;
+		}
+
 		if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) {
 			xfrm_secpath_reject(xerr_idx, skb, &fl);
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
@@ -3607,6 +3617,12 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 				tpp[ti++] = &pols[pi]->xfrm_vec[i];
 		}
 		xfrm_nr = ti;
+
+		if (!xfrm_default_allow(net, dir) && !xfrm_nr) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
+			goto reject;
+		}
+
 		if (npols > 1) {
 			xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
 			tpp = stp;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b47d613409b7..4eafd1130c3e 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1961,6 +1961,54 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb,
 	return skb;
 }
 
+static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_userpolicy_default *up = nlmsg_data(nlh);
+	u8 dirmask = (1 << up->dirmask) & XFRM_POL_DEFAULT_MASK;
+	u8 old_default = net->xfrm.policy_default;
+
+	net->xfrm.policy_default = (old_default & (0xff ^ dirmask))
+				    | (up->action << up->dirmask);
+
+	rt_genid_bump_all(net);
+
+	return 0;
+}
+
+static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct nlattr **attrs)
+{
+	struct sk_buff *r_skb;
+	struct nlmsghdr *r_nlh;
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_userpolicy_default *r_up, *up;
+	int len = NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_default));
+	u32 portid = NETLINK_CB(skb).portid;
+	u32 seq = nlh->nlmsg_seq;
+
+	up = nlmsg_data(nlh);
+
+	r_skb = nlmsg_new(len, GFP_ATOMIC);
+	if (!r_skb)
+		return -ENOMEM;
+
+	r_nlh = nlmsg_put(r_skb, portid, seq, XFRM_MSG_GETDEFAULT, sizeof(*r_up), 0);
+	if (!r_nlh) {
+		kfree_skb(r_skb);
+		return -EMSGSIZE;
+	}
+
+	r_up = nlmsg_data(r_nlh);
+
+	r_up->action = ((net->xfrm.policy_default & (1 << up->dirmask)) >> up->dirmask);
+	r_up->dirmask = up->dirmask;
+	nlmsg_end(r_skb, r_nlh);
+
+	return nlmsg_unicast(net->xfrm.nlsk, r_skb, portid);
+}
+
 static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 		struct nlattr **attrs)
 {
@@ -2664,6 +2712,8 @@ const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
 	[XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = sizeof(u32),
 	[XFRM_MSG_NEWSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
 	[XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
+	[XFRM_MSG_SETDEFAULT  - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
+	[XFRM_MSG_GETDEFAULT  - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
 };
 EXPORT_SYMBOL_GPL(xfrm_msg_min);
 
@@ -2743,6 +2793,8 @@ static const struct xfrm_link {
 						   .nla_pol = xfrma_spd_policy,
 						   .nla_max = XFRMA_SPD_MAX },
 	[XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo   },
+	[XFRM_MSG_SETDEFAULT  - XFRM_MSG_BASE] = { .doit = xfrm_set_default   },
+	[XFRM_MSG_GETDEFAULT  - XFRM_MSG_BASE] = { .doit = xfrm_get_default   },
 };
 
 static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
-- 
cgit v1.2.3-71-gd317


From db67f219fc9365a0c456666ed7c134d43ab0be8a Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Tue, 20 Jul 2021 21:42:56 +0200
Subject: uapi: IPv6 IOAM headers definition

This patch provides the IPv6 IOAM option header [1] as well as the IOAM
Trace header [2]. An IOAM option must be 4n-aligned. Here is an overview of
a Hop-by-Hop with an IOAM Trace option:

+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|  Next header  |  Hdr Ext Len  |    Padding    |    Padding    |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|  Option Type  |  Opt Data Len |    Reserved   |   IOAM Type   |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|         Namespace-ID          | NodeLen | Flags | RemainingLen|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|                IOAM-Trace-Type                |    Reserved   |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+<-+
|                                                               |  |
|                         node data [n]                         |  |
|                                                               |  |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+  D
|                                                               |  a
|                         node data [n-1]                       |  t
|                                                               |  a
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
~                             ...                               ~  S
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+  p
|                                                               |  a
|                         node data [1]                         |  c
|                                                               |  e
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+  |
|                                                               |  |
|                         node data [0]                         |  |
|                                                               |  |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+<-+

The IOAM option header starts at "Option Type" and ends after "IOAM
Type". The IOAM Trace header starts at "Namespace-ID" and ends after
"IOAM-Trace-Type/Reserved".

IOAM Type: either Pre-allocated Trace (=0), Incremental Trace (=1),
Proof-of-Transit (=2) or Edge-to-Edge (=3). Note that both the
Pre-allocated Trace and the Incremental Trace look the same. The two
others are not implemented.

Namespace-ID: IOAM namespace identifier, not to be confused with network
namespaces. It adds further context to IOAM options and associated data,
and allows devices which are IOAM capable to determine whether IOAM
options must be processed or ignored. It can also be used by an operator
to distinguish different operational domains or to identify different
sets of devices.

NodeLen: Length of data added by each node. It depends on the Trace
Type.

Flags: Only the Overflow (O) flag for now. The O flag is set by a
transit node when there are not enough octets left to record its data.

RemainingLen: Remaining free space to record data.

IOAM-Trace-Type: Bit field where each bit corresponds to a specific kind
of IOAM data. See [2] for a detailed list.

  [1] https://tools.ietf.org/html/draft-ietf-ippm-ioam-ipv6-options
  [2] https://tools.ietf.org/html/draft-ietf-ippm-ioam-data

Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ioam6.h | 123 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 include/uapi/linux/ioam6.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ioam6.h b/include/uapi/linux/ioam6.h
new file mode 100644
index 000000000000..2177e4e49566
--- /dev/null
+++ b/include/uapi/linux/ioam6.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ *  IPv6 IOAM implementation
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#ifndef _UAPI_LINUX_IOAM6_H
+#define _UAPI_LINUX_IOAM6_H
+
+#include <asm/byteorder.h>
+#include <linux/types.h>
+
+/*
+ * IPv6 IOAM Option Header
+ */
+struct ioam6_hdr {
+	__u8 opt_type;
+	__u8 opt_len;
+	__u8 :8;				/* reserved */
+#define IOAM6_TYPE_PREALLOC 0
+	__u8 type;
+} __attribute__((packed));
+
+/*
+ * IOAM Trace Header
+ */
+struct ioam6_trace_hdr {
+	__be16	namespace_id;
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+
+	__u8	:1,				/* unused */
+		:1,				/* unused */
+		overflow:1,
+		nodelen:5;
+
+	__u8	remlen:7,
+		:1;				/* unused */
+
+	union {
+		__be32 type_be32;
+
+		struct {
+			__u32	bit7:1,
+				bit6:1,
+				bit5:1,
+				bit4:1,
+				bit3:1,
+				bit2:1,
+				bit1:1,
+				bit0:1,
+				bit15:1,	/* unused */
+				bit14:1,	/* unused */
+				bit13:1,	/* unused */
+				bit12:1,	/* unused */
+				bit11:1,
+				bit10:1,
+				bit9:1,
+				bit8:1,
+				bit23:1,	/* reserved */
+				bit22:1,
+				bit21:1,	/* unused */
+				bit20:1,	/* unused */
+				bit19:1,	/* unused */
+				bit18:1,	/* unused */
+				bit17:1,	/* unused */
+				bit16:1,	/* unused */
+				:8;		/* reserved */
+		} type;
+	};
+
+#elif defined(__BIG_ENDIAN_BITFIELD)
+
+	__u8	nodelen:5,
+		overflow:1,
+		:1,				/* unused */
+		:1;				/* unused */
+
+	__u8	:1,				/* unused */
+		remlen:7;
+
+	union {
+		__be32 type_be32;
+
+		struct {
+			__u32	bit0:1,
+				bit1:1,
+				bit2:1,
+				bit3:1,
+				bit4:1,
+				bit5:1,
+				bit6:1,
+				bit7:1,
+				bit8:1,
+				bit9:1,
+				bit10:1,
+				bit11:1,
+				bit12:1,	/* unused */
+				bit13:1,	/* unused */
+				bit14:1,	/* unused */
+				bit15:1,	/* unused */
+				bit16:1,	/* unused */
+				bit17:1,	/* unused */
+				bit18:1,	/* unused */
+				bit19:1,	/* unused */
+				bit20:1,	/* unused */
+				bit21:1,	/* unused */
+				bit22:1,
+				bit23:1,	/* reserved */
+				:8;		/* reserved */
+		} type;
+	};
+
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+
+	__u8	data[0];
+} __attribute__((packed));
+
+#endif /* _UAPI_LINUX_IOAM6_H */
-- 
cgit v1.2.3-71-gd317


From 9ee11f0fff205b4b3df9750bff5e94f97c71b6a0 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Tue, 20 Jul 2021 21:42:57 +0200
Subject: ipv6: ioam: Data plane support for Pre-allocated Trace

Implement support for processing the IOAM Pre-allocated Trace with IPv6,
see [1] and [2]. Introduce a new IPv6 Hop-by-Hop TLV option, see IANA [3].

A new per-interface sysctl is introduced. The value is a boolean to accept (=1)
or ignore (=0, by default) IPv6 IOAM options on ingress for an interface:
 - net.ipv6.conf.XXX.ioam6_enabled

Two other sysctls are introduced to define IOAM IDs, represented by an integer.
They are respectively per-namespace and per-interface:
 - net.ipv6.ioam6_id
 - net.ipv6.conf.XXX.ioam6_id

The value of the first one represents the IOAM ID of the node itself (u32; max
and default value = U32_MAX>>8, due to hop limit concatenation) while the other
represents the IOAM ID of an interface (u16; max and default value = U16_MAX).

Each "ioam6_id" sysctl has a "_wide" equivalent:
 - net.ipv6.ioam6_id_wide
 - net.ipv6.conf.XXX.ioam6_id_wide

The value of the first one represents the wide IOAM ID of the node itself (u64;
max and default value = U64_MAX>>8, due to hop limit concatenation) while the
other represents the wide IOAM ID of an interface (u32; max and default value
= U32_MAX).

The use of short and wide equivalents is not exclusive, a deployment could
choose to leverage both. For example, net.ipv6.conf.XXX.ioam6_id (short format)
could be an identifier for a physical interface, whereas
net.ipv6.conf.XXX.ioam6_id_wide (wide format) could be an identifier for a
logical sub-interface. Documentation about new sysctls is provided at the end
of this patchset.

Two relativistic hash tables are used: one for IOAM namespaces, the other for
IOAM schemas. A namespace can only have a single active schema and a schema
can only be attached to a single namespace (1:1 relationship).

  [1] https://tools.ietf.org/html/draft-ietf-ippm-ioam-ipv6-options
  [2] https://tools.ietf.org/html/draft-ietf-ippm-ioam-data
  [3] https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml#ipv6-parameters-2

Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ioam6.h      |  13 ++
 include/linux/ipv6.h       |   3 +
 include/net/ioam6.h        |  64 +++++++++
 include/net/netns/ipv6.h   |   3 +
 include/uapi/linux/in6.h   |   1 +
 include/uapi/linux/ioam6.h |   9 ++
 include/uapi/linux/ipv6.h  |   3 +
 net/ipv6/Makefile          |   2 +-
 net/ipv6/addrconf.c        |  37 +++++
 net/ipv6/af_inet6.c        |  10 ++
 net/ipv6/exthdrs.c         |  61 +++++++++
 net/ipv6/ioam6.c           | 333 +++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/sysctl_net_ipv6.c |  19 +++
 13 files changed, 557 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/ioam6.h
 create mode 100644 include/net/ioam6.h
 create mode 100644 net/ipv6/ioam6.c

(limited to 'include/uapi')

diff --git a/include/linux/ioam6.h b/include/linux/ioam6.h
new file mode 100644
index 000000000000..94a24b36998f
--- /dev/null
+++ b/include/linux/ioam6.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ *  IPv6 IOAM
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+#ifndef _LINUX_IOAM6_H
+#define _LINUX_IOAM6_H
+
+#include <uapi/linux/ioam6.h>
+
+#endif /* _LINUX_IOAM6_H */
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 70b2ad3b9884..ef4a69865737 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -76,6 +76,9 @@ struct ipv6_devconf {
 	__s32		disable_policy;
 	__s32           ndisc_tclass;
 	__s32		rpl_seg_enabled;
+	__u32		ioam6_id;
+	__u32		ioam6_id_wide;
+	__u8		ioam6_enabled;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/net/ioam6.h b/include/net/ioam6.h
new file mode 100644
index 000000000000..772b91ee2e87
--- /dev/null
+++ b/include/net/ioam6.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ *  IPv6 IOAM implementation
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#ifndef _NET_IOAM6_H
+#define _NET_IOAM6_H
+
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/ioam6.h>
+#include <linux/rhashtable-types.h>
+
+struct ioam6_namespace {
+	struct rhash_head head;
+	struct rcu_head rcu;
+
+	struct ioam6_schema __rcu *schema;
+
+	__be16 id;
+	__be32 data;
+	__be64 data_wide;
+};
+
+struct ioam6_schema {
+	struct rhash_head head;
+	struct rcu_head rcu;
+
+	struct ioam6_namespace __rcu *ns;
+
+	u32 id;
+	int len;
+	__be32 hdr;
+
+	u8 data[0];
+};
+
+struct ioam6_pernet_data {
+	struct mutex lock;
+	struct rhashtable namespaces;
+	struct rhashtable schemas;
+};
+
+static inline struct ioam6_pernet_data *ioam6_pernet(struct net *net)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	return net->ipv6.ioam6_data;
+#else
+	return NULL;
+#endif
+}
+
+struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id);
+void ioam6_fill_trace_data(struct sk_buff *skb,
+			   struct ioam6_namespace *ns,
+			   struct ioam6_trace_hdr *trace);
+
+int ioam6_init(void);
+void ioam6_exit(void);
+
+#endif /* _NET_IOAM6_H */
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index bde0b7adb4a3..a4b550380316 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -51,6 +51,8 @@ struct netns_sysctl_ipv6 {
 	int max_dst_opts_len;
 	int max_hbh_opts_len;
 	int seg6_flowlabel;
+	u32 ioam6_id;
+	u64 ioam6_id_wide;
 	bool skip_notify_on_dev_down;
 	u8 fib_notify_on_flag_change;
 };
@@ -110,6 +112,7 @@ struct netns_ipv6 {
 		spinlock_t	lock;
 		u32		seq;
 	} ip6addrlbl_table;
+	struct ioam6_pernet_data *ioam6_data;
 };
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index 5ad396a57eb3..c4c53a9ab959 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -145,6 +145,7 @@ struct in6_flowlabel_req {
 #define IPV6_TLV_PADN		1
 #define IPV6_TLV_ROUTERALERT	5
 #define IPV6_TLV_CALIPSO	7	/* RFC 5570 */
+#define IPV6_TLV_IOAM		49	/* TEMPORARY IANA allocation for IOAM */
 #define IPV6_TLV_JUMBO		194
 #define IPV6_TLV_HAO		201	/* home address option */
 
diff --git a/include/uapi/linux/ioam6.h b/include/uapi/linux/ioam6.h
index 2177e4e49566..23ba6e85582f 100644
--- a/include/uapi/linux/ioam6.h
+++ b/include/uapi/linux/ioam6.h
@@ -12,6 +12,15 @@
 #include <asm/byteorder.h>
 #include <linux/types.h>
 
+#define IOAM6_U16_UNAVAILABLE U16_MAX
+#define IOAM6_U32_UNAVAILABLE U32_MAX
+#define IOAM6_U64_UNAVAILABLE U64_MAX
+
+#define IOAM6_DEFAULT_ID (IOAM6_U32_UNAVAILABLE >> 8)
+#define IOAM6_DEFAULT_ID_WIDE (IOAM6_U64_UNAVAILABLE >> 8)
+#define IOAM6_DEFAULT_IF_ID IOAM6_U16_UNAVAILABLE
+#define IOAM6_DEFAULT_IF_ID_WIDE IOAM6_U32_UNAVAILABLE
+
 /*
  * IPv6 IOAM Option Header
  */
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 70603775fe91..b243a53fa985 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -190,6 +190,9 @@ enum {
 	DEVCONF_NDISC_TCLASS,
 	DEVCONF_RPL_SEG_ENABLED,
 	DEVCONF_RA_DEFRTR_METRIC,
+	DEVCONF_IOAM6_ENABLED,
+	DEVCONF_IOAM6_ID,
+	DEVCONF_IOAM6_ID_WIDE,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index cf7b47bdb9b3..b7ef10d417d6 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -10,7 +10,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
 		raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
 		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
-		udp_offload.o seg6.o fib6_notifier.o rpl.o
+		udp_offload.o seg6.o fib6_notifier.o rpl.o ioam6.o
 
 ipv6-offload :=	ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index bc330fffb4a8..1802287977f1 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -89,12 +89,15 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/export.h>
+#include <linux/ioam6.h>
 
 #define	INFINITY_LIFE_TIME	0xFFFFFFFF
 
 #define IPV6_MAX_STRLEN \
 	sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255")
 
+static u32 ioam6_if_id_max = U16_MAX;
+
 static inline u32 cstamp_delta(unsigned long cstamp)
 {
 	return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
@@ -237,6 +240,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 	.disable_policy		= 0,
 	.rpl_seg_enabled	= 0,
+	.ioam6_enabled		= 0,
+	.ioam6_id               = IOAM6_DEFAULT_IF_ID,
+	.ioam6_id_wide		= IOAM6_DEFAULT_IF_ID_WIDE,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -293,6 +299,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 	.disable_policy		= 0,
 	.rpl_seg_enabled	= 0,
+	.ioam6_enabled		= 0,
+	.ioam6_id               = IOAM6_DEFAULT_IF_ID,
+	.ioam6_id_wide		= IOAM6_DEFAULT_IF_ID_WIDE,
 };
 
 /* Check if link is ready: is it up and is a valid qdisc available */
@@ -5524,6 +5533,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy;
 	array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass;
 	array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled;
+	array[DEVCONF_IOAM6_ENABLED] = cnf->ioam6_enabled;
+	array[DEVCONF_IOAM6_ID] = cnf->ioam6_id;
+	array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -6930,6 +6942,31 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "ioam6_enabled",
+		.data		= &ipv6_devconf.ioam6_enabled,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= (void *)SYSCTL_ZERO,
+		.extra2		= (void *)SYSCTL_ONE,
+	},
+	{
+		.procname	= "ioam6_id",
+		.data		= &ipv6_devconf.ioam6_id,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= (void *)SYSCTL_ZERO,
+		.extra2		= (void *)&ioam6_if_id_max,
+	},
+	{
+		.procname	= "ioam6_id_wide",
+		.data		= &ipv6_devconf.ioam6_id_wide,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec,
+	},
 	{
 		/* sentinel */
 	}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2389ff702f51..d92c90d97763 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -62,6 +62,7 @@
 #include <net/rpl.h>
 #include <net/compat.h>
 #include <net/xfrm.h>
+#include <net/ioam6.h>
 
 #include <linux/uaccess.h>
 #include <linux/mroute6.h>
@@ -961,6 +962,9 @@ static int __net_init inet6_net_init(struct net *net)
 	net->ipv6.sysctl.fib_notify_on_flag_change = 0;
 	atomic_set(&net->ipv6.fib6_sernum, 1);
 
+	net->ipv6.sysctl.ioam6_id = IOAM6_DEFAULT_ID;
+	net->ipv6.sysctl.ioam6_id_wide = IOAM6_DEFAULT_ID_WIDE;
+
 	err = ipv6_init_mibs(net);
 	if (err)
 		return err;
@@ -1191,6 +1195,10 @@ static int __init inet6_init(void)
 	if (err)
 		goto rpl_fail;
 
+	err = ioam6_init();
+	if (err)
+		goto ioam6_fail;
+
 	err = igmp6_late_init();
 	if (err)
 		goto igmp6_late_err;
@@ -1213,6 +1221,8 @@ sysctl_fail:
 	igmp6_late_cleanup();
 #endif
 igmp6_late_err:
+	ioam6_exit();
+ioam6_fail:
 	rpl_exit();
 rpl_fail:
 	seg6_exit();
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 26882e165c9e..d897faa4e9e6 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -49,6 +49,9 @@
 #include <net/seg6_hmac.h>
 #endif
 #include <net/rpl.h>
+#include <linux/ioam6.h>
+#include <net/ioam6.h>
+#include <net/dst_metadata.h>
 
 #include <linux/uaccess.h>
 
@@ -928,6 +931,60 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
 	return false;
 }
 
+/* IOAM */
+
+static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff)
+{
+	struct ioam6_trace_hdr *trace;
+	struct ioam6_namespace *ns;
+	struct ioam6_hdr *hdr;
+
+	/* Bad alignment (must be 4n-aligned) */
+	if (optoff & 3)
+		goto drop;
+
+	/* Ignore if IOAM is not enabled on ingress */
+	if (!__in6_dev_get(skb->dev)->cnf.ioam6_enabled)
+		goto ignore;
+
+	/* Truncated Option header */
+	hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff);
+	if (hdr->opt_len < 2)
+		goto drop;
+
+	switch (hdr->type) {
+	case IOAM6_TYPE_PREALLOC:
+		/* Truncated Pre-allocated Trace header */
+		if (hdr->opt_len < 2 + sizeof(*trace))
+			goto drop;
+
+		/* Malformed Pre-allocated Trace header */
+		trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr));
+		if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4)
+			goto drop;
+
+		/* Ignore if the IOAM namespace is unknown */
+		ns = ioam6_namespace(ipv6_skb_net(skb), trace->namespace_id);
+		if (!ns)
+			goto ignore;
+
+		if (!skb_valid_dst(skb))
+			ip6_route_input(skb);
+
+		ioam6_fill_trace_data(skb, ns, trace);
+		break;
+	default:
+		break;
+	}
+
+ignore:
+	return true;
+
+drop:
+	kfree_skb(skb);
+	return false;
+}
+
 /* Jumbo payload */
 
 static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
@@ -999,6 +1056,10 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = {
 		.type	= IPV6_TLV_ROUTERALERT,
 		.func	= ipv6_hop_ra,
 	},
+	{
+		.type	= IPV6_TLV_IOAM,
+		.func	= ipv6_hop_ioam,
+	},
 	{
 		.type	= IPV6_TLV_JUMBO,
 		.func	= ipv6_hop_jumbo,
diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c
new file mode 100644
index 000000000000..ba629e1b9408
--- /dev/null
+++ b/net/ipv6/ioam6.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ *  IPv6 IOAM implementation
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/ioam6.h>
+#include <linux/rhashtable.h>
+
+#include <net/addrconf.h>
+#include <net/ioam6.h>
+
+static void ioam6_ns_release(struct ioam6_namespace *ns)
+{
+	kfree_rcu(ns, rcu);
+}
+
+static void ioam6_sc_release(struct ioam6_schema *sc)
+{
+	kfree_rcu(sc, rcu);
+}
+
+static void ioam6_free_ns(void *ptr, void *arg)
+{
+	struct ioam6_namespace *ns = (struct ioam6_namespace *)ptr;
+
+	if (ns)
+		ioam6_ns_release(ns);
+}
+
+static void ioam6_free_sc(void *ptr, void *arg)
+{
+	struct ioam6_schema *sc = (struct ioam6_schema *)ptr;
+
+	if (sc)
+		ioam6_sc_release(sc);
+}
+
+static int ioam6_ns_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct ioam6_namespace *ns = obj;
+
+	return (ns->id != *(__be16 *)arg->key);
+}
+
+static int ioam6_sc_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct ioam6_schema *sc = obj;
+
+	return (sc->id != *(u32 *)arg->key);
+}
+
+static const struct rhashtable_params rht_ns_params = {
+	.key_len		= sizeof(__be16),
+	.key_offset		= offsetof(struct ioam6_namespace, id),
+	.head_offset		= offsetof(struct ioam6_namespace, head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= ioam6_ns_cmpfn,
+};
+
+static const struct rhashtable_params rht_sc_params = {
+	.key_len		= sizeof(u32),
+	.key_offset		= offsetof(struct ioam6_schema, id),
+	.head_offset		= offsetof(struct ioam6_schema, head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= ioam6_sc_cmpfn,
+};
+
+struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id)
+{
+	struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
+
+	return rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+}
+
+static void __ioam6_fill_trace_data(struct sk_buff *skb,
+				    struct ioam6_namespace *ns,
+				    struct ioam6_trace_hdr *trace,
+				    struct ioam6_schema *sc,
+				    u8 sclen)
+{
+	struct __kernel_sock_timeval ts;
+	u64 raw64;
+	u32 raw32;
+	u16 raw16;
+	u8 *data;
+	u8 byte;
+
+	data = trace->data + trace->remlen * 4 - trace->nodelen * 4 - sclen * 4;
+
+	/* hop_lim and node_id */
+	if (trace->type.bit0) {
+		byte = ipv6_hdr(skb)->hop_limit;
+		if (skb->dev)
+			byte--;
+
+		raw32 = dev_net(skb->dev)->ipv6.sysctl.ioam6_id;
+
+		*(__be32 *)data = cpu_to_be32((byte << 24) | raw32);
+		data += sizeof(__be32);
+	}
+
+	/* ingress_if_id and egress_if_id */
+	if (trace->type.bit1) {
+		if (!skb->dev)
+			raw16 = IOAM6_U16_UNAVAILABLE;
+		else
+			raw16 = (__force u16)__in6_dev_get(skb->dev)->cnf.ioam6_id;
+
+		*(__be16 *)data = cpu_to_be16(raw16);
+		data += sizeof(__be16);
+
+		if (skb_dst(skb)->dev->flags & IFF_LOOPBACK)
+			raw16 = IOAM6_U16_UNAVAILABLE;
+		else
+			raw16 = (__force u16)__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id;
+
+		*(__be16 *)data = cpu_to_be16(raw16);
+		data += sizeof(__be16);
+	}
+
+	/* timestamp seconds */
+	if (trace->type.bit2) {
+		if (!skb->tstamp)
+			__net_timestamp(skb);
+
+		skb_get_new_timestamp(skb, &ts);
+
+		*(__be32 *)data = cpu_to_be32((u32)ts.tv_sec);
+		data += sizeof(__be32);
+	}
+
+	/* timestamp subseconds */
+	if (trace->type.bit3) {
+		if (!skb->tstamp)
+			__net_timestamp(skb);
+
+		if (!trace->type.bit2)
+			skb_get_new_timestamp(skb, &ts);
+
+		*(__be32 *)data = cpu_to_be32((u32)ts.tv_usec);
+		data += sizeof(__be32);
+	}
+
+	/* transit delay */
+	if (trace->type.bit4) {
+		*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		data += sizeof(__be32);
+	}
+
+	/* namespace data */
+	if (trace->type.bit5) {
+		*(__be32 *)data = ns->data;
+		data += sizeof(__be32);
+	}
+
+	/* queue depth */
+	if (trace->type.bit6) {
+		*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		data += sizeof(__be32);
+	}
+
+	/* checksum complement */
+	if (trace->type.bit7) {
+		*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		data += sizeof(__be32);
+	}
+
+	/* hop_lim and node_id (wide) */
+	if (trace->type.bit8) {
+		byte = ipv6_hdr(skb)->hop_limit;
+		if (skb->dev)
+			byte--;
+
+		raw64 = dev_net(skb->dev)->ipv6.sysctl.ioam6_id_wide;
+
+		*(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64);
+		data += sizeof(__be64);
+	}
+
+	/* ingress_if_id and egress_if_id (wide) */
+	if (trace->type.bit9) {
+		if (!skb->dev)
+			raw32 = IOAM6_U32_UNAVAILABLE;
+		else
+			raw32 = __in6_dev_get(skb->dev)->cnf.ioam6_id_wide;
+
+		*(__be32 *)data = cpu_to_be32(raw32);
+		data += sizeof(__be32);
+
+		if (skb_dst(skb)->dev->flags & IFF_LOOPBACK)
+			raw32 = IOAM6_U32_UNAVAILABLE;
+		else
+			raw32 = __in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide;
+
+		*(__be32 *)data = cpu_to_be32(raw32);
+		data += sizeof(__be32);
+	}
+
+	/* namespace data (wide) */
+	if (trace->type.bit10) {
+		*(__be64 *)data = ns->data_wide;
+		data += sizeof(__be64);
+	}
+
+	/* buffer occupancy */
+	if (trace->type.bit11) {
+		*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		data += sizeof(__be32);
+	}
+
+	/* opaque state snapshot */
+	if (trace->type.bit22) {
+		if (!sc) {
+			*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE >> 8);
+		} else {
+			*(__be32 *)data = sc->hdr;
+			data += sizeof(__be32);
+
+			memcpy(data, sc->data, sc->len);
+		}
+	}
+}
+
+/* called with rcu_read_lock() */
+void ioam6_fill_trace_data(struct sk_buff *skb,
+			   struct ioam6_namespace *ns,
+			   struct ioam6_trace_hdr *trace)
+{
+	struct ioam6_schema *sc;
+	u8 sclen = 0;
+
+	/* Skip if Overflow flag is set OR
+	 * if an unknown type (bit 12-21) is set
+	 */
+	if (trace->overflow ||
+	    trace->type.bit12 | trace->type.bit13 | trace->type.bit14 |
+	    trace->type.bit15 | trace->type.bit16 | trace->type.bit17 |
+	    trace->type.bit18 | trace->type.bit19 | trace->type.bit20 |
+	    trace->type.bit21) {
+		return;
+	}
+
+	/* NodeLen does not include Opaque State Snapshot length. We need to
+	 * take it into account if the corresponding bit is set (bit 22) and
+	 * if the current IOAM namespace has an active schema attached to it
+	 */
+	sc = rcu_dereference(ns->schema);
+	if (trace->type.bit22) {
+		sclen = sizeof_field(struct ioam6_schema, hdr) / 4;
+
+		if (sc)
+			sclen += sc->len / 4;
+	}
+
+	/* If there is no space remaining, we set the Overflow flag and we
+	 * skip without filling the trace
+	 */
+	if (!trace->remlen || trace->remlen < trace->nodelen + sclen) {
+		trace->overflow = 1;
+		return;
+	}
+
+	__ioam6_fill_trace_data(skb, ns, trace, sc, sclen);
+	trace->remlen -= trace->nodelen + sclen;
+}
+
+static int __net_init ioam6_net_init(struct net *net)
+{
+	struct ioam6_pernet_data *nsdata;
+	int err = -ENOMEM;
+
+	nsdata = kzalloc(sizeof(*nsdata), GFP_KERNEL);
+	if (!nsdata)
+		goto out;
+
+	mutex_init(&nsdata->lock);
+	net->ipv6.ioam6_data = nsdata;
+
+	err = rhashtable_init(&nsdata->namespaces, &rht_ns_params);
+	if (err)
+		goto free_nsdata;
+
+	err = rhashtable_init(&nsdata->schemas, &rht_sc_params);
+	if (err)
+		goto free_rht_ns;
+
+out:
+	return err;
+free_rht_ns:
+	rhashtable_destroy(&nsdata->namespaces);
+free_nsdata:
+	kfree(nsdata);
+	net->ipv6.ioam6_data = NULL;
+	goto out;
+}
+
+static void __net_exit ioam6_net_exit(struct net *net)
+{
+	struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
+
+	rhashtable_free_and_destroy(&nsdata->namespaces, ioam6_free_ns, NULL);
+	rhashtable_free_and_destroy(&nsdata->schemas, ioam6_free_sc, NULL);
+
+	kfree(nsdata);
+}
+
+static struct pernet_operations ioam6_net_ops = {
+	.init = ioam6_net_init,
+	.exit = ioam6_net_exit,
+};
+
+int __init ioam6_init(void)
+{
+	int err = register_pernet_subsys(&ioam6_net_ops);
+
+	if (err)
+		return err;
+
+	pr_info("In-situ OAM (IOAM) with IPv6\n");
+	return 0;
+}
+
+void ioam6_exit(void)
+{
+	unregister_pernet_subsys(&ioam6_net_ops);
+}
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index d7cf26f730d7..d53dd142bf87 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -21,6 +21,7 @@
 #ifdef CONFIG_NETLABEL
 #include <net/calipso.h>
 #endif
+#include <linux/ioam6.h>
 
 static int two = 2;
 static int three = 3;
@@ -28,6 +29,8 @@ static int flowlabel_reflect_max = 0x7;
 static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
 static u32 rt6_multipath_hash_fields_all_mask =
 	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
+static u32 ioam6_id_max = IOAM6_DEFAULT_ID;
+static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE;
 
 static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write,
 					  void *buffer, size_t *lenp, loff_t *ppos)
@@ -196,6 +199,22 @@ static struct ctl_table ipv6_table_template[] = {
 		.extra1         = SYSCTL_ZERO,
 		.extra2         = &two,
 	},
+	{
+		.procname	= "ioam6_id",
+		.data		= &init_net.ipv6.sysctl.ioam6_id,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra2		= &ioam6_id_max,
+	},
+	{
+		.procname	= "ioam6_id_wide",
+		.data		= &init_net.ipv6.sysctl.ioam6_id_wide,
+		.maxlen		= sizeof(u64),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra2		= &ioam6_id_wide_max,
+	},
 	{ }
 };
 
-- 
cgit v1.2.3-71-gd317


From 8c6f6fa6772696be0c047a711858084b38763728 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Tue, 20 Jul 2021 21:42:58 +0200
Subject: ipv6: ioam: IOAM Generic Netlink API

Add Generic Netlink commands to allow userspace to configure IOAM
namespaces and schemas. The target is iproute2 and the patch is ready.
It will be posted as soon as this patchset is merged. Here is an overview:

$ ip ioam
Usage:	ip ioam { COMMAND | help }
	ip ioam namespace show
	ip ioam namespace add ID [ data DATA32 ] [ wide DATA64 ]
	ip ioam namespace del ID
	ip ioam schema show
	ip ioam schema add ID DATA
	ip ioam schema del ID
	ip ioam namespace set ID schema { ID | none }

Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ioam6_genl.h      |  13 +
 include/uapi/linux/ioam6_genl.h |  52 ++++
 net/ipv6/ioam6.c                | 561 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 624 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/ioam6_genl.h
 create mode 100644 include/uapi/linux/ioam6_genl.h

(limited to 'include/uapi')

diff --git a/include/linux/ioam6_genl.h b/include/linux/ioam6_genl.h
new file mode 100644
index 000000000000..176e67919de3
--- /dev/null
+++ b/include/linux/ioam6_genl.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ *  IPv6 IOAM Generic Netlink API
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+#ifndef _LINUX_IOAM6_GENL_H
+#define _LINUX_IOAM6_GENL_H
+
+#include <uapi/linux/ioam6_genl.h>
+
+#endif /* _LINUX_IOAM6_GENL_H */
diff --git a/include/uapi/linux/ioam6_genl.h b/include/uapi/linux/ioam6_genl.h
new file mode 100644
index 000000000000..ca4b22833754
--- /dev/null
+++ b/include/uapi/linux/ioam6_genl.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ *  IPv6 IOAM Generic Netlink API
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#ifndef _UAPI_LINUX_IOAM6_GENL_H
+#define _UAPI_LINUX_IOAM6_GENL_H
+
+#define IOAM6_GENL_NAME "IOAM6"
+#define IOAM6_GENL_VERSION 0x1
+
+enum {
+	IOAM6_ATTR_UNSPEC,
+
+	IOAM6_ATTR_NS_ID,	/* u16 */
+	IOAM6_ATTR_NS_DATA,	/* u32 */
+	IOAM6_ATTR_NS_DATA_WIDE,/* u64 */
+
+#define IOAM6_MAX_SCHEMA_DATA_LEN (255 * 4)
+	IOAM6_ATTR_SC_ID,	/* u32 */
+	IOAM6_ATTR_SC_DATA,	/* Binary */
+	IOAM6_ATTR_SC_NONE,	/* Flag */
+
+	IOAM6_ATTR_PAD,
+
+	__IOAM6_ATTR_MAX,
+};
+
+#define IOAM6_ATTR_MAX (__IOAM6_ATTR_MAX - 1)
+
+enum {
+	IOAM6_CMD_UNSPEC,
+
+	IOAM6_CMD_ADD_NAMESPACE,
+	IOAM6_CMD_DEL_NAMESPACE,
+	IOAM6_CMD_DUMP_NAMESPACES,
+
+	IOAM6_CMD_ADD_SCHEMA,
+	IOAM6_CMD_DEL_SCHEMA,
+	IOAM6_CMD_DUMP_SCHEMAS,
+
+	IOAM6_CMD_NS_SET_SCHEMA,
+
+	__IOAM6_CMD_MAX,
+};
+
+#define IOAM6_CMD_MAX (__IOAM6_CMD_MAX - 1)
+
+#endif /* _UAPI_LINUX_IOAM6_GENL_H */
diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c
index ba629e1b9408..ba59671f32b8 100644
--- a/net/ipv6/ioam6.c
+++ b/net/ipv6/ioam6.c
@@ -11,9 +11,11 @@
 #include <linux/kernel.h>
 #include <linux/net.h>
 #include <linux/ioam6.h>
+#include <linux/ioam6_genl.h>
 #include <linux/rhashtable.h>
 
 #include <net/addrconf.h>
+#include <net/genetlink.h>
 #include <net/ioam6.h>
 
 static void ioam6_ns_release(struct ioam6_namespace *ns)
@@ -72,6 +74,552 @@ static const struct rhashtable_params rht_sc_params = {
 	.obj_cmpfn		= ioam6_sc_cmpfn,
 };
 
+static struct genl_family ioam6_genl_family;
+
+static const struct nla_policy ioam6_genl_policy_addns[] = {
+	[IOAM6_ATTR_NS_ID]	= { .type = NLA_U16 },
+	[IOAM6_ATTR_NS_DATA]	= { .type = NLA_U32 },
+	[IOAM6_ATTR_NS_DATA_WIDE] = { .type = NLA_U64 },
+};
+
+static const struct nla_policy ioam6_genl_policy_delns[] = {
+	[IOAM6_ATTR_NS_ID]	= { .type = NLA_U16 },
+};
+
+static const struct nla_policy ioam6_genl_policy_addsc[] = {
+	[IOAM6_ATTR_SC_ID]	= { .type = NLA_U32 },
+	[IOAM6_ATTR_SC_DATA]	= { .type = NLA_BINARY,
+				    .len = IOAM6_MAX_SCHEMA_DATA_LEN },
+};
+
+static const struct nla_policy ioam6_genl_policy_delsc[] = {
+	[IOAM6_ATTR_SC_ID]	= { .type = NLA_U32 },
+};
+
+static const struct nla_policy ioam6_genl_policy_ns_sc[] = {
+	[IOAM6_ATTR_NS_ID]	= { .type = NLA_U16 },
+	[IOAM6_ATTR_SC_ID]	= { .type = NLA_U32 },
+	[IOAM6_ATTR_SC_NONE]	= { .type = NLA_FLAG },
+};
+
+static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ioam6_pernet_data *nsdata;
+	struct ioam6_namespace *ns;
+	u64 data64;
+	u32 data32;
+	__be16 id;
+	int err;
+
+	if (!info->attrs[IOAM6_ATTR_NS_ID])
+		return -EINVAL;
+
+	id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+	nsdata = ioam6_pernet(genl_info_net(info));
+
+	mutex_lock(&nsdata->lock);
+
+	ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+	if (ns) {
+		err = -EEXIST;
+		goto out_unlock;
+	}
+
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	if (!ns) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	ns->id = id;
+
+	if (!info->attrs[IOAM6_ATTR_NS_DATA])
+		data32 = IOAM6_U32_UNAVAILABLE;
+	else
+		data32 = nla_get_u32(info->attrs[IOAM6_ATTR_NS_DATA]);
+
+	if (!info->attrs[IOAM6_ATTR_NS_DATA_WIDE])
+		data64 = IOAM6_U64_UNAVAILABLE;
+	else
+		data64 = nla_get_u64(info->attrs[IOAM6_ATTR_NS_DATA_WIDE]);
+
+	ns->data = cpu_to_be32(data32);
+	ns->data_wide = cpu_to_be64(data64);
+
+	err = rhashtable_lookup_insert_fast(&nsdata->namespaces, &ns->head,
+					    rht_ns_params);
+	if (err)
+		kfree(ns);
+
+out_unlock:
+	mutex_unlock(&nsdata->lock);
+	return err;
+}
+
+static int ioam6_genl_delns(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ioam6_pernet_data *nsdata;
+	struct ioam6_namespace *ns;
+	struct ioam6_schema *sc;
+	__be16 id;
+	int err;
+
+	if (!info->attrs[IOAM6_ATTR_NS_ID])
+		return -EINVAL;
+
+	id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+	nsdata = ioam6_pernet(genl_info_net(info));
+
+	mutex_lock(&nsdata->lock);
+
+	ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+	if (!ns) {
+		err = -ENOENT;
+		goto out_unlock;
+	}
+
+	sc = rcu_dereference_protected(ns->schema,
+				       lockdep_is_held(&nsdata->lock));
+
+	err = rhashtable_remove_fast(&nsdata->namespaces, &ns->head,
+				     rht_ns_params);
+	if (err)
+		goto out_unlock;
+
+	if (sc)
+		rcu_assign_pointer(sc->ns, NULL);
+
+	ioam6_ns_release(ns);
+
+out_unlock:
+	mutex_unlock(&nsdata->lock);
+	return err;
+}
+
+static int __ioam6_genl_dumpns_element(struct ioam6_namespace *ns,
+				       u32 portid,
+				       u32 seq,
+				       u32 flags,
+				       struct sk_buff *skb,
+				       u8 cmd)
+{
+	struct ioam6_schema *sc;
+	u64 data64;
+	u32 data32;
+	void *hdr;
+
+	hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd);
+	if (!hdr)
+		return -ENOMEM;
+
+	data32 = be32_to_cpu(ns->data);
+	data64 = be64_to_cpu(ns->data_wide);
+
+	if (nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id)) ||
+	    (data32 != IOAM6_U32_UNAVAILABLE &&
+	     nla_put_u32(skb, IOAM6_ATTR_NS_DATA, data32)) ||
+	    (data64 != IOAM6_U64_UNAVAILABLE &&
+	     nla_put_u64_64bit(skb, IOAM6_ATTR_NS_DATA_WIDE,
+			       data64, IOAM6_ATTR_PAD)))
+		goto nla_put_failure;
+
+	rcu_read_lock();
+
+	sc = rcu_dereference(ns->schema);
+	if (sc && nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id)) {
+		rcu_read_unlock();
+		goto nla_put_failure;
+	}
+
+	rcu_read_unlock();
+
+	genlmsg_end(skb, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ioam6_genl_dumpns_start(struct netlink_callback *cb)
+{
+	struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk));
+	struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+	if (!iter) {
+		iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+		if (!iter)
+			return -ENOMEM;
+
+		cb->args[0] = (long)iter;
+	}
+
+	rhashtable_walk_enter(&nsdata->namespaces, iter);
+
+	return 0;
+}
+
+static int ioam6_genl_dumpns_done(struct netlink_callback *cb)
+{
+	struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+	rhashtable_walk_exit(iter);
+	kfree(iter);
+
+	return 0;
+}
+
+static int ioam6_genl_dumpns(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct rhashtable_iter *iter;
+	struct ioam6_namespace *ns;
+	int err;
+
+	iter = (struct rhashtable_iter *)cb->args[0];
+	rhashtable_walk_start(iter);
+
+	for (;;) {
+		ns = rhashtable_walk_next(iter);
+
+		if (IS_ERR(ns)) {
+			if (PTR_ERR(ns) == -EAGAIN)
+				continue;
+			err = PTR_ERR(ns);
+			goto done;
+		} else if (!ns) {
+			break;
+		}
+
+		err = __ioam6_genl_dumpns_element(ns,
+						  NETLINK_CB(cb->skb).portid,
+						  cb->nlh->nlmsg_seq,
+						  NLM_F_MULTI,
+						  skb,
+						  IOAM6_CMD_DUMP_NAMESPACES);
+		if (err)
+			goto done;
+	}
+
+	err = skb->len;
+
+done:
+	rhashtable_walk_stop(iter);
+	return err;
+}
+
+static int ioam6_genl_addsc(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ioam6_pernet_data *nsdata;
+	int len, len_aligned, err;
+	struct ioam6_schema *sc;
+	u32 id;
+
+	if (!info->attrs[IOAM6_ATTR_SC_ID] || !info->attrs[IOAM6_ATTR_SC_DATA])
+		return -EINVAL;
+
+	id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+	nsdata = ioam6_pernet(genl_info_net(info));
+
+	mutex_lock(&nsdata->lock);
+
+	sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params);
+	if (sc) {
+		err = -EEXIST;
+		goto out_unlock;
+	}
+
+	len = nla_len(info->attrs[IOAM6_ATTR_SC_DATA]);
+	len_aligned = ALIGN(len, 4);
+
+	sc = kzalloc(sizeof(*sc) + len_aligned, GFP_KERNEL);
+	if (!sc) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	sc->id = id;
+	sc->len = len_aligned;
+	sc->hdr = cpu_to_be32(sc->id | ((u8)(sc->len / 4) << 24));
+	nla_memcpy(sc->data, info->attrs[IOAM6_ATTR_SC_DATA], len);
+
+	err = rhashtable_lookup_insert_fast(&nsdata->schemas, &sc->head,
+					    rht_sc_params);
+	if (err)
+		goto free_sc;
+
+out_unlock:
+	mutex_unlock(&nsdata->lock);
+	return err;
+free_sc:
+	kfree(sc);
+	goto out_unlock;
+}
+
+static int ioam6_genl_delsc(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ioam6_pernet_data *nsdata;
+	struct ioam6_namespace *ns;
+	struct ioam6_schema *sc;
+	int err;
+	u32 id;
+
+	if (!info->attrs[IOAM6_ATTR_SC_ID])
+		return -EINVAL;
+
+	id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+	nsdata = ioam6_pernet(genl_info_net(info));
+
+	mutex_lock(&nsdata->lock);
+
+	sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params);
+	if (!sc) {
+		err = -ENOENT;
+		goto out_unlock;
+	}
+
+	ns = rcu_dereference_protected(sc->ns, lockdep_is_held(&nsdata->lock));
+
+	err = rhashtable_remove_fast(&nsdata->schemas, &sc->head,
+				     rht_sc_params);
+	if (err)
+		goto out_unlock;
+
+	if (ns)
+		rcu_assign_pointer(ns->schema, NULL);
+
+	ioam6_sc_release(sc);
+
+out_unlock:
+	mutex_unlock(&nsdata->lock);
+	return err;
+}
+
+static int __ioam6_genl_dumpsc_element(struct ioam6_schema *sc,
+				       u32 portid, u32 seq, u32 flags,
+				       struct sk_buff *skb, u8 cmd)
+{
+	struct ioam6_namespace *ns;
+	void *hdr;
+
+	hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd);
+	if (!hdr)
+		return -ENOMEM;
+
+	if (nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id) ||
+	    nla_put(skb, IOAM6_ATTR_SC_DATA, sc->len, sc->data))
+		goto nla_put_failure;
+
+	rcu_read_lock();
+
+	ns = rcu_dereference(sc->ns);
+	if (ns && nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id))) {
+		rcu_read_unlock();
+		goto nla_put_failure;
+	}
+
+	rcu_read_unlock();
+
+	genlmsg_end(skb, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ioam6_genl_dumpsc_start(struct netlink_callback *cb)
+{
+	struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk));
+	struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+	if (!iter) {
+		iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+		if (!iter)
+			return -ENOMEM;
+
+		cb->args[0] = (long)iter;
+	}
+
+	rhashtable_walk_enter(&nsdata->schemas, iter);
+
+	return 0;
+}
+
+static int ioam6_genl_dumpsc_done(struct netlink_callback *cb)
+{
+	struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+	rhashtable_walk_exit(iter);
+	kfree(iter);
+
+	return 0;
+}
+
+static int ioam6_genl_dumpsc(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct rhashtable_iter *iter;
+	struct ioam6_schema *sc;
+	int err;
+
+	iter = (struct rhashtable_iter *)cb->args[0];
+	rhashtable_walk_start(iter);
+
+	for (;;) {
+		sc = rhashtable_walk_next(iter);
+
+		if (IS_ERR(sc)) {
+			if (PTR_ERR(sc) == -EAGAIN)
+				continue;
+			err = PTR_ERR(sc);
+			goto done;
+		} else if (!sc) {
+			break;
+		}
+
+		err = __ioam6_genl_dumpsc_element(sc,
+						  NETLINK_CB(cb->skb).portid,
+						  cb->nlh->nlmsg_seq,
+						  NLM_F_MULTI,
+						  skb,
+						  IOAM6_CMD_DUMP_SCHEMAS);
+		if (err)
+			goto done;
+	}
+
+	err = skb->len;
+
+done:
+	rhashtable_walk_stop(iter);
+	return err;
+}
+
+static int ioam6_genl_ns_set_schema(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ioam6_namespace *ns, *ns_ref;
+	struct ioam6_schema *sc, *sc_ref;
+	struct ioam6_pernet_data *nsdata;
+	__be16 ns_id;
+	u32 sc_id;
+	int err;
+
+	if (!info->attrs[IOAM6_ATTR_NS_ID] ||
+	    (!info->attrs[IOAM6_ATTR_SC_ID] &&
+	     !info->attrs[IOAM6_ATTR_SC_NONE]))
+		return -EINVAL;
+
+	ns_id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+	nsdata = ioam6_pernet(genl_info_net(info));
+
+	mutex_lock(&nsdata->lock);
+
+	ns = rhashtable_lookup_fast(&nsdata->namespaces, &ns_id, rht_ns_params);
+	if (!ns) {
+		err = -ENOENT;
+		goto out_unlock;
+	}
+
+	if (info->attrs[IOAM6_ATTR_SC_NONE]) {
+		sc = NULL;
+	} else {
+		sc_id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+		sc = rhashtable_lookup_fast(&nsdata->schemas, &sc_id,
+					    rht_sc_params);
+		if (!sc) {
+			err = -ENOENT;
+			goto out_unlock;
+		}
+	}
+
+	sc_ref = rcu_dereference_protected(ns->schema,
+					   lockdep_is_held(&nsdata->lock));
+	if (sc_ref)
+		rcu_assign_pointer(sc_ref->ns, NULL);
+	rcu_assign_pointer(ns->schema, sc);
+
+	if (sc) {
+		ns_ref = rcu_dereference_protected(sc->ns,
+						   lockdep_is_held(&nsdata->lock));
+		if (ns_ref)
+			rcu_assign_pointer(ns_ref->schema, NULL);
+		rcu_assign_pointer(sc->ns, ns);
+	}
+
+	err = 0;
+
+out_unlock:
+	mutex_unlock(&nsdata->lock);
+	return err;
+}
+
+static const struct genl_ops ioam6_genl_ops[] = {
+	{
+		.cmd	= IOAM6_CMD_ADD_NAMESPACE,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= ioam6_genl_addns,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ioam6_genl_policy_addns,
+		.maxattr = ARRAY_SIZE(ioam6_genl_policy_addns) - 1,
+	},
+	{
+		.cmd	= IOAM6_CMD_DEL_NAMESPACE,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= ioam6_genl_delns,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ioam6_genl_policy_delns,
+		.maxattr = ARRAY_SIZE(ioam6_genl_policy_delns) - 1,
+	},
+	{
+		.cmd	= IOAM6_CMD_DUMP_NAMESPACES,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.start	= ioam6_genl_dumpns_start,
+		.dumpit	= ioam6_genl_dumpns,
+		.done	= ioam6_genl_dumpns_done,
+		.flags	= GENL_ADMIN_PERM,
+	},
+	{
+		.cmd	= IOAM6_CMD_ADD_SCHEMA,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= ioam6_genl_addsc,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ioam6_genl_policy_addsc,
+		.maxattr = ARRAY_SIZE(ioam6_genl_policy_addsc) - 1,
+	},
+	{
+		.cmd	= IOAM6_CMD_DEL_SCHEMA,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= ioam6_genl_delsc,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ioam6_genl_policy_delsc,
+		.maxattr = ARRAY_SIZE(ioam6_genl_policy_delsc) - 1,
+	},
+	{
+		.cmd	= IOAM6_CMD_DUMP_SCHEMAS,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.start	= ioam6_genl_dumpsc_start,
+		.dumpit	= ioam6_genl_dumpsc,
+		.done	= ioam6_genl_dumpsc_done,
+		.flags	= GENL_ADMIN_PERM,
+	},
+	{
+		.cmd	= IOAM6_CMD_NS_SET_SCHEMA,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= ioam6_genl_ns_set_schema,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ioam6_genl_policy_ns_sc,
+		.maxattr = ARRAY_SIZE(ioam6_genl_policy_ns_sc) - 1,
+	},
+};
+
+static struct genl_family ioam6_genl_family __ro_after_init = {
+	.name		= IOAM6_GENL_NAME,
+	.version	= IOAM6_GENL_VERSION,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.ops		= ioam6_genl_ops,
+	.n_ops		= ARRAY_SIZE(ioam6_genl_ops),
+	.module		= THIS_MODULE,
+};
+
 struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id)
 {
 	struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
@@ -319,15 +867,24 @@ static struct pernet_operations ioam6_net_ops = {
 int __init ioam6_init(void)
 {
 	int err = register_pernet_subsys(&ioam6_net_ops);
+	if (err)
+		goto out;
 
+	err = genl_register_family(&ioam6_genl_family);
 	if (err)
-		return err;
+		goto out_unregister_pernet_subsys;
 
 	pr_info("In-situ OAM (IOAM) with IPv6\n");
-	return 0;
+
+out:
+	return err;
+out_unregister_pernet_subsys:
+	unregister_pernet_subsys(&ioam6_net_ops);
+	goto out;
 }
 
 void ioam6_exit(void)
 {
+	genl_unregister_family(&ioam6_genl_family);
 	unregister_pernet_subsys(&ioam6_net_ops);
 }
-- 
cgit v1.2.3-71-gd317


From 3edede08ff37c6a9370510508d5eeb54890baf47 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Tue, 20 Jul 2021 21:42:59 +0200
Subject: ipv6: ioam: Support for IOAM injection with lwtunnels

Add support for the IOAM inline insertion (only for the host-to-host use case)
which is per-route configured with lightweight tunnels. The target is iproute2
and the patch is ready. It will be posted as soon as this patchset is merged.
Here is an overview:

$ ip -6 ro ad fc00::1/128 encap ioam6 trace type 0x800000 ns 1 size 12 dev eth0

This example configures an IOAM Pre-allocated Trace option attached to the
fc00::1/128 prefix. The IOAM namespace (ns) is 1, the size of the pre-allocated
trace data block is 12 octets (size) and only the first IOAM data (bit 0:
hop_limit + node id) is included in the trace (type) represented as a bitfield.

The reason why the in-transit (IPv6-in-IPv6 encapsulation) use case is not
implemented is explained on the patchset cover.

Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ioam6_iptunnel.h      |  13 ++
 include/net/ioam6.h                 |   3 +
 include/uapi/linux/ioam6.h          |   1 +
 include/uapi/linux/ioam6_iptunnel.h |  20 +++
 include/uapi/linux/lwtunnel.h       |   1 +
 net/core/lwtunnel.c                 |   2 +
 net/ipv6/Kconfig                    |  11 ++
 net/ipv6/Makefile                   |   1 +
 net/ipv6/ioam6.c                    |  44 ++++--
 net/ipv6/ioam6_iptunnel.c           | 274 ++++++++++++++++++++++++++++++++++++
 10 files changed, 358 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/ioam6_iptunnel.h
 create mode 100644 include/uapi/linux/ioam6_iptunnel.h
 create mode 100644 net/ipv6/ioam6_iptunnel.c

(limited to 'include/uapi')

diff --git a/include/linux/ioam6_iptunnel.h b/include/linux/ioam6_iptunnel.h
new file mode 100644
index 000000000000..07d9dfedd29d
--- /dev/null
+++ b/include/linux/ioam6_iptunnel.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ *  IPv6 IOAM Lightweight Tunnel API
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+#ifndef _LINUX_IOAM6_IPTUNNEL_H
+#define _LINUX_IOAM6_IPTUNNEL_H
+
+#include <uapi/linux/ioam6_iptunnel.h>
+
+#endif /* _LINUX_IOAM6_IPTUNNEL_H */
diff --git a/include/net/ioam6.h b/include/net/ioam6.h
index 772b91ee2e87..3c2993bc48c8 100644
--- a/include/net/ioam6.h
+++ b/include/net/ioam6.h
@@ -61,4 +61,7 @@ void ioam6_fill_trace_data(struct sk_buff *skb,
 int ioam6_init(void);
 void ioam6_exit(void);
 
+int ioam6_iptunnel_init(void);
+void ioam6_iptunnel_exit(void);
+
 #endif /* _NET_IOAM6_H */
diff --git a/include/uapi/linux/ioam6.h b/include/uapi/linux/ioam6.h
index 23ba6e85582f..ac4de376f0ce 100644
--- a/include/uapi/linux/ioam6.h
+++ b/include/uapi/linux/ioam6.h
@@ -126,6 +126,7 @@ struct ioam6_trace_hdr {
 #error "Please fix <asm/byteorder.h>"
 #endif
 
+#define IOAM6_TRACE_DATA_SIZE_MAX 244
 	__u8	data[0];
 } __attribute__((packed));
 
diff --git a/include/uapi/linux/ioam6_iptunnel.h b/include/uapi/linux/ioam6_iptunnel.h
new file mode 100644
index 000000000000..bae14636a8c8
--- /dev/null
+++ b/include/uapi/linux/ioam6_iptunnel.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ *  IPv6 IOAM Lightweight Tunnel API
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#ifndef _UAPI_LINUX_IOAM6_IPTUNNEL_H
+#define _UAPI_LINUX_IOAM6_IPTUNNEL_H
+
+enum {
+	IOAM6_IPTUNNEL_UNSPEC,
+	IOAM6_IPTUNNEL_TRACE,		/* struct ioam6_trace_hdr */
+	__IOAM6_IPTUNNEL_MAX,
+};
+
+#define IOAM6_IPTUNNEL_MAX (__IOAM6_IPTUNNEL_MAX - 1)
+
+#endif /* _UAPI_LINUX_IOAM6_IPTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 568a4303ccce..2e206919125c 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -14,6 +14,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_BPF,
 	LWTUNNEL_ENCAP_SEG6_LOCAL,
 	LWTUNNEL_ENCAP_RPL,
+	LWTUNNEL_ENCAP_IOAM6,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 8ec7d13d2860..d0ae987d2de9 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -43,6 +43,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "SEG6LOCAL";
 	case LWTUNNEL_ENCAP_RPL:
 		return "RPL";
+	case LWTUNNEL_ENCAP_IOAM6:
+		return "IOAM6";
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
 	case LWTUNNEL_ENCAP_NONE:
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 747f56e0c636..e504204bca92 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -328,4 +328,15 @@ config IPV6_RPL_LWTUNNEL
 
 	  If unsure, say N.
 
+config IPV6_IOAM6_LWTUNNEL
+	bool "IPv6: IOAM Pre-allocated Trace insertion support"
+	depends on IPV6
+	select LWTUNNEL
+	help
+	  Support for the inline insertion of IOAM Pre-allocated
+	  Trace Header (only on locally generated packets), using
+	  the lightweight tunnels mechanism.
+
+	  If unsure, say N.
+
 endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index b7ef10d417d6..1bc7e143217b 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -27,6 +27,7 @@ ipv6-$(CONFIG_NETLABEL) += calipso.o
 ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o
 ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
 ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o
+ipv6-$(CONFIG_IPV6_IOAM6_LWTUNNEL) += ioam6_iptunnel.o
 
 ipv6-objs += $(ipv6-y)
 
diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c
index ba59671f32b8..5e8961004832 100644
--- a/net/ipv6/ioam6.c
+++ b/net/ipv6/ioam6.c
@@ -648,7 +648,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 		if (skb->dev)
 			byte--;
 
-		raw32 = dev_net(skb->dev)->ipv6.sysctl.ioam6_id;
+		raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id;
 
 		*(__be32 *)data = cpu_to_be32((byte << 24) | raw32);
 		data += sizeof(__be32);
@@ -675,24 +675,31 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 
 	/* timestamp seconds */
 	if (trace->type.bit2) {
-		if (!skb->tstamp)
-			__net_timestamp(skb);
-
-		skb_get_new_timestamp(skb, &ts);
+		if (!skb->dev) {
+			*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		} else {
+			if (!skb->tstamp)
+				__net_timestamp(skb);
 
-		*(__be32 *)data = cpu_to_be32((u32)ts.tv_sec);
+			skb_get_new_timestamp(skb, &ts);
+			*(__be32 *)data = cpu_to_be32((u32)ts.tv_sec);
+		}
 		data += sizeof(__be32);
 	}
 
 	/* timestamp subseconds */
 	if (trace->type.bit3) {
-		if (!skb->tstamp)
-			__net_timestamp(skb);
+		if (!skb->dev) {
+			*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+		} else {
+			if (!skb->tstamp)
+				__net_timestamp(skb);
 
-		if (!trace->type.bit2)
-			skb_get_new_timestamp(skb, &ts);
+			if (!trace->type.bit2)
+				skb_get_new_timestamp(skb, &ts);
 
-		*(__be32 *)data = cpu_to_be32((u32)ts.tv_usec);
+			*(__be32 *)data = cpu_to_be32((u32)ts.tv_usec);
+		}
 		data += sizeof(__be32);
 	}
 
@@ -726,7 +733,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 		if (skb->dev)
 			byte--;
 
-		raw64 = dev_net(skb->dev)->ipv6.sysctl.ioam6_id_wide;
+		raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide;
 
 		*(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64);
 		data += sizeof(__be64);
@@ -874,10 +881,20 @@ int __init ioam6_init(void)
 	if (err)
 		goto out_unregister_pernet_subsys;
 
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+	err = ioam6_iptunnel_init();
+	if (err)
+		goto out_unregister_genl;
+#endif
+
 	pr_info("In-situ OAM (IOAM) with IPv6\n");
 
 out:
 	return err;
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+out_unregister_genl:
+	genl_unregister_family(&ioam6_genl_family);
+#endif
 out_unregister_pernet_subsys:
 	unregister_pernet_subsys(&ioam6_net_ops);
 	goto out;
@@ -885,6 +902,9 @@ out_unregister_pernet_subsys:
 
 void ioam6_exit(void)
 {
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+	ioam6_iptunnel_exit();
+#endif
 	genl_unregister_family(&ioam6_genl_family);
 	unregister_pernet_subsys(&ioam6_net_ops);
 }
diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c
new file mode 100644
index 000000000000..f9ee04541c17
--- /dev/null
+++ b/net/ipv6/ioam6_iptunnel.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ *  IPv6 IOAM Lightweight Tunnel implementation
+ *
+ *  Author:
+ *  Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/netlink.h>
+#include <linux/in6.h>
+#include <linux/ioam6.h>
+#include <linux/ioam6_iptunnel.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/lwtunnel.h>
+#include <net/ioam6.h>
+
+#define IOAM6_MASK_SHORT_FIELDS 0xff100000
+#define IOAM6_MASK_WIDE_FIELDS 0xe00000
+
+struct ioam6_lwt_encap {
+	struct ipv6_hopopt_hdr	eh;
+	u8			pad[2];	/* 2-octet padding for 4n-alignment */
+	struct ioam6_hdr	ioamh;
+	struct ioam6_trace_hdr	traceh;
+} __packed;
+
+struct ioam6_lwt {
+	struct ioam6_lwt_encap	tuninfo;
+};
+
+static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt)
+{
+	return (struct ioam6_lwt *)lwt->data;
+}
+
+static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt)
+{
+	return &ioam6_lwt_state(lwt)->tuninfo;
+}
+
+static struct ioam6_trace_hdr *ioam6_trace(struct lwtunnel_state *lwt)
+{
+	return &(ioam6_lwt_state(lwt)->tuninfo.traceh);
+}
+
+static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = {
+	[IOAM6_IPTUNNEL_TRACE]	= NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)),
+};
+
+static int nla_put_ioam6_trace(struct sk_buff *skb, int attrtype,
+			       struct ioam6_trace_hdr *trace)
+{
+	struct ioam6_trace_hdr *data;
+	struct nlattr *nla;
+	int len;
+
+	len = sizeof(*trace);
+
+	nla = nla_reserve(skb, attrtype, len);
+	if (!nla)
+		return -EMSGSIZE;
+
+	data = nla_data(nla);
+	memcpy(data, trace, len);
+
+	return 0;
+}
+
+static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace)
+{
+	u32 fields;
+
+	if (!trace->type_be32 || !trace->remlen ||
+	    trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4)
+		return false;
+
+	trace->nodelen = 0;
+	fields = be32_to_cpu(trace->type_be32);
+
+	trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS)
+				* (sizeof(__be32) / 4);
+	trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS)
+				* (sizeof(__be64) / 4);
+
+	return true;
+}
+
+static int ioam6_build_state(struct net *net, struct nlattr *nla,
+			     unsigned int family, const void *cfg,
+			     struct lwtunnel_state **ts,
+			     struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1];
+	struct ioam6_lwt_encap *tuninfo;
+	struct ioam6_trace_hdr *trace;
+	struct lwtunnel_state *s;
+	int len_aligned;
+	int len, err;
+
+	if (family != AF_INET6)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla,
+			       ioam6_iptunnel_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[IOAM6_IPTUNNEL_TRACE]) {
+		NL_SET_ERR_MSG(extack, "missing trace");
+		return -EINVAL;
+	}
+
+	trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]);
+	if (!ioam6_validate_trace_hdr(trace)) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE],
+				    "invalid trace validation");
+		return -EINVAL;
+	}
+
+	len = sizeof(*tuninfo) + trace->remlen * 4;
+	len_aligned = ALIGN(len, 8);
+
+	s = lwtunnel_state_alloc(len_aligned);
+	if (!s)
+		return -ENOMEM;
+
+	tuninfo = ioam6_lwt_info(s);
+	tuninfo->eh.hdrlen = (len_aligned >> 3) - 1;
+	tuninfo->pad[0] = IPV6_TLV_PADN;
+	tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC;
+	tuninfo->ioamh.opt_type = IPV6_TLV_IOAM;
+	tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace)
+					+ trace->remlen * 4;
+
+	memcpy(&tuninfo->traceh, trace, sizeof(*trace));
+
+	len = len_aligned - len;
+	if (len == 1) {
+		tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PAD1;
+	} else if (len > 0) {
+		tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN;
+		tuninfo->traceh.data[trace->remlen * 4 + 1] = len - 2;
+	}
+
+	s->type = LWTUNNEL_ENCAP_IOAM6;
+	s->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+	*ts = s;
+
+	return 0;
+}
+
+static int ioam6_do_inline(struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo)
+{
+	struct ioam6_trace_hdr *trace;
+	struct ipv6hdr *oldhdr, *hdr;
+	struct ioam6_namespace *ns;
+	int hdrlen, err;
+
+	hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
+
+	err = skb_cow_head(skb, hdrlen + skb->mac_len);
+	if (unlikely(err))
+		return err;
+
+	oldhdr = ipv6_hdr(skb);
+	skb_pull(skb, sizeof(*oldhdr));
+	skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr));
+
+	skb_push(skb, sizeof(*oldhdr) + hdrlen);
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+
+	hdr = ipv6_hdr(skb);
+	memmove(hdr, oldhdr, sizeof(*oldhdr));
+	tuninfo->eh.nexthdr = hdr->nexthdr;
+
+	skb_set_transport_header(skb, sizeof(*hdr));
+	skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen);
+
+	memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
+
+	hdr->nexthdr = NEXTHDR_HOP;
+	hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
+
+	trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb)
+					   + sizeof(struct ipv6_hopopt_hdr) + 2
+					   + sizeof(struct ioam6_hdr));
+
+	ns = ioam6_namespace(dev_net(skb_dst(skb)->dev), trace->namespace_id);
+	if (ns)
+		ioam6_fill_trace_data(skb, ns, trace);
+
+	return 0;
+}
+
+static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct lwtunnel_state *lwt = skb_dst(skb)->lwtstate;
+	int err = -EINVAL;
+
+	if (skb->protocol != htons(ETH_P_IPV6))
+		goto drop;
+
+	/* Only for packets we send and
+	 * that do not contain a Hop-by-Hop yet
+	 */
+	if (skb->dev || ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP)
+		goto out;
+
+	err = ioam6_do_inline(skb, ioam6_lwt_info(lwt));
+	if (unlikely(err))
+		goto drop;
+
+	err = skb_cow_head(skb, LL_RESERVED_SPACE(skb_dst(skb)->dev));
+	if (unlikely(err))
+		goto drop;
+
+out:
+	return lwt->orig_output(net, sk, skb);
+
+drop:
+	kfree_skb(skb);
+	return err;
+}
+
+static int ioam6_fill_encap_info(struct sk_buff *skb,
+				 struct lwtunnel_state *lwtstate)
+{
+	struct ioam6_trace_hdr *trace = ioam6_trace(lwtstate);
+
+	if (nla_put_ioam6_trace(skb, IOAM6_IPTUNNEL_TRACE, trace))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	struct ioam6_trace_hdr *trace = ioam6_trace(lwtstate);
+
+	return nla_total_size(sizeof(*trace));
+}
+
+static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct ioam6_trace_hdr *a_hdr = ioam6_trace(a);
+	struct ioam6_trace_hdr *b_hdr = ioam6_trace(b);
+
+	return (a_hdr->namespace_id != b_hdr->namespace_id);
+}
+
+static const struct lwtunnel_encap_ops ioam6_iptun_ops = {
+	.build_state	= ioam6_build_state,
+	.output		= ioam6_output,
+	.fill_encap	= ioam6_fill_encap_info,
+	.get_encap_size	= ioam6_encap_nlsize,
+	.cmp_encap	= ioam6_encap_cmp,
+	.owner		= THIS_MODULE,
+};
+
+int __init ioam6_iptunnel_init(void)
+{
+	return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
+}
+
+void ioam6_iptunnel_exit(void)
+{
+	lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
+}
-- 
cgit v1.2.3-71-gd317


From 7961c5b60f23dff5d82a523f9aeb8ebf34cf9926 Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Wed, 14 Jul 2021 14:28:33 +0200
Subject: drm/i915: Add TTM offset argument to mmap.

The FIXED mapping is only used for ttm, and tells userspace that the
mapping type is pre-defined. This disables the other type of mmap
offsets when discrete memory is used, so fix the selftests as well.

Document the struct as well, so it shows up in docbook.

Cc: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
[mauld: Included minor fixes from the review comments]
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210714122833.766586-1-maarten.lankhorst@linux.intel.com
---
 drivers/gpu/drm/i915/gem/i915_gem_mman.c           | 17 +++++++-
 drivers/gpu/drm/i915/gem/i915_gem_object_types.h   |  1 +
 drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c | 27 ++++++++++++-
 include/uapi/drm/i915_drm.h                        | 47 +++++++++++++++++-----
 4 files changed, 78 insertions(+), 14 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index a90f796e85c0..2f3b7dc7b0e6 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -679,10 +679,16 @@ __assign_mmap_offset(struct drm_i915_gem_object *obj,
 		return -ENODEV;
 
 	if (obj->ops->mmap_offset)  {
+		if (mmap_type != I915_MMAP_TYPE_FIXED)
+			return -ENODEV;
+
 		*offset = obj->ops->mmap_offset(obj);
 		return 0;
 	}
 
+	if (mmap_type == I915_MMAP_TYPE_FIXED)
+		return -ENODEV;
+
 	if (mmap_type != I915_MMAP_TYPE_GTT &&
 	    !i915_gem_object_has_struct_page(obj) &&
 	    !i915_gem_object_has_iomem(obj))
@@ -727,7 +733,9 @@ i915_gem_dumb_mmap_offset(struct drm_file *file,
 {
 	enum i915_mmap_type mmap_type;
 
-	if (boot_cpu_has(X86_FEATURE_PAT))
+	if (HAS_LMEM(to_i915(dev)))
+		mmap_type = I915_MMAP_TYPE_FIXED;
+	else if (boot_cpu_has(X86_FEATURE_PAT))
 		mmap_type = I915_MMAP_TYPE_WC;
 	else if (!i915_ggtt_has_aperture(&to_i915(dev)->ggtt))
 		return -ENODEV;
@@ -798,6 +806,10 @@ i915_gem_mmap_offset_ioctl(struct drm_device *dev, void *data,
 		type = I915_MMAP_TYPE_UC;
 		break;
 
+	case I915_MMAP_OFFSET_FIXED:
+		type = I915_MMAP_TYPE_FIXED;
+		break;
+
 	default:
 		return -EINVAL;
 	}
@@ -968,6 +980,9 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 		vma->vm_ops = &vm_ops_cpu;
 		break;
 
+	case I915_MMAP_TYPE_FIXED:
+		GEM_WARN_ON(1);
+		fallthrough;
 	case I915_MMAP_TYPE_WB:
 		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 		vma->vm_ops = &vm_ops_cpu;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index ef3de2ae9723..afbadfc5516b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -105,6 +105,7 @@ enum i915_mmap_type {
 	I915_MMAP_TYPE_WC,
 	I915_MMAP_TYPE_WB,
 	I915_MMAP_TYPE_UC,
+	I915_MMAP_TYPE_FIXED,
 };
 
 struct i915_mmap_offset {
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c
index fda017c19ef6..0b2b73d8a364 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c
@@ -573,6 +573,14 @@ err:
 	return 0;
 }
 
+static enum i915_mmap_type default_mapping(struct drm_i915_private *i915)
+{
+	if (HAS_LMEM(i915))
+		return I915_MMAP_TYPE_FIXED;
+
+	return I915_MMAP_TYPE_GTT;
+}
+
 static bool assert_mmap_offset(struct drm_i915_private *i915,
 			       unsigned long size,
 			       int expected)
@@ -585,7 +593,7 @@ static bool assert_mmap_offset(struct drm_i915_private *i915,
 	if (IS_ERR(obj))
 		return expected && expected == PTR_ERR(obj);
 
-	ret = __assign_mmap_offset(obj, I915_MMAP_TYPE_GTT, &offset, NULL);
+	ret = __assign_mmap_offset(obj, default_mapping(i915), &offset, NULL);
 	i915_gem_object_put(obj);
 
 	return ret == expected;
@@ -689,7 +697,7 @@ static int igt_mmap_offset_exhaustion(void *arg)
 		goto out;
 	}
 
-	err = __assign_mmap_offset(obj, I915_MMAP_TYPE_GTT, &offset, NULL);
+	err = __assign_mmap_offset(obj, default_mapping(i915), &offset, NULL);
 	if (err) {
 		pr_err("Unable to insert object into reclaimed hole\n");
 		goto err_obj;
@@ -831,8 +839,14 @@ static int wc_check(struct drm_i915_gem_object *obj)
 
 static bool can_mmap(struct drm_i915_gem_object *obj, enum i915_mmap_type type)
 {
+	struct drm_i915_private *i915 = to_i915(obj->base.dev);
 	bool no_map;
 
+	if (HAS_LMEM(i915))
+		return type == I915_MMAP_TYPE_FIXED;
+	else if (type == I915_MMAP_TYPE_FIXED)
+		return false;
+
 	if (type == I915_MMAP_TYPE_GTT &&
 	    !i915_ggtt_has_aperture(&to_i915(obj->base.dev)->ggtt))
 		return false;
@@ -970,6 +984,8 @@ static int igt_mmap(void *arg)
 			err = __igt_mmap(i915, obj, I915_MMAP_TYPE_GTT);
 			if (err == 0)
 				err = __igt_mmap(i915, obj, I915_MMAP_TYPE_WC);
+			if (err == 0)
+				err = __igt_mmap(i915, obj, I915_MMAP_TYPE_FIXED);
 
 			i915_gem_object_put(obj);
 			if (err)
@@ -987,6 +1003,7 @@ static const char *repr_mmap_type(enum i915_mmap_type type)
 	case I915_MMAP_TYPE_WB: return "wb";
 	case I915_MMAP_TYPE_WC: return "wc";
 	case I915_MMAP_TYPE_UC: return "uc";
+	case I915_MMAP_TYPE_FIXED: return "fixed";
 	default: return "unknown";
 	}
 }
@@ -1100,6 +1117,8 @@ static int igt_mmap_access(void *arg)
 			err = __igt_mmap_access(i915, obj, I915_MMAP_TYPE_WC);
 		if (err == 0)
 			err = __igt_mmap_access(i915, obj, I915_MMAP_TYPE_UC);
+		if (err == 0)
+			err = __igt_mmap_access(i915, obj, I915_MMAP_TYPE_FIXED);
 
 		i915_gem_object_put(obj);
 		if (err)
@@ -1241,6 +1260,8 @@ static int igt_mmap_gpu(void *arg)
 		err = __igt_mmap_gpu(i915, obj, I915_MMAP_TYPE_GTT);
 		if (err == 0)
 			err = __igt_mmap_gpu(i915, obj, I915_MMAP_TYPE_WC);
+		if (err == 0)
+			err = __igt_mmap_gpu(i915, obj, I915_MMAP_TYPE_FIXED);
 
 		i915_gem_object_put(obj);
 		if (err)
@@ -1396,6 +1417,8 @@ static int igt_mmap_revoke(void *arg)
 		err = __igt_mmap_revoke(i915, obj, I915_MMAP_TYPE_GTT);
 		if (err == 0)
 			err = __igt_mmap_revoke(i915, obj, I915_MMAP_TYPE_WC);
+		if (err == 0)
+			err = __igt_mmap_revoke(i915, obj, I915_MMAP_TYPE_FIXED);
 
 		i915_gem_object_put(obj);
 		if (err)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index e20eeeca7a1c..0aea82657cdc 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -849,31 +849,56 @@ struct drm_i915_gem_mmap_gtt {
 	__u64 offset;
 };
 
+/**
+ * struct drm_i915_gem_mmap_offset - Retrieve an offset so we can mmap this buffer object.
+ *
+ * This struct is passed as argument to the `DRM_IOCTL_I915_GEM_MMAP_OFFSET` ioctl,
+ * and is used to retrieve the fake offset to mmap an object specified by &handle.
+ *
+ * The legacy way of using `DRM_IOCTL_I915_GEM_MMAP` is removed on gen12+.
+ * `DRM_IOCTL_I915_GEM_MMAP_GTT` is an older supported alias to this struct, but will behave
+ * as setting the &extensions to 0, and &flags to `I915_MMAP_OFFSET_GTT`.
+ */
 struct drm_i915_gem_mmap_offset {
-	/** Handle for the object being mapped. */
+	/** @handle: Handle for the object being mapped. */
 	__u32 handle;
+	/** @pad: Must be zero */
 	__u32 pad;
 	/**
-	 * Fake offset to use for subsequent mmap call
+	 * @offset: The fake offset to use for subsequent mmap call
 	 *
 	 * This is a fixed-size type for 32/64 compatibility.
 	 */
 	__u64 offset;
 
 	/**
-	 * Flags for extended behaviour.
+	 * @flags: Flags for extended behaviour.
 	 *
-	 * It is mandatory that one of the MMAP_OFFSET types
-	 * (GTT, WC, WB, UC, etc) should be included.
+	 * It is mandatory that one of the `MMAP_OFFSET` types
+	 * should be included:
+	 *
+	 * - `I915_MMAP_OFFSET_GTT`: Use mmap with the object bound to GTT. (Write-Combined)
+	 * - `I915_MMAP_OFFSET_WC`: Use Write-Combined caching.
+	 * - `I915_MMAP_OFFSET_WB`: Use Write-Back caching.
+	 * - `I915_MMAP_OFFSET_FIXED`: Use object placement to determine caching.
+	 *
+	 * On devices with local memory `I915_MMAP_OFFSET_FIXED` is the only valid
+	 * type. On devices without local memory, this caching mode is invalid.
+	 *
+	 * As caching mode when specifying `I915_MMAP_OFFSET_FIXED`, WC or WB will
+	 * be used, depending on the object placement on creation. WB will be used
+	 * when the object can only exist in system memory, WC otherwise.
 	 */
 	__u64 flags;
-#define I915_MMAP_OFFSET_GTT 0
-#define I915_MMAP_OFFSET_WC  1
-#define I915_MMAP_OFFSET_WB  2
-#define I915_MMAP_OFFSET_UC  3
 
-	/*
-	 * Zero-terminated chain of extensions.
+#define I915_MMAP_OFFSET_GTT	0
+#define I915_MMAP_OFFSET_WC	1
+#define I915_MMAP_OFFSET_WB	2
+#define I915_MMAP_OFFSET_UC	3
+#define I915_MMAP_OFFSET_FIXED	4
+
+	/**
+	 * @extensions: Zero-terminated chain of extensions.
 	 *
 	 * No current extensions defined; mbz.
 	 */
-- 
cgit v1.2.3-71-gd317


From 81340cf3bddded4fe23a55148152e6d5e2460351 Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Thu, 15 Jul 2021 11:15:36 +0100
Subject: drm/i915/uapi: reject set_domain for discrete
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CPU domain should be static for discrete, and on DG1 we don't need
any flushing since everything is already coherent, so really all this
does is an object wait, for which we have an ioctl. Longer term the
desired caching should be an immutable creation time property for the
BO, which can be set with something like gem_create_ext.

One other user is iris + userptr, which uses the set_domain to probe all
the pages to check if the GUP succeeds, however we now have a PROBE
flag for this purpose.

v2: add some more kernel doc, also add the implicit rules with caching

Suggested-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Jordan Justen <jordan.l.justen@intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Ramalingam C <ramalingam.c@intel.com>
Reviewed-by: Ramalingam C <ramalingam.c@intel.com>
Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20210715101536.2606307-5-matthew.auld@intel.com
---
 drivers/gpu/drm/i915/gem/i915_gem_domain.c |  3 +++
 include/uapi/drm/i915_drm.h                | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
index 43004bef55cb..b684a62bf3b0 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
@@ -490,6 +490,9 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 	u32 write_domain = args->write_domain;
 	int err;
 
+	if (IS_DGFX(to_i915(dev)))
+		return -ENODEV;
+
 	/* Only handle setting domains to types used by the CPU. */
 	if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
 		return -EINVAL;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 0aea82657cdc..975087553ea0 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -923,6 +923,25 @@ struct drm_i915_gem_mmap_offset {
  *	- I915_GEM_DOMAIN_GTT: Mappable aperture domain
  *
  * All other domains are rejected.
+ *
+ * Note that for discrete, starting from DG1, this is no longer supported, and
+ * is instead rejected. On such platforms the CPU domain is effectively static,
+ * where we also only support a single &drm_i915_gem_mmap_offset cache mode,
+ * which can't be set explicitly and instead depends on the object placements,
+ * as per the below.
+ *
+ * Implicit caching rules, starting from DG1:
+ *
+ *	- If any of the object placements (see &drm_i915_gem_create_ext_memory_regions)
+ *	  contain I915_MEMORY_CLASS_DEVICE then the object will be allocated and
+ *	  mapped as write-combined only.
+ *
+ *	- Everything else is always allocated and mapped as write-back, with the
+ *	  guarantee that everything is also coherent with the GPU.
+ *
+ * Note that this is likely to change in the future again, where we might need
+ * more flexibility on future devices, so making this all explicit as part of a
+ * new &drm_i915_gem_create_ext extension is probable.
  */
 struct drm_i915_gem_set_domain {
 	/** @handle: Handle for the object. */
-- 
cgit v1.2.3-71-gd317


From 90c7c70a0a909de87572b9ff81aa35acd49f6473 Mon Sep 17 00:00:00 2001
From: Normunds Rieksts <normunds.rieksts@arm.com>
Date: Thu, 1 Jul 2021 18:07:09 +0100
Subject: drm/fourcc: Add modifier definitions for Arm Fixed Rate Compression

Arm Fixed Rate Compression (AFRC) is a proprietary fixed rate image
compression protocol and format.
It is designed to provide guaranteed bandwidth and memory footprint
reductions in graphics and media use-cases.

This patch aims to add modifier definitions for describing
AFRC.

Signed-off-by: Normunds Rieksts <normunds.rieksts@arm.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Signed-off-by: Liviu Dudau <liviu.dudau@arm.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210701170709.39922-1-normunds.rieksts@arm.com
---
 include/uapi/drm/drm_fourcc.h | 109 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 106 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index f7156322aba5..9f4bb4a6f358 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -900,9 +900,9 @@ drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier)
 
 /*
  * The top 4 bits (out of the 56 bits alloted for specifying vendor specific
- * modifiers) denote the category for modifiers. Currently we have only two
- * categories of modifiers ie AFBC and MISC. We can have a maximum of sixteen
- * different categories.
+ * modifiers) denote the category for modifiers. Currently we have three
+ * categories of modifiers ie AFBC, MISC and AFRC. We can have a maximum of
+ * sixteen different categories.
  */
 #define DRM_FORMAT_MOD_ARM_CODE(__type, __val) \
 	fourcc_mod_code(ARM, ((__u64)(__type) << 52) | ((__val) & 0x000fffffffffffffULL))
@@ -1017,6 +1017,109 @@ drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier)
  */
 #define AFBC_FORMAT_MOD_USM	(1ULL << 12)
 
+/*
+ * Arm Fixed-Rate Compression (AFRC) modifiers
+ *
+ * AFRC is a proprietary fixed rate image compression protocol and format,
+ * designed to provide guaranteed bandwidth and memory footprint
+ * reductions in graphics and media use-cases.
+ *
+ * AFRC buffers consist of one or more planes, with the same components
+ * and meaning as an uncompressed buffer using the same pixel format.
+ *
+ * Within each plane, the pixel/luma/chroma values are grouped into
+ * "coding unit" blocks which are individually compressed to a
+ * fixed size (in bytes). All coding units within a given plane of a buffer
+ * store the same number of values, and have the same compressed size.
+ *
+ * The coding unit size is configurable, allowing different rates of compression.
+ *
+ * The start of each AFRC buffer plane must be aligned to an alignment granule which
+ * depends on the coding unit size.
+ *
+ * Coding Unit Size   Plane Alignment
+ * ----------------   ---------------
+ * 16 bytes           1024 bytes
+ * 24 bytes           512  bytes
+ * 32 bytes           2048 bytes
+ *
+ * Coding units are grouped into paging tiles. AFRC buffer dimensions must be aligned
+ * to a multiple of the paging tile dimensions.
+ * The dimensions of each paging tile depend on whether the buffer is optimised for
+ * scanline (SCAN layout) or rotated (ROT layout) access.
+ *
+ * Layout   Paging Tile Width   Paging Tile Height
+ * ------   -----------------   ------------------
+ * SCAN     16 coding units     4 coding units
+ * ROT      8  coding units     8 coding units
+ *
+ * The dimensions of each coding unit depend on the number of components
+ * in the compressed plane and whether the buffer is optimised for
+ * scanline (SCAN layout) or rotated (ROT layout) access.
+ *
+ * Number of Components in Plane   Layout      Coding Unit Width   Coding Unit Height
+ * -----------------------------   ---------   -----------------   ------------------
+ * 1                               SCAN        16 samples          4 samples
+ * Example: 16x4 luma samples in a 'Y' plane
+ *          16x4 chroma 'V' values, in the 'V' plane of a fully-planar YUV buffer
+ * -----------------------------   ---------   -----------------   ------------------
+ * 1                               ROT         8 samples           8 samples
+ * Example: 8x8 luma samples in a 'Y' plane
+ *          8x8 chroma 'V' values, in the 'V' plane of a fully-planar YUV buffer
+ * -----------------------------   ---------   -----------------   ------------------
+ * 2                               DONT CARE   8 samples           4 samples
+ * Example: 8x4 chroma pairs in the 'UV' plane of a semi-planar YUV buffer
+ * -----------------------------   ---------   -----------------   ------------------
+ * 3                               DONT CARE   4 samples           4 samples
+ * Example: 4x4 pixels in an RGB buffer without alpha
+ * -----------------------------   ---------   -----------------   ------------------
+ * 4                               DONT CARE   4 samples           4 samples
+ * Example: 4x4 pixels in an RGB buffer with alpha
+ */
+
+#define DRM_FORMAT_MOD_ARM_TYPE_AFRC 0x02
+
+#define DRM_FORMAT_MOD_ARM_AFRC(__afrc_mode) \
+	DRM_FORMAT_MOD_ARM_CODE(DRM_FORMAT_MOD_ARM_TYPE_AFRC, __afrc_mode)
+
+/*
+ * AFRC coding unit size modifier.
+ *
+ * Indicates the number of bytes used to store each compressed coding unit for
+ * one or more planes in an AFRC encoded buffer. The coding unit size for chrominance
+ * is the same for both Cb and Cr, which may be stored in separate planes.
+ *
+ * AFRC_FORMAT_MOD_CU_SIZE_P0 indicates the number of bytes used to store
+ * each compressed coding unit in the first plane of the buffer. For RGBA buffers
+ * this is the only plane, while for semi-planar and fully-planar YUV buffers,
+ * this corresponds to the luma plane.
+ *
+ * AFRC_FORMAT_MOD_CU_SIZE_P12 indicates the number of bytes used to store
+ * each compressed coding unit in the second and third planes in the buffer.
+ * For semi-planar and fully-planar YUV buffers, this corresponds to the chroma plane(s).
+ *
+ * For single-plane buffers, AFRC_FORMAT_MOD_CU_SIZE_P0 must be specified
+ * and AFRC_FORMAT_MOD_CU_SIZE_P12 must be zero.
+ * For semi-planar and fully-planar buffers, both AFRC_FORMAT_MOD_CU_SIZE_P0 and
+ * AFRC_FORMAT_MOD_CU_SIZE_P12 must be specified.
+ */
+#define AFRC_FORMAT_MOD_CU_SIZE_MASK 0xf
+#define AFRC_FORMAT_MOD_CU_SIZE_16 (1ULL)
+#define AFRC_FORMAT_MOD_CU_SIZE_24 (2ULL)
+#define AFRC_FORMAT_MOD_CU_SIZE_32 (3ULL)
+
+#define AFRC_FORMAT_MOD_CU_SIZE_P0(__afrc_cu_size) (__afrc_cu_size)
+#define AFRC_FORMAT_MOD_CU_SIZE_P12(__afrc_cu_size) ((__afrc_cu_size) << 4)
+
+/*
+ * AFRC scanline memory layout.
+ *
+ * Indicates if the buffer uses the scanline-optimised layout
+ * for an AFRC encoded buffer, otherwise, it uses the rotation-optimised layout.
+ * The memory layout is the same for all planes.
+ */
+#define AFRC_FORMAT_MOD_LAYOUT_SCAN (1ULL << 8)
+
 /*
  * Arm 16x16 Block U-Interleaved modifier
  *
-- 
cgit v1.2.3-71-gd317


From 50ae81305c7a3ee802f0a1988503ce913c79cd6e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 4 May 2021 11:25:22 -0500
Subject: signal: Verify the alignment and size of siginfo_t

Update the static assertions about siginfo_t to also describe
it's alignment and size.

While investigating if it was possible to add a 64bit field into
siginfo_t[1] it became apparent that the alignment of siginfo_t
is as much a part of the ABI as the size of the structure.

If the alignment changes siginfo_t when embedded in another structure
can move to a different offset.  Which is not acceptable from an ABI
structure.

So document that fact and add static assertions to notify developers
if they change change the alignment by accident.

[1] https://lkml.kernel.org/r/YJEZdhe6JGFNYlum@elver.google.com
Acked-by: Marco Elver <elver@google.com>
v1: https://lkml.kernel.org/r/20210505141101.11519-4-ebiederm@xmission.co
Link: https://lkml.kernel.org/r/875yxaxmyl.fsf_-_@disp2133
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/arm/kernel/signal.c           | 2 ++
 arch/arm64/kernel/signal.c         | 2 ++
 arch/arm64/kernel/signal32.c       | 2 ++
 arch/sparc/kernel/signal32.c       | 2 ++
 arch/sparc/kernel/signal_64.c      | 2 ++
 arch/x86/kernel/signal_compat.c    | 6 ++++++
 include/uapi/asm-generic/siginfo.h | 5 +++++
 7 files changed, 21 insertions(+)

(limited to 'include/uapi')

diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7ef453e8a96f..f3800c0f428b 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -737,6 +737,8 @@ static_assert(NSIGBUS	== 5);
 static_assert(NSIGTRAP	== 6);
 static_assert(NSIGCHLD	== 6);
 static_assert(NSIGSYS	== 2);
+static_assert(sizeof(siginfo_t) == 128);
+static_assert(__alignof__(siginfo_t) == 4);
 static_assert(offsetof(siginfo_t, si_signo)	== 0x00);
 static_assert(offsetof(siginfo_t, si_errno)	== 0x04);
 static_assert(offsetof(siginfo_t, si_code)	== 0x08);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 4413b6a4e32a..d3721e01441b 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -1011,6 +1011,8 @@ static_assert(NSIGBUS	== 5);
 static_assert(NSIGTRAP	== 6);
 static_assert(NSIGCHLD	== 6);
 static_assert(NSIGSYS	== 2);
+static_assert(sizeof(siginfo_t) == 128);
+static_assert(__alignof__(siginfo_t) == 8);
 static_assert(offsetof(siginfo_t, si_signo)	== 0x00);
 static_assert(offsetof(siginfo_t, si_errno)	== 0x04);
 static_assert(offsetof(siginfo_t, si_code)	== 0x08);
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index ab1775216712..d3be01c46bec 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -469,6 +469,8 @@ static_assert(NSIGBUS	== 5);
 static_assert(NSIGTRAP	== 6);
 static_assert(NSIGCHLD	== 6);
 static_assert(NSIGSYS	== 2);
+static_assert(sizeof(compat_siginfo_t) == 128);
+static_assert(__alignof__(compat_siginfo_t) == 4);
 static_assert(offsetof(compat_siginfo_t, si_signo)	== 0x00);
 static_assert(offsetof(compat_siginfo_t, si_errno)	== 0x04);
 static_assert(offsetof(compat_siginfo_t, si_code)	== 0x08);
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 65fd26ae9d25..4276b9e003ca 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -757,6 +757,8 @@ static_assert(NSIGBUS	== 5);
 static_assert(NSIGTRAP	== 6);
 static_assert(NSIGCHLD	== 6);
 static_assert(NSIGSYS	== 2);
+static_assert(sizeof(compat_siginfo_t) == 128);
+static_assert(__alignof__(compat_siginfo_t) == 4);
 static_assert(offsetof(compat_siginfo_t, si_signo)	== 0x00);
 static_assert(offsetof(compat_siginfo_t, si_errno)	== 0x04);
 static_assert(offsetof(compat_siginfo_t, si_code)	== 0x08);
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index a58e0cc45d24..cea23cf95600 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -567,6 +567,8 @@ static_assert(NSIGBUS	== 5);
 static_assert(NSIGTRAP	== 6);
 static_assert(NSIGCHLD	== 6);
 static_assert(NSIGSYS	== 2);
+static_assert(sizeof(siginfo_t) == 128);
+static_assert(__alignof__(siginfo_t) == 8);
 static_assert(offsetof(siginfo_t, si_signo)	== 0x00);
 static_assert(offsetof(siginfo_t, si_errno)	== 0x04);
 static_assert(offsetof(siginfo_t, si_code)	== 0x08);
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 06743ec054d2..b52407c56000 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -34,7 +34,13 @@ static inline void signal_compat_build_tests(void)
 	BUILD_BUG_ON(NSIGSYS  != 2);
 
 	/* This is part of the ABI and can never change in size: */
+	BUILD_BUG_ON(sizeof(siginfo_t) != 128);
 	BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128);
+
+	/* This is a part of the ABI and can never change in alignment */
+	BUILD_BUG_ON(__alignof__(siginfo_t) != 8);
+	BUILD_BUG_ON(__alignof__(compat_siginfo_t) != 4);
+
 	/*
 	 * The offsets of all the (unioned) si_fields are fixed
 	 * in the ABI, of course.  Make sure none of them ever
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 5a3c221f4c9d..3ba180f550d7 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -29,6 +29,11 @@ typedef union sigval {
 #define __ARCH_SI_ATTRIBUTES
 #endif
 
+/*
+ * Be careful when extending this union.  On 32bit siginfo_t is 32bit
+ * aligned.  Which means that a 64bit field or any other field that
+ * would increase the alignment of siginfo_t will break the ABI.
+ */
 union __sifields {
 	/* kill() */
 	struct {
-- 
cgit v1.2.3-71-gd317


From 9ffb14ef61bab83fa818736bf3e7e6b6e182e8e2 Mon Sep 17 00:00:00 2001
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Date: Thu, 15 Jul 2021 13:07:13 +0300
Subject: move_mount: allow to add a mount into an existing group

Previously a sharing group (shared and master ids pair) can be only
inherited when mount is created via bindmount. This patch adds an
ability to add an existing private mount into an existing sharing group.

With this functionality one can first create the desired mount tree from
only private mounts (without the need to care about undesired mount
propagation or mount creation order implied by sharing group
dependencies), and next then setup any desired mount sharing between
those mounts in tree as needed.

This allows CRIU to restore any set of mount namespaces, mount trees and
sharing group trees for a container.

We have many issues with restoring mounts in CRIU related to sharing
groups and propagation:
- reverse sharing groups vs mount tree order requires complex mounts
  reordering which mostly implies also using some temporary mounts
(please see https://lkml.org/lkml/2021/3/23/569 for more info)

- mount() syscall creates tons of mounts due to propagation
- mount re-parenting due to propagation
- "Mount Trap" due to propagation
- "Non Uniform" propagation, meaning that with different tricks with
  mount order and temporary children-"lock" mounts one can create mount
  trees which can't be restored without those tricks
(see https://www.linuxplumbersconf.org/event/7/contributions/640/)

With this new functionality we can resolve all the problems with
propagation at once.

Link: https://lore.kernel.org/r/20210715100714.120228-1-ptikhomirov@virtuozzo.com
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Mattias Nissler <mnissler@chromium.org>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Cc: lkml <linux-kernel@vger.kernel.org>
Co-developed-by: Andrei Vagin <avagin@gmail.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/namespace.c             | 77 +++++++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/mount.h |  3 +-
 2 files changed, 78 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/namespace.c b/fs/namespace.c
index ab4174a3c802..5d0b477c2682 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2684,6 +2684,78 @@ out:
 	return ret;
 }
 
+static int do_set_group(struct path *from_path, struct path *to_path)
+{
+	struct mount *from, *to;
+	int err;
+
+	from = real_mount(from_path->mnt);
+	to = real_mount(to_path->mnt);
+
+	namespace_lock();
+
+	err = -EINVAL;
+	/* To and From must be mounted */
+	if (!is_mounted(&from->mnt))
+		goto out;
+	if (!is_mounted(&to->mnt))
+		goto out;
+
+	err = -EPERM;
+	/* We should be allowed to modify mount namespaces of both mounts */
+	if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		goto out;
+	if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		goto out;
+
+	err = -EINVAL;
+	/* To and From paths should be mount roots */
+	if (from_path->dentry != from_path->mnt->mnt_root)
+		goto out;
+	if (to_path->dentry != to_path->mnt->mnt_root)
+		goto out;
+
+	/* Setting sharing groups is only allowed across same superblock */
+	if (from->mnt.mnt_sb != to->mnt.mnt_sb)
+		goto out;
+
+	/* From mount root should be wider than To mount root */
+	if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
+		goto out;
+
+	/* From mount should not have locked children in place of To's root */
+	if (has_locked_children(from, to->mnt.mnt_root))
+		goto out;
+
+	/* Setting sharing groups is only allowed on private mounts */
+	if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
+		goto out;
+
+	/* From should not be private */
+	if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
+		goto out;
+
+	if (IS_MNT_SLAVE(from)) {
+		struct mount *m = from->mnt_master;
+
+		list_add(&to->mnt_slave, &m->mnt_slave_list);
+		to->mnt_master = m;
+	}
+
+	if (IS_MNT_SHARED(from)) {
+		to->mnt_group_id = from->mnt_group_id;
+		list_add(&to->mnt_share, &from->mnt_share);
+		lock_mount_hash();
+		set_mnt_shared(to);
+		unlock_mount_hash();
+	}
+
+	err = 0;
+out:
+	namespace_unlock();
+	return err;
+}
+
 static int do_move_mount(struct path *old_path, struct path *new_path)
 {
 	struct mnt_namespace *ns;
@@ -3669,7 +3741,10 @@ SYSCALL_DEFINE5(move_mount,
 	if (ret < 0)
 		goto out_to;
 
-	ret = do_move_mount(&from_path, &to_path);
+	if (flags & MOVE_MOUNT_SET_GROUP)
+		ret = do_set_group(&from_path, &to_path);
+	else
+		ret = do_move_mount(&from_path, &to_path);
 
 out_to:
 	path_put(&to_path);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index dd7a166fdf9c..4d93967f8aea 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -73,7 +73,8 @@
 #define MOVE_MOUNT_T_SYMLINKS		0x00000010 /* Follow symlinks on to path */
 #define MOVE_MOUNT_T_AUTOMOUNTS		0x00000020 /* Follow automounts on to path */
 #define MOVE_MOUNT_T_EMPTY_PATH		0x00000040 /* Empty to path permitted */
-#define MOVE_MOUNT__MASK		0x00000077
+#define MOVE_MOUNT_SET_GROUP		0x00000100 /* Set sharing group instead */
+#define MOVE_MOUNT__MASK		0x00000177
 
 /*
  * fsopen() flags.
-- 
cgit v1.2.3-71-gd317


From d7aff291d069c4418285f3c8ee27b0ff67ce5998 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@orcam.me.uk>
Date: Sat, 26 Jun 2021 06:11:51 +0200
Subject: serial: 8250: Define RX trigger levels for OxSemi 950 devices

Oxford Semiconductor 950 serial port devices have a 128-byte FIFO and in
the enhanced (650) mode, which we select in `autoconfig_has_efr' with
the ECB bit set in the EFR register, they support the receive interrupt
trigger level selectable with FCR bits 7:6 from the set of 16, 32, 112,
120.  This applies to the original OX16C950 discrete UART[1] as well as
950 cores embedded into more complex devices.

For these devices we set the default to 112, which sets an excessively
high level of 112 or 7/8 of the FIFO capacity, unlike with other port
types where we choose at most 1/2 of their respective FIFO capacities.
Additionally we don't make the trigger level configurable.  Consequently
frequent input overruns happen with high bit rates where hardware flow
control cannot be used (e.g. terminal applications) even with otherwise
highly-performant systems.

Lower the default receive interrupt trigger level to 32 then, and make
it configurable.  Document the trigger levels along with other port
types, including the set of 16, 32, 64, 112 for the transmit interrupt
as well[2].

References:

[1] "OX16C950 rev B High Performance UART with 128 byte FIFOs", Oxford
    Semiconductor, Inc., DS-0031, Sep 05, Table 10: "Receiver Trigger
    Levels", p. 22

[2] same, Table 9: "Transmit Interrupt Trigger Levels", p. 22

Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk>
Link: https://lore.kernel.org/r/alpine.DEB.2.21.2106260608480.37803@angie.orcam.me.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_port.c | 3 ++-
 include/uapi/linux/serial_reg.h     | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 2164290cbd31..badf5c320853 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -122,7 +122,8 @@ static const struct serial8250_config uart_config[] = {
 		.name		= "16C950/954",
 		.fifo_size	= 128,
 		.tx_loadsz	= 128,
-		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
+		.rxtrig_bytes	= {16, 32, 112, 120},
 		/* UART_CAP_EFR breaks billionon CF bluetooth card. */
 		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP,
 	},
diff --git a/include/uapi/linux/serial_reg.h b/include/uapi/linux/serial_reg.h
index be07b5470f4b..f51bc8f36813 100644
--- a/include/uapi/linux/serial_reg.h
+++ b/include/uapi/linux/serial_reg.h
@@ -62,6 +62,7 @@
  * ST16C654:	 8  16  56  60		 8  16  32  56	PORT_16654
  * TI16C750:	 1  16  32  56		xx  xx  xx  xx	PORT_16750
  * TI16C752:	 8  16  56  60		 8  16  32  56
+ * OX16C950:	16  32 112 120		16  32  64 112	PORT_16C950
  * Tegra:	 1   4   8  14		16   8   4   1	PORT_TEGRA
  */
 #define UART_FCR_R_TRIG_00	0x00
-- 
cgit v1.2.3-71-gd317


From e4252cb66637b846b916cca7c2cdb4ed22ab2fc3 Mon Sep 17 00:00:00 2001
From: Mark Gray <mark.d.gray@redhat.com>
Date: Fri, 23 Jul 2021 10:24:12 -0400
Subject: openvswitch: update kdoc OVS_DP_ATTR_PER_CPU_PIDS

Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 6571b57b2268..0e436a3755f1 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -70,7 +70,7 @@ enum ovs_datapath_cmd {
  * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
  * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
  * not be sent.
- * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
+ * @OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
  * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
  * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
  * datapath.  Always present in notifications.
-- 
cgit v1.2.3-71-gd317


From 784dcfa56e0453bb197601ba0b8196f6f892ebcb Mon Sep 17 00:00:00 2001
From: Mark Gray <mark.d.gray@redhat.com>
Date: Fri, 23 Jul 2021 10:24:13 -0400
Subject: openvswitch: fix alignment issues

Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  4 ++--
 net/openvswitch/actions.c        |  6 ++++--
 net/openvswitch/datapath.c       | 16 ++++++++++------
 3 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 0e436a3755f1..150bcff49b1c 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -89,8 +89,8 @@ enum ovs_datapath_attr {
 	OVS_DP_ATTR_USER_FEATURES,	/* OVS_DP_F_*  */
 	OVS_DP_ATTR_PAD,
 	OVS_DP_ATTR_MASKS_CACHE_SIZE,
-	OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in per-cpu
-				     * dispatch mode
+	OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in
+				     * per-cpu dispatch mode
 				     */
 	__OVS_DP_ATTR_MAX
 };
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index f79679746c62..076774034bb9 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -924,9 +924,11 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 			break;
 
 		case OVS_USERSPACE_ATTR_PID:
-			if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+			if (dp->user_features &
+			    OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
 				upcall.portid =
-				   ovs_dp_get_upcall_portid(dp, smp_processor_id());
+				  ovs_dp_get_upcall_portid(dp,
+							   smp_processor_id());
 			else
 				upcall.portid = nla_get_u32(a);
 			break;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 7a4edafdc685..e6f0ae5618dd 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -244,7 +244,8 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 		upcall.cmd = OVS_PACKET_CMD_MISS;
 
 		if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
-			upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id());
+			upcall.portid =
+			    ovs_dp_get_upcall_portid(dp, smp_processor_id());
 		else
 			upcall.portid = ovs_vport_find_upcall_portid(p, skb);
 
@@ -1636,13 +1637,16 @@ u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id)
 	if (dp_nlsk_pids) {
 		if (cpu_id < dp_nlsk_pids->n_pids) {
 			return dp_nlsk_pids->pids[cpu_id];
-		} else if (dp_nlsk_pids->n_pids > 0 && cpu_id >= dp_nlsk_pids->n_pids) {
-			/* If the number of netlink PIDs is mismatched with the number of
-			 * CPUs as seen by the kernel, log this and send the upcall to an
-			 * arbitrary socket (0) in order to not drop packets
+		} else if (dp_nlsk_pids->n_pids > 0 &&
+			   cpu_id >= dp_nlsk_pids->n_pids) {
+			/* If the number of netlink PIDs is mismatched with
+			 * the number of CPUs as seen by the kernel, log this
+			 * and send the upcall to an arbitrary socket (0) in
+			 * order to not drop packets
 			 */
 			pr_info_ratelimited("cpu_id mismatch with handler threads");
-			return dp_nlsk_pids->pids[cpu_id % dp_nlsk_pids->n_pids];
+			return dp_nlsk_pids->pids[cpu_id %
+						  dp_nlsk_pids->n_pids];
 		} else {
 			return 0;
 		}
-- 
cgit v1.2.3-71-gd317


From 17ce9c61c71cbc0d7e6ba0a9f34c724fb316a31b Mon Sep 17 00:00:00 2001
From: Simon Ser <contact@emersion.fr>
Date: Mon, 26 Jul 2021 07:50:32 +0000
Subject: drm: document DRM_IOCTL_MODE_RMFB

Since there's no struct to attach the docs to, document the IOCTL
definition.

Signed-off-by: Simon Ser <contact@emersion.fr>
Reviewed-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Leandro Ribeiro <leandro.ribeiro@collabora.com>
Link: https://patchwork.freedesktop.org/patch/msgid/ephVkof3uGu2RpOdBbrHE3qF98zBfIBRzXe4Vyoboh0@cp4-web-034.plabs.ch
---
 include/uapi/drm/drm.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index e1f49dd241f7..3b810b53ba8b 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -1050,6 +1050,16 @@ extern "C" {
 #define DRM_IOCTL_MODE_GETPROPBLOB	DRM_IOWR(0xAC, struct drm_mode_get_blob)
 #define DRM_IOCTL_MODE_GETFB		DRM_IOWR(0xAD, struct drm_mode_fb_cmd)
 #define DRM_IOCTL_MODE_ADDFB		DRM_IOWR(0xAE, struct drm_mode_fb_cmd)
+/**
+ * DRM_IOCTL_MODE_RMFB - Remove a framebuffer.
+ *
+ * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL
+ * argument is a framebuffer object ID.
+ *
+ * Warning: removing a framebuffer currently in-use on an enabled plane will
+ * disable that plane. The CRTC the plane is linked to may also be disabled
+ * (depending on driver capabilities).
+ */
 #define DRM_IOCTL_MODE_RMFB		DRM_IOWR(0xAF, unsigned int)
 #define DRM_IOCTL_MODE_PAGE_FLIP	DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip)
 #define DRM_IOCTL_MODE_DIRTYFB		DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd)
-- 
cgit v1.2.3-71-gd317


From ee242ca704d386991d7ece0c46134e211d52412b Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Mon, 26 Jul 2021 17:23:47 -0700
Subject: drm/i915/guc: Implement GuC priority management

Implement a simple static mapping algorithm of the i915 priority levels
(int, -1k to 1k exposed to user) to the 4 GuC levels. Mapping is as
follows:

i915 level < 0          -> GuC low level     (3)
i915 level == 0         -> GuC normal level  (2)
i915 level < INT_MAX    -> GuC high level    (1)
i915 level == INT_MAX   -> GuC highest level (0)

We believe this mapping should cover the UMD use cases (3 distinct user
levels + 1 kernel level).

In addition to static mapping, a simple counter system is attached to
each context tracking the number of requests inflight on the context at
each level. This is needed as the GuC levels are per context while in
the i915 levels are per request.

v2:
 (Daniele)
  - Add BUILD_BUG_ON to enforce ordering of priority levels
  - Add missing lockdep to guc_prio_fini
  - Check for return before setting context registered flag
  - Map DISPLAY priority or higher to highest guc prio
  - Update comment for guc_prio

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210727002348.97202-33-matthew.brost@intel.com
---
 drivers/gpu/drm/i915/gt/intel_breadcrumbs.c       |   3 +
 drivers/gpu/drm/i915/gt/intel_context_types.h     |   9 +-
 drivers/gpu/drm/i915/gt/intel_engine_user.c       |   4 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 212 +++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_request.c               |   5 +
 drivers/gpu/drm/i915/i915_request.h               |   9 +
 drivers/gpu/drm/i915/i915_scheduler.c             |   7 +
 drivers/gpu/drm/i915/i915_scheduler_types.h       |  12 ++
 drivers/gpu/drm/i915/i915_trace.h                 |  17 +-
 include/uapi/drm/i915_drm.h                       |   9 +
 10 files changed, 282 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index 2007dc6f6b99..209cf265bf74 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -245,6 +245,9 @@ static void signal_irq_work(struct irq_work *work)
 			llist_entry(signal, typeof(*rq), signal_node);
 		struct list_head cb_list;
 
+		if (rq->engine->sched_engine->retire_inflight_request_prio)
+			rq->engine->sched_engine->retire_inflight_request_prio(rq);
+
 		spin_lock(&rq->lock);
 		list_replace(&rq->fence.cb_list, &cb_list);
 		__dma_fence_signal__timestamp(&rq->fence, timestamp);
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index a5bc876face7..e54351a170e2 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -18,8 +18,9 @@
 #include "intel_engine_types.h"
 #include "intel_sseu.h"
 
-#define CONTEXT_REDZONE POISON_INUSE
+#include "uc/intel_guc_fwif.h"
 
+#define CONTEXT_REDZONE POISON_INUSE
 DECLARE_EWMA(runtime, 3, 8);
 
 struct i915_gem_context;
@@ -191,6 +192,12 @@ struct intel_context {
 
 	/* GuC context blocked fence */
 	struct i915_sw_fence guc_blocked;
+
+	/*
+	 * GuC priority management
+	 */
+	u8 guc_prio;
+	u32 guc_prio_count[GUC_CLIENT_PRIORITY_NUM];
 };
 
 #endif /* __INTEL_CONTEXT_TYPES__ */
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
index 84142127ebd8..8f8bea08e734 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
@@ -11,6 +11,7 @@
 #include "intel_engine.h"
 #include "intel_engine_user.h"
 #include "intel_gt.h"
+#include "uc/intel_guc_submission.h"
 
 struct intel_engine_cs *
 intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance)
@@ -115,6 +116,9 @@ static void set_scheduler_caps(struct drm_i915_private *i915)
 			disabled |= (I915_SCHEDULER_CAP_ENABLED |
 				     I915_SCHEDULER_CAP_PRIORITY);
 
+		if (intel_uc_uses_guc_submission(&i915->gt.uc))
+			enabled |= I915_SCHEDULER_CAP_STATIC_PRIORITY_MAP;
+
 		for (i = 0; i < ARRAY_SIZE(map); i++) {
 			if (engine->flags & BIT(map[i].engine))
 				enabled |= BIT(map[i].sched);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 3ff42d6e934f..b760cbf6ca0e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -81,6 +81,7 @@ guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count);
  */
 #define SCHED_STATE_NO_LOCK_ENABLED			BIT(0)
 #define SCHED_STATE_NO_LOCK_PENDING_ENABLE		BIT(1)
+#define SCHED_STATE_NO_LOCK_REGISTERED			BIT(2)
 static inline bool context_enabled(struct intel_context *ce)
 {
 	return (atomic_read(&ce->guc_sched_state_no_lock) &
@@ -116,6 +117,24 @@ static inline void clr_context_pending_enable(struct intel_context *ce)
 		   &ce->guc_sched_state_no_lock);
 }
 
+static inline bool context_registered(struct intel_context *ce)
+{
+	return (atomic_read(&ce->guc_sched_state_no_lock) &
+		SCHED_STATE_NO_LOCK_REGISTERED);
+}
+
+static inline void set_context_registered(struct intel_context *ce)
+{
+	atomic_or(SCHED_STATE_NO_LOCK_REGISTERED,
+		  &ce->guc_sched_state_no_lock);
+}
+
+static inline void clr_context_registered(struct intel_context *ce)
+{
+	atomic_and((u32)~SCHED_STATE_NO_LOCK_REGISTERED,
+		   &ce->guc_sched_state_no_lock);
+}
+
 /*
  * Below is a set of functions which control the GuC scheduling state which
  * require a lock, aside from the special case where the functions are called
@@ -1092,6 +1111,7 @@ static int steal_guc_id(struct intel_guc *guc)
 
 		list_del_init(&ce->guc_id_link);
 		guc_id = ce->guc_id;
+		clr_context_registered(ce);
 		set_context_guc_id_invalid(ce);
 		return guc_id;
 	} else {
@@ -1201,10 +1221,15 @@ static int register_context(struct intel_context *ce, bool loop)
 	struct intel_guc *guc = ce_to_guc(ce);
 	u32 offset = intel_guc_ggtt_offset(guc, guc->lrc_desc_pool) +
 		ce->guc_id * sizeof(struct guc_lrc_desc);
+	int ret;
 
 	trace_intel_context_register(ce);
 
-	return __guc_action_register_context(guc, ce->guc_id, offset, loop);
+	ret = __guc_action_register_context(guc, ce->guc_id, offset, loop);
+	if (likely(!ret))
+		set_context_registered(ce);
+
+	return ret;
 }
 
 static int __guc_action_deregister_context(struct intel_guc *guc,
@@ -1260,6 +1285,8 @@ static void guc_context_policy_init(struct intel_engine_cs *engine,
 	desc->preemption_timeout = engine->props.preempt_timeout_ms * 1000;
 }
 
+static inline u8 map_i915_prio_to_guc_prio(int prio);
+
 static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
 {
 	struct intel_engine_cs *engine = ce->engine;
@@ -1267,6 +1294,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
 	struct intel_guc *guc = &engine->gt->uc.guc;
 	u32 desc_idx = ce->guc_id;
 	struct guc_lrc_desc *desc;
+	const struct i915_gem_context *ctx;
+	int prio = I915_CONTEXT_DEFAULT_PRIORITY;
 	bool context_registered;
 	intel_wakeref_t wakeref;
 	int ret = 0;
@@ -1282,6 +1311,12 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
 
 	context_registered = lrc_desc_registered(guc, desc_idx);
 
+	rcu_read_lock();
+	ctx = rcu_dereference(ce->gem_context);
+	if (ctx)
+		prio = ctx->sched.priority;
+	rcu_read_unlock();
+
 	reset_lrc_desc(guc, desc_idx);
 	set_lrc_desc_registered(guc, desc_idx, ce);
 
@@ -1290,7 +1325,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
 	desc->engine_submit_mask = adjust_engine_mask(engine->class,
 						      engine->mask);
 	desc->hw_context_desc = ce->lrc.lrca;
-	desc->priority = GUC_CLIENT_PRIORITY_KMD_NORMAL;
+	ce->guc_prio = map_i915_prio_to_guc_prio(prio);
+	desc->priority = ce->guc_prio;
 	desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD;
 	guc_context_policy_init(engine, desc);
 	init_sched_state(ce);
@@ -1693,11 +1729,17 @@ static inline void guc_lrc_desc_unpin(struct intel_context *ce)
 	GEM_BUG_ON(ce != __get_context(guc, ce->guc_id));
 	GEM_BUG_ON(context_enabled(ce));
 
+	clr_context_registered(ce);
 	deregister_context(ce, ce->guc_id, true);
 }
 
 static void __guc_context_destroy(struct intel_context *ce)
 {
+	GEM_BUG_ON(ce->guc_prio_count[GUC_CLIENT_PRIORITY_KMD_HIGH] ||
+		   ce->guc_prio_count[GUC_CLIENT_PRIORITY_HIGH] ||
+		   ce->guc_prio_count[GUC_CLIENT_PRIORITY_KMD_NORMAL] ||
+		   ce->guc_prio_count[GUC_CLIENT_PRIORITY_NORMAL]);
+
 	lrc_fini(ce);
 	intel_context_fini(ce);
 
@@ -1791,15 +1833,124 @@ static int guc_context_alloc(struct intel_context *ce)
 	return lrc_alloc(ce, ce->engine);
 }
 
+static void guc_context_set_prio(struct intel_guc *guc,
+				 struct intel_context *ce,
+				 u8 prio)
+{
+	u32 action[] = {
+		INTEL_GUC_ACTION_SET_CONTEXT_PRIORITY,
+		ce->guc_id,
+		prio,
+	};
+
+	GEM_BUG_ON(prio < GUC_CLIENT_PRIORITY_KMD_HIGH ||
+		   prio > GUC_CLIENT_PRIORITY_NORMAL);
+
+	if (ce->guc_prio == prio || submission_disabled(guc) ||
+	    !context_registered(ce))
+		return;
+
+	guc_submission_send_busy_loop(guc, action, ARRAY_SIZE(action), 0, true);
+
+	ce->guc_prio = prio;
+	trace_intel_context_set_prio(ce);
+}
+
+static inline u8 map_i915_prio_to_guc_prio(int prio)
+{
+	if (prio == I915_PRIORITY_NORMAL)
+		return GUC_CLIENT_PRIORITY_KMD_NORMAL;
+	else if (prio < I915_PRIORITY_NORMAL)
+		return GUC_CLIENT_PRIORITY_NORMAL;
+	else if (prio < I915_PRIORITY_DISPLAY)
+		return GUC_CLIENT_PRIORITY_HIGH;
+	else
+		return GUC_CLIENT_PRIORITY_KMD_HIGH;
+}
+
+static inline void add_context_inflight_prio(struct intel_context *ce,
+					     u8 guc_prio)
+{
+	lockdep_assert_held(&ce->guc_active.lock);
+	GEM_BUG_ON(guc_prio >= ARRAY_SIZE(ce->guc_prio_count));
+
+	++ce->guc_prio_count[guc_prio];
+
+	/* Overflow protection */
+	GEM_WARN_ON(!ce->guc_prio_count[guc_prio]);
+}
+
+static inline void sub_context_inflight_prio(struct intel_context *ce,
+					     u8 guc_prio)
+{
+	lockdep_assert_held(&ce->guc_active.lock);
+	GEM_BUG_ON(guc_prio >= ARRAY_SIZE(ce->guc_prio_count));
+
+	/* Underflow protection */
+	GEM_WARN_ON(!ce->guc_prio_count[guc_prio]);
+
+	--ce->guc_prio_count[guc_prio];
+}
+
+static inline void update_context_prio(struct intel_context *ce)
+{
+	struct intel_guc *guc = &ce->engine->gt->uc.guc;
+	int i;
+
+	BUILD_BUG_ON(GUC_CLIENT_PRIORITY_KMD_HIGH != 0);
+	BUILD_BUG_ON(GUC_CLIENT_PRIORITY_KMD_HIGH > GUC_CLIENT_PRIORITY_NORMAL);
+
+	lockdep_assert_held(&ce->guc_active.lock);
+
+	for (i = 0; i < ARRAY_SIZE(ce->guc_prio_count); ++i) {
+		if (ce->guc_prio_count[i]) {
+			guc_context_set_prio(guc, ce, i);
+			break;
+		}
+	}
+}
+
+static inline bool new_guc_prio_higher(u8 old_guc_prio, u8 new_guc_prio)
+{
+	/* Lower value is higher priority */
+	return new_guc_prio < old_guc_prio;
+}
+
 static void add_to_context(struct i915_request *rq)
 {
 	struct intel_context *ce = rq->context;
+	u8 new_guc_prio = map_i915_prio_to_guc_prio(rq_prio(rq));
+
+	GEM_BUG_ON(rq->guc_prio == GUC_PRIO_FINI);
 
 	spin_lock(&ce->guc_active.lock);
 	list_move_tail(&rq->sched.link, &ce->guc_active.requests);
+
+	if (rq->guc_prio == GUC_PRIO_INIT) {
+		rq->guc_prio = new_guc_prio;
+		add_context_inflight_prio(ce, rq->guc_prio);
+	} else if (new_guc_prio_higher(rq->guc_prio, new_guc_prio)) {
+		sub_context_inflight_prio(ce, rq->guc_prio);
+		rq->guc_prio = new_guc_prio;
+		add_context_inflight_prio(ce, rq->guc_prio);
+	}
+	update_context_prio(ce);
+
 	spin_unlock(&ce->guc_active.lock);
 }
 
+static void guc_prio_fini(struct i915_request *rq, struct intel_context *ce)
+{
+	lockdep_assert_held(&ce->guc_active.lock);
+
+	if (rq->guc_prio != GUC_PRIO_INIT &&
+	    rq->guc_prio != GUC_PRIO_FINI) {
+		sub_context_inflight_prio(ce, rq->guc_prio);
+		update_context_prio(ce);
+	}
+	rq->guc_prio = GUC_PRIO_FINI;
+}
+
 static void remove_from_context(struct i915_request *rq)
 {
 	struct intel_context *ce = rq->context;
@@ -1812,6 +1963,8 @@ static void remove_from_context(struct i915_request *rq)
 	/* Prevent further __await_execution() registering a cb, then flush */
 	set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags);
 
+	guc_prio_fini(rq, ce);
+
 	spin_unlock_irq(&ce->guc_active.lock);
 
 	atomic_dec(&ce->guc_id_ref);
@@ -2093,6 +2246,39 @@ static void guc_init_breadcrumbs(struct intel_engine_cs *engine)
 	}
 }
 
+static void guc_bump_inflight_request_prio(struct i915_request *rq,
+					   int prio)
+{
+	struct intel_context *ce = rq->context;
+	u8 new_guc_prio = map_i915_prio_to_guc_prio(prio);
+
+	/* Short circuit function */
+	if (prio < I915_PRIORITY_NORMAL ||
+	    rq->guc_prio == GUC_PRIO_FINI ||
+	    (rq->guc_prio != GUC_PRIO_INIT &&
+	     !new_guc_prio_higher(rq->guc_prio, new_guc_prio)))
+		return;
+
+	spin_lock(&ce->guc_active.lock);
+	if (rq->guc_prio != GUC_PRIO_FINI) {
+		if (rq->guc_prio != GUC_PRIO_INIT)
+			sub_context_inflight_prio(ce, rq->guc_prio);
+		rq->guc_prio = new_guc_prio;
+		add_context_inflight_prio(ce, rq->guc_prio);
+		update_context_prio(ce);
+	}
+	spin_unlock(&ce->guc_active.lock);
+}
+
+static void guc_retire_inflight_request_prio(struct i915_request *rq)
+{
+	struct intel_context *ce = rq->context;
+
+	spin_lock(&ce->guc_active.lock);
+	guc_prio_fini(rq, ce);
+	spin_unlock(&ce->guc_active.lock);
+}
+
 static void sanitize_hwsp(struct intel_engine_cs *engine)
 {
 	struct intel_timeline *tl;
@@ -2317,6 +2503,10 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
 		guc->sched_engine->disabled = guc_sched_engine_disabled;
 		guc->sched_engine->private_data = guc;
 		guc->sched_engine->destroy = guc_sched_engine_destroy;
+		guc->sched_engine->bump_inflight_request_prio =
+			guc_bump_inflight_request_prio;
+		guc->sched_engine->retire_inflight_request_prio =
+			guc_retire_inflight_request_prio;
 		tasklet_setup(&guc->sched_engine->tasklet,
 			      guc_submission_tasklet);
 	}
@@ -2694,6 +2884,22 @@ void intel_guc_submission_print_info(struct intel_guc *guc,
 	drm_printf(p, "\n");
 }
 
+static inline void guc_log_context_priority(struct drm_printer *p,
+					    struct intel_context *ce)
+{
+	int i;
+
+	drm_printf(p, "\t\tPriority: %d\n",
+		   ce->guc_prio);
+	drm_printf(p, "\t\tNumber Requests (lower index == higher priority)\n");
+	for (i = GUC_CLIENT_PRIORITY_KMD_HIGH;
+	     i < GUC_CLIENT_PRIORITY_NUM; ++i) {
+		drm_printf(p, "\t\tNumber requests in priority band[%d]: %d\n",
+			   i, ce->guc_prio_count[i]);
+	}
+	drm_printf(p, "\n");
+}
+
 void intel_guc_submission_print_context_info(struct intel_guc *guc,
 					     struct drm_printer *p)
 {
@@ -2716,6 +2922,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
 		drm_printf(p, "\t\tSchedule State: 0x%x, 0x%x\n\n",
 			   ce->guc_state.sched_state,
 			   atomic_read(&ce->guc_sched_state_no_lock));
+
+		guc_log_context_priority(p, ce);
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 541a20371502..1f1d4a6a0eff 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -114,6 +114,9 @@ static void i915_fence_release(struct dma_fence *fence)
 {
 	struct i915_request *rq = to_request(fence);
 
+	GEM_BUG_ON(rq->guc_prio != GUC_PRIO_INIT &&
+		   rq->guc_prio != GUC_PRIO_FINI);
+
 	/*
 	 * The request is put onto a RCU freelist (i.e. the address
 	 * is immediately reused), mark the fences as being freed now.
@@ -924,6 +927,8 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp)
 
 	rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */
 
+	rq->guc_prio = GUC_PRIO_INIT;
+
 	/* We bump the ref for the fence chain */
 	i915_sw_fence_reinit(&i915_request_get(rq)->submit);
 	i915_sw_fence_reinit(&i915_request_get(rq)->semaphore);
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index ac0e3326c067..e6a0e0ebc9aa 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -293,6 +293,15 @@ struct i915_request {
 	 */
 	struct list_head guc_fence_link;
 
+	/**
+	 * Priority level while the request is inflight. Differs from i915
+	 * scheduler priority. See comment above
+	 * I915_SCHEDULER_CAP_STATIC_PRIORITY_MAP for details.
+	 */
+#define	GUC_PRIO_INIT	0xff
+#define	GUC_PRIO_FINI	0xfe
+	u8 guc_prio;
+
 	I915_SELFTEST_DECLARE(struct {
 		struct list_head link;
 		unsigned long delay;
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 28dd887eb1be..17843c204356 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -241,6 +241,9 @@ static void __i915_schedule(struct i915_sched_node *node,
 	/* Fifo and depth-first replacement ensure our deps execute before us */
 	sched_engine = lock_sched_engine(node, sched_engine, &cache);
 	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
+		struct i915_request *from = container_of(dep->signaler,
+							 struct i915_request,
+							 sched);
 		INIT_LIST_HEAD(&dep->dfs_link);
 
 		node = dep->signaler;
@@ -254,6 +257,10 @@ static void __i915_schedule(struct i915_sched_node *node,
 		GEM_BUG_ON(node_to_request(node)->engine->sched_engine !=
 			   sched_engine);
 
+		/* Must be called before changing the nodes priority */
+		if (sched_engine->bump_inflight_request_prio)
+			sched_engine->bump_inflight_request_prio(from, prio);
+
 		WRITE_ONCE(node->attr.priority, prio);
 
 		/*
diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
index eaef233e9080..b0a1b58c7893 100644
--- a/drivers/gpu/drm/i915/i915_scheduler_types.h
+++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
@@ -179,6 +179,18 @@ struct i915_sched_engine {
 	void	(*kick_backend)(const struct i915_request *rq,
 				int prio);
 
+	/**
+	 * @bump_inflight_request_prio: update priority of an inflight request
+	 */
+	void	(*bump_inflight_request_prio)(struct i915_request *rq,
+					      int prio);
+
+	/**
+	 * @retire_inflight_request_prio: indicate request is retired to
+	 * priority tracking
+	 */
+	void	(*retire_inflight_request_prio)(struct i915_request *rq);
+
 	/**
 	 * @schedule: adjust priority of request
 	 *
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 9613a7c19661..806ad688274b 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -904,6 +904,7 @@ DECLARE_EVENT_CLASS(intel_context,
 			     __field(int, pin_count)
 			     __field(u32, sched_state)
 			     __field(u32, guc_sched_state_no_lock)
+			     __field(u8, guc_prio)
 			     ),
 
 		    TP_fast_assign(
@@ -912,12 +913,19 @@ DECLARE_EVENT_CLASS(intel_context,
 			   __entry->sched_state = ce->guc_state.sched_state;
 			   __entry->guc_sched_state_no_lock =
 			   atomic_read(&ce->guc_sched_state_no_lock);
+			   __entry->guc_prio = ce->guc_prio;
 			   ),
 
-		    TP_printk("guc_id=%d, pin_count=%d sched_state=0x%x,0x%x",
+		    TP_printk("guc_id=%d, pin_count=%d sched_state=0x%x,0x%x, guc_prio=%u",
 			      __entry->guc_id, __entry->pin_count,
 			      __entry->sched_state,
-			      __entry->guc_sched_state_no_lock)
+			      __entry->guc_sched_state_no_lock,
+			      __entry->guc_prio)
+);
+
+DEFINE_EVENT(intel_context, intel_context_set_prio,
+	     TP_PROTO(struct intel_context *ce),
+	     TP_ARGS(ce)
 );
 
 DEFINE_EVENT(intel_context, intel_context_reset,
@@ -1017,6 +1025,11 @@ trace_i915_request_out(struct i915_request *rq)
 {
 }
 
+static inline void
+trace_intel_context_set_prio(struct intel_context *ce)
+{
+}
+
 static inline void
 trace_intel_context_reset(struct intel_context *ce)
 {
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 975087553ea0..7f13d241417f 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -572,6 +572,15 @@ typedef struct drm_i915_irq_wait {
 #define   I915_SCHEDULER_CAP_PREEMPTION	(1ul << 2)
 #define   I915_SCHEDULER_CAP_SEMAPHORES	(1ul << 3)
 #define   I915_SCHEDULER_CAP_ENGINE_BUSY_STATS	(1ul << 4)
+/*
+ * Indicates the 2k user priority levels are statically mapped into 3 buckets as
+ * follows:
+ *
+ * -1k to -1	Low priority
+ * 0		Normal priority
+ * 1 to 1k	Highest priority
+ */
+#define   I915_SCHEDULER_CAP_STATIC_PRIORITY_MAP	(1ul << 5)
 
 #define I915_PARAM_HUC_STATUS		 42
 
-- 
cgit v1.2.3-71-gd317


From e893bb1bb4d2eb635eba61e5d9c5135d96855773 Mon Sep 17 00:00:00 2001
From: Balbir Singh <sblbir@amazon.com>
Date: Fri, 8 Jan 2021 23:10:55 +1100
Subject: x86, prctl: Hook L1D flushing in via prctl

Use the existing PR_GET/SET_SPECULATION_CTRL API to expose the L1D flush
capability. For L1D flushing PR_SPEC_FORCE_DISABLE and
PR_SPEC_DISABLE_NOEXEC are not supported.

Enabling L1D flush does not check if the task is running on an SMT enabled
core, rather a check is done at runtime (at the time of flush), if the task
runs on a SMT sibling then the task is sent a SIGBUS which is executed
before the task returns to user space or to a guest.

This is better than the other alternatives of:

  a. Ensuring strict affinity of the task (hard to enforce without further
     changes in the scheduler)

  b. Silently skipping flush for tasks that move to SMT enabled cores.

Hook up the core prctl and implement the x86 specific parts which in turn
makes it functional.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Balbir Singh <sblbir@amazon.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210108121056.21940-5-sblbir@amazon.com
---
 arch/x86/kernel/cpu/bugs.c | 33 +++++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h |  1 +
 2 files changed, 34 insertions(+)

(limited to 'include/uapi')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1a5a1b085eaa..ecfca3bbcd96 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1252,6 +1252,24 @@ static void task_update_spec_tif(struct task_struct *tsk)
 		speculation_ctrl_update_current();
 }
 
+static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+
+	if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+		return -EPERM;
+
+	switch (ctrl) {
+	case PR_SPEC_ENABLE:
+		set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+		return 0;
+	case PR_SPEC_DISABLE:
+		clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+		return 0;
+	default:
+		return -ERANGE;
+	}
+}
+
 static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
 {
 	if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
@@ -1361,6 +1379,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
 		return ssb_prctl_set(task, ctrl);
 	case PR_SPEC_INDIRECT_BRANCH:
 		return ib_prctl_set(task, ctrl);
+	case PR_SPEC_L1D_FLUSH:
+		return l1d_flush_prctl_set(task, ctrl);
 	default:
 		return -ENODEV;
 	}
@@ -1377,6 +1397,17 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
 }
 #endif
 
+static int l1d_flush_prctl_get(struct task_struct *task)
+{
+	if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+		return PR_SPEC_FORCE_DISABLE;
+
+	if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH))
+		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+	else
+		return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+}
+
 static int ssb_prctl_get(struct task_struct *task)
 {
 	switch (ssb_mode) {
@@ -1427,6 +1458,8 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
 		return ssb_prctl_get(task);
 	case PR_SPEC_INDIRECT_BRANCH:
 		return ib_prctl_get(task);
+	case PR_SPEC_L1D_FLUSH:
+		return l1d_flush_prctl_get(task);
 	default:
 		return -ENODEV;
 	}
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 967d9c55323d..964c41ed303e 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -213,6 +213,7 @@ struct prctl_mm_map {
 /* Speculation control variants */
 # define PR_SPEC_STORE_BYPASS		0
 # define PR_SPEC_INDIRECT_BRANCH	1
+# define PR_SPEC_L1D_FLUSH		2
 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */
 # define PR_SPEC_NOT_AFFECTED		0
 # define PR_SPEC_PRCTL			(1UL << 0)
-- 
cgit v1.2.3-71-gd317


From 56af5e749f20c3a540310c207dcc373f4f09156e Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Tue, 27 Jul 2021 18:33:15 -0700
Subject: net/sched: act_skbmod: Add SKBMOD_F_ECN option support

Currently, when doing rate limiting using the tc-police(8) action, the
easiest way is to simply drop the packets which exceed or conform the
configured bandwidth limit.  Add a new option to tc-skbmod(8), so that
users may use the ECN [1] extension to explicitly inform the receiver
about the congestion instead of dropping packets "on the floor".

The 2 least significant bits of the Traffic Class field in IPv4 and IPv6
headers are used to represent different ECN states [2]:

	0b00: "Non ECN-Capable Transport", Non-ECT
	0b10: "ECN Capable Transport", ECT(0)
	0b01: "ECN Capable Transport", ECT(1)
	0b11: "Congestion Encountered", CE

As an example:

	$ tc filter add dev eth0 parent 1: protocol ip prio 10 \
		matchall action skbmod ecn

Doing the above marks all ECT(0) and ECT(1) packets as CE.  It does NOT
affect Non-ECT or non-IP packets.  In the tc-police scenario mentioned
above, users may pipe a tc-police action and a tc-skbmod "ecn" action
together to achieve ECN-based rate limiting.

For TCP connections, upon receiving a CE packet, the receiver will respond
with an ECE packet, asking the sender to reduce their congestion window.
However ECN also works with other L4 protocols e.g. DCCP and SCTP [2], and
our implementation does not touch or care about L4 headers.

The updated tc-skbmod SYNOPSIS looks like the following:

	tc ... action skbmod { set SETTABLE | swap SWAPPABLE | ecn } ...

Only one of "set", "swap" or "ecn" shall be used in a single tc-skbmod
command.  Trying to use more than one of them at a time is considered
undefined behavior; pipe multiple tc-skbmod commands together instead.
"set" and "swap" only affect Ethernet packets, while "ecn" only affects
IPv{4,6} packets.

It is also worth mentioning that, in theory, the same effect could be
achieved by piping a "police" action and a "bpf" action using the
bpf_skb_ecn_set_ce() helper, but this requires eBPF programming from the
user, thus impractical.

Depends on patch "net/sched: act_skbmod: Skip non-Ethernet packets".

[1] https://datatracker.ietf.org/doc/html/rfc3168
[2] https://en.wikipedia.org/wiki/Explicit_Congestion_Notification

Reviewed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_skbmod.h |  1 +
 net/sched/act_skbmod.c                | 44 +++++++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h
index c525b3503797..af6ef2cfbf3d 100644
--- a/include/uapi/linux/tc_act/tc_skbmod.h
+++ b/include/uapi/linux/tc_act/tc_skbmod.h
@@ -17,6 +17,7 @@
 #define SKBMOD_F_SMAC	0x2
 #define SKBMOD_F_ETYPE	0x4
 #define SKBMOD_F_SWAPMAC 0x8
+#define SKBMOD_F_ECN	0x10
 
 struct tc_skbmod {
 	tc_gen;
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 8d17a543cc9f..762ceec3e6f6 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
+#include <net/inet_ecn.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
@@ -21,15 +22,13 @@
 static unsigned int skbmod_net_id;
 static struct tc_action_ops act_skbmod_ops;
 
-#define MAX_EDIT_LEN ETH_HLEN
 static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
 	struct tcf_skbmod *d = to_skbmod(a);
-	int action;
+	int action, max_edit_len, err;
 	struct tcf_skbmod_params *p;
 	u64 flags;
-	int err;
 
 	tcf_lastuse_update(&d->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
@@ -38,19 +37,34 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
 	if (unlikely(action == TC_ACT_SHOT))
 		goto drop;
 
-	if (!skb->dev || skb->dev->type != ARPHRD_ETHER)
-		return action;
+	max_edit_len = skb_mac_header_len(skb);
+	p = rcu_dereference_bh(d->skbmod_p);
+	flags = p->flags;
+
+	/* tcf_skbmod_init() guarantees "flags" to be one of the following:
+	 *	1. a combination of SKBMOD_F_{DMAC,SMAC,ETYPE}
+	 *	2. SKBMOD_F_SWAPMAC
+	 *	3. SKBMOD_F_ECN
+	 * SKBMOD_F_ECN only works with IP packets; all other flags only work with Ethernet
+	 * packets.
+	 */
+	if (flags == SKBMOD_F_ECN) {
+		switch (skb_protocol(skb, true)) {
+		case cpu_to_be16(ETH_P_IP):
+		case cpu_to_be16(ETH_P_IPV6):
+			max_edit_len += skb_network_header_len(skb);
+			break;
+		default:
+			goto out;
+		}
+	} else if (!skb->dev || skb->dev->type != ARPHRD_ETHER) {
+		goto out;
+	}
 
-	/* XXX: if you are going to edit more fields beyond ethernet header
-	 * (example when you add IP header replacement or vlan swap)
-	 * then MAX_EDIT_LEN needs to change appropriately
-	*/
-	err = skb_ensure_writable(skb, MAX_EDIT_LEN);
+	err = skb_ensure_writable(skb, max_edit_len);
 	if (unlikely(err)) /* best policy is to drop on the floor */
 		goto drop;
 
-	p = rcu_dereference_bh(d->skbmod_p);
-	flags = p->flags;
 	if (flags & SKBMOD_F_DMAC)
 		ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
 	if (flags & SKBMOD_F_SMAC)
@@ -66,6 +80,10 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
 		ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr);
 	}
 
+	if (flags & SKBMOD_F_ECN)
+		INET_ECN_set_ce(skb);
+
+out:
 	return action;
 
 drop:
@@ -129,6 +147,8 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 	index = parm->index;
 	if (parm->flags & SKBMOD_F_SWAPMAC)
 		lflags = SKBMOD_F_SWAPMAC;
+	if (parm->flags & SKBMOD_F_ECN)
+		lflags = SKBMOD_F_ECN;
 
 	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
-- 
cgit v1.2.3-71-gd317


From 125d10373ad991888c9e94d2da49bcc5ccba2127 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Jul 2021 13:42:15 -0700
Subject: dmanegine: idxd: add software command status

Enabling device and wq returns standard errno and that does not provide
enough details to indicate what exactly failed. The hardware command status
is only 8bits. Expand the command status to 32bits and use the upper 16
bits to define software errors to provide more details on the exact
failure. Bit 31 will be used to indicate the error is software set as the
driver is using some of the spec defined hardware error as well.

Cc: Ramesh Thomas <ramesh.thomas@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162681373579.1968485.5891788397526827892.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 Documentation/ABI/stable/sysfs-driver-dma-idxd |  2 ++
 drivers/dma/idxd/cdev.c                        |  5 ++++-
 drivers/dma/idxd/device.c                      | 22 +++++++++++++++++++---
 drivers/dma/idxd/dma.c                         |  4 ++++
 drivers/dma/idxd/idxd.h                        |  2 +-
 drivers/dma/idxd/sysfs.c                       | 11 ++++++++++-
 include/uapi/linux/idxd.h                      | 23 +++++++++++++++++++++++
 7 files changed, 63 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index adb0c93e8dfc..df4afbccf037 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -128,6 +128,8 @@ Date:		Aug 28, 2020
 KernelVersion:	5.10.0
 Contact:	dmaengine@vger.kernel.org
 Description:	The last executed device administrative command's status/error.
+		Also last configuration error overloaded.
+		Writing to it will clear the status.
 
 What:		/sys/bus/dsa/devices/wq<m>.<n>/block_on_fault
 Date:		Oct 27, 2020
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index f6a4603517ba..4d2ecdb130e7 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -320,9 +320,12 @@ static int idxd_user_drv_probe(struct idxd_dev *idxd_dev)
 		goto err;
 
 	rc = idxd_wq_add_cdev(wq);
-	if (rc < 0)
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_CDEV_ERR;
 		goto err_cdev;
+	}
 
+	idxd->cmd_status = 0;
 	mutex_unlock(&wq->wq_lock);
 	return 0;
 
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 41f67a195eb6..86fa4b4590f9 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -840,6 +840,7 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	wq->wqcfg->wq_size = wq->size;
 
 	if (wq->size == 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_SIZE;
 		dev_warn(dev, "Incorrect work queue size: 0\n");
 		return -EINVAL;
 	}
@@ -975,6 +976,7 @@ static int idxd_wqs_setup(struct idxd_device *idxd)
 			continue;
 
 		if (wq_shared(wq) && !device_swq_supported(idxd)) {
+			idxd->cmd_status = IDXD_SCMD_WQ_NO_SWQ_SUPPORT;
 			dev_warn(dev, "No shared wq support but configured.\n");
 			return -EINVAL;
 		}
@@ -983,8 +985,10 @@ static int idxd_wqs_setup(struct idxd_device *idxd)
 		configured++;
 	}
 
-	if (configured == 0)
+	if (configured == 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NONE_CONFIGURED;
 		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -1140,21 +1144,26 @@ int __drv_enable_wq(struct idxd_wq *wq)
 
 	lockdep_assert_held(&wq->wq_lock);
 
-	if (idxd->state != IDXD_DEV_ENABLED)
+	if (idxd->state != IDXD_DEV_ENABLED) {
+		idxd->cmd_status = IDXD_SCMD_DEV_NOT_ENABLED;
 		goto err;
+	}
 
 	if (wq->state != IDXD_WQ_DISABLED) {
 		dev_dbg(dev, "wq %d already enabled.\n", wq->id);
+		idxd->cmd_status = IDXD_SCMD_WQ_ENABLED;
 		rc = -EBUSY;
 		goto err;
 	}
 
 	if (!wq->group) {
 		dev_dbg(dev, "wq %d not attached to group.\n", wq->id);
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_GRP;
 		goto err;
 	}
 
 	if (strlen(wq->name) == 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_NAME;
 		dev_dbg(dev, "wq %d name not set.\n", wq->id);
 		goto err;
 	}
@@ -1162,6 +1171,7 @@ int __drv_enable_wq(struct idxd_wq *wq)
 	/* Shared WQ checks */
 	if (wq_shared(wq)) {
 		if (!device_swq_supported(idxd)) {
+			idxd->cmd_status = IDXD_SCMD_WQ_NO_SVM;
 			dev_dbg(dev, "PASID not enabled and shared wq.\n");
 			goto err;
 		}
@@ -1174,6 +1184,7 @@ int __drv_enable_wq(struct idxd_wq *wq)
 		 * threshold via sysfs.
 		 */
 		if (wq->threshold == 0) {
+			idxd->cmd_status = IDXD_SCMD_WQ_NO_THRESH;
 			dev_dbg(dev, "Shared wq and threshold 0.\n");
 			goto err;
 		}
@@ -1197,6 +1208,7 @@ int __drv_enable_wq(struct idxd_wq *wq)
 
 	rc = idxd_wq_map_portal(wq);
 	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_PORTAL_ERR;
 		dev_dbg(dev, "wq %d portal mapping failed: %d\n", wq->id, rc);
 		goto err_map_portal;
 	}
@@ -1259,8 +1271,10 @@ int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
 	 * enabled state, then the device was altered outside of driver's control.
 	 * If the state is in halted state, then we don't want to proceed.
 	 */
-	if (idxd->state != IDXD_DEV_DISABLED)
+	if (idxd->state != IDXD_DEV_DISABLED) {
+		idxd->cmd_status = IDXD_SCMD_DEV_ENABLED;
 		return -ENXIO;
+	}
 
 	/* Device configuration */
 	spin_lock_irqsave(&idxd->dev_lock, flags);
@@ -1279,9 +1293,11 @@ int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
 	rc = idxd_register_dma_device(idxd);
 	if (rc < 0) {
 		idxd_device_disable(idxd);
+		idxd->cmd_status = IDXD_SCMD_DEV_DMA_ERR;
 		return rc;
 	}
 
+	idxd->cmd_status = 0;
 	return 0;
 }
 
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 2fd7ec29a08f..a195225687bb 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -284,22 +284,26 @@ static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
 
 	rc = idxd_wq_alloc_resources(wq);
 	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_RES_ALLOC_ERR;
 		dev_dbg(dev, "WQ resource alloc failed\n");
 		goto err_res_alloc;
 	}
 
 	rc = idxd_wq_init_percpu_ref(wq);
 	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_PERCPU_ERR;
 		dev_dbg(dev, "percpu_ref setup failed\n");
 		goto err_ref;
 	}
 
 	rc = idxd_register_dma_channel(wq);
 	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_DMA_CHAN_ERR;
 		dev_dbg(dev, "Failed to register dma channel\n");
 		goto err_dma;
 	}
 
+	idxd->cmd_status = 0;
 	mutex_unlock(&wq->wq_lock);
 	return 0;
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 94983bced189..bfcb03329f77 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -252,7 +252,7 @@ struct idxd_device {
 	unsigned long flags;
 	int id;
 	int major;
-	u8 cmd_status;
+	u32 cmd_status;
 
 	struct pci_dev *pdev;
 	void __iomem *reg_base;
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 881a12596d4b..4c01587c9d4a 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1217,7 +1217,16 @@ static ssize_t cmd_status_show(struct device *dev,
 
 	return sysfs_emit(buf, "%#x\n", idxd->cmd_status);
 }
-static DEVICE_ATTR_RO(cmd_status);
+
+static ssize_t cmd_status_store(struct device *dev, struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	idxd->cmd_status = 0;
+	return count;
+}
+static DEVICE_ATTR_RW(cmd_status);
 
 static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_version.attr,
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index e33997b4d750..1c0175aa0e42 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -9,6 +9,29 @@
 #include <stdint.h>
 #endif
 
+/* Driver command error status */
+enum idxd_scmd_stat {
+	IDXD_SCMD_DEV_ENABLED = 0x80000010,
+	IDXD_SCMD_DEV_NOT_ENABLED = 0x80000020,
+	IDXD_SCMD_WQ_ENABLED = 0x80000021,
+	IDXD_SCMD_DEV_DMA_ERR = 0x80020000,
+	IDXD_SCMD_WQ_NO_GRP = 0x80030000,
+	IDXD_SCMD_WQ_NO_NAME = 0x80040000,
+	IDXD_SCMD_WQ_NO_SVM = 0x80050000,
+	IDXD_SCMD_WQ_NO_THRESH = 0x80060000,
+	IDXD_SCMD_WQ_PORTAL_ERR = 0x80070000,
+	IDXD_SCMD_WQ_RES_ALLOC_ERR = 0x80080000,
+	IDXD_SCMD_PERCPU_ERR = 0x80090000,
+	IDXD_SCMD_DMA_CHAN_ERR = 0x800a0000,
+	IDXD_SCMD_CDEV_ERR = 0x800b0000,
+	IDXD_SCMD_WQ_NO_SWQ_SUPPORT = 0x800c0000,
+	IDXD_SCMD_WQ_NONE_CONFIGURED = 0x800d0000,
+	IDXD_SCMD_WQ_NO_SIZE = 0x800e0000,
+};
+
+#define IDXD_SCMD_SOFTERR_MASK	0x80000000
+#define IDXD_SCMD_SOFTERR_SHIFT	16
+
 /* Descriptor flags */
 #define IDXD_OP_FLAG_FENCE	0x0001
 #define IDXD_OP_FLAG_BOF	0x0002
-- 
cgit v1.2.3-71-gd317


From fc40e5e10c3bcc36f3f765f0d9fae0a13efc7935 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Tue, 27 Jul 2021 18:06:17 -0700
Subject: drm/msm: Utilize gpu scheduler priorities
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The drm/scheduler provides additional prioritization on top of that
provided by however many number of ringbuffers (each with their own
priority level) is supported on a given generation.  Expose the
additional levels of priority to userspace and map the userspace
priority back to ring (first level of priority) and schedular priority
(additional priority levels within the ring).

Signed-off-by: Rob Clark <robdclark@chromium.org>
Acked-by: Christian König <christian.koenig@amd.com>
Link: https://lore.kernel.org/r/20210728010632.2633470-13-robdclark@gmail.com
Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  4 +--
 drivers/gpu/drm/msm/msm_gem_submit.c    |  4 +--
 drivers/gpu/drm/msm/msm_gpu.h           | 58 +++++++++++++++++++++++++++++++--
 drivers/gpu/drm/msm/msm_submitqueue.c   | 35 +++++++++-----------
 include/uapi/drm/msm_drm.h              | 14 +++++++-
 5 files changed, 88 insertions(+), 27 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index bad4809b68ef..748665232d29 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -261,8 +261,8 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value)
 			return ret;
 		}
 		return -EINVAL;
-	case MSM_PARAM_NR_RINGS:
-		*value = gpu->nr_rings;
+	case MSM_PARAM_PRIORITIES:
+		*value = gpu->nr_rings * NR_SCHED_PRIORITIES;
 		return 0;
 	case MSM_PARAM_PP_PGTABLE:
 		*value = 0;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index 450efe59abb5..c2ecec5b11c4 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -59,7 +59,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
 	submit->gpu = gpu;
 	submit->cmd = (void *)&submit->bos[nr_bos];
 	submit->queue = queue;
-	submit->ring = gpu->rb[queue->prio];
+	submit->ring = gpu->rb[queue->ring_nr];
 	submit->fault_dumped = false;
 
 	INIT_LIST_HEAD(&submit->node);
@@ -749,7 +749,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	/* Get a unique identifier for the submission for logging purposes */
 	submitid = atomic_inc_return(&ident) - 1;
 
-	ring = gpu->rb[queue->prio];
+	ring = gpu->rb[queue->ring_nr];
 	trace_msm_gpu_submit(pid_nr(pid), ring->id, submitid,
 		args->nr_bos, args->nr_cmds);
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index b912cacaecc0..0e4b45bff2e6 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -250,6 +250,59 @@ struct msm_gpu_perfcntr {
 	const char *name;
 };
 
+/*
+ * The number of priority levels provided by drm gpu scheduler.  The
+ * DRM_SCHED_PRIORITY_KERNEL priority level is treated specially in some
+ * cases, so we don't use it (no need for kernel generated jobs).
+ */
+#define NR_SCHED_PRIORITIES (1 + DRM_SCHED_PRIORITY_HIGH - DRM_SCHED_PRIORITY_MIN)
+
+/**
+ * msm_gpu_convert_priority - Map userspace priority to ring # and sched priority
+ *
+ * @gpu:        the gpu instance
+ * @prio:       the userspace priority level
+ * @ring_nr:    [out] the ringbuffer the userspace priority maps to
+ * @sched_prio: [out] the gpu scheduler priority level which the userspace
+ *              priority maps to
+ *
+ * With drm/scheduler providing it's own level of prioritization, our total
+ * number of available priority levels is (nr_rings * NR_SCHED_PRIORITIES).
+ * Each ring is associated with it's own scheduler instance.  However, our
+ * UABI is that lower numerical values are higher priority.  So mapping the
+ * single userspace priority level into ring_nr and sched_prio takes some
+ * care.  The userspace provided priority (when a submitqueue is created)
+ * is mapped to ring nr and scheduler priority as such:
+ *
+ *   ring_nr    = userspace_prio / NR_SCHED_PRIORITIES
+ *   sched_prio = NR_SCHED_PRIORITIES -
+ *                (userspace_prio % NR_SCHED_PRIORITIES) - 1
+ *
+ * This allows generations without preemption (nr_rings==1) to have some
+ * amount of prioritization, and provides more priority levels for gens
+ * that do have preemption.
+ */
+static inline int msm_gpu_convert_priority(struct msm_gpu *gpu, int prio,
+		unsigned *ring_nr, enum drm_sched_priority *sched_prio)
+{
+	unsigned rn, sp;
+
+	rn = div_u64_rem(prio, NR_SCHED_PRIORITIES, &sp);
+
+	/* invert sched priority to map to higher-numeric-is-higher-
+	 * priority convention
+	 */
+	sp = NR_SCHED_PRIORITIES - sp - 1;
+
+	if (rn >= gpu->nr_rings)
+		return -EINVAL;
+
+	*ring_nr = rn;
+	*sched_prio = sp;
+
+	return 0;
+}
+
 /**
  * A submitqueue is associated with a gl context or vk queue (or equiv)
  * in userspace.
@@ -257,7 +310,8 @@ struct msm_gpu_perfcntr {
  * @id:        userspace id for the submitqueue, unique within the drm_file
  * @flags:     userspace flags for the submitqueue, specified at creation
  *             (currently unusued)
- * @prio:      the submitqueue priority
+ * @ring_nr:   the ringbuffer used by this submitqueue, which is determined
+ *             by the submitqueue's priority
  * @faults:    the number of GPU hangs associated with this submitqueue
  * @ctx:       the per-drm_file context associated with the submitqueue (ie.
  *             which set of pgtables do submits jobs associated with the
@@ -272,7 +326,7 @@ struct msm_gpu_perfcntr {
 struct msm_gpu_submitqueue {
 	int id;
 	u32 flags;
-	u32 prio;
+	u32 ring_nr;
 	int faults;
 	struct msm_file_private *ctx;
 	struct list_head node;
diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c
index 682ba2a7c0ec..32a55d81b58b 100644
--- a/drivers/gpu/drm/msm/msm_submitqueue.c
+++ b/drivers/gpu/drm/msm/msm_submitqueue.c
@@ -68,6 +68,8 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
 	struct msm_gpu_submitqueue *queue;
 	struct msm_ringbuffer *ring;
 	struct drm_gpu_scheduler *sched;
+	enum drm_sched_priority sched_prio;
+	unsigned ring_nr;
 	int ret;
 
 	if (!ctx)
@@ -76,8 +78,9 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
 	if (!priv->gpu)
 		return -ENODEV;
 
-	if (prio >= priv->gpu->nr_rings)
-		return -EINVAL;
+	ret = msm_gpu_convert_priority(priv->gpu, prio, &ring_nr, &sched_prio);
+	if (ret)
+		return ret;
 
 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
 
@@ -86,24 +89,13 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
 
 	kref_init(&queue->ref);
 	queue->flags = flags;
-	queue->prio = prio;
+	queue->ring_nr = ring_nr;
 
-	ring = priv->gpu->rb[prio];
+	ring = priv->gpu->rb[ring_nr];
 	sched = &ring->sched;
 
-	/*
-	 * TODO we can allow more priorities than we have ringbuffers by
-	 * mapping:
-	 *
-	 *    ring = prio / 3;
-	 *    ent_prio = DRM_SCHED_PRIORITY_MIN + (prio % 3);
-	 *
-	 * Probably avoid using DRM_SCHED_PRIORITY_KERNEL as that is
-	 * treated specially in places.
-	 */
 	ret = drm_sched_entity_init(&queue->entity,
-			DRM_SCHED_PRIORITY_NORMAL,
-			&sched, 1, NULL);
+			sched_prio, &sched, 1, NULL);
 	if (ret) {
 		kfree(queue);
 		return ret;
@@ -134,16 +126,19 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx)
 {
 	struct msm_drm_private *priv = drm->dev_private;
-	int default_prio;
+	int default_prio, max_priority;
 
 	if (!priv->gpu)
 		return -ENODEV;
 
+	max_priority = (priv->gpu->nr_rings * NR_SCHED_PRIORITIES) - 1;
+
 	/*
-	 * Select priority 2 as the "default priority" unless nr_rings is less
-	 * than 2 and then pick the lowest priority
+	 * Pick a medium priority level as default.  Lower numeric value is
+	 * higher priority, so round-up to pick a priority that is not higher
+	 * than the middle priority level.
 	 */
-	default_prio = clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1);
+	default_prio = DIV_ROUND_UP(max_priority, 2);
 
 	INIT_LIST_HEAD(&ctx->submitqueues);
 
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index f075851021c3..6b8fffc28a50 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -73,11 +73,19 @@ struct drm_msm_timespec {
 #define MSM_PARAM_MAX_FREQ   0x04
 #define MSM_PARAM_TIMESTAMP  0x05
 #define MSM_PARAM_GMEM_BASE  0x06
-#define MSM_PARAM_NR_RINGS   0x07
+#define MSM_PARAM_PRIORITIES 0x07  /* The # of priority levels */
 #define MSM_PARAM_PP_PGTABLE 0x08  /* => 1 for per-process pagetables, else 0 */
 #define MSM_PARAM_FAULTS     0x09
 #define MSM_PARAM_SUSPENDS   0x0a
 
+/* For backwards compat.  The original support for preemption was based on
+ * a single ring per priority level so # of priority levels equals the #
+ * of rings.  With drm/scheduler providing additional levels of priority,
+ * the number of priorities is greater than the # of rings.  The param is
+ * renamed to better reflect this.
+ */
+#define MSM_PARAM_NR_RINGS   MSM_PARAM_PRIORITIES
+
 struct drm_msm_param {
 	__u32 pipe;           /* in, MSM_PIPE_x */
 	__u32 param;          /* in, MSM_PARAM_x */
@@ -304,6 +312,10 @@ struct drm_msm_gem_madvise {
 
 #define MSM_SUBMITQUEUE_FLAGS (0)
 
+/*
+ * The submitqueue priority should be between 0 and MSM_PARAM_PRIORITIES-1,
+ * a lower numeric value is higher priority.
+ */
 struct drm_msm_submitqueue {
 	__u32 flags;   /* in, MSM_SUBMITQUEUE_x */
 	__u32 prio;    /* in, Priority level */
-- 
cgit v1.2.3-71-gd317


From 25905f602fdb0cfa147017056636768a7aa1ff6f Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 21 Jul 2021 12:25:20 -0700
Subject: dmaengine: idxd: Change license on idxd.h to LGPL

This file was given GPL-2.0 license. But LGPL-2.1 makes more sense
as it needs to be used by libraries outside of the kernel source tree.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/idxd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index e33997b4d750..edc346a77c91 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: LGPL-2.1 WITH Linux-syscall-note */
 /* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
 #ifndef _USR_IDXD_H_
 #define _USR_IDXD_H_
-- 
cgit v1.2.3-71-gd317


From 433c38f40f6a81cf3988b9372f2983912737f322 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Tue, 27 Jul 2021 13:52:56 -0700
Subject: arm64: mte: change ASYNC and SYNC TCF settings into bitfields

Allow the user program to specify both ASYNC and SYNC TCF modes by
repurposing the existing constants as bitfields. This will allow the
kernel to select one of the modes on behalf of the user program. With
this patch the kernel will always select async mode, but a subsequent
patch will make this configurable.

Link: https://linux-review.googlesource.com/id/Icc5923c85a8ea284588cc399ae74fd19ec291230
Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210727205300.2554659-3-pcc@google.com
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/processor.h |  3 ++
 arch/arm64/kernel/mte.c            | 70 +++++++++++++++-----------------------
 include/uapi/linux/prctl.h         | 11 +++---
 3 files changed, 37 insertions(+), 47 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 54d34276fa91..ee82ebbb5e5a 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -19,6 +19,9 @@
 #define MTE_CTRL_GCR_USER_EXCL_SHIFT	0
 #define MTE_CTRL_GCR_USER_EXCL_MASK	0xffff
 
+#define MTE_CTRL_TCF_SYNC		(1UL << 16)
+#define MTE_CTRL_TCF_ASYNC		(1UL << 17)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/build_bug.h>
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index a5269558210c..3b6b68518003 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -193,14 +193,19 @@ void mte_check_tfsr_el1(void)
 }
 #endif
 
-static void set_gcr_el1_excl(u64 excl)
+static void mte_update_sctlr_user(struct task_struct *task)
 {
-	current->thread.mte_ctrl = excl;
+	unsigned long sctlr = task->thread.sctlr_user;
+	unsigned long pref = MTE_CTRL_TCF_ASYNC;
+	unsigned long mte_ctrl = task->thread.mte_ctrl;
+	unsigned long resolved_mte_tcf = (mte_ctrl & pref) ? pref : mte_ctrl;
 
-	/*
-	 * SYS_GCR_EL1 will be set to current->thread.gcr_user_excl value
-	 * by mte_set_user_gcr() in kernel_exit,
-	 */
+	sctlr &= ~SCTLR_EL1_TCF0_MASK;
+	if (resolved_mte_tcf & MTE_CTRL_TCF_ASYNC)
+		sctlr |= SCTLR_EL1_TCF0_ASYNC;
+	else if (resolved_mte_tcf & MTE_CTRL_TCF_SYNC)
+		sctlr |= SCTLR_EL1_TCF0_SYNC;
+	task->thread.sctlr_user = sctlr;
 }
 
 void mte_thread_init_user(void)
@@ -212,15 +217,16 @@ void mte_thread_init_user(void)
 	dsb(ish);
 	write_sysreg_s(0, SYS_TFSRE0_EL1);
 	clear_thread_flag(TIF_MTE_ASYNC_FAULT);
-	/* disable tag checking */
-	set_task_sctlr_el1((current->thread.sctlr_user & ~SCTLR_EL1_TCF0_MASK) |
-			   SCTLR_EL1_TCF0_NONE);
-	/* reset tag generation mask */
-	set_gcr_el1_excl(SYS_GCR_EL1_EXCL_MASK);
+	/* disable tag checking and reset tag generation mask */
+	current->thread.mte_ctrl = MTE_CTRL_GCR_USER_EXCL_MASK;
+	mte_update_sctlr_user(current);
+	set_task_sctlr_el1(current->thread.sctlr_user);
 }
 
 void mte_thread_switch(struct task_struct *next)
 {
+	mte_update_sctlr_user(next);
+
 	/*
 	 * Check if an async tag exception occurred at EL1.
 	 *
@@ -259,33 +265,21 @@ void mte_suspend_exit(void)
 
 long set_mte_ctrl(struct task_struct *task, unsigned long arg)
 {
-	u64 sctlr = task->thread.sctlr_user & ~SCTLR_EL1_TCF0_MASK;
 	u64 mte_ctrl = (~((arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT) &
 			SYS_GCR_EL1_EXCL_MASK) << MTE_CTRL_GCR_USER_EXCL_SHIFT;
 
 	if (!system_supports_mte())
 		return 0;
 
-	switch (arg & PR_MTE_TCF_MASK) {
-	case PR_MTE_TCF_NONE:
-		sctlr |= SCTLR_EL1_TCF0_NONE;
-		break;
-	case PR_MTE_TCF_SYNC:
-		sctlr |= SCTLR_EL1_TCF0_SYNC;
-		break;
-	case PR_MTE_TCF_ASYNC:
-		sctlr |= SCTLR_EL1_TCF0_ASYNC;
-		break;
-	default:
-		return -EINVAL;
-	}
+	if (arg & PR_MTE_TCF_ASYNC)
+		mte_ctrl |= MTE_CTRL_TCF_ASYNC;
+	if (arg & PR_MTE_TCF_SYNC)
+		mte_ctrl |= MTE_CTRL_TCF_SYNC;
 
-	if (task != current) {
-		task->thread.sctlr_user = sctlr;
-		task->thread.mte_ctrl = mte_ctrl;
-	} else {
-		set_task_sctlr_el1(sctlr);
-		set_gcr_el1_excl(mte_ctrl);
+	task->thread.mte_ctrl = mte_ctrl;
+	if (task == current) {
+		mte_update_sctlr_user(task);
+		set_task_sctlr_el1(task->thread.sctlr_user);
 	}
 
 	return 0;
@@ -302,18 +296,10 @@ long get_mte_ctrl(struct task_struct *task)
 		return 0;
 
 	ret = incl << PR_MTE_TAG_SHIFT;
-
-	switch (task->thread.sctlr_user & SCTLR_EL1_TCF0_MASK) {
-	case SCTLR_EL1_TCF0_NONE:
-		ret |= PR_MTE_TCF_NONE;
-		break;
-	case SCTLR_EL1_TCF0_SYNC:
-		ret |= PR_MTE_TCF_SYNC;
-		break;
-	case SCTLR_EL1_TCF0_ASYNC:
+	if (mte_ctrl & MTE_CTRL_TCF_ASYNC)
 		ret |= PR_MTE_TCF_ASYNC;
-		break;
-	}
+	if (mte_ctrl & MTE_CTRL_TCF_SYNC)
+		ret |= PR_MTE_TCF_SYNC;
 
 	return ret;
 }
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 967d9c55323d..89de78a14b9b 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -234,14 +234,15 @@ struct prctl_mm_map {
 #define PR_GET_TAGGED_ADDR_CTRL		56
 # define PR_TAGGED_ADDR_ENABLE		(1UL << 0)
 /* MTE tag check fault modes */
-# define PR_MTE_TCF_SHIFT		1
-# define PR_MTE_TCF_NONE		(0UL << PR_MTE_TCF_SHIFT)
-# define PR_MTE_TCF_SYNC		(1UL << PR_MTE_TCF_SHIFT)
-# define PR_MTE_TCF_ASYNC		(2UL << PR_MTE_TCF_SHIFT)
-# define PR_MTE_TCF_MASK		(3UL << PR_MTE_TCF_SHIFT)
+# define PR_MTE_TCF_NONE		0
+# define PR_MTE_TCF_SYNC		(1UL << 1)
+# define PR_MTE_TCF_ASYNC		(1UL << 2)
+# define PR_MTE_TCF_MASK		(PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC)
 /* MTE tag inclusion mask */
 # define PR_MTE_TAG_SHIFT		3
 # define PR_MTE_TAG_MASK		(0xffffUL << PR_MTE_TAG_SHIFT)
+/* Unused; kept only for source compatibility */
+# define PR_MTE_TCF_SHIFT		1
 
 /* Control reclaim behavior when allocating memory */
 #define PR_SET_IO_FLUSHER		57
-- 
cgit v1.2.3-71-gd317


From 5d8dbb7fb82b8661c16d496644b931c0e2e3a12e Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Wed, 28 Jul 2021 19:38:18 +0300
Subject: net: xfrm: fix shift-out-of-bounce

We need to check up->dirmask to avoid shift-out-of-bounce bug,
since up->dirmask comes from userspace.

Also, added XFRM_USERPOLICY_DIRMASK_MAX constant to uapi to inform
user-space that up->dirmask has maximum possible value

Fixes: 2d151d39073a ("xfrm: Add possibility to set the default to block if we have no policy")
Reported-and-tested-by: syzbot+9cd5837a045bbee5b810@syzkaller.appspotmail.com
Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/xfrm.h | 1 +
 net/xfrm/xfrm_user.c      | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 6e8095106192..b96c1ea7166d 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -514,6 +514,7 @@ struct xfrm_user_offload {
 #define XFRM_OFFLOAD_INBOUND	2
 
 struct xfrm_userpolicy_default {
+#define XFRM_USERPOLICY_DIRMASK_MAX	(sizeof(__u8) * 8)
 	__u8				dirmask;
 	__u8				action;
 };
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 4eafd1130c3e..127c99f71c99 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1966,9 +1966,14 @@ static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh,
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_userpolicy_default *up = nlmsg_data(nlh);
-	u8 dirmask = (1 << up->dirmask) & XFRM_POL_DEFAULT_MASK;
+	u8 dirmask;
 	u8 old_default = net->xfrm.policy_default;
 
+	if (up->dirmask >= XFRM_USERPOLICY_DIRMASK_MAX)
+		return -EINVAL;
+
+	dirmask = (1 << up->dirmask) & XFRM_POL_DEFAULT_MASK;
+
 	net->xfrm.policy_default = (old_default & (0xff ^ dirmask))
 				    | (up->action << up->dirmask);
 
-- 
cgit v1.2.3-71-gd317


From bc49d8169aa72295104f1558830c568efb946315 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:39 +0800
Subject: mctp: Add MCTP base

Add basic Kconfig, an initial (empty) af_mctp source object, and
{AF,PF}_MCTP definitions, and the required definitions for a new
protocol type.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                         |  7 +++++++
 include/linux/socket.h              |  6 +++++-
 include/uapi/linux/mctp.h           | 15 +++++++++++++++
 net/Kconfig                         |  1 +
 net/Makefile                        |  1 +
 net/core/sock.c                     |  1 +
 net/mctp/Kconfig                    | 13 +++++++++++++
 net/mctp/Makefile                   |  3 +++
 net/mctp/af_mctp.c                  | 13 +++++++++++++
 net/socket.c                        |  1 +
 security/selinux/hooks.c            |  4 +++-
 security/selinux/include/classmap.h |  4 +++-
 12 files changed, 66 insertions(+), 3 deletions(-)
 create mode 100644 include/uapi/linux/mctp.h
 create mode 100644 net/mctp/Kconfig
 create mode 100644 net/mctp/Makefile
 create mode 100644 net/mctp/af_mctp.c

(limited to 'include/uapi')

diff --git a/MAINTAINERS b/MAINTAINERS
index 4c32a9c532b7..22a1ff9afd9d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11032,6 +11032,13 @@ F:	drivers/mailbox/arm_mhuv2.c
 F:	include/linux/mailbox/arm_mhuv2_message.h
 F:	Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml
 
+MANAGEMENT COMPONENT TRANSPORT PROTOCOL (MCTP)
+M:	Jeremy Kerr <jk@codeconstruct.com.au>
+M:	Matt Johnston <matt@codeconstruct.com.au>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	net/mctp/
+
 MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7
 M:	Michael Kerrisk <mtk.manpages@gmail.com>
 L:	linux-man@vger.kernel.org
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 0d8e3dcb7f88..fd9ce51582d8 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -223,8 +223,11 @@ struct ucred {
 				 * reuses AF_INET address family
 				 */
 #define AF_XDP		44	/* XDP sockets			*/
+#define AF_MCTP		45	/* Management component
+				 * transport protocol
+				 */
 
-#define AF_MAX		45	/* For now.. */
+#define AF_MAX		46	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -274,6 +277,7 @@ struct ucred {
 #define PF_QIPCRTR	AF_QIPCRTR
 #define PF_SMC		AF_SMC
 #define PF_XDP		AF_XDP
+#define PF_MCTP		AF_MCTP
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
new file mode 100644
index 000000000000..2640a589c14c
--- /dev/null
+++ b/include/uapi/linux/mctp.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Management Component Transport Protocol (MCTP)
+ *
+ * Copyright (c) 2021 Code Construct
+ * Copyright (c) 2021 Google
+ */
+
+#ifndef __UAPI_MCTP_H
+#define __UAPI_MCTP_H
+
+struct sockaddr_mctp {
+};
+
+#endif /* __UAPI_MCTP_H */
diff --git a/net/Kconfig b/net/Kconfig
index c7392c449b25..fb13460c6dab 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -363,6 +363,7 @@ source "net/bluetooth/Kconfig"
 source "net/rxrpc/Kconfig"
 source "net/kcm/Kconfig"
 source "net/strparser/Kconfig"
+source "net/mctp/Kconfig"
 
 config FIB_RULES
 	bool
diff --git a/net/Makefile b/net/Makefile
index 9ca9572188fe..fbfeb8a0bb37 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_QRTR)		+= qrtr/
 obj-$(CONFIG_NET_NCSI)		+= ncsi/
 obj-$(CONFIG_XDP_SOCKETS)	+= xdp/
 obj-$(CONFIG_MPTCP)		+= mptcp/
+obj-$(CONFIG_MCTP)		+= mctp/
diff --git a/net/core/sock.c b/net/core/sock.c
index a3eea6e0b30a..9671c32e6ef5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -226,6 +226,7 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
+  x "AF_MCTP"  , \
   x "AF_MAX"
 
 static const char *const af_family_key_strings[AF_MAX+1] = {
diff --git a/net/mctp/Kconfig b/net/mctp/Kconfig
new file mode 100644
index 000000000000..2cdf3d0a28c9
--- /dev/null
+++ b/net/mctp/Kconfig
@@ -0,0 +1,13 @@
+
+menuconfig MCTP
+	depends on NET
+	tristate "MCTP core protocol support"
+	help
+	  Management Component Transport Protocol (MCTP) is an in-system
+	  protocol for communicating between management controllers and
+	  their managed devices (peripherals, host processors, etc.). The
+	  protocol is defined by DMTF specification DSP0236.
+
+	  This option enables core MCTP support. For communicating with other
+	  devices, you'll want to enable a driver for a specific hardware
+	  channel.
diff --git a/net/mctp/Makefile b/net/mctp/Makefile
new file mode 100644
index 000000000000..7c056b1b7939
--- /dev/null
+++ b/net/mctp/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_MCTP) += mctp.o
+mctp-objs := af_mctp.o
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
new file mode 100644
index 000000000000..8f9c77e97357
--- /dev/null
+++ b/net/mctp/af_mctp.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Management Component Transport Protocol (MCTP)
+ *
+ * Copyright (c) 2021 Code Construct
+ * Copyright (c) 2021 Google
+ */
+
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("MCTP core");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jeremy Kerr <jk@codeconstruct.com.au>");
diff --git a/net/socket.c b/net/socket.c
index 42665bd99ea4..3c10504e46d9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -212,6 +212,7 @@ static const char * const pf_family_names[] = {
 	[PF_QIPCRTR]	= "PF_QIPCRTR",
 	[PF_SMC]	= "PF_SMC",
 	[PF_XDP]	= "PF_XDP",
+	[PF_MCTP]	= "PF_MCTP",
 };
 
 /*
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index b0032c42333e..2143f590e3d6 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1330,7 +1330,9 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc
 			return SECCLASS_SMC_SOCKET;
 		case PF_XDP:
 			return SECCLASS_XDP_SOCKET;
-#if PF_MAX > 45
+		case PF_MCTP:
+			return SECCLASS_MCTP_SOCKET;
+#if PF_MAX > 46
 #error New address family defined, please update this function.
 #endif
 		}
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 62d19bccf3de..084757ff4390 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -246,6 +246,8 @@ struct security_class_mapping secclass_map[] = {
 	    NULL } },
 	{ "xdp_socket",
 	  { COMMON_SOCK_PERMS, NULL } },
+	{ "mctp_socket",
+	  { COMMON_SOCK_PERMS, NULL } },
 	{ "perf_event",
 	  { "open", "cpu", "kernel", "tracepoint", "read", "write", NULL } },
 	{ "lockdown",
@@ -255,6 +257,6 @@ struct security_class_mapping secclass_map[] = {
 	{ NULL }
   };
 
-#if PF_MAX > 45
+#if PF_MAX > 46
 #error New address family defined, please update secclass_map.
 #endif
-- 
cgit v1.2.3-71-gd317


From 60fc63981693f807baa0e404104dedea0e8b4e61 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:42 +0800
Subject: mctp: Add sockaddr_mctp to uapi

This change introduces the user-visible MCTP header, containing the
protocol-specific addressing definitions.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mctp.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
index 2640a589c14c..52b54d13f385 100644
--- a/include/uapi/linux/mctp.h
+++ b/include/uapi/linux/mctp.h
@@ -9,7 +9,28 @@
 #ifndef __UAPI_MCTP_H
 #define __UAPI_MCTP_H
 
+#include <linux/types.h>
+
+typedef __u8			mctp_eid_t;
+
+struct mctp_addr {
+	mctp_eid_t		s_addr;
+};
+
 struct sockaddr_mctp {
+	unsigned short int	smctp_family;
+	int			smctp_network;
+	struct mctp_addr	smctp_addr;
+	__u8			smctp_type;
+	__u8			smctp_tag;
 };
 
+#define MCTP_NET_ANY		0x0
+
+#define MCTP_ADDR_NULL		0x00
+#define MCTP_ADDR_ANY		0xff
+
+#define MCTP_TAG_MASK		0x07
+#define MCTP_TAG_OWNER		0x08
+
 #endif /* __UAPI_MCTP_H */
-- 
cgit v1.2.3-71-gd317


From 4b2e69305cbbc7c32ecbd946110b505c4ff6071a Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:43 +0800
Subject: mctp: Add initial driver infrastructure

Add an empty drivers/net/mctp/, for future interface drivers.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                 | 1 +
 drivers/net/Kconfig         | 2 ++
 drivers/net/Makefile        | 1 +
 drivers/net/mctp/Kconfig    | 8 ++++++++
 drivers/net/mctp/Makefile   | 0
 include/uapi/linux/if_arp.h | 1 +
 6 files changed, 13 insertions(+)
 create mode 100644 drivers/net/mctp/Kconfig
 create mode 100644 drivers/net/mctp/Makefile

(limited to 'include/uapi')

diff --git a/MAINTAINERS b/MAINTAINERS
index 770b986d10f0..e95eb3b00cd2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11037,6 +11037,7 @@ M:	Jeremy Kerr <jk@codeconstruct.com.au>
 M:	Matt Johnston <matt@codeconstruct.com.au>
 L:	netdev@vger.kernel.org
 S:	Maintained
+F:	drivers/net/mctp/
 F:	include/net/mctp.h
 F:	net/mctp/
 
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 6977f8248df7..56213a8a1ec5 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -483,6 +483,8 @@ config NET_SB1000
 
 source "drivers/net/phy/Kconfig"
 
+source "drivers/net/mctp/Kconfig"
+
 source "drivers/net/mdio/Kconfig"
 
 source "drivers/net/pcs/Kconfig"
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 7ffd2d03efaf..a48a664605a3 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_WAN) += wan/
 obj-$(CONFIG_WLAN) += wireless/
 obj-$(CONFIG_IEEE802154) += ieee802154/
 obj-$(CONFIG_WWAN) += wwan/
+obj-$(CONFIG_MCTP) += mctp/
 
 obj-$(CONFIG_VMXNET3) += vmxnet3/
 obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
diff --git a/drivers/net/mctp/Kconfig b/drivers/net/mctp/Kconfig
new file mode 100644
index 000000000000..d8f966cedc89
--- /dev/null
+++ b/drivers/net/mctp/Kconfig
@@ -0,0 +1,8 @@
+
+if MCTP
+
+menu "MCTP Device Drivers"
+
+endmenu
+
+endif
diff --git a/drivers/net/mctp/Makefile b/drivers/net/mctp/Makefile
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
index c3cc5a9e5eaf..4783af9fe520 100644
--- a/include/uapi/linux/if_arp.h
+++ b/include/uapi/linux/if_arp.h
@@ -54,6 +54,7 @@
 #define ARPHRD_X25	271		/* CCITT X.25			*/
 #define ARPHRD_HWX25	272		/* Boards with X.25 in firmware	*/
 #define ARPHRD_CAN	280		/* Controller Area Network      */
+#define ARPHRD_MCTP	290
 #define ARPHRD_PPP	512
 #define ARPHRD_CISCO	513		/* Cisco HDLC	 		*/
 #define ARPHRD_HDLC	ARPHRD_CISCO
-- 
cgit v1.2.3-71-gd317


From 583be982d93479ea3d85091b0fd0b01201ede87d Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:44 +0800
Subject: mctp: Add device handling and netlink interface

This change adds the infrastructure for managing MCTP netdevices; we add
a pointer to the AF_MCTP-specific data to struct netdevice, and hook up
the rtnetlink operations for adding and removing addresses.

Includes changes from Matt Johnston <matt@codeconstruct.com.au>.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                   |   1 +
 include/linux/netdevice.h     |   4 +
 include/net/mctp.h            |  14 ++
 include/net/mctpdevice.h      |  35 ++++
 include/uapi/linux/if_ether.h |   3 +
 include/uapi/linux/if_link.h  |  10 +
 include/uapi/linux/mctp.h     |   1 +
 net/mctp/Makefile             |   2 +-
 net/mctp/af_mctp.c            |   8 +
 net/mctp/device.c             | 414 ++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 491 insertions(+), 1 deletion(-)
 create mode 100644 include/net/mctpdevice.h
 create mode 100644 net/mctp/device.c

(limited to 'include/uapi')

diff --git a/MAINTAINERS b/MAINTAINERS
index e95eb3b00cd2..c2943c227ee6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11039,6 +11039,7 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/mctp/
 F:	include/net/mctp.h
+F:	include/net/mctpdevice.h
 F:	net/mctp/
 
 MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 226bbee06730..d63a94ecbf3b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1823,6 +1823,7 @@ enum netdev_ml_priv_type {
  *	@ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network
  *			 device struct
  *	@mpls_ptr:	mpls_dev struct pointer
+ *	@mctp_ptr:	MCTP specific data
  *
  *	@dev_addr:	Hw address (before bcast,
  *			because most packets are unicast)
@@ -2110,6 +2111,9 @@ struct net_device {
 #if IS_ENABLED(CONFIG_MPLS_ROUTING)
 	struct mpls_dev __rcu	*mpls_ptr;
 #endif
+#if IS_ENABLED(CONFIG_MCTP)
+	struct mctp_dev __rcu	*mctp_ptr;
+#endif
 
 /*
  * Cache lines mostly used on receive path (including eth_type_trans())
diff --git a/include/net/mctp.h b/include/net/mctp.h
index 4c01e083be45..61452e03aa85 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -10,6 +10,7 @@
 #define __NET_MCTP_H
 
 #include <linux/bits.h>
+#include <linux/mctp.h>
 
 /* MCTP packet definitions */
 struct mctp_hdr {
@@ -32,4 +33,17 @@ struct mctp_hdr {
 #define MCTP_HDR_TAG_SHIFT	0
 #define MCTP_HDR_TAG_MASK	GENMASK(2, 0)
 
+static inline bool mctp_address_ok(mctp_eid_t eid)
+{
+	return eid >= 8 && eid < 255;
+}
+
+static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb)
+{
+	return (struct mctp_hdr *)skb_network_header(skb);
+}
+
+void mctp_device_init(void);
+void mctp_device_exit(void);
+
 #endif /* __NET_MCTP_H */
diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h
new file mode 100644
index 000000000000..71a11012fac7
--- /dev/null
+++ b/include/net/mctpdevice.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Management Component Transport Protocol (MCTP) - device
+ * definitions.
+ *
+ * Copyright (c) 2021 Code Construct
+ * Copyright (c) 2021 Google
+ */
+
+#ifndef __NET_MCTPDEVICE_H
+#define __NET_MCTPDEVICE_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/refcount.h>
+
+struct mctp_dev {
+	struct net_device	*dev;
+
+	unsigned int		net;
+
+	/* Only modified under RTNL. Reads have addrs_lock held */
+	u8			*addrs;
+	size_t			num_addrs;
+	spinlock_t		addrs_lock;
+
+	struct rcu_head		rcu;
+};
+
+#define MCTP_INITIAL_DEFAULT_NET	1
+
+struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev);
+struct mctp_dev *__mctp_dev_get(const struct net_device *dev);
+
+#endif /* __NET_MCTPDEVICE_H */
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index a0b637911d3c..5f589c7a8382 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -151,6 +151,9 @@
 #define ETH_P_MAP	0x00F9		/* Qualcomm multiplexing and
 					 * aggregation protocol
 					 */
+#define ETH_P_MCTP	0x00FA		/* Management component transport
+					 * protocol packets
+					 */
 
 /*
  *	This is an Ethernet frame header.
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 4882e81514b6..49b22afab78f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1260,4 +1260,14 @@ struct ifla_rmnet_flags {
 	__u32	mask;
 };
 
+/* MCTP section */
+
+enum {
+	IFLA_MCTP_UNSPEC,
+	IFLA_MCTP_NET,
+	__IFLA_MCTP_MAX,
+};
+
+#define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1)
+
 #endif /* _UAPI_LINUX_IF_LINK_H */
diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
index 52b54d13f385..a9d8edb3402b 100644
--- a/include/uapi/linux/mctp.h
+++ b/include/uapi/linux/mctp.h
@@ -26,6 +26,7 @@ struct sockaddr_mctp {
 };
 
 #define MCTP_NET_ANY		0x0
+#define MCTP_NET_DEFAULT	0x0
 
 #define MCTP_ADDR_NULL		0x00
 #define MCTP_ADDR_ANY		0xff
diff --git a/net/mctp/Makefile b/net/mctp/Makefile
index 7c056b1b7939..2ea98c27b262 100644
--- a/net/mctp/Makefile
+++ b/net/mctp/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_MCTP) += mctp.o
-mctp-objs := af_mctp.o
+mctp-objs := af_mctp.o device.o
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index b3aeca6486e3..401b4fa141a5 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -6,13 +6,18 @@
  * Copyright (c) 2021 Google
  */
 
+#include <linux/if_arp.h>
 #include <linux/net.h>
 #include <linux/mctp.h>
 #include <linux/module.h>
 #include <linux/socket.h>
 
+#include <net/mctp.h>
+#include <net/mctpdevice.h>
 #include <net/sock.h>
 
+/* socket implementation */
+
 struct mctp_sock {
 	struct sock	sk;
 };
@@ -152,6 +157,8 @@ static __init int mctp_init(void)
 	if (rc)
 		goto err_unreg_sock;
 
+	mctp_device_init();
+
 	return 0;
 
 err_unreg_sock:
@@ -162,6 +169,7 @@ err_unreg_sock:
 
 static __exit void mctp_exit(void)
 {
+	mctp_device_exit();
 	proto_unregister(&mctp_proto);
 	sock_unregister(PF_MCTP);
 }
diff --git a/net/mctp/device.c b/net/mctp/device.c
new file mode 100644
index 000000000000..877abe5312cd
--- /dev/null
+++ b/net/mctp/device.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Management Component Transport Protocol (MCTP) - device implementation.
+ *
+ * Copyright (c) 2021 Code Construct
+ * Copyright (c) 2021 Google
+ */
+
+#include <linux/if_link.h>
+#include <linux/mctp.h>
+#include <linux/netdevice.h>
+#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
+
+#include <net/addrconf.h>
+#include <net/netlink.h>
+#include <net/mctp.h>
+#include <net/mctpdevice.h>
+#include <net/sock.h>
+
+struct mctp_dump_cb {
+	int h;
+	int idx;
+	size_t a_idx;
+};
+
+/* unlocked: caller must hold rcu_read_lock */
+struct mctp_dev *__mctp_dev_get(const struct net_device *dev)
+{
+	return rcu_dereference(dev->mctp_ptr);
+}
+
+struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev)
+{
+	return rtnl_dereference(dev->mctp_ptr);
+}
+
+static void mctp_dev_destroy(struct mctp_dev *mdev)
+{
+	struct net_device *dev = mdev->dev;
+
+	dev_put(dev);
+	kfree_rcu(mdev, rcu);
+}
+
+static int mctp_fill_addrinfo(struct sk_buff *skb, struct netlink_callback *cb,
+			      struct mctp_dev *mdev, mctp_eid_t eid)
+{
+	struct ifaddrmsg *hdr;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			RTM_NEWADDR, sizeof(*hdr), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	hdr = nlmsg_data(nlh);
+	hdr->ifa_family = AF_MCTP;
+	hdr->ifa_prefixlen = 0;
+	hdr->ifa_flags = 0;
+	hdr->ifa_scope = 0;
+	hdr->ifa_index = mdev->dev->ifindex;
+
+	if (nla_put_u8(skb, IFA_LOCAL, eid))
+		goto cancel;
+
+	if (nla_put_u8(skb, IFA_ADDRESS, eid))
+		goto cancel;
+
+	nlmsg_end(skb, nlh);
+
+	return 0;
+
+cancel:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int mctp_dump_dev_addrinfo(struct mctp_dev *mdev, struct sk_buff *skb,
+				  struct netlink_callback *cb)
+{
+	struct mctp_dump_cb *mcb = (void *)cb->ctx;
+	int rc = 0;
+
+	for (; mcb->a_idx < mdev->num_addrs; mcb->a_idx++) {
+		rc = mctp_fill_addrinfo(skb, cb, mdev, mdev->addrs[mcb->a_idx]);
+		if (rc < 0)
+			break;
+	}
+
+	return rc;
+}
+
+static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct mctp_dump_cb *mcb = (void *)cb->ctx;
+	struct net *net = sock_net(skb->sk);
+	struct hlist_head *head;
+	struct net_device *dev;
+	struct ifaddrmsg *hdr;
+	struct mctp_dev *mdev;
+	int ifindex;
+	int idx, rc;
+
+	hdr = nlmsg_data(cb->nlh);
+	// filter by ifindex if requested
+	ifindex = hdr->ifa_index;
+
+	rcu_read_lock();
+	for (; mcb->h < NETDEV_HASHENTRIES; mcb->h++, mcb->idx = 0) {
+		idx = 0;
+		head = &net->dev_index_head[mcb->h];
+		hlist_for_each_entry_rcu(dev, head, index_hlist) {
+			if (idx >= mcb->idx &&
+			    (ifindex == 0 || ifindex == dev->ifindex)) {
+				mdev = __mctp_dev_get(dev);
+				if (mdev) {
+					rc = mctp_dump_dev_addrinfo(mdev,
+								    skb, cb);
+					// Error indicates full buffer, this
+					// callback will get retried.
+					if (rc < 0)
+						goto out;
+				}
+			}
+			idx++;
+			// reset for next iteration
+			mcb->a_idx = 0;
+		}
+	}
+out:
+	rcu_read_unlock();
+	mcb->idx = idx;
+
+	return skb->len;
+}
+
+static const struct nla_policy ifa_mctp_policy[IFA_MAX + 1] = {
+	[IFA_ADDRESS]		= { .type = NLA_U8 },
+	[IFA_LOCAL]		= { .type = NLA_U8 },
+};
+
+static int mctp_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX + 1];
+	struct net_device *dev;
+	struct mctp_addr *addr;
+	struct mctp_dev *mdev;
+	struct ifaddrmsg *ifm;
+	unsigned long flags;
+	u8 *tmp_addrs;
+	int rc;
+
+	rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy,
+			 extack);
+	if (rc < 0)
+		return rc;
+
+	ifm = nlmsg_data(nlh);
+
+	if (tb[IFA_LOCAL])
+		addr = nla_data(tb[IFA_LOCAL]);
+	else if (tb[IFA_ADDRESS])
+		addr = nla_data(tb[IFA_ADDRESS]);
+	else
+		return -EINVAL;
+
+	/* find device */
+	dev = __dev_get_by_index(net, ifm->ifa_index);
+	if (!dev)
+		return -ENODEV;
+
+	mdev = mctp_dev_get_rtnl(dev);
+	if (!mdev)
+		return -ENODEV;
+
+	if (!mctp_address_ok(addr->s_addr))
+		return -EINVAL;
+
+	/* Prevent duplicates. Under RTNL so don't need to lock for reading */
+	if (memchr(mdev->addrs, addr->s_addr, mdev->num_addrs))
+		return -EEXIST;
+
+	tmp_addrs = kmalloc(mdev->num_addrs + 1, GFP_KERNEL);
+	if (!tmp_addrs)
+		return -ENOMEM;
+	memcpy(tmp_addrs, mdev->addrs, mdev->num_addrs);
+	tmp_addrs[mdev->num_addrs] = addr->s_addr;
+
+	/* Lock to write */
+	spin_lock_irqsave(&mdev->addrs_lock, flags);
+	mdev->num_addrs++;
+	swap(mdev->addrs, tmp_addrs);
+	spin_unlock_irqrestore(&mdev->addrs_lock, flags);
+
+	kfree(tmp_addrs);
+
+	return 0;
+}
+
+static int mctp_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX + 1];
+	struct net_device *dev;
+	struct mctp_addr *addr;
+	struct mctp_dev *mdev;
+	struct ifaddrmsg *ifm;
+	unsigned long flags;
+	u8 *pos;
+	int rc;
+
+	rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy,
+			 extack);
+	if (rc < 0)
+		return rc;
+
+	ifm = nlmsg_data(nlh);
+
+	if (tb[IFA_LOCAL])
+		addr = nla_data(tb[IFA_LOCAL]);
+	else if (tb[IFA_ADDRESS])
+		addr = nla_data(tb[IFA_ADDRESS]);
+	else
+		return -EINVAL;
+
+	/* find device */
+	dev = __dev_get_by_index(net, ifm->ifa_index);
+	if (!dev)
+		return -ENODEV;
+
+	mdev = mctp_dev_get_rtnl(dev);
+	if (!mdev)
+		return -ENODEV;
+
+	pos = memchr(mdev->addrs, addr->s_addr, mdev->num_addrs);
+	if (!pos)
+		return -ENOENT;
+
+	spin_lock_irqsave(&mdev->addrs_lock, flags);
+	memmove(pos, pos + 1, mdev->num_addrs - 1 - (pos - mdev->addrs));
+	mdev->num_addrs--;
+	spin_unlock_irqrestore(&mdev->addrs_lock, flags);
+
+	return 0;
+}
+
+static struct mctp_dev *mctp_add_dev(struct net_device *dev)
+{
+	struct mctp_dev *mdev;
+
+	ASSERT_RTNL();
+
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&mdev->addrs_lock);
+
+	mdev->net = MCTP_INITIAL_DEFAULT_NET;
+
+	/* associate to net_device */
+	rcu_assign_pointer(dev->mctp_ptr, mdev);
+	dev_hold(dev);
+	mdev->dev = dev;
+
+	return mdev;
+}
+
+static int mctp_fill_link_af(struct sk_buff *skb,
+			     const struct net_device *dev, u32 ext_filter_mask)
+{
+	struct mctp_dev *mdev;
+
+	mdev = mctp_dev_get_rtnl(dev);
+	if (!mdev)
+		return -ENODATA;
+	if (nla_put_u32(skb, IFLA_MCTP_NET, mdev->net))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static size_t mctp_get_link_af_size(const struct net_device *dev,
+				    u32 ext_filter_mask)
+{
+	struct mctp_dev *mdev;
+	unsigned int ret;
+
+	/* caller holds RCU */
+	mdev = __mctp_dev_get(dev);
+	if (!mdev)
+		return 0;
+	ret = nla_total_size(4); /* IFLA_MCTP_NET */
+	return ret;
+}
+
+static const struct nla_policy ifla_af_mctp_policy[IFLA_MCTP_MAX + 1] = {
+	[IFLA_MCTP_NET]		= { .type = NLA_U32 },
+};
+
+static int mctp_set_link_af(struct net_device *dev, const struct nlattr *attr,
+			    struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_MCTP_MAX + 1];
+	struct mctp_dev *mdev;
+	int rc;
+
+	rc = nla_parse_nested(tb, IFLA_MCTP_MAX, attr, ifla_af_mctp_policy,
+			      NULL);
+	if (rc)
+		return rc;
+
+	mdev = mctp_dev_get_rtnl(dev);
+	if (!mdev)
+		return 0;
+
+	if (tb[IFLA_MCTP_NET])
+		WRITE_ONCE(mdev->net, nla_get_u32(tb[IFLA_MCTP_NET]));
+
+	return 0;
+}
+
+static void mctp_unregister(struct net_device *dev)
+{
+	struct mctp_dev *mdev;
+
+	mdev = mctp_dev_get_rtnl(dev);
+
+	if (!mdev)
+		return;
+
+	RCU_INIT_POINTER(mdev->dev->mctp_ptr, NULL);
+
+	kfree(mdev->addrs);
+
+	mctp_dev_destroy(mdev);
+}
+
+static int mctp_register(struct net_device *dev)
+{
+	struct mctp_dev *mdev;
+
+	/* Already registered? */
+	if (rtnl_dereference(dev->mctp_ptr))
+		return 0;
+
+	/* only register specific types; MCTP-specific and loopback for now */
+	if (dev->type != ARPHRD_MCTP && dev->type != ARPHRD_LOOPBACK)
+		return 0;
+
+	mdev = mctp_add_dev(dev);
+	if (IS_ERR(mdev))
+		return PTR_ERR(mdev);
+
+	return 0;
+}
+
+static int mctp_dev_notify(struct notifier_block *this, unsigned long event,
+			   void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	int rc;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		rc = mctp_register(dev);
+		if (rc)
+			return notifier_from_errno(rc);
+		break;
+	case NETDEV_UNREGISTER:
+		mctp_unregister(dev);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct rtnl_af_ops mctp_af_ops = {
+	.family = AF_MCTP,
+	.fill_link_af = mctp_fill_link_af,
+	.get_link_af_size = mctp_get_link_af_size,
+	.set_link_af = mctp_set_link_af,
+};
+
+static struct notifier_block mctp_dev_nb = {
+	.notifier_call = mctp_dev_notify,
+	.priority = ADDRCONF_NOTIFY_PRIORITY,
+};
+
+void __init mctp_device_init(void)
+{
+	register_netdevice_notifier(&mctp_dev_nb);
+
+	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETADDR,
+			     NULL, mctp_dump_addrinfo, 0);
+	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWADDR,
+			     mctp_rtm_newaddr, NULL, 0);
+	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELADDR,
+			     mctp_rtm_deladdr, NULL, 0);
+	rtnl_af_register(&mctp_af_ops);
+}
+
+void __exit mctp_device_exit(void)
+{
+	rtnl_af_unregister(&mctp_af_ops);
+	rtnl_unregister(PF_MCTP, RTM_DELADDR);
+	rtnl_unregister(PF_MCTP, RTM_NEWADDR);
+	rtnl_unregister(PF_MCTP, RTM_GETADDR);
+
+	unregister_netdevice_notifier(&mctp_dev_nb);
+}
-- 
cgit v1.2.3-71-gd317


From 03f2bbc4ee57ca53b2fa1d9caabc5006e0b8f375 Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Thu, 29 Jul 2021 10:20:52 +0800
Subject: mctp: Allow per-netns default networks

Currently we have a compile-time default network
(MCTP_INITIAL_DEFAULT_NET). This change introduces a default_net field
on the net namespace, allowing future configuration for new interfaces.

Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mctp.h        |  2 ++
 include/net/netns/mctp.h  |  3 +++
 include/uapi/linux/mctp.h |  1 -
 net/mctp/af_mctp.c        |  3 +++
 net/mctp/device.c         |  2 +-
 net/mctp/route.c          | 14 ++++++++++++++
 6 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index 0a460ba185b8..54bbe042c973 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -37,6 +37,8 @@ struct mctp_hdr {
 
 #define MCTP_HEADER_MAXLEN	4
 
+#define MCTP_INITIAL_DEFAULT_NET	1
+
 static inline bool mctp_address_ok(mctp_eid_t eid)
 {
 	return eid >= 8 && eid < 255;
diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h
index 14ae6d37e52a..acedef12a35e 100644
--- a/include/net/netns/mctp.h
+++ b/include/net/netns/mctp.h
@@ -25,6 +25,9 @@ struct netns_mctp {
 	spinlock_t keys_lock;
 	struct hlist_head keys;
 
+	/* MCTP network */
+	unsigned int default_net;
+
 	/* neighbour table */
 	struct mutex neigh_lock;
 	struct list_head neighbours;
diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
index a9d8edb3402b..52b54d13f385 100644
--- a/include/uapi/linux/mctp.h
+++ b/include/uapi/linux/mctp.h
@@ -26,7 +26,6 @@ struct sockaddr_mctp {
 };
 
 #define MCTP_NET_ANY		0x0
-#define MCTP_NET_DEFAULT	0x0
 
 #define MCTP_ADDR_NULL		0x00
 #define MCTP_ADDR_ANY		0xff
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 9ca836df19d0..84f722d31fd7 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -94,6 +94,9 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	if (!capable(CAP_NET_RAW))
 		return -EACCES;
 
+	if (addr->smctp_network == MCTP_NET_ANY)
+		addr->smctp_network = mctp_default_net(sock_net(sk));
+
 	rt = mctp_route_lookup(sock_net(sk), addr->smctp_network,
 			       addr->smctp_addr.s_addr);
 	if (!rt)
diff --git a/net/mctp/device.c b/net/mctp/device.c
index aa049590acda..b9f38e765f61 100644
--- a/net/mctp/device.c
+++ b/net/mctp/device.c
@@ -267,7 +267,7 @@ static struct mctp_dev *mctp_add_dev(struct net_device *dev)
 
 	spin_lock_init(&mdev->addrs_lock);
 
-	mdev->net = MCTP_INITIAL_DEFAULT_NET;
+	mdev->net = mctp_default_net(dev_net(dev));
 
 	/* associate to net_device */
 	rcu_assign_pointer(dev->mctp_ptr, mdev);
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 38f0a7278520..b3101375c8e7 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -433,6 +433,19 @@ static struct mctp_route *mctp_route_alloc(void)
 	return rt;
 }
 
+unsigned int mctp_default_net(struct net *net)
+{
+	return READ_ONCE(net->mctp.default_net);
+}
+
+int mctp_default_net_set(struct net *net, unsigned int index)
+{
+	if (index == 0)
+		return -EINVAL;
+	WRITE_ONCE(net->mctp.default_net, index);
+	return 0;
+}
+
 /* tag management */
 static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
 			     struct mctp_sock *msk)
@@ -1045,6 +1058,7 @@ static int __net_init mctp_routes_net_init(struct net *net)
 	mutex_init(&ns->bind_lock);
 	INIT_HLIST_HEAD(&ns->keys);
 	spin_lock_init(&ns->keys_lock);
+	WARN_ON(mctp_default_net_set(net, MCTP_INITIAL_DEFAULT_NET));
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From 695176bfe5dec2051f950bdac0ae0b21e29e6de3 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Thu, 29 Jul 2021 16:12:14 -0700
Subject: net_sched: refactor TC action init API

TC action ->init() API has 10 parameters, it becomes harder
to read. Some of them are just boolean and can be replaced
by flags. Similarly for the internal API tcf_action_init()
and tcf_exts_validate().

This patch converts them to flags and fold them into
the upper 16 bits of "flags", whose lower 16 bits are still
reserved for user-space. More specifically, the following
kernel flags are introduced:

TCA_ACT_FLAGS_POLICE replace 'name' in a few contexts, to
distinguish whether it is compatible with policer.

TCA_ACT_FLAGS_BIND replaces 'bind', to indicate whether
this action is bound to a filter.

TCA_ACT_FLAGS_REPLACE  replaces 'ovr' in most contexts,
means we are replacing an existing action.

TCA_ACT_FLAGS_NO_RTNL replaces 'rtnl_held' but has the
opposite meaning, because we still hold RTNL in most
cases.

The only user-space flag TCA_ACT_FLAGS_NO_PERCPU_STATS is
untouched and still stored as before.

I have tested this patch with tdc and I do not see any
failure related to this patch.

Tested-by: Vlad Buslov <vladbu@nvidia.com>
Acked-by: Jamal Hadi Salim<jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h        | 22 ++++++++++------
 include/net/pkt_cls.h        |  2 +-
 include/net/sch_generic.h    |  2 +-
 include/uapi/linux/pkt_cls.h |  1 +
 net/sched/act_api.c          | 61 ++++++++++++++++++++++----------------------
 net/sched/act_bpf.c          |  4 +--
 net/sched/act_connmark.c     |  4 +--
 net/sched/act_csum.c         |  7 ++---
 net/sched/act_ct.c           |  4 +--
 net/sched/act_ctinfo.c       |  4 +--
 net/sched/act_gact.c         |  4 +--
 net/sched/act_gate.c         |  4 +--
 net/sched/act_ife.c          |  9 ++++---
 net/sched/act_ipt.c          | 21 +++++++--------
 net/sched/act_mirred.c       |  4 +--
 net/sched/act_mpls.c         |  4 +--
 net/sched/act_nat.c          |  6 ++---
 net/sched/act_pedit.c        |  4 +--
 net/sched/act_police.c       |  4 +--
 net/sched/act_sample.c       |  7 ++---
 net/sched/act_simple.c       |  4 +--
 net/sched/act_skbedit.c      |  4 +--
 net/sched/act_skbmod.c       |  3 ++-
 net/sched/act_tunnel_key.c   |  4 +--
 net/sched/act_vlan.c         |  4 +--
 net/sched/cls_api.c          | 28 ++++++++++++--------
 net/sched/cls_basic.c        | 10 ++++----
 net/sched/cls_bpf.c          |  8 +++---
 net/sched/cls_cgroup.c       |  6 ++---
 net/sched/cls_flow.c         |  6 ++---
 net/sched/cls_flower.c       | 18 ++++++-------
 net/sched/cls_fw.c           | 13 +++++-----
 net/sched/cls_matchall.c     | 17 ++++++------
 net/sched/cls_route.c        | 10 ++++----
 net/sched/cls_rsvp.h         |  7 +++--
 net/sched/cls_tcindex.c      | 10 ++++----
 net/sched/cls_u32.c          | 24 ++++++++---------
 37 files changed, 185 insertions(+), 169 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 086b291e9530..f19f7f4a463c 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -58,6 +58,14 @@ struct tc_action {
 #define TCA_ACT_HW_STATS_ANY (TCA_ACT_HW_STATS_IMMEDIATE | \
 			      TCA_ACT_HW_STATS_DELAYED)
 
+/* Reserve 16 bits for user-space. See TCA_ACT_FLAGS_NO_PERCPU_STATS. */
+#define TCA_ACT_FLAGS_USER_BITS 16
+#define TCA_ACT_FLAGS_USER_MASK 0xffff
+#define TCA_ACT_FLAGS_POLICE	(1U << TCA_ACT_FLAGS_USER_BITS)
+#define TCA_ACT_FLAGS_BIND	(1U << (TCA_ACT_FLAGS_USER_BITS + 1))
+#define TCA_ACT_FLAGS_REPLACE	(1U << (TCA_ACT_FLAGS_USER_BITS + 2))
+#define TCA_ACT_FLAGS_NO_RTNL	(1U << (TCA_ACT_FLAGS_USER_BITS + 3))
+
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
  */
@@ -99,8 +107,8 @@ struct tc_action_ops {
 	void	(*cleanup)(struct tc_action *);
 	int     (*lookup)(struct net *net, struct tc_action **a, u32 index);
 	int     (*init)(struct net *net, struct nlattr *nla,
-			struct nlattr *est, struct tc_action **act, int ovr,
-			int bind, bool rtnl_held, struct tcf_proto *tp,
+			struct nlattr *est, struct tc_action **act,
+			struct tcf_proto *tp,
 			u32 flags, struct netlink_ext_ack *extack);
 	int     (*walk)(struct net *, struct sk_buff *,
 			struct netlink_callback *, int,
@@ -179,18 +187,16 @@ int tcf_action_destroy(struct tc_action *actions[], int bind);
 int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
 		    int nr_actions, struct tcf_result *res);
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
-		    struct nlattr *est, char *name, int ovr, int bind,
+		    struct nlattr *est,
 		    struct tc_action *actions[], int init_res[], size_t *attr_size,
-		    bool rtnl_held, struct netlink_ext_ack *extack);
-struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
+		    u32 flags, struct netlink_ext_ack *extack);
+struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police,
 					 bool rtnl_held,
 					 struct netlink_ext_ack *extack);
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    struct nlattr *nla, struct nlattr *est,
-				    char *name, int ovr, int bind,
 				    struct tc_action_ops *a_o, int *init_res,
-				    bool rtnl_held,
-				    struct netlink_ext_ack *extack);
+				    u32 flags, struct netlink_ext_ack *extack);
 int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind,
 		    int ref, bool terse);
 int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index dc28fcb6f0a2..64de26b7ad39 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -319,7 +319,7 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
 
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
 		      struct nlattr **tb, struct nlattr *rate_tlv,
-		      struct tcf_exts *exts, bool ovr, bool rtnl_held,
+		      struct tcf_exts *exts, u32 flags,
 		      struct netlink_ext_ack *extack);
 void tcf_exts_destroy(struct tcf_exts *exts);
 void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9ed33e6840bd..c0069ac00e62 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -357,7 +357,7 @@ struct tcf_proto_ops {
 	int			(*change)(struct net *net, struct sk_buff *,
 					struct tcf_proto*, unsigned long,
 					u32 handle, struct nlattr **,
-					void **, bool, bool,
+					void **, u32,
 					struct netlink_ext_ack *);
 	int			(*delete)(struct tcf_proto *tp, void *arg,
 					  bool *last, bool rtnl_held,
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 025c40fef93d..6836ccb9c45d 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -22,6 +22,7 @@ enum {
 	__TCA_ACT_MAX
 };
 
+/* See other TCA_ACT_FLAGS_ * flags in include/net/act_api.h. */
 #define TCA_ACT_FLAGS_NO_PERCPU_STATS 1 /* Don't use percpu allocator for
 					 * actions stats.
 					 */
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 998a2374f7ae..7dd3a2dc5fa4 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -495,7 +495,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 	p->tcfa_tm.install = jiffies;
 	p->tcfa_tm.lastuse = jiffies;
 	p->tcfa_tm.firstuse = 0;
-	p->tcfa_flags = flags;
+	p->tcfa_flags = flags & TCA_ACT_FLAGS_USER_MASK;
 	if (est) {
 		err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats,
 					&p->tcfa_rate_est,
@@ -941,7 +941,7 @@ void tcf_idr_insert_many(struct tc_action *actions[])
 	}
 }
 
-struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
+struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police,
 					 bool rtnl_held,
 					 struct netlink_ext_ack *extack)
 {
@@ -951,7 +951,7 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
 	struct nlattr *kind;
 	int err;
 
-	if (name == NULL) {
+	if (!police) {
 		err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
 						  tcf_action_policy, extack);
 		if (err < 0)
@@ -967,7 +967,7 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
 			return ERR_PTR(err);
 		}
 	} else {
-		if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) {
+		if (strlcpy(act_name, "police", IFNAMSIZ) >= IFNAMSIZ) {
 			NL_SET_ERR_MSG(extack, "TC action name too long");
 			return ERR_PTR(-EINVAL);
 		}
@@ -1004,12 +1004,11 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
 
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    struct nlattr *nla, struct nlattr *est,
-				    char *name, int ovr, int bind,
 				    struct tc_action_ops *a_o, int *init_res,
-				    bool rtnl_held,
-				    struct netlink_ext_ack *extack)
+				    u32 flags, struct netlink_ext_ack *extack)
 {
-	struct nla_bitfield32 flags = { 0, 0 };
+	bool police = flags & TCA_ACT_FLAGS_POLICE;
+	struct nla_bitfield32 userflags = { 0, 0 };
 	u8 hw_stats = TCA_ACT_HW_STATS_ANY;
 	struct nlattr *tb[TCA_ACT_MAX + 1];
 	struct tc_cookie *cookie = NULL;
@@ -1017,7 +1016,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 	int err;
 
 	/* backward compatibility for policer */
-	if (name == NULL) {
+	if (!police) {
 		err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
 						  tcf_action_policy, extack);
 		if (err < 0)
@@ -1032,22 +1031,22 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 		}
 		hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]);
 		if (tb[TCA_ACT_FLAGS])
-			flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
+			userflags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
 
-		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
-				rtnl_held, tp, flags.value, extack);
+		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, tp,
+				userflags.value | flags, extack);
 	} else {
-		err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
-				tp, flags.value, extack);
+		err = a_o->init(net, nla, est, &a, tp, userflags.value | flags,
+				extack);
 	}
 	if (err < 0)
 		goto err_out;
 	*init_res = err;
 
-	if (!name && tb[TCA_ACT_COOKIE])
+	if (!police && tb[TCA_ACT_COOKIE])
 		tcf_set_action_cookie(&a->act_cookie, cookie);
 
-	if (!name)
+	if (!police)
 		a->hw_stats = hw_stats;
 
 	return a;
@@ -1063,9 +1062,9 @@ err_out:
 /* Returns numbers of initialized actions or negative error. */
 
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
-		    struct nlattr *est, char *name, int ovr, int bind,
-		    struct tc_action *actions[], int init_res[], size_t *attr_size,
-		    bool rtnl_held, struct netlink_ext_ack *extack)
+		    struct nlattr *est, struct tc_action *actions[],
+		    int init_res[], size_t *attr_size, u32 flags,
+		    struct netlink_ext_ack *extack)
 {
 	struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {};
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
@@ -1082,7 +1081,9 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
 		struct tc_action_ops *a_o;
 
-		a_o = tc_action_load_ops(name, tb[i], rtnl_held, extack);
+		a_o = tc_action_load_ops(tb[i], flags & TCA_ACT_FLAGS_POLICE,
+					 !(flags & TCA_ACT_FLAGS_NO_RTNL),
+					 extack);
 		if (IS_ERR(a_o)) {
 			err = PTR_ERR(a_o);
 			goto err_mod;
@@ -1091,9 +1092,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 	}
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
-					ops[i - 1], &init_res[i - 1], rtnl_held,
-					extack);
+		act = tcf_action_init_1(net, tp, tb[i], est, ops[i - 1],
+					&init_res[i - 1], flags, extack);
 		if (IS_ERR(act)) {
 			err = PTR_ERR(act);
 			goto err;
@@ -1113,7 +1113,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 	goto err_mod;
 
 err:
-	tcf_action_destroy(actions, bind);
+	tcf_action_destroy(actions, flags & TCA_ACT_FLAGS_BIND);
 err_mod:
 	for (i = 0; i < TCA_ACT_MAX_PRIO; i++) {
 		if (ops[i])
@@ -1495,7 +1495,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
 }
 
 static int tcf_action_add(struct net *net, struct nlattr *nla,
-			  struct nlmsghdr *n, u32 portid, int ovr,
+			  struct nlmsghdr *n, u32 portid, u32 flags,
 			  struct netlink_ext_ack *extack)
 {
 	size_t attr_size = 0;
@@ -1504,8 +1504,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
 	int init_res[TCA_ACT_MAX_PRIO] = {};
 
 	for (loop = 0; loop < 10; loop++) {
-		ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0,
-				      actions, init_res, &attr_size, true, extack);
+		ret = tcf_action_init(net, NULL, nla, NULL, actions, init_res,
+				      &attr_size, flags, extack);
 		if (ret != -EAGAIN)
 			break;
 	}
@@ -1535,7 +1535,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 	struct net *net = sock_net(skb->sk);
 	struct nlattr *tca[TCA_ROOT_MAX + 1];
 	u32 portid = NETLINK_CB(skb).portid;
-	int ret = 0, ovr = 0;
+	u32 flags = 0;
+	int ret = 0;
 
 	if ((n->nlmsg_type != RTM_GETACTION) &&
 	    !netlink_capable(skb, CAP_NET_ADMIN))
@@ -1561,8 +1562,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 		 * is zero) then just set this
 		 */
 		if (n->nlmsg_flags & NLM_F_REPLACE)
-			ovr = 1;
-		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr,
+			flags = TCA_ACT_FLAGS_REPLACE;
+		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, flags,
 				     extack);
 		break;
 	case RTM_DELACTION:
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index e409a0005717..040807aa15b9 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -275,11 +275,11 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
 
 static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **act,
-			int replace, int bind, bool rtnl_held,
 			struct tcf_proto *tp, u32 flags,
 			struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tcf_bpf_cfg cfg, old;
@@ -317,7 +317,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 		if (bind)
 			return 0;
 
-		if (!replace) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*act, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e19885d7fe2c..94e78ac7a748 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -96,12 +96,12 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
 
 static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 			     struct nlattr *est, struct tc_action **a,
-			     int ovr, int bind, bool rtnl_held,
 			     struct tcf_proto *tp, u32 flags,
 			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 	struct nlattr *tb[TCA_CONNMARK_MAX + 1];
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct tcf_chain *goto_ch = NULL;
 	struct tcf_connmark_info *ci;
 	struct tc_connmark *parm;
@@ -144,7 +144,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 		ci = to_connmark(*a);
 		if (bind)
 			return 0;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 4fa4fcb842ba..a15ec95e69c3 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -41,11 +41,12 @@ static unsigned int csum_net_id;
 static struct tc_action_ops act_csum_ops;
 
 static int tcf_csum_init(struct net *net, struct nlattr *nla,
-			 struct nlattr *est, struct tc_action **a, int ovr,
-			 int bind, bool rtnl_held, struct tcf_proto *tp,
+			 struct nlattr *est, struct tc_action **a,
+			 struct tcf_proto *tp,
 			 u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct tcf_csum_params *params_new;
 	struct nlattr *tb[TCA_CSUM_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
@@ -78,7 +79,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 	} else if (err > 0) {
 		if (bind)/* dont override defaults */
 			return 0;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 1b4b3514c94f..ad9df0cb4b98 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1235,11 +1235,11 @@ static int tcf_ct_fill_params(struct net *net,
 
 static int tcf_ct_init(struct net *net, struct nlattr *nla,
 		       struct nlattr *est, struct tc_action **a,
-		       int replace, int bind, bool rtnl_held,
 		       struct tcf_proto *tp, u32 flags,
 		       struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ct_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct tcf_ct_params *params = NULL;
 	struct nlattr *tb[TCA_CT_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
@@ -1279,7 +1279,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
 		if (bind)
 			return 0;
 
-		if (!replace) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index b20c8ce59905..549374a2d008 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -154,11 +154,11 @@ static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
 
 static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
-			   int ovr, int bind, bool rtnl_held,
 			   struct tcf_proto *tp, u32 flags,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	u32 dscpmask = 0, dscpstatemask, index;
 	struct nlattr *tb[TCA_CTINFO_MAX + 1];
 	struct tcf_ctinfo_params *cp_new;
@@ -221,7 +221,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
 	} else if (err > 0) {
 		if (bind) /* don't override defaults */
 			return 0;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 73c3926358a0..d8dce173df37 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -52,11 +52,11 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
 
 static int tcf_gact_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind, bool rtnl_held,
 			 struct tcf_proto *tp, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_GACT_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_gact *parm;
@@ -109,7 +109,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	} else if (err > 0) {
 		if (bind)/* dont override defaults */
 			return 0;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
index a78cb7965718..7df72a4197a3 100644
--- a/net/sched/act_gate.c
+++ b/net/sched/act_gate.c
@@ -295,12 +295,12 @@ static void gate_setup_timer(struct tcf_gate *gact, u64 basetime,
 
 static int tcf_gate_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind, bool rtnl_held,
 			 struct tcf_proto *tp, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, gate_net_id);
 	enum tk_offsets tk_offset = TK_OFFS_TAI;
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_GATE_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	u64 cycletime = 0, basetime = 0;
@@ -364,7 +364,7 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 		}
 
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index a2ddea04183a..7064a365a1a9 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -479,11 +479,11 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
 
 static int tcf_ife_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **a,
-			int ovr, int bind, bool rtnl_held,
 			struct tcf_proto *tp, u32 flags,
 			struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_IFE_MAX + 1];
 	struct nlattr *tb2[IFE_META_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
@@ -532,7 +532,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 			kfree(p);
 			return err;
 		}
-		err = load_metalist(tb2, rtnl_held);
+		err = load_metalist(tb2, !(flags & TCA_ACT_FLAGS_NO_RTNL));
 		if (err) {
 			kfree(p);
 			return err;
@@ -560,7 +560,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 			return ret;
 		}
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		kfree(p);
 		return -EEXIST;
@@ -600,7 +600,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (tb[TCA_IFE_METALST]) {
-		err = populate_metalist(ife, tb2, exists, rtnl_held);
+		err = populate_metalist(ife, tb2, exists,
+					!(flags & TCA_ACT_FLAGS_NO_RTNL));
 		if (err)
 			goto metadata_parse_err;
 	} else {
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index ac7297f42355..265b1443e252 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -94,10 +94,11 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
 
 static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
-			  const struct tc_action_ops *ops, int ovr, int bind,
+			  const struct tc_action_ops *ops,
 			  struct tcf_proto *tp, u32 flags)
 {
 	struct tc_action_net *tn = net_generic(net, id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_IPT_MAX + 1];
 	struct tcf_ipt *ipt;
 	struct xt_entry_target *td, *t;
@@ -154,7 +155,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
 		if (bind)/* dont override defaults */
 			return 0;
 
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
@@ -201,21 +202,21 @@ err1:
 }
 
 static int tcf_ipt_init(struct net *net, struct nlattr *nla,
-			struct nlattr *est, struct tc_action **a, int ovr,
-			int bind, bool rtnl_held, struct tcf_proto *tp,
+			struct nlattr *est, struct tc_action **a,
+			struct tcf_proto *tp,
 			u32 flags, struct netlink_ext_ack *extack)
 {
-	return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
-			      bind, tp, flags);
+	return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops,
+			      tp, flags);
 }
 
 static int tcf_xt_init(struct net *net, struct nlattr *nla,
-		       struct nlattr *est, struct tc_action **a, int ovr,
-		       int bind, bool unlocked, struct tcf_proto *tp,
+		       struct nlattr *est, struct tc_action **a,
+		       struct tcf_proto *tp,
 		       u32 flags, struct netlink_ext_ack *extack)
 {
-	return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
-			      bind, tp, flags);
+	return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops,
+			      tp, flags);
 }
 
 static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 7153c67f641e..77ee80e3effc 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -91,11 +91,11 @@ static struct tc_action_ops act_mirred_ops;
 
 static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
-			   int ovr, int bind, bool rtnl_held,
 			   struct tcf_proto *tp,
 			   u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_MIRRED_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	bool mac_header_xmit = false;
@@ -155,7 +155,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 			return ret;
 		}
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index d1486ea496a2..e4529b428cf4 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -152,11 +152,11 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
 
 static int tcf_mpls_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind, bool rtnl_held,
 			 struct tcf_proto *tp, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, mpls_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_MPLS_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tcf_mpls_params *p;
@@ -255,7 +255,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
 		}
 
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 1ebd2a86d980..7dd6b586ba7f 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -34,11 +34,11 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
 };
 
 static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
-			struct tc_action **a, int ovr, int bind,
-			bool rtnl_held,	struct tcf_proto *tp,
+			struct tc_action **a, struct tcf_proto *tp,
 			u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_NAT_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_nat *parm;
@@ -70,7 +70,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	} else if (err > 0) {
 		if (bind)
 			return 0;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b45304446e13..c6c862c459cc 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -136,11 +136,11 @@ nla_failure:
 
 static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
-			  int ovr, int bind, bool rtnl_held,
 			  struct tcf_proto *tp, u32 flags,
 			  struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_pedit_key *keys = NULL;
@@ -198,7 +198,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	} else if (err > 0) {
 		if (bind)
 			goto out_free;
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			ret = -EEXIST;
 			goto out_release;
 		}
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 0fab8de176d2..832157a840fc 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -48,11 +48,11 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
 
 static int tcf_police_init(struct net *net, struct nlattr *nla,
 			       struct nlattr *est, struct tc_action **a,
-			       int ovr, int bind, bool rtnl_held,
 			       struct tcf_proto *tp, u32 flags,
 			       struct netlink_ext_ack *extack)
 {
 	int ret = 0, tcfp_result = TC_ACT_OK, err, size;
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_POLICE_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_police *parm;
@@ -97,7 +97,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 		}
 		ret = ACT_P_CREATED;
 		spin_lock_init(&(to_police(*a)->tcfp_lock));
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 6a0c16e4351d..230501eb9e06 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -34,11 +34,12 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
 };
 
 static int tcf_sample_init(struct net *net, struct nlattr *nla,
-			   struct nlattr *est, struct tc_action **a, int ovr,
-			   int bind, bool rtnl_held, struct tcf_proto *tp,
+			   struct nlattr *est, struct tc_action **a,
+			   struct tcf_proto *tp,
 			   u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_SAMPLE_MAX + 1];
 	struct psample_group *psample_group;
 	u32 psample_group_num, rate, index;
@@ -75,7 +76,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 			return ret;
 		}
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 726cc956d06f..cbbe1861d3a2 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -85,11 +85,11 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
 
 static int tcf_simp_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind, bool rtnl_held,
 			 struct tcf_proto *tp, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_DEF_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_defact *parm;
@@ -147,7 +147,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 		tcf_action_set_ctrlact(*a, parm->action, goto_ch);
 		ret = ACT_P_CREATED;
 	} else {
-		if (!ovr) {
+		if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 			err = -EEXIST;
 			goto release_idr;
 		}
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index e5f3fb8b00e3..605418538347 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -96,11 +96,11 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 			    struct nlattr *est, struct tc_action **a,
-			    int ovr, int bind, bool rtnl_held,
 			    struct tcf_proto *tp, u32 act_flags,
 			    struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+	bool bind = act_flags & TCA_ACT_FLAGS_BIND;
 	struct tcf_skbedit_params *params_new;
 	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
@@ -186,7 +186,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		ret = ACT_P_CREATED;
 	} else {
 		d = to_skbedit(*a);
-		if (!ovr) {
+		if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) {
 			tcf_idr_release(*a, bind);
 			return -EEXIST;
 		}
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 762ceec3e6f6..ecb9ee666095 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -100,11 +100,12 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
 
 static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
-			   int ovr, int bind, bool rtnl_held,
 			   struct tcf_proto *tp, u32 flags,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+	bool ovr = flags & TCA_ACT_FLAGS_REPLACE;
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_SKBMOD_MAX + 1];
 	struct tcf_skbmod_params *p, *p_old;
 	struct tcf_chain *goto_ch = NULL;
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 85c0d0d5b9da..d9cd174eecb7 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -355,11 +355,11 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p)
 
 static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
-			   int ovr, int bind, bool rtnl_held,
 			   struct tcf_proto *tp, u32 act_flags,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+	bool bind = act_flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
 	struct tcf_tunnel_key_params *params_new;
 	struct metadata_dst *metadata = NULL;
@@ -504,7 +504,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 		}
 
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) {
 		NL_SET_ERR_MSG(extack, "TC IDR already exists");
 		ret = -EEXIST;
 		goto release_tun_meta;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 71f2015c70ca..e4dc5a555bd8 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -114,11 +114,11 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
 
 static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind, bool rtnl_held,
 			 struct tcf_proto *tp, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_VLAN_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
 	bool push_prio_exists = false;
@@ -223,7 +223,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 		}
 
 		ret = ACT_P_CREATED;
-	} else if (!ovr) {
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 7be5b9d2aead..69185e311422 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1949,6 +1949,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 	int err;
 	int tp_created;
 	bool rtnl_held = false;
+	u32 flags = 0;
 
 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
@@ -2112,9 +2113,12 @@ replay:
 		goto errout;
 	}
 
+	if (!(n->nlmsg_flags & NLM_F_CREATE))
+		flags |= TCA_ACT_FLAGS_REPLACE;
+	if (!rtnl_held)
+		flags |= TCA_ACT_FLAGS_NO_RTNL;
 	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
-			      n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE,
-			      rtnl_held, extack);
+			      flags, extack);
 	if (err == 0) {
 		tfilter_notify(net, skb, n, tp, block, q, parent, fh,
 			       RTM_NEWTFILTER, false, rtnl_held);
@@ -3020,8 +3024,8 @@ void tcf_exts_destroy(struct tcf_exts *exts)
 EXPORT_SYMBOL(tcf_exts_destroy);
 
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
-		      struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr,
-		      bool rtnl_held, struct netlink_ext_ack *extack)
+		      struct nlattr *rate_tlv, struct tcf_exts *exts,
+		      u32 flags, struct netlink_ext_ack *extack)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	{
@@ -3032,13 +3036,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 		if (exts->police && tb[exts->police]) {
 			struct tc_action_ops *a_o;
 
-			a_o = tc_action_load_ops("police", tb[exts->police], rtnl_held, extack);
+			a_o = tc_action_load_ops(tb[exts->police], true,
+						 !(flags & TCA_ACT_FLAGS_NO_RTNL),
+						 extack);
 			if (IS_ERR(a_o))
 				return PTR_ERR(a_o);
+			flags |= TCA_ACT_FLAGS_POLICE | TCA_ACT_FLAGS_BIND;
 			act = tcf_action_init_1(net, tp, tb[exts->police],
-						rate_tlv, "police", ovr,
-						TCA_ACT_BIND, a_o, init_res,
-						rtnl_held, extack);
+						rate_tlv, a_o, init_res, flags,
+						extack);
 			module_put(a_o->owner);
 			if (IS_ERR(act))
 				return PTR_ERR(act);
@@ -3050,10 +3056,10 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 		} else if (exts->action && tb[exts->action]) {
 			int err;
 
+			flags |= TCA_ACT_FLAGS_BIND;
 			err = tcf_action_init(net, tp, tb[exts->action],
-					      rate_tlv, NULL, ovr, TCA_ACT_BIND,
-					      exts->actions, init_res,
-					      &attr_size, rtnl_held, extack);
+					      rate_tlv, exts->actions, init_res,
+					      &attr_size, flags, extack);
 			if (err < 0)
 				return err;
 			exts->nr_actions = err;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index f256a7c69093..8158fc9ee1ab 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -145,12 +145,12 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
 static int basic_set_parms(struct net *net, struct tcf_proto *tp,
 			   struct basic_filter *f, unsigned long base,
 			   struct nlattr **tb,
-			   struct nlattr *est, bool ovr,
+			   struct nlattr *est, u32 flags,
 			   struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack);
+	err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack);
 	if (err < 0)
 		return err;
 
@@ -169,8 +169,8 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
 
 static int basic_change(struct net *net, struct sk_buff *in_skb,
 			struct tcf_proto *tp, unsigned long base, u32 handle,
-			struct nlattr **tca, void **arg, bool ovr,
-			bool rtnl_held, struct netlink_ext_ack *extack)
+			struct nlattr **tca, void **arg,
+			u32 flags, struct netlink_ext_ack *extack)
 {
 	int err;
 	struct basic_head *head = rtnl_dereference(tp->root);
@@ -216,7 +216,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 		goto errout;
 	}
 
-	err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr,
+	err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], flags,
 			      extack);
 	if (err < 0) {
 		if (!fold)
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index fa739efa59f4..3b472bafdc9d 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -404,7 +404,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
 
 static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 			     struct cls_bpf_prog *prog, unsigned long base,
-			     struct nlattr **tb, struct nlattr *est, bool ovr,
+			     struct nlattr **tb, struct nlattr *est, u32 flags,
 			     struct netlink_ext_ack *extack)
 {
 	bool is_bpf, is_ebpf, have_exts = false;
@@ -416,7 +416,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 	if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf))
 		return -EINVAL;
 
-	ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, true,
+	ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, flags,
 				extack);
 	if (ret < 0)
 		return ret;
@@ -455,7 +455,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 			  struct tcf_proto *tp, unsigned long base,
 			  u32 handle, struct nlattr **tca,
-			  void **arg, bool ovr, bool rtnl_held,
+			  void **arg, u32 flags,
 			  struct netlink_ext_ack *extack)
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
@@ -500,7 +500,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		goto errout;
 	prog->handle = handle;
 
-	ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr,
+	ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], flags,
 				extack);
 	if (ret < 0)
 		goto errout_idr;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index fb881144fa01..ed00001b528a 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -76,7 +76,7 @@ static void cls_cgroup_destroy_work(struct work_struct *work)
 static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 			     struct tcf_proto *tp, unsigned long base,
 			     u32 handle, struct nlattr **tca,
-			     void **arg, bool ovr, bool rtnl_held,
+			     void **arg, u32 flags,
 			     struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_CGROUP_MAX + 1];
@@ -108,8 +108,8 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto errout;
 
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr,
-				true, extack);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, flags,
+				extack);
 	if (err < 0)
 		goto errout;
 
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 87398af2715a..972303aa8edd 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -387,7 +387,7 @@ static void flow_destroy_filter_work(struct work_struct *work)
 static int flow_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
-		       void **arg, bool ovr, bool rtnl_held,
+		       void **arg, u32 flags,
 		       struct netlink_ext_ack *extack)
 {
 	struct flow_head *head = rtnl_dereference(tp->root);
@@ -442,8 +442,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto err2;
 
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr,
-				true, extack);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, flags,
+				extack);
 	if (err < 0)
 		goto err2;
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index d7869a984881..23b21253b3c3 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1915,23 +1915,22 @@ errout_cleanup:
 static int fl_set_parms(struct net *net, struct tcf_proto *tp,
 			struct cls_fl_filter *f, struct fl_flow_mask *mask,
 			unsigned long base, struct nlattr **tb,
-			struct nlattr *est, bool ovr,
-			struct fl_flow_tmplt *tmplt, bool rtnl_held,
+			struct nlattr *est,
+			struct fl_flow_tmplt *tmplt, u32 flags,
 			struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, rtnl_held,
-				extack);
+	err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack);
 	if (err < 0)
 		return err;
 
 	if (tb[TCA_FLOWER_CLASSID]) {
 		f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]);
-		if (!rtnl_held)
+		if (flags & TCA_ACT_FLAGS_NO_RTNL)
 			rtnl_lock();
 		tcf_bind_filter(tp, &f->res, base);
-		if (!rtnl_held)
+		if (flags & TCA_ACT_FLAGS_NO_RTNL)
 			rtnl_unlock();
 	}
 
@@ -1975,10 +1974,11 @@ static int fl_ht_insert_unique(struct cls_fl_filter *fnew,
 static int fl_change(struct net *net, struct sk_buff *in_skb,
 		     struct tcf_proto *tp, unsigned long base,
 		     u32 handle, struct nlattr **tca,
-		     void **arg, bool ovr, bool rtnl_held,
+		     void **arg, u32 flags,
 		     struct netlink_ext_ack *extack)
 {
 	struct cls_fl_head *head = fl_head_dereference(tp);
+	bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL);
 	struct cls_fl_filter *fold = *arg;
 	struct cls_fl_filter *fnew;
 	struct fl_flow_mask *mask;
@@ -2034,8 +2034,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		}
 	}
 
-	err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], ovr,
-			   tp->chain->tmplt_priv, rtnl_held, extack);
+	err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE],
+			   tp->chain->tmplt_priv, flags, extack);
 	if (err)
 		goto errout;
 
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index ec945294626a..8654b0ce997c 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -198,15 +198,15 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
 
 static int fw_set_parms(struct net *net, struct tcf_proto *tp,
 			struct fw_filter *f, struct nlattr **tb,
-			struct nlattr **tca, unsigned long base, bool ovr,
+			struct nlattr **tca, unsigned long base, u32 flags,
 			struct netlink_ext_ack *extack)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	u32 mask;
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr,
-				true, extack);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, flags,
+				extack);
 	if (err < 0)
 		return err;
 
@@ -237,8 +237,7 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
 static int fw_change(struct net *net, struct sk_buff *in_skb,
 		     struct tcf_proto *tp, unsigned long base,
 		     u32 handle, struct nlattr **tca, void **arg,
-		     bool ovr, bool rtnl_held,
-		     struct netlink_ext_ack *extack)
+		     u32 flags, struct netlink_ext_ack *extack)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	struct fw_filter *f = *arg;
@@ -277,7 +276,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 			return err;
 		}
 
-		err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr, extack);
+		err = fw_set_parms(net, tp, fnew, tb, tca, base, flags, extack);
 		if (err < 0) {
 			tcf_exts_destroy(&fnew->exts);
 			kfree(fnew);
@@ -326,7 +325,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 	f->id = handle;
 	f->tp = tp;
 
-	err = fw_set_parms(net, tp, f, tb, tca, base, ovr, extack);
+	err = fw_set_parms(net, tp, f, tb, tca, base, flags, extack);
 	if (err < 0)
 		goto errout;
 
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index cafb84480bab..24f0046ce0b3 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -163,13 +163,12 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
 static int mall_set_parms(struct net *net, struct tcf_proto *tp,
 			  struct cls_mall_head *head,
 			  unsigned long base, struct nlattr **tb,
-			  struct nlattr *est, bool ovr,
+			  struct nlattr *est, u32 flags,
 			  struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, true,
-				extack);
+	err = tcf_exts_validate(net, tp, tb, est, &head->exts, flags, extack);
 	if (err < 0)
 		return err;
 
@@ -183,13 +182,13 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp,
 static int mall_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
-		       void **arg, bool ovr, bool rtnl_held,
+		       void **arg, u32 flags,
 		       struct netlink_ext_ack *extack)
 {
 	struct cls_mall_head *head = rtnl_dereference(tp->root);
 	struct nlattr *tb[TCA_MATCHALL_MAX + 1];
 	struct cls_mall_head *new;
-	u32 flags = 0;
+	u32 userflags = 0;
 	int err;
 
 	if (!tca[TCA_OPTIONS])
@@ -204,8 +203,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 		return err;
 
 	if (tb[TCA_MATCHALL_FLAGS]) {
-		flags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]);
-		if (!tc_flags_valid(flags))
+		userflags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]);
+		if (!tc_flags_valid(userflags))
 			return -EINVAL;
 	}
 
@@ -220,14 +219,14 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 	if (!handle)
 		handle = 1;
 	new->handle = handle;
-	new->flags = flags;
+	new->flags = userflags;
 	new->pf = alloc_percpu(struct tc_matchall_pcnt);
 	if (!new->pf) {
 		err = -ENOMEM;
 		goto err_alloc_percpu;
 	}
 
-	err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr,
+	err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], flags,
 			     extack);
 	if (err)
 		goto err_set_parms;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 5efa3e7ace15..a35ab8c27866 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -382,7 +382,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 			    unsigned long base, struct route4_filter *f,
 			    u32 handle, struct route4_head *head,
 			    struct nlattr **tb, struct nlattr *est, int new,
-			    bool ovr, struct netlink_ext_ack *extack)
+			    u32 flags, struct netlink_ext_ack *extack)
 {
 	u32 id = 0, to = 0, nhandle = 0x8000;
 	struct route4_filter *fp;
@@ -390,7 +390,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 	struct route4_bucket *b;
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack);
+	err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack);
 	if (err < 0)
 		return err;
 
@@ -464,8 +464,8 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 
 static int route4_change(struct net *net, struct sk_buff *in_skb,
 			 struct tcf_proto *tp, unsigned long base, u32 handle,
-			 struct nlattr **tca, void **arg, bool ovr,
-			 bool rtnl_held, struct netlink_ext_ack *extack)
+			 struct nlattr **tca, void **arg, u32 flags,
+			 struct netlink_ext_ack *extack)
 {
 	struct route4_head *head = rtnl_dereference(tp->root);
 	struct route4_filter __rcu **fp;
@@ -510,7 +510,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 	}
 
 	err = route4_set_parms(net, tp, base, f, handle, head, tb,
-			       tca[TCA_RATE], new, ovr, extack);
+			       tca[TCA_RATE], new, flags, extack);
 	if (err < 0)
 		goto errout;
 
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 27a4b6dbcf57..5cd9d6b143c4 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -470,9 +470,8 @@ static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
 
 static int rsvp_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
-		       u32 handle,
-		       struct nlattr **tca,
-		       void **arg, bool ovr, bool rtnl_held,
+		       u32 handle, struct nlattr **tca,
+		       void **arg, u32 flags,
 		       struct netlink_ext_ack *extack)
 {
 	struct rsvp_head *data = rtnl_dereference(tp->root);
@@ -499,7 +498,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
 	err = tcf_exts_init(&e, net, TCA_RSVP_ACT, TCA_RSVP_POLICE);
 	if (err < 0)
 		return err;
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, true,
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, flags,
 				extack);
 	if (err < 0)
 		goto errout2;
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index e9a8a2c86bbd..742c7d49a958 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -330,7 +330,7 @@ static int
 tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 		  u32 handle, struct tcindex_data *p,
 		  struct tcindex_filter_result *r, struct nlattr **tb,
-		  struct nlattr *est, bool ovr, struct netlink_ext_ack *extack)
+		  struct nlattr *est, u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tcindex_filter_result new_filter_result, *old_r = r;
 	struct tcindex_data *cp = NULL, *oldp;
@@ -342,7 +342,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 	err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
 	if (err < 0)
 		return err;
-	err = tcf_exts_validate(net, tp, tb, est, &e, ovr, true, extack);
+	err = tcf_exts_validate(net, tp, tb, est, &e, flags, extack);
 	if (err < 0)
 		goto errout;
 
@@ -529,8 +529,8 @@ errout:
 static int
 tcindex_change(struct net *net, struct sk_buff *in_skb,
 	       struct tcf_proto *tp, unsigned long base, u32 handle,
-	       struct nlattr **tca, void **arg, bool ovr,
-	       bool rtnl_held, struct netlink_ext_ack *extack)
+	       struct nlattr **tca, void **arg, u32 flags,
+	       struct netlink_ext_ack *extack)
 {
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_TCINDEX_MAX + 1];
@@ -551,7 +551,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb,
 		return err;
 
 	return tcindex_set_parms(net, tp, base, handle, p, r, tb,
-				 tca[TCA_RATE], ovr, extack);
+				 tca[TCA_RATE], flags, extack);
 }
 
 static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker,
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 6e1abe805448..4272814487f0 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -709,12 +709,12 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 			 unsigned long base,
 			 struct tc_u_knode *n, struct nlattr **tb,
-			 struct nlattr *est, bool ovr,
+			 struct nlattr *est, u32 flags,
 			 struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, true, extack);
+	err = tcf_exts_validate(net, tp, tb, est, &n->exts, flags, extack);
 	if (err < 0)
 		return err;
 
@@ -840,7 +840,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
 
 static int u32_change(struct net *net, struct sk_buff *in_skb,
 		      struct tcf_proto *tp, unsigned long base, u32 handle,
-		      struct nlattr **tca, void **arg, bool ovr, bool rtnl_held,
+		      struct nlattr **tca, void **arg, u32 flags,
 		      struct netlink_ext_ack *extack)
 {
 	struct tc_u_common *tp_c = tp->data;
@@ -849,7 +849,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	struct tc_u32_sel *s;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_U32_MAX + 1];
-	u32 htid, flags = 0;
+	u32 htid, userflags = 0;
 	size_t sel_size;
 	int err;
 
@@ -868,8 +868,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		return err;
 
 	if (tb[TCA_U32_FLAGS]) {
-		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
-		if (!tc_flags_valid(flags)) {
+		userflags = nla_get_u32(tb[TCA_U32_FLAGS]);
+		if (!tc_flags_valid(userflags)) {
 			NL_SET_ERR_MSG_MOD(extack, "Invalid filter flags");
 			return -EINVAL;
 		}
@@ -884,7 +884,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return -EINVAL;
 		}
 
-		if ((n->flags ^ flags) &
+		if ((n->flags ^ userflags) &
 		    ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) {
 			NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags");
 			return -EINVAL;
@@ -895,7 +895,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return -ENOMEM;
 
 		err = u32_set_parms(net, tp, base, new, tb,
-				    tca[TCA_RATE], ovr, extack);
+				    tca[TCA_RATE], flags, extack);
 
 		if (err) {
 			u32_destroy_key(new, false);
@@ -955,9 +955,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		ht->handle = handle;
 		ht->prio = tp->prio;
 		idr_init(&ht->handle_idr);
-		ht->flags = flags;
+		ht->flags = userflags;
 
-		err = u32_replace_hw_hnode(tp, ht, flags, extack);
+		err = u32_replace_hw_hnode(tp, ht, userflags, extack);
 		if (err) {
 			idr_remove(&tp_c->handle_idr, handle);
 			kfree(ht);
@@ -1038,7 +1038,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	RCU_INIT_POINTER(n->ht_up, ht);
 	n->handle = handle;
 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
-	n->flags = flags;
+	n->flags = userflags;
 
 	err = tcf_exts_init(&n->exts, net, TCA_U32_ACT, TCA_U32_POLICE);
 	if (err < 0)
@@ -1060,7 +1060,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	}
 #endif
 
-	err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], ovr,
+	err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], flags,
 			    extack);
 	if (err == 0) {
 		struct tc_u_knode __rcu **ins;
-- 
cgit v1.2.3-71-gd317


From 6e5b47a4f1dde38d42b054cc6d16b6840de08bd2 Mon Sep 17 00:00:00 2001
From: Simon Ser <contact@emersion.fr>
Date: Mon, 2 Aug 2021 07:28:35 +0000
Subject: drm: document drm_mode_get_property

It's not obvious what the fields mean and how they should be used.
The most important detail is the link to drm_property.flags, which
describes how property types work.

v2: document enum drm_mode_property_enum, add ref to "Modeset Base
Object Abstraction" (Daniel)

Signed-off-by: Simon Ser <contact@emersion.fr>
Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Leandro Ribeiro <leandro.ribeiro@collabora.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210802072826.500078-1-contact@emersion.fr
---
 Documentation/gpu/drm-kms.rst |  2 ++
 include/uapi/drm/drm_mode.h   | 60 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 58 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/gpu/drm-kms.rst b/Documentation/gpu/drm-kms.rst
index 0cc21f6aaef5..1ef7951ded5e 100644
--- a/Documentation/gpu/drm-kms.rst
+++ b/Documentation/gpu/drm-kms.rst
@@ -159,6 +159,8 @@ KMS Core Structures and Functions
 .. kernel-doc:: drivers/gpu/drm/drm_mode_config.c
    :export:
 
+.. _kms_base_object_abstraction:
+
 Modeset Base Object Abstraction
 ===============================
 
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 98bf130feda5..90c55383f1ee 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -541,22 +541,74 @@ struct drm_mode_get_connector {
  */
 #define DRM_MODE_PROP_ATOMIC        0x80000000
 
+/**
+ * struct drm_mode_property_enum - Description for an enum/bitfield entry.
+ * @value: numeric value for this enum entry.
+ * @name: symbolic name for this enum entry.
+ *
+ * See struct drm_property_enum for details.
+ */
 struct drm_mode_property_enum {
 	__u64 value;
 	char name[DRM_PROP_NAME_LEN];
 };
 
+/**
+ * struct drm_mode_get_property - Get property metadata.
+ *
+ * User-space can perform a GETPROPERTY ioctl to retrieve information about a
+ * property. The same property may be attached to multiple objects, see
+ * "Modeset Base Object Abstraction".
+ *
+ * The meaning of the @values_ptr field changes depending on the property type.
+ * See &drm_property.flags for more details.
+ *
+ * The @enum_blob_ptr and @count_enum_blobs fields are only meaningful when the
+ * property has the type &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK. For
+ * backwards compatibility, the kernel will always set @count_enum_blobs to
+ * zero when the property has the type &DRM_MODE_PROP_BLOB. User-space must
+ * ignore these two fields if the property has a different type.
+ *
+ * User-space is expected to retrieve values and enums by performing this ioctl
+ * at least twice: the first time to retrieve the number of elements, the
+ * second time to retrieve the elements themselves.
+ *
+ * To retrieve the number of elements, set @count_values and @count_enum_blobs
+ * to zero, then call the ioctl. @count_values will be updated with the number
+ * of elements. If the property has the type &DRM_MODE_PROP_ENUM or
+ * &DRM_MODE_PROP_BITMASK, @count_enum_blobs will be updated as well.
+ *
+ * To retrieve the elements themselves, allocate an array for @values_ptr and
+ * set @count_values to its capacity. If the property has the type
+ * &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK, allocate an array for
+ * @enum_blob_ptr and set @count_enum_blobs to its capacity. Calling the ioctl
+ * again will fill the arrays.
+ */
 struct drm_mode_get_property {
-	__u64 values_ptr; /* values and blob lengths */
-	__u64 enum_blob_ptr; /* enum and blob id ptrs */
+	/** @values_ptr: Pointer to a ``__u64`` array. */
+	__u64 values_ptr;
+	/** @enum_blob_ptr: Pointer to a struct drm_mode_property_enum array. */
+	__u64 enum_blob_ptr;
 
+	/**
+	 * @prop_id: Object ID of the property which should be retrieved. Set
+	 * by the caller.
+	 */
 	__u32 prop_id;
+	/**
+	 * @flags: ``DRM_MODE_PROP_*`` bitfield. See &drm_property.flags for
+	 * a definition of the flags.
+	 */
 	__u32 flags;
+	/**
+	 * @name: Symbolic property name. User-space should use this field to
+	 * recognize properties.
+	 */
 	char name[DRM_PROP_NAME_LEN];
 
+	/** @count_values: Number of elements in @values_ptr. */
 	__u32 count_values;
-	/* This is only used to count enum values, not blobs. The _blobs is
-	 * simply because of a historical reason, i.e. backwards compat. */
+	/** @count_enum_blobs: Number of elements in @enum_blob_ptr. */
 	__u32 count_enum_blobs;
 };
 
-- 
cgit v1.2.3-71-gd317


From 2d3e5caf96b9449af951e63476657acd759c1a30 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Sat, 31 Jul 2021 12:08:30 -0500
Subject: net/ipv4: Replace one-element array with flexible-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare having
a dynamically sized set of trailing elements in a structure. Kernel code
should always use “flexible array members”[1] for these cases. The older
style of one-element or zero-length arrays should no longer be used[2].

Use an anonymous union with a couple of anonymous structs in order to
keep userspace unchanged:

$ pahole -C ip_msfilter net/ipv4/ip_sockglue.o
struct ip_msfilter {
	union {
		struct {
			__be32     imsf_multiaddr_aux;   /*     0     4 */
			__be32     imsf_interface_aux;   /*     4     4 */
			__u32      imsf_fmode_aux;       /*     8     4 */
			__u32      imsf_numsrc_aux;      /*    12     4 */
			__be32     imsf_slist[1];        /*    16     4 */
		};                                       /*     0    20 */
		struct {
			__be32     imsf_multiaddr;       /*     0     4 */
			__be32     imsf_interface;       /*     4     4 */
			__u32      imsf_fmode;           /*     8     4 */
			__u32      imsf_numsrc;          /*    12     4 */
			__be32     imsf_slist_flex[0];   /*    16     0 */
		};                                       /*     0    16 */
	};                                               /*     0    20 */

	/* size: 20, cachelines: 1, members: 1 */
	/* last cacheline: 20 bytes */
};

Also, refactor the code accordingly and make use of the struct_size()
and flex_array_size() helpers.

This helps with the ongoing efforts to globally enable -Warray-bounds
and get us closer to being able to tighten the FORTIFY_SOURCE routines
on memcpy().

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/79
Link: https://github.com/KSPP/linux/issues/109
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/in.h | 21 ++++++++++++++++-----
 net/ipv4/igmp.c         | 12 ++++++------
 net/ipv4/ip_sockglue.c  | 15 ++++++++-------
 3 files changed, 30 insertions(+), 18 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index d1b327036ae4..193b7cf1f0ac 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -188,11 +188,22 @@ struct ip_mreq_source {
 };
 
 struct ip_msfilter {
-	__be32		imsf_multiaddr;
-	__be32		imsf_interface;
-	__u32		imsf_fmode;
-	__u32		imsf_numsrc;
-	__be32		imsf_slist[1];
+	union {
+		struct {
+			__be32		imsf_multiaddr_aux;
+			__be32		imsf_interface_aux;
+			__u32		imsf_fmode_aux;
+			__u32		imsf_numsrc_aux;
+			__be32		imsf_slist[1];
+		};
+		struct {
+			__be32		imsf_multiaddr;
+			__be32		imsf_interface;
+			__u32		imsf_fmode;
+			__u32		imsf_numsrc;
+			__be32		imsf_slist_flex[];
+		};
+	};
 };
 
 #define IP_MSFILTER_SIZE(numsrc) \
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 03589a04f9aa..a5f4ecb02e97 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2475,8 +2475,8 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 			goto done;
 		}
 		newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
-		memcpy(newpsl->sl_addr, msf->imsf_slist,
-			msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+		memcpy(newpsl->sl_addr, msf->imsf_slist_flex,
+		       flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc));
 		err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
 			msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
 		if (err) {
@@ -2551,14 +2551,14 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 		count = psl->sl_count;
 	}
 	copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
-	len = copycount * sizeof(psl->sl_addr[0]);
+	len = flex_array_size(psl, sl_addr, copycount);
 	msf->imsf_numsrc = count;
-	if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
-	    copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+	if (put_user(struct_size(optval, imsf_slist_flex, copycount), optlen) ||
+	    copy_to_user(optval, msf, struct_size(optval, imsf_slist_flex, 0))) {
 		return -EFAULT;
 	}
 	if (len &&
-	    copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+	    copy_to_user(&optval->imsf_slist_flex[0], psl->sl_addr, len))
 		return -EFAULT;
 	return 0;
 done:
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ec6036713e2c..bbe660b84a91 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -663,12 +663,11 @@ static int set_mcast_msfilter(struct sock *sk, int ifindex,
 			      struct sockaddr_storage *group,
 			      struct sockaddr_storage *list)
 {
-	int msize = IP_MSFILTER_SIZE(numsrc);
 	struct ip_msfilter *msf;
 	struct sockaddr_in *psin;
 	int err, i;
 
-	msf = kmalloc(msize, GFP_KERNEL);
+	msf = kmalloc(struct_size(msf, imsf_slist_flex, numsrc), GFP_KERNEL);
 	if (!msf)
 		return -ENOBUFS;
 
@@ -684,7 +683,7 @@ static int set_mcast_msfilter(struct sock *sk, int ifindex,
 
 		if (psin->sin_family != AF_INET)
 			goto Eaddrnotavail;
-		msf->imsf_slist[i] = psin->sin_addr.s_addr;
+		msf->imsf_slist_flex[i] = psin->sin_addr.s_addr;
 	}
 	err = ip_mc_msfilter(sk, msf, ifindex);
 	kfree(msf);
@@ -1229,7 +1228,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
 	{
 		struct ip_msfilter *msf;
 
-		if (optlen < IP_MSFILTER_SIZE(0))
+		if (optlen < struct_size(msf, imsf_slist_flex, 0))
 			goto e_inval;
 		if (optlen > sysctl_optmem_max) {
 			err = -ENOBUFS;
@@ -1247,7 +1246,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
 			err = -ENOBUFS;
 			break;
 		}
-		if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
+		if (struct_size(msf, imsf_slist_flex, msf->imsf_numsrc) >
+		    optlen) {
 			kfree(msf);
 			err = -EINVAL;
 			break;
@@ -1660,11 +1660,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	{
 		struct ip_msfilter msf;
 
-		if (len < IP_MSFILTER_SIZE(0)) {
+		if (len < struct_size(&msf, imsf_slist_flex, 0)) {
 			err = -EINVAL;
 			goto out;
 		}
-		if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
+		if (copy_from_user(&msf, optval,
+				   struct_size(&msf, imsf_slist_flex, 0))) {
 			err = -EFAULT;
 			goto out;
 		}
-- 
cgit v1.2.3-71-gd317


From 06447ae5e33bfbc5a777cc06d9854a31f3912833 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Wed, 14 Jul 2021 21:56:55 +0200
Subject: ioprio: move user space relevant ioprio bits to UAPI includes

systemd added a modified copy of include/linux/ioprio.h into its
code to get the relevant content definitions for the exposed
ioprio_[get|set] system calls.

Move the user space relevant ioprio bits to the UAPI includes to be
able to use the ioprio_[get|set] syscalls as intended.

Cc: Kay Sievers <kay@vrfy.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20210714195655.181943-1-socketcan@hartkopp.net
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ioprio.h      | 41 +---------------------------------------
 include/uapi/linux/ioprio.h | 46 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 40 deletions(-)
 create mode 100644 include/uapi/linux/ioprio.h

(limited to 'include/uapi')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index e9bfe6972aed..ef9ad4fb245f 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -6,46 +6,7 @@
 #include <linux/sched/rt.h>
 #include <linux/iocontext.h>
 
-/*
- * Gives us 8 prio classes with 13-bits of data for each class
- */
-#define IOPRIO_CLASS_SHIFT	(13)
-#define IOPRIO_PRIO_MASK	((1UL << IOPRIO_CLASS_SHIFT) - 1)
-
-#define IOPRIO_PRIO_CLASS(mask)	((mask) >> IOPRIO_CLASS_SHIFT)
-#define IOPRIO_PRIO_DATA(mask)	((mask) & IOPRIO_PRIO_MASK)
-#define IOPRIO_PRIO_VALUE(class, data)	(((class) << IOPRIO_CLASS_SHIFT) | data)
-
-#define ioprio_valid(mask)	(IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
-
-/*
- * These are the io priority groups as implemented by CFQ. RT is the realtime
- * class, it always gets premium service. BE is the best-effort scheduling
- * class, the default for any process. IDLE is the idle scheduling class, it
- * is only served when no one else is using the disk.
- */
-enum {
-	IOPRIO_CLASS_NONE,
-	IOPRIO_CLASS_RT,
-	IOPRIO_CLASS_BE,
-	IOPRIO_CLASS_IDLE,
-};
-
-/*
- * 8 best effort priority levels are supported
- */
-#define IOPRIO_BE_NR	(8)
-
-enum {
-	IOPRIO_WHO_PROCESS = 1,
-	IOPRIO_WHO_PGRP,
-	IOPRIO_WHO_USER,
-};
-
-/*
- * Fallback BE priority
- */
-#define IOPRIO_NORM	(4)
+#include <uapi/linux/ioprio.h>
 
 /*
  * if process has set io priority explicitly, use that. if not, convert
diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h
new file mode 100644
index 000000000000..77b17e08b0da
--- /dev/null
+++ b/include/uapi/linux/ioprio.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_IOPRIO_H
+#define _UAPI_LINUX_IOPRIO_H
+
+/*
+ * Gives us 8 prio classes with 13-bits of data for each class
+ */
+#define IOPRIO_CLASS_SHIFT	(13)
+#define IOPRIO_PRIO_MASK	((1UL << IOPRIO_CLASS_SHIFT) - 1)
+
+#define IOPRIO_PRIO_CLASS(mask)	((mask) >> IOPRIO_CLASS_SHIFT)
+#define IOPRIO_PRIO_DATA(mask)	((mask) & IOPRIO_PRIO_MASK)
+#define IOPRIO_PRIO_VALUE(class, data)	(((class) << IOPRIO_CLASS_SHIFT) | data)
+
+/*
+ * These are the io priority groups as implemented by CFQ. RT is the realtime
+ * class, it always gets premium service. BE is the best-effort scheduling
+ * class, the default for any process. IDLE is the idle scheduling class, it
+ * is only served when no one else is using the disk.
+ */
+enum {
+	IOPRIO_CLASS_NONE,
+	IOPRIO_CLASS_RT,
+	IOPRIO_CLASS_BE,
+	IOPRIO_CLASS_IDLE,
+};
+
+#define ioprio_valid(mask)	(IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
+
+/*
+ * 8 best effort priority levels are supported
+ */
+#define IOPRIO_BE_NR	(8)
+
+enum {
+	IOPRIO_WHO_PROCESS = 1,
+	IOPRIO_WHO_PGRP,
+	IOPRIO_WHO_USER,
+};
+
+/*
+ * Fallback BE priority
+ */
+#define IOPRIO_NORM	(4)
+
+#endif /* _UAPI_LINUX_IOPRIO_H */
-- 
cgit v1.2.3-71-gd317


From 7957d93bf32bc211415827e44fdd9cdf1388df59 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@microsoft.com>
Date: Tue, 13 Jul 2021 01:05:27 +0200
Subject: block: add ioctl to read the disk sequence number

Add a new BLKGETDISKSEQ ioctl which retrieves the disk sequence number
from the genhd structure.

    # ./getdiskseq /dev/loop*
    /dev/loop0:     13
    /dev/loop0p1:   13
    /dev/loop0p2:   13
    /dev/loop0p3:   13
    /dev/loop1:     14
    /dev/loop1p1:   14
    /dev/loop1p2:   14
    /dev/loop2:     5
    /dev/loop3:     6

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Tested-by: Luca Boccassi <bluca@debian.org>
Link: https://lore.kernel.org/r/20210712230530.29323-4-mcroce@linux.microsoft.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c           | 2 ++
 include/uapi/linux/fs.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/block/ioctl.c b/block/ioctl.c
index 24beec9ca9c9..0c3a4a53fa11 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -469,6 +469,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
 				BLKDEV_DISCARD_SECURE);
 	case BLKZEROOUT:
 		return blk_ioctl_zeroout(bdev, mode, arg);
+	case BLKGETDISKSEQ:
+		return put_u64(argp, bdev->bd_disk->diskseq);
 	case BLKREPORTZONE:
 		return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
 	case BLKRESETZONE:
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 4c32e97dcdf0..bdf7b404b3e7 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -184,6 +184,7 @@ struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
+#define BLKGETDISKSEQ _IOR(0x12,128,__u64)
 /*
  * A jump here: 130-136 are reserved for zoned block devices
  * (see uapi/linux/blkzoned.h)
-- 
cgit v1.2.3-71-gd317


From 3a755cd8b7c601f756cbbf908b84f7cc8c04a02b Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 2 Aug 2021 11:02:19 +0800
Subject: bonding: add new option lacp_active

Add an option lacp_active, which is similar with team's runner.active.
This option specifies whether to send LACPDU frames periodically. If set
on, the LACPDU frames are sent along with the configured lacp_rate
setting. If set off, the LACPDU frames acts as "speak when spoken to".

Note, the LACPDU state frames still will be sent when init or unbind port.

v2: remove module parameter

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.rst | 12 ++++++++++++
 drivers/net/bonding/bond_3ad.c       | 11 ++++++-----
 drivers/net/bonding/bond_main.c      |  1 +
 drivers/net/bonding/bond_netlink.c   | 16 ++++++++++++++++
 drivers/net/bonding/bond_options.c   | 27 +++++++++++++++++++++++++++
 drivers/net/bonding/bond_procfs.c    |  2 ++
 drivers/net/bonding/bond_sysfs.c     | 25 ++++++++++++++++++++-----
 include/net/bond_3ad.h               |  1 +
 include/net/bond_options.h           |  1 +
 include/net/bonding.h                |  1 +
 include/uapi/linux/if_link.h         |  1 +
 tools/include/uapi/linux/if_link.h   |  1 +
 12 files changed, 89 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index 62f2aab8eaec..31cfd7d674a6 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -501,6 +501,18 @@ fail_over_mac
 	This option was added in bonding version 3.2.0.  The "follow"
 	policy was added in bonding version 3.3.0.
 
+lacp_active
+	Option specifying whether to send LACPDU frames periodically.
+
+	off or 0
+		LACPDU frames acts as "speak when spoken to".
+
+	on or 1
+		LACPDU frames are sent along the configured links
+		periodically. See lacp_rate for more details.
+
+	The default is on.
+
 lacp_rate
 
 	Option specifying the rate in which we'll ask our link partner
diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 6908822d9773..a4a202b9a0a2 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -96,7 +96,7 @@ static int ad_marker_send(struct port *port, struct bond_marker *marker);
 static void ad_mux_machine(struct port *port, bool *update_slave_arr);
 static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port);
 static void ad_tx_machine(struct port *port);
-static void ad_periodic_machine(struct port *port);
+static void ad_periodic_machine(struct port *port, struct bond_params bond_params);
 static void ad_port_selection_logic(struct port *port, bool *update_slave_arr);
 static void ad_agg_selection_logic(struct aggregator *aggregator,
 				   bool *update_slave_arr);
@@ -1294,10 +1294,11 @@ static void ad_tx_machine(struct port *port)
 /**
  * ad_periodic_machine - handle a port's periodic state machine
  * @port: the port we're looking at
+ * @bond_params: bond parameters we will use
  *
  * Turn ntt flag on priodically to perform periodic transmission of lacpdu's.
  */
-static void ad_periodic_machine(struct port *port)
+static void ad_periodic_machine(struct port *port, struct bond_params bond_params)
 {
 	periodic_states_t last_state;
 
@@ -1306,8 +1307,8 @@ static void ad_periodic_machine(struct port *port)
 
 	/* check if port was reinitialized */
 	if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) ||
-	    (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY))
-	   ) {
+	    (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) ||
+	    !bond_params.lacp_active) {
 		port->sm_periodic_state = AD_NO_PERIODIC;
 	}
 	/* check if state machine should change state */
@@ -2341,7 +2342,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 		}
 
 		ad_rx_machine(NULL, port);
-		ad_periodic_machine(port);
+		ad_periodic_machine(port, bond->params);
 		ad_port_selection_logic(port, &update_slave_arr);
 		ad_mux_machine(port, &update_slave_arr);
 		ad_tx_machine(port);
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 616ebbb08ca6..3ba5f4871162 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -5478,6 +5478,7 @@ static int bond_check_params(struct bond_params *params)
 	params->downdelay = downdelay;
 	params->peer_notif_delay = 0;
 	params->use_carrier = use_carrier;
+	params->lacp_active = 1;
 	params->lacp_fast = lacp_fast;
 	params->primary[0] = 0;
 	params->primary_reselect = primary_reselect_value;
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 0561ece1ba45..5d54e11d18fa 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -100,6 +100,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 	[IFLA_BOND_MIN_LINKS]		= { .type = NLA_U32 },
 	[IFLA_BOND_LP_INTERVAL]		= { .type = NLA_U32 },
 	[IFLA_BOND_PACKETS_PER_SLAVE]	= { .type = NLA_U32 },
+	[IFLA_BOND_AD_LACP_ACTIVE]	= { .type = NLA_U8 },
 	[IFLA_BOND_AD_LACP_RATE]	= { .type = NLA_U8 },
 	[IFLA_BOND_AD_SELECT]		= { .type = NLA_U8 },
 	[IFLA_BOND_AD_INFO]		= { .type = NLA_NESTED },
@@ -387,6 +388,16 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		if (err)
 			return err;
 	}
+
+	if (data[IFLA_BOND_AD_LACP_ACTIVE]) {
+		int lacp_active = nla_get_u8(data[IFLA_BOND_AD_LACP_ACTIVE]);
+
+		bond_opt_initval(&newval, lacp_active);
+		err = __bond_opt_set(bond, BOND_OPT_LACP_ACTIVE, &newval);
+		if (err)
+			return err;
+	}
+
 	if (data[IFLA_BOND_AD_LACP_RATE]) {
 		int lacp_rate =
 			nla_get_u8(data[IFLA_BOND_AD_LACP_RATE]);
@@ -490,6 +501,7 @@ static size_t bond_get_size(const struct net_device *bond_dev)
 		nla_total_size(sizeof(u32)) +	/* IFLA_BOND_MIN_LINKS */
 		nla_total_size(sizeof(u32)) +	/* IFLA_BOND_LP_INTERVAL */
 		nla_total_size(sizeof(u32)) +  /* IFLA_BOND_PACKETS_PER_SLAVE */
+		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_AD_LACP_ACTIVE */
 		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_AD_LACP_RATE */
 		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_AD_SELECT */
 		nla_total_size(sizeof(struct nlattr)) + /* IFLA_BOND_AD_INFO */
@@ -622,6 +634,10 @@ static int bond_fill_info(struct sk_buff *skb,
 			packets_per_slave))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, IFLA_BOND_AD_LACP_ACTIVE,
+		       bond->params.lacp_active))
+		goto nla_put_failure;
+
 	if (nla_put_u8(skb, IFLA_BOND_AD_LACP_RATE,
 		       bond->params.lacp_fast))
 		goto nla_put_failure;
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 0cf25de6f46d..a8fde3bc458f 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -58,6 +58,8 @@ static int bond_option_lp_interval_set(struct bonding *bond,
 				       const struct bond_opt_value *newval);
 static int bond_option_pps_set(struct bonding *bond,
 			       const struct bond_opt_value *newval);
+static int bond_option_lacp_active_set(struct bonding *bond,
+				       const struct bond_opt_value *newval);
 static int bond_option_lacp_rate_set(struct bonding *bond,
 				     const struct bond_opt_value *newval);
 static int bond_option_ad_select_set(struct bonding *bond,
@@ -135,6 +137,12 @@ static const struct bond_opt_value bond_intmax_tbl[] = {
 	{ NULL,      -1,      0}
 };
 
+static const struct bond_opt_value bond_lacp_active[] = {
+	{ "off", 0,  0},
+	{ "on",  1,  BOND_VALFLAG_DEFAULT},
+	{ NULL,  -1, 0}
+};
+
 static const struct bond_opt_value bond_lacp_rate_tbl[] = {
 	{ "slow", AD_LACP_SLOW, 0},
 	{ "fast", AD_LACP_FAST, 0},
@@ -283,6 +291,15 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.values = bond_intmax_tbl,
 		.set = bond_option_updelay_set
 	},
+	[BOND_OPT_LACP_ACTIVE] = {
+		.id = BOND_OPT_LACP_ACTIVE,
+		.name = "lacp_active",
+		.desc = "Send LACPDU frames with configured lacp rate or acts as speak when spoken to",
+		.flags = BOND_OPTFLAG_IFDOWN,
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)),
+		.values = bond_lacp_active,
+		.set = bond_option_lacp_active_set
+	},
 	[BOND_OPT_LACP_RATE] = {
 		.id = BOND_OPT_LACP_RATE,
 		.name = "lacp_rate",
@@ -1333,6 +1350,16 @@ static int bond_option_pps_set(struct bonding *bond,
 	return 0;
 }
 
+static int bond_option_lacp_active_set(struct bonding *bond,
+				       const struct bond_opt_value *newval)
+{
+	netdev_dbg(bond->dev, "Setting LACP active to %s (%llu)\n",
+		   newval->string, newval->value);
+	bond->params.lacp_active = newval->value;
+
+	return 0;
+}
+
 static int bond_option_lacp_rate_set(struct bonding *bond,
 				     const struct bond_opt_value *newval)
 {
diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c
index 0fb1da361bb1..f3e3bfd72556 100644
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -133,6 +133,8 @@ static void bond_info_show_master(struct seq_file *seq)
 		struct ad_info ad_info;
 
 		seq_puts(seq, "\n802.3ad info\n");
+		seq_printf(seq, "LACP active: %s\n",
+			   (bond->params.lacp_active) ? "on" : "off");
 		seq_printf(seq, "LACP rate: %s\n",
 			   (bond->params.lacp_fast) ? "fast" : "slow");
 		seq_printf(seq, "Min links: %d\n", bond->params.min_links);
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 5f9e9a240226..b9e9842fed94 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -339,10 +339,24 @@ static ssize_t bonding_show_peer_notif_delay(struct device *d,
 static DEVICE_ATTR(peer_notif_delay, 0644,
 		   bonding_show_peer_notif_delay, bonding_sysfs_store_option);
 
-/* Show the LACP interval. */
-static ssize_t bonding_show_lacp(struct device *d,
-				 struct device_attribute *attr,
-				 char *buf)
+/* Show the LACP activity and interval. */
+static ssize_t bonding_show_lacp_active(struct device *d,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct bonding *bond = to_bond(d);
+	const struct bond_opt_value *val;
+
+	val = bond_opt_get_val(BOND_OPT_LACP_ACTIVE, bond->params.lacp_active);
+
+	return sprintf(buf, "%s %d\n", val->string, bond->params.lacp_active);
+}
+static DEVICE_ATTR(lacp_active, 0644,
+		   bonding_show_lacp_active, bonding_sysfs_store_option);
+
+static ssize_t bonding_show_lacp_rate(struct device *d,
+				      struct device_attribute *attr,
+				      char *buf)
 {
 	struct bonding *bond = to_bond(d);
 	const struct bond_opt_value *val;
@@ -352,7 +366,7 @@ static ssize_t bonding_show_lacp(struct device *d,
 	return sprintf(buf, "%s %d\n", val->string, bond->params.lacp_fast);
 }
 static DEVICE_ATTR(lacp_rate, 0644,
-		   bonding_show_lacp, bonding_sysfs_store_option);
+		   bonding_show_lacp_rate, bonding_sysfs_store_option);
 
 static ssize_t bonding_show_min_links(struct device *d,
 				      struct device_attribute *attr,
@@ -738,6 +752,7 @@ static struct attribute *per_bond_attrs[] = {
 	&dev_attr_downdelay.attr,
 	&dev_attr_updelay.attr,
 	&dev_attr_peer_notif_delay.attr,
+	&dev_attr_lacp_active.attr,
 	&dev_attr_lacp_rate.attr,
 	&dev_attr_ad_select.attr,
 	&dev_attr_xmit_hash_policy.attr,
diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h
index c8696a230b7d..38785d48baff 100644
--- a/include/net/bond_3ad.h
+++ b/include/net/bond_3ad.h
@@ -303,6 +303,7 @@ int  __bond_3ad_get_active_agg_info(struct bonding *bond,
 int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond,
 			 struct slave *slave);
 int bond_3ad_set_carrier(struct bonding *bond);
+void bond_3ad_update_lacp_active(struct bonding *bond);
 void bond_3ad_update_lacp_rate(struct bonding *bond);
 void bond_3ad_update_ad_actor_settings(struct bonding *bond);
 int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats);
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 9d382f2f0bc5..e64833a674eb 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -64,6 +64,7 @@ enum {
 	BOND_OPT_AD_USER_PORT_KEY,
 	BOND_OPT_NUM_PEER_NOTIF_ALIAS,
 	BOND_OPT_PEER_NOTIF_DELAY,
+	BOND_OPT_LACP_ACTIVE,
 	BOND_OPT_LAST
 };
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 625d9c72dee3..46df47004803 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -129,6 +129,7 @@ struct bond_params {
 	int updelay;
 	int downdelay;
 	int peer_notif_delay;
+	int lacp_active;
 	int lacp_fast;
 	unsigned int min_links;
 	int ad_select;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 49b22afab78f..5310003523ce 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -855,6 +855,7 @@ enum {
 	IFLA_BOND_AD_ACTOR_SYSTEM,
 	IFLA_BOND_TLB_DYNAMIC_LB,
 	IFLA_BOND_PEER_NOTIF_DELAY,
+	IFLA_BOND_AD_LACP_ACTIVE,
 	__IFLA_BOND_MAX,
 };
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index d208b2af697f..eb15f319aa57 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -653,6 +653,7 @@ enum {
 	IFLA_BOND_AD_ACTOR_SYSTEM,
 	IFLA_BOND_TLB_DYNAMIC_LB,
 	IFLA_BOND_PEER_NOTIF_DELAY,
+	IFLA_BOND_AD_LACP_ACTIVE,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit v1.2.3-71-gd317


From 018c14911dd7e2feedd96d440f12ea999e459fff Mon Sep 17 00:00:00 2001
From: Bodo Stroesser <bostroesser@gmail.com>
Date: Tue, 13 Jul 2021 19:50:21 +0200
Subject: scsi: target: tcmu: Add new feature KEEP_BUF

When running command pipelining for WRITE direction commands (e.g. tape
device write), userspace sends cmd completion to cmd ring before processing
write data. In that case userspace has to copy data before sending
completion, because cmd completion also implicitly releases the data buffer
in data area.

The new feature KEEP_BUF allows userspace to optionally keep the buffer
after completion by setting new bit TCMU_UFLAG_KEEP_BUF in
tcmu_cmd_entry_hdr->uflags. In that case buffer has to be released
explicitly by writing the cmd_id to new action item free_kept_buf.

All kept buffers are released during reset_ring and if userspace closes uio
device (tcmu_release).

Link: https://lore.kernel.org/r/20210713175021.20103-1-bostroesser@gmail.com
Reviewed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Bodo Stroesser <bostroesser@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_user.c     | 150 +++++++++++++++++++++++++++++++---
 include/uapi/linux/target_core_user.h |   2 +
 2 files changed, 141 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index fbb6ffaddfbe..9f552f48084c 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -191,6 +191,7 @@ struct tcmu_cmd {
 	unsigned long deadline;
 
 #define TCMU_CMD_BIT_EXPIRED 0
+#define TCMU_CMD_BIT_KEEP_BUF 1
 	unsigned long flags;
 };
 
@@ -1315,11 +1316,13 @@ unlock:
 	mutex_unlock(&udev->cmdr_lock);
 }
 
-static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *entry)
+static bool tcmu_handle_completion(struct tcmu_cmd *cmd,
+				   struct tcmu_cmd_entry *entry, bool keep_buf)
 {
 	struct se_cmd *se_cmd = cmd->se_cmd;
 	struct tcmu_dev *udev = cmd->tcmu_dev;
 	bool read_len_valid = false;
+	bool ret = true;
 	uint32_t read_len;
 
 	/*
@@ -1330,6 +1333,13 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
 		WARN_ON_ONCE(se_cmd);
 		goto out;
 	}
+	if (test_bit(TCMU_CMD_BIT_KEEP_BUF, &cmd->flags)) {
+		pr_err("cmd_id %u already completed with KEEP_BUF, ring is broken\n",
+		       entry->hdr.cmd_id);
+		set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags);
+		ret = false;
+		goto out;
+	}
 
 	list_del_init(&cmd->queue_entry);
 
@@ -1379,8 +1389,22 @@ done:
 		target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status);
 
 out:
-	tcmu_cmd_free_data(cmd, cmd->dbi_cnt);
-	tcmu_free_cmd(cmd);
+	if (!keep_buf) {
+		tcmu_cmd_free_data(cmd, cmd->dbi_cnt);
+		tcmu_free_cmd(cmd);
+	} else {
+		/*
+		 * Keep this command after completion, since userspace still
+		 * needs the data buffer. Mark it with TCMU_CMD_BIT_KEEP_BUF
+		 * and reset potential TCMU_CMD_BIT_EXPIRED, so we don't accept
+		 * a second completion later.
+		 * Userspace can free the buffer later by writing the cmd_id
+		 * to new action attribute free_kept_buf.
+		 */
+		clear_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags);
+		set_bit(TCMU_CMD_BIT_KEEP_BUF, &cmd->flags);
+	}
+	return ret;
 }
 
 static int tcmu_run_tmr_queue(struct tcmu_dev *udev)
@@ -1432,6 +1456,7 @@ static bool tcmu_handle_completions(struct tcmu_dev *udev)
 	while (udev->cmdr_last_cleaned != READ_ONCE(mb->cmd_tail)) {
 
 		struct tcmu_cmd_entry *entry = udev->cmdr + udev->cmdr_last_cleaned;
+		bool keep_buf;
 
 		/*
 		 * Flush max. up to end of cmd ring since current entry might
@@ -1453,7 +1478,11 @@ static bool tcmu_handle_completions(struct tcmu_dev *udev)
 		}
 		WARN_ON(tcmu_hdr_get_op(entry->hdr.len_op) != TCMU_OP_CMD);
 
-		cmd = xa_erase(&udev->commands, entry->hdr.cmd_id);
+		keep_buf = !!(entry->hdr.uflags & TCMU_UFLAG_KEEP_BUF);
+		if (keep_buf)
+			cmd = xa_load(&udev->commands, entry->hdr.cmd_id);
+		else
+			cmd = xa_erase(&udev->commands, entry->hdr.cmd_id);
 		if (!cmd) {
 			pr_err("cmd_id %u not found, ring is broken\n",
 			       entry->hdr.cmd_id);
@@ -1461,7 +1490,8 @@ static bool tcmu_handle_completions(struct tcmu_dev *udev)
 			return false;
 		}
 
-		tcmu_handle_completion(cmd, entry);
+		if (!tcmu_handle_completion(cmd, entry, keep_buf))
+			break;
 
 		UPDATE_HEAD(udev->cmdr_last_cleaned,
 			    tcmu_hdr_get_len(entry->hdr.len_op),
@@ -1619,7 +1649,8 @@ static void tcmu_dev_call_rcu(struct rcu_head *p)
 
 static int tcmu_check_and_free_pending_cmd(struct tcmu_cmd *cmd)
 {
-	if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) {
+	if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags) ||
+	    test_bit(TCMU_CMD_BIT_KEEP_BUF, &cmd->flags)) {
 		kmem_cache_free(tcmu_cmd_cache, cmd);
 		return 0;
 	}
@@ -1903,6 +1934,38 @@ static int tcmu_open(struct uio_info *info, struct inode *inode)
 static int tcmu_release(struct uio_info *info, struct inode *inode)
 {
 	struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);
+	struct tcmu_cmd *cmd;
+	unsigned long i;
+	bool freed = false;
+
+	mutex_lock(&udev->cmdr_lock);
+
+	xa_for_each(&udev->commands, i, cmd) {
+		/* Cmds with KEEP_BUF set are no longer on the ring, but
+		 * userspace still holds the data buffer. If userspace closes
+		 * we implicitly free these cmds and buffers, since after new
+		 * open the (new ?) userspace cannot find the cmd in the ring
+		 * and thus never will release the buffer by writing cmd_id to
+		 * free_kept_buf action attribute.
+		 */
+		if (!test_bit(TCMU_CMD_BIT_KEEP_BUF, &cmd->flags))
+			continue;
+		pr_debug("removing KEEP_BUF cmd %u on dev %s from ring\n",
+			 cmd->cmd_id, udev->name);
+		freed = true;
+
+		xa_erase(&udev->commands, i);
+		tcmu_cmd_free_data(cmd, cmd->dbi_cnt);
+		tcmu_free_cmd(cmd);
+	}
+	/*
+	 * We only freed data space, not ring space. Therefore we dont call
+	 * run_tmr_queue, but call run_qfull_queue if tmr_list is empty.
+	 */
+	if (freed && list_empty(&udev->tmr_queue))
+		run_qfull_queue(udev, false);
+
+	mutex_unlock(&udev->cmdr_lock);
 
 	clear_bit(TCMU_DEV_BIT_OPEN, &udev->flags);
 
@@ -2147,7 +2210,8 @@ static int tcmu_configure_device(struct se_device *dev)
 	mb->version = TCMU_MAILBOX_VERSION;
 	mb->flags = TCMU_MAILBOX_FLAG_CAP_OOOC |
 		    TCMU_MAILBOX_FLAG_CAP_READ_LEN |
-		    TCMU_MAILBOX_FLAG_CAP_TMR;
+		    TCMU_MAILBOX_FLAG_CAP_TMR |
+		    TCMU_MAILBOX_FLAG_CAP_KEEP_BUF;
 	mb->cmdr_off = CMDR_OFF;
 	mb->cmdr_size = udev->cmdr_size;
 
@@ -2279,12 +2343,16 @@ static void tcmu_reset_ring(struct tcmu_dev *udev, u8 err_level)
 	mutex_lock(&udev->cmdr_lock);
 
 	xa_for_each(&udev->commands, i, cmd) {
-		pr_debug("removing cmd %u on dev %s from ring (is expired %d)\n",
-			  cmd->cmd_id, udev->name,
-			  test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags));
+		pr_debug("removing cmd %u on dev %s from ring %s\n",
+			 cmd->cmd_id, udev->name,
+			 test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags) ?
+			 "(is expired)" :
+			 (test_bit(TCMU_CMD_BIT_KEEP_BUF, &cmd->flags) ?
+			 "(is keep buffer)" : ""));
 
 		xa_erase(&udev->commands, i);
-		if (!test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) {
+		if (!test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags) &&
+		    !test_bit(TCMU_CMD_BIT_KEEP_BUF, &cmd->flags)) {
 			WARN_ON(!cmd->se_cmd);
 			list_del_init(&cmd->queue_entry);
 			cmd->se_cmd->priv = NULL;
@@ -2933,6 +3001,65 @@ static ssize_t tcmu_reset_ring_store(struct config_item *item, const char *page,
 }
 CONFIGFS_ATTR_WO(tcmu_, reset_ring);
 
+static ssize_t tcmu_free_kept_buf_store(struct config_item *item, const char *page,
+					size_t count)
+{
+	struct se_device *se_dev = container_of(to_config_group(item),
+						struct se_device,
+						dev_action_group);
+	struct tcmu_dev *udev = TCMU_DEV(se_dev);
+	struct tcmu_cmd *cmd;
+	u16 cmd_id;
+	int ret;
+
+	if (!target_dev_configured(&udev->se_dev)) {
+		pr_err("Device is not configured.\n");
+		return -EINVAL;
+	}
+
+	ret = kstrtou16(page, 0, &cmd_id);
+	if (ret < 0)
+		return ret;
+
+	mutex_lock(&udev->cmdr_lock);
+
+	{
+		XA_STATE(xas, &udev->commands, cmd_id);
+
+		xas_lock(&xas);
+		cmd = xas_load(&xas);
+		if (!cmd) {
+			pr_err("free_kept_buf: cmd_id %d not found\n", cmd_id);
+			count = -EINVAL;
+			xas_unlock(&xas);
+			goto out_unlock;
+		}
+		if (!test_bit(TCMU_CMD_BIT_KEEP_BUF, &cmd->flags)) {
+			pr_err("free_kept_buf: cmd_id %d was not completed with KEEP_BUF\n",
+			       cmd_id);
+			count = -EINVAL;
+			xas_unlock(&xas);
+			goto out_unlock;
+		}
+		xas_store(&xas, NULL);
+		xas_unlock(&xas);
+	}
+
+	tcmu_cmd_free_data(cmd, cmd->dbi_cnt);
+	tcmu_free_cmd(cmd);
+	/*
+	 * We only freed data space, not ring space. Therefore we dont call
+	 * run_tmr_queue, but call run_qfull_queue if tmr_list is empty.
+	 */
+	if (list_empty(&udev->tmr_queue))
+		run_qfull_queue(udev, false);
+
+out_unlock:
+	mutex_unlock(&udev->cmdr_lock);
+	return count;
+}
+CONFIGFS_ATTR_WO(tcmu_, free_kept_buf);
+
 static struct configfs_attribute *tcmu_attrib_attrs[] = {
 	&tcmu_attr_cmd_time_out,
 	&tcmu_attr_qfull_time_out,
@@ -2951,6 +3078,7 @@ static struct configfs_attribute **tcmu_attrs;
 static struct configfs_attribute *tcmu_action_attrs[] = {
 	&tcmu_attr_block_dev,
 	&tcmu_attr_reset_ring,
+	&tcmu_attr_free_kept_buf,
 	NULL,
 };
 
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index 95b1597f16ae..27ace512babd 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -46,6 +46,7 @@
 #define TCMU_MAILBOX_FLAG_CAP_OOOC (1 << 0) /* Out-of-order completions */
 #define TCMU_MAILBOX_FLAG_CAP_READ_LEN (1 << 1) /* Read data length */
 #define TCMU_MAILBOX_FLAG_CAP_TMR (1 << 2) /* TMR notifications */
+#define TCMU_MAILBOX_FLAG_CAP_KEEP_BUF (1<<3) /* Keep buf after cmd completion */
 
 struct tcmu_mailbox {
 	__u16 version;
@@ -75,6 +76,7 @@ struct tcmu_cmd_entry_hdr {
 	__u8 kflags;
 #define TCMU_UFLAG_UNKNOWN_OP 0x1
 #define TCMU_UFLAG_READ_LEN   0x2
+#define TCMU_UFLAG_KEEP_BUF   0x4
 	__u8 uflags;
 
 } __packed;
-- 
cgit v1.2.3-71-gd317


From 5b9272e93f2efe3f6cda60cc2c26817b2ce49386 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Wed, 7 Jul 2021 11:48:54 +0200
Subject: can: j1939: extend UAPI to notify about RX status

To be able to create applications with user friendly feedback, we need be
able to provide receive status information.

Typical ETP transfer may take seconds or even hours. To give user some
clue or show a progress bar, the stack should push status updates.
Same as for the TX information, the socket error queue will be used with
following new signals:
- J1939_EE_INFO_RX_RTS   - received and accepted request to send signal.
- J1939_EE_INFO_RX_DPO   - received data package offset signal
- J1939_EE_INFO_RX_ABORT - RX session was aborted

Instead of completion signal, user will get data package.
To activate this signals, application should set
SOF_TIMESTAMPING_RX_SOFTWARE to the SO_TIMESTAMPING socket option. This
will avoid unpredictable application behavior for the old software.

Link: https://lore.kernel.org/r/20210707094854.30781-3-o.rempel@pengutronix.de
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/j1939.h |   9 +++
 net/can/j1939/j1939-priv.h     |   4 ++
 net/can/j1939/socket.c         | 135 ++++++++++++++++++++++++++++++++---------
 net/can/j1939/transport.c      |  22 +++++--
 4 files changed, 136 insertions(+), 34 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/can/j1939.h b/include/uapi/linux/can/j1939.h
index df6e821075c1..38936460f668 100644
--- a/include/uapi/linux/can/j1939.h
+++ b/include/uapi/linux/can/j1939.h
@@ -78,11 +78,20 @@ enum {
 enum {
 	J1939_NLA_PAD,
 	J1939_NLA_BYTES_ACKED,
+	J1939_NLA_TOTAL_SIZE,
+	J1939_NLA_PGN,
+	J1939_NLA_SRC_NAME,
+	J1939_NLA_DEST_NAME,
+	J1939_NLA_SRC_ADDR,
+	J1939_NLA_DEST_ADDR,
 };
 
 enum {
 	J1939_EE_INFO_NONE,
 	J1939_EE_INFO_TX_ABORT,
+	J1939_EE_INFO_RX_RTS,
+	J1939_EE_INFO_RX_DPO,
+	J1939_EE_INFO_RX_ABORT,
 };
 
 struct j1939_filter {
diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h
index 93b8ad7f7d04..f6df20808f5e 100644
--- a/net/can/j1939/j1939-priv.h
+++ b/net/can/j1939/j1939-priv.h
@@ -23,6 +23,9 @@ enum j1939_sk_errqueue_type {
 	J1939_ERRQUEUE_TX_ACK,
 	J1939_ERRQUEUE_TX_SCHED,
 	J1939_ERRQUEUE_TX_ABORT,
+	J1939_ERRQUEUE_RX_RTS,
+	J1939_ERRQUEUE_RX_DPO,
+	J1939_ERRQUEUE_RX_ABORT,
 };
 
 /* j1939 devices */
@@ -87,6 +90,7 @@ struct j1939_priv {
 	struct list_head j1939_socks;
 
 	struct kref rx_kref;
+	u32 rx_tskey;
 };
 
 void j1939_ecu_put(struct j1939_ecu *ecu);
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index 6f3b10472f7f..6dff4510687a 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -905,20 +905,33 @@ failure:
 	return NULL;
 }
 
-static size_t j1939_sk_opt_stats_get_size(void)
+static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type)
 {
-	return
-		nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */
-		0;
+	switch (type) {
+	case J1939_ERRQUEUE_RX_RTS:
+		return
+			nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */
+			nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */
+			nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */
+			nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */
+			nla_total_size(sizeof(u8)) +  /* J1939_NLA_SRC_ADDR */
+			nla_total_size(sizeof(u8)) +  /* J1939_NLA_DEST_ADDR */
+			0;
+	default:
+		return
+			nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */
+			0;
+	}
 }
 
 static struct sk_buff *
-j1939_sk_get_timestamping_opt_stats(struct j1939_session *session)
+j1939_sk_get_timestamping_opt_stats(struct j1939_session *session,
+				    enum j1939_sk_errqueue_type type)
 {
 	struct sk_buff *stats;
 	u32 size;
 
-	stats = alloc_skb(j1939_sk_opt_stats_get_size(), GFP_ATOMIC);
+	stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
 
@@ -928,32 +941,67 @@ j1939_sk_get_timestamping_opt_stats(struct j1939_session *session)
 		size = min(session->pkt.tx_acked * 7,
 			   session->total_message_size);
 
-	nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size);
+	switch (type) {
+	case J1939_ERRQUEUE_RX_RTS:
+		nla_put_u32(stats, J1939_NLA_TOTAL_SIZE,
+			    session->total_message_size);
+		nla_put_u32(stats, J1939_NLA_PGN,
+			    session->skcb.addr.pgn);
+		nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME,
+				  session->skcb.addr.src_name, J1939_NLA_PAD);
+		nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME,
+				  session->skcb.addr.dst_name, J1939_NLA_PAD);
+		nla_put_u8(stats, J1939_NLA_SRC_ADDR,
+			   session->skcb.addr.sa);
+		nla_put_u8(stats, J1939_NLA_DEST_ADDR,
+			   session->skcb.addr.da);
+		break;
+	default:
+		nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size);
+	}
 
 	return stats;
 }
 
-void j1939_sk_errqueue(struct j1939_session *session,
-		       enum j1939_sk_errqueue_type type)
+static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk,
+				enum j1939_sk_errqueue_type type)
 {
 	struct j1939_priv *priv = session->priv;
-	struct sock *sk = session->sk;
 	struct j1939_sock *jsk;
 	struct sock_exterr_skb *serr;
 	struct sk_buff *skb;
 	char *state = "UNK";
 	int err;
 
-	/* currently we have no sk for the RX session */
-	if (!sk)
-		return;
-
 	jsk = j1939_sk(sk);
 
 	if (!(jsk->state & J1939_SOCK_ERRQUEUE))
 		return;
 
-	skb = j1939_sk_get_timestamping_opt_stats(session);
+	switch (type) {
+	case J1939_ERRQUEUE_TX_ACK:
+		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))
+			return;
+		break;
+	case J1939_ERRQUEUE_TX_SCHED:
+		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED))
+			return;
+		break;
+	case J1939_ERRQUEUE_TX_ABORT:
+		break;
+	case J1939_ERRQUEUE_RX_RTS:
+		fallthrough;
+	case J1939_ERRQUEUE_RX_DPO:
+		fallthrough;
+	case J1939_ERRQUEUE_RX_ABORT:
+		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE))
+			return;
+		break;
+	default:
+		netdev_err(priv->ndev, "Unknown errqueue type %i\n", type);
+	}
+
+	skb = j1939_sk_get_timestamping_opt_stats(session, type);
 	if (!skb)
 		return;
 
@@ -965,35 +1013,41 @@ void j1939_sk_errqueue(struct j1939_session *session,
 	memset(serr, 0, sizeof(*serr));
 	switch (type) {
 	case J1939_ERRQUEUE_TX_ACK:
-		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) {
-			kfree_skb(skb);
-			return;
-		}
-
 		serr->ee.ee_errno = ENOMSG;
 		serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
 		serr->ee.ee_info = SCM_TSTAMP_ACK;
-		state = "ACK";
+		state = "TX ACK";
 		break;
 	case J1939_ERRQUEUE_TX_SCHED:
-		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) {
-			kfree_skb(skb);
-			return;
-		}
-
 		serr->ee.ee_errno = ENOMSG;
 		serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
 		serr->ee.ee_info = SCM_TSTAMP_SCHED;
-		state = "SCH";
+		state = "TX SCH";
 		break;
 	case J1939_ERRQUEUE_TX_ABORT:
 		serr->ee.ee_errno = session->err;
 		serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
 		serr->ee.ee_info = J1939_EE_INFO_TX_ABORT;
-		state = "ABT";
+		state = "TX ABT";
+		break;
+	case J1939_ERRQUEUE_RX_RTS:
+		serr->ee.ee_errno = ENOMSG;
+		serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+		serr->ee.ee_info = J1939_EE_INFO_RX_RTS;
+		state = "RX RTS";
+		break;
+	case J1939_ERRQUEUE_RX_DPO:
+		serr->ee.ee_errno = ENOMSG;
+		serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+		serr->ee.ee_info = J1939_EE_INFO_RX_DPO;
+		state = "RX DPO";
+		break;
+	case J1939_ERRQUEUE_RX_ABORT:
+		serr->ee.ee_errno = session->err;
+		serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+		serr->ee.ee_info = J1939_EE_INFO_RX_ABORT;
+		state = "RX ABT";
 		break;
-	default:
-		netdev_err(priv->ndev, "Unknown errqueue type %i\n", type);
 	}
 
 	serr->opt_stats = true;
@@ -1008,6 +1062,27 @@ void j1939_sk_errqueue(struct j1939_session *session,
 		kfree_skb(skb);
 };
 
+void j1939_sk_errqueue(struct j1939_session *session,
+		       enum j1939_sk_errqueue_type type)
+{
+	struct j1939_priv *priv = session->priv;
+	struct j1939_sock *jsk;
+
+	if (session->sk) {
+		/* send TX notifications to the socket of origin  */
+		__j1939_sk_errqueue(session, session->sk, type);
+		return;
+	}
+
+	/* spread RX notifications to all sockets subscribed to this session */
+	spin_lock_bh(&priv->j1939_socks_lock);
+	list_for_each_entry(jsk, &priv->j1939_socks, list) {
+		if (j1939_sk_recv_match_one(jsk, &session->skcb))
+			__j1939_sk_errqueue(session, &jsk->sk, type);
+	}
+	spin_unlock_bh(&priv->j1939_socks_lock);
+};
+
 void j1939_sk_send_loop_abort(struct sock *sk, int err)
 {
 	sk->sk_err = err;
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index 801e700eaba6..bb5c4b8979be 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -260,10 +260,14 @@ static void __j1939_session_drop(struct j1939_session *session)
 
 static void j1939_session_destroy(struct j1939_session *session)
 {
-	if (session->err)
-		j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ABORT);
-	else
-		j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ACK);
+	if (session->transmission) {
+		if (session->err)
+			j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ABORT);
+		else
+			j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ACK);
+	} else if (session->err) {
+			j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
+	}
 
 	netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
 
@@ -1116,6 +1120,8 @@ static void __j1939_session_cancel(struct j1939_session *session,
 
 	if (session->sk)
 		j1939_sk_send_loop_abort(session->sk, session->err);
+	else
+		j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
 }
 
 static void j1939_session_cancel(struct j1939_session *session,
@@ -1330,6 +1336,8 @@ static void j1939_xtp_rx_abort_one(struct j1939_priv *priv, struct sk_buff *skb,
 	session->err = j1939_xtp_abort_to_errno(priv, abort);
 	if (session->sk)
 		j1939_sk_send_loop_abort(session->sk, session->err);
+	else
+		j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
 	j1939_session_deactivate_activate_next(session);
 
 abort_put:
@@ -1630,6 +1638,9 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv,
 	session->pkt.rx = 0;
 	session->pkt.tx = 0;
 
+	session->tskey = priv->rx_tskey++;
+	j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_RTS);
+
 	WARN_ON_ONCE(j1939_session_activate(session));
 
 	return session;
@@ -1752,6 +1763,9 @@ static void j1939_xtp_rx_dpo_one(struct j1939_session *session,
 	session->pkt.dpo = j1939_etp_ctl_to_packet(skb->data);
 	session->last_cmd = dat[0];
 	j1939_tp_set_rxtimeout(session, 750);
+
+	if (!session->transmission)
+		j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_DPO);
 }
 
 static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb,
-- 
cgit v1.2.3-71-gd317


From 04190bf8944deb7e3ac165a1a494db23aa0160a9 Mon Sep 17 00:00:00 2001
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Date: Wed, 4 Aug 2021 10:55:56 +0300
Subject: sock: allow reading and changing sk_userlocks with setsockopt

SOCK_SNDBUF_LOCK and SOCK_RCVBUF_LOCK flags disable automatic socket
buffers adjustment done by kernel (see tcp_fixup_rcvbuf() and
tcp_sndbuf_expand()). If we've just created a new socket this adjustment
is enabled on it, but if one changes the socket buffer size by
setsockopt(SO_{SND,RCV}BUF*) it becomes disabled.

CRIU needs to call setsockopt(SO_{SND,RCV}BUF*) on each socket on
restore as it first needs to increase buffer sizes for packet queues
restore and second it needs to restore back original buffer sizes. So
after CRIU restore all sockets become non-auto-adjustable, which can
decrease network performance of restored applications significantly.

CRIU need to be able to restore sockets with enabled/disabled adjustment
to the same state it was before dump, so let's add special setsockopt
for it.

Let's also export SOCK_SNDBUF_LOCK and SOCK_RCVBUF_LOCK flags to uAPI so
that using these interface one can reenable automatic socket buffer
adjustment on their sockets.

Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h  |  2 ++
 arch/mips/include/uapi/asm/socket.h   |  2 ++
 arch/parisc/include/uapi/asm/socket.h |  2 ++
 arch/sparc/include/uapi/asm/socket.h  |  2 ++
 include/net/sock.h                    |  3 +--
 include/uapi/asm-generic/socket.h     |  2 ++
 include/uapi/linux/socket.h           |  5 +++++
 net/core/sock.c                       | 13 +++++++++++++
 8 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 6b3daba60987..1dd9baf4a6c2 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -129,6 +129,8 @@
 
 #define SO_NETNS_COOKIE		71
 
+#define SO_BUF_LOCK		72
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index cdf404a831b2..1eaf6a1ca561 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -140,6 +140,8 @@
 
 #define SO_NETNS_COOKIE		71
 
+#define SO_BUF_LOCK		72
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 5b5351cdcb33..8baaad52d799 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -121,6 +121,8 @@
 
 #define SO_NETNS_COOKIE		0x4045
 
+#define SO_BUF_LOCK		0x4046
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 92675dc380fa..e80ee8641ac3 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -122,6 +122,8 @@
 
 #define SO_NETNS_COOKIE          0x0050
 
+#define SO_BUF_LOCK              0x0051
+
 #if !defined(__KERNEL__)
 
 
diff --git a/include/net/sock.h b/include/net/sock.h
index ff1be7e7e90b..6e761451c927 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -68,6 +68,7 @@
 #include <net/tcp_states.h>
 #include <linux/net_tstamp.h>
 #include <net/l3mdev.h>
+#include <uapi/linux/socket.h>
 
 /*
  * This structure really needs to be cleaned up.
@@ -1438,8 +1439,6 @@ static inline int __sk_prot_rehash(struct sock *sk)
 #define RCV_SHUTDOWN	1
 #define SEND_SHUTDOWN	2
 
-#define SOCK_SNDBUF_LOCK	1
-#define SOCK_RCVBUF_LOCK	2
 #define SOCK_BINDADDR_LOCK	4
 #define SOCK_BINDPORT_LOCK	8
 
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index d588c244ec2f..1f0a2b4864e4 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -124,6 +124,8 @@
 
 #define SO_NETNS_COOKIE		71
 
+#define SO_BUF_LOCK		72
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h
index c3409c8ec0dd..eb0a9a5b6e71 100644
--- a/include/uapi/linux/socket.h
+++ b/include/uapi/linux/socket.h
@@ -26,4 +26,9 @@ struct __kernel_sockaddr_storage {
 	};
 };
 
+#define SOCK_SNDBUF_LOCK	1
+#define SOCK_RCVBUF_LOCK	2
+
+#define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK)
+
 #endif /* _UAPI_LINUX_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 9671c32e6ef5..aada649e07e8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1358,6 +1358,15 @@ set_sndbuf:
 		ret = sock_bindtoindex_locked(sk, val);
 		break;
 
+	case SO_BUF_LOCK:
+		if (val & ~SOCK_BUF_LOCK_MASK) {
+			ret = -EINVAL;
+			break;
+		}
+		sk->sk_userlocks = val | (sk->sk_userlocks &
+					  ~SOCK_BUF_LOCK_MASK);
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -1720,6 +1729,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val64 = sock_net(sk)->net_cookie;
 		break;
 
+	case SO_BUF_LOCK:
+		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
+		break;
+
 	default:
 		/* We implement the SO_SNDLOWAT etc to not be settable
 		 * (1003.1g 7).
-- 
cgit v1.2.3-71-gd317


From 9d5adeecc409365e45cd89657f0392d0dd5c217f Mon Sep 17 00:00:00 2001
From: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Date: Tue, 22 Jun 2021 13:39:57 +0200
Subject: media: v4l2-ctrls: Add intra-refresh period control

Add a control to set intra-refresh period.

Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 .../userspace-api/media/v4l/ext-ctrls-codec.rst         | 17 ++++++++++++++++-
 drivers/media/v4l2-core/v4l2-ctrls-defs.c               |  2 ++
 include/uapi/linux/v4l2-controls.h                      |  1 +
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 999b2360cef8..976d34445a24 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -1175,9 +1175,24 @@ enum v4l2_mpeg_video_h264_entropy_mode -
     macroblocks refreshed every frame. Each frame a successive set of
     macroblocks is refreshed until the cycle completes and starts from
     the top of the frame. Setting this control to zero means that
-    macroblocks will not be refreshed.
+    macroblocks will not be refreshed.  Note that this control will not
+    take effect when ``V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD`` control
+    is set to non zero value.
     Applicable to H264, H263 and MPEG4 encoder.
 
+``V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD (integer)``
+    Intra macroblock refresh period. This sets the period to refresh
+    the whole frame. In other words, this defines the number of frames
+    for which the whole frame will be intra-refreshed.  An example:
+    setting period to 1 means that the whole frame will be refreshed,
+    setting period to 2 means that the half of macroblocks will be
+    intra-refreshed on frameX and the other half of macroblocks
+    will be refreshed in frameX + 1 and so on. Setting the period to
+    zero means no period is specified.
+    Note that if the client sets this control to non zero value the
+    ``V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB`` control shall be
+    ignored. Applicable to H264 and HEVC encoders.
+
 ``V4L2_CID_MPEG_VIDEO_FRAME_RC_ENABLE (boolean)``
     Frame level rate control enable. If this control is disabled then
     the quantization parameter for each frame type is constant and set
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
index b6344bbf1e00..421300e13a41 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
@@ -833,6 +833,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_DECODER_SLICE_INTERFACE:	return "Decoder Slice Interface";
 	case V4L2_CID_MPEG_VIDEO_DECODER_MPEG4_DEBLOCK_FILTER:	return "MPEG4 Loop Filter Enable";
 	case V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB:	return "Number of Intra Refresh MBs";
+	case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD:		return "Intra Refresh Period";
 	case V4L2_CID_MPEG_VIDEO_FRAME_RC_ENABLE:		return "Frame Level Rate Control Enable";
 	case V4L2_CID_MPEG_VIDEO_MB_RC_ENABLE:			return "H264 MB Level Rate Control";
 	case V4L2_CID_MPEG_VIDEO_HEADER_MODE:			return "Sequence Header Mode";
@@ -1258,6 +1259,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_MPEG_VIDEO_MV_H_SEARCH_RANGE:
 	case V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE:
 	case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY:
+	case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD:
 		*type = V4L2_CTRL_TYPE_INTEGER;
 		break;
 	case V4L2_CID_MPEG_VIDEO_LTR_COUNT:
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index fdf97a6d7d18..5532b5f68493 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -435,6 +435,7 @@ enum v4l2_mpeg_video_multi_slice_mode {
 #define V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX		(V4L2_CID_CODEC_BASE+233)
 #define V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES		(V4L2_CID_CODEC_BASE+234)
 #define V4L2_CID_MPEG_VIDEO_DEC_CONCEAL_COLOR		(V4L2_CID_CODEC_BASE+235)
+#define V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD	(V4L2_CID_CODEC_BASE+236)
 
 /* CIDs for the MPEG-2 Part 2 (H.262) codec */
 #define V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL			(V4L2_CID_CODEC_BASE+270)
-- 
cgit v1.2.3-71-gd317


From 46abe13b5e3db187e52cd0de06c07bbce010726c Mon Sep 17 00:00:00 2001
From: Igor Skalkin <igor.skalkin@opensynergy.com>
Date: Tue, 3 Aug 2021 14:10:24 +0100
Subject: firmware: arm_scmi: Add virtio transport

This transport enables communications with an SCMI platform through virtio;
the SCMI platform will be represented by a virtio device.

Implement an SCMI virtio driver according to the virtio SCMI device spec
[1]. Virtio device id 32 has been reserved for the SCMI device [2].

The virtio transport has one Tx channel (virtio cmdq, A2P channel) and
at most one Rx channel (virtio eventq, P2A channel).

The following feature bit defined in [1] is not implemented:
VIRTIO_SCMI_F_SHARED_MEMORY.

The number of messages which can be pending simultaneously is restricted
according to the virtqueue capacity negotiated at probing time.

As soon as Rx channel message buffers are allocated or have been read
out by the arm-scmi driver, feed them back to the virtio device.

Since some virtio devices may not have the short response time exhibited
by SCMI platforms using other transports, set a generous response
timeout.

SCMI polling mode is not supported by this virtio transport since deemed
meaningless: polling mode operation is offered by the SCMI core to those
transports that could not provide a completion interrupt on the TX path,
which is never the case for virtio whose core callbacks can easily call
into core scmi_rx_callback upon messages reception.

[1] https://github.com/oasis-tcs/virtio-spec/blob/master/virtio-scmi.tex
[2] https://www.oasis-open.org/committees/ballot.php?id=3496

Link: https://lore.kernel.org/r/20210803131024.40280-16-cristian.marussi@arm.com
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Co-developed-by: Peter Hilber <peter.hilber@opensynergy.com>
Co-developed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Igor Skalkin <igor.skalkin@opensynergy.com>
[ Peter: Adapted patch for submission to upstream. ]
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
[ Cristian: simplified driver logic, changed link_supplier and channel
	    available/setup logic, removed dummy callbacks ]
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 MAINTAINERS                        |   1 +
 drivers/firmware/arm_scmi/Kconfig  |  11 +
 drivers/firmware/arm_scmi/Makefile |   1 +
 drivers/firmware/arm_scmi/common.h |   3 +
 drivers/firmware/arm_scmi/driver.c |   3 +
 drivers/firmware/arm_scmi/virtio.c | 491 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/virtio_ids.h    |   1 +
 include/uapi/linux/virtio_scmi.h   |  24 ++
 8 files changed, 535 insertions(+)
 create mode 100644 drivers/firmware/arm_scmi/virtio.c
 create mode 100644 include/uapi/linux/virtio_scmi.h

(limited to 'include/uapi')

diff --git a/MAINTAINERS b/MAINTAINERS
index a61f4f3b78a9..db1c7b74642e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17940,6 +17940,7 @@ F:	drivers/regulator/scmi-regulator.c
 F:	drivers/reset/reset-scmi.c
 F:	include/linux/sc[mp]i_protocol.h
 F:	include/trace/events/scmi.h
+F:	include/uapi/linux/virtio_scmi.h
 
 SYSTEM RESET/SHUTDOWN DRIVERS
 M:	Sebastian Reichel <sre@kernel.org>
diff --git a/drivers/firmware/arm_scmi/Kconfig b/drivers/firmware/arm_scmi/Kconfig
index 24fed705b02c..7f4d2435503b 100644
--- a/drivers/firmware/arm_scmi/Kconfig
+++ b/drivers/firmware/arm_scmi/Kconfig
@@ -66,6 +66,17 @@ config ARM_SCMI_TRANSPORT_SMC
 	  If you want the ARM SCMI PROTOCOL stack to include support for a
 	  transport based on SMC, answer Y.
 
+config ARM_SCMI_TRANSPORT_VIRTIO
+	bool "SCMI transport based on VirtIO"
+	depends on VIRTIO
+	select ARM_SCMI_HAVE_TRANSPORT
+	select ARM_SCMI_HAVE_MSG
+	help
+	  This enables the virtio based transport for SCMI.
+
+	  If you want the ARM SCMI PROTOCOL stack to include support for a
+	  transport based on VirtIO, answer Y.
+
 endif #ARM_SCMI_PROTOCOL
 
 config ARM_SCMI_POWER_DOMAIN
diff --git a/drivers/firmware/arm_scmi/Makefile b/drivers/firmware/arm_scmi/Makefile
index aaad9f6589aa..1dcf123d64ab 100644
--- a/drivers/firmware/arm_scmi/Makefile
+++ b/drivers/firmware/arm_scmi/Makefile
@@ -5,6 +5,7 @@ scmi-transport-$(CONFIG_ARM_SCMI_HAVE_SHMEM) = shmem.o
 scmi-transport-$(CONFIG_ARM_SCMI_TRANSPORT_MAILBOX) += mailbox.o
 scmi-transport-$(CONFIG_ARM_SCMI_TRANSPORT_SMC) += smc.o
 scmi-transport-$(CONFIG_ARM_SCMI_HAVE_MSG) += msg.o
+scmi-transport-$(CONFIG_ARM_SCMI_TRANSPORT_VIRTIO) += virtio.o
 scmi-protocols-y = base.o clock.o perf.o power.o reset.o sensors.o system.o voltage.o
 scmi-module-objs := $(scmi-bus-y) $(scmi-driver-y) $(scmi-protocols-y) \
 		    $(scmi-transport-y)
diff --git a/drivers/firmware/arm_scmi/common.h b/drivers/firmware/arm_scmi/common.h
index 7864c21269b0..dea1bfbe1052 100644
--- a/drivers/firmware/arm_scmi/common.h
+++ b/drivers/firmware/arm_scmi/common.h
@@ -418,6 +418,9 @@ extern const struct scmi_desc scmi_mailbox_desc;
 #ifdef CONFIG_ARM_SCMI_TRANSPORT_SMC
 extern const struct scmi_desc scmi_smc_desc;
 #endif
+#ifdef CONFIG_ARM_SCMI_TRANSPORT_VIRTIO
+extern const struct scmi_desc scmi_virtio_desc;
+#endif
 
 void scmi_rx_callback(struct scmi_chan_info *cinfo, u32 msg_hdr, void *priv);
 void scmi_free_channel(struct scmi_chan_info *cinfo, struct idr *idr, int id);
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index aaca01a4d752..00fcacd06562 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -1983,6 +1983,9 @@ static const struct of_device_id scmi_of_match[] = {
 #endif
 #ifdef CONFIG_ARM_SCMI_TRANSPORT_SMC
 	{ .compatible = "arm,scmi-smc", .data = &scmi_smc_desc},
+#endif
+#ifdef CONFIG_ARM_SCMI_TRANSPORT_VIRTIO
+	{ .compatible = "arm,scmi-virtio", .data = &scmi_virtio_desc},
 #endif
 	{ /* Sentinel */ },
 };
diff --git a/drivers/firmware/arm_scmi/virtio.c b/drivers/firmware/arm_scmi/virtio.c
new file mode 100644
index 000000000000..3dacf794b177
--- /dev/null
+++ b/drivers/firmware/arm_scmi/virtio.c
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Virtio Transport driver for Arm System Control and Management Interface
+ * (SCMI).
+ *
+ * Copyright (C) 2020-2021 OpenSynergy.
+ * Copyright (C) 2021 ARM Ltd.
+ */
+
+/**
+ * DOC: Theory of Operation
+ *
+ * The scmi-virtio transport implements a driver for the virtio SCMI device.
+ *
+ * There is one Tx channel (virtio cmdq, A2P channel) and at most one Rx
+ * channel (virtio eventq, P2A channel). Each channel is implemented through a
+ * virtqueue. Access to each virtqueue is protected by spinlocks.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+
+#include <uapi/linux/virtio_ids.h>
+#include <uapi/linux/virtio_scmi.h>
+
+#include "common.h"
+
+#define VIRTIO_SCMI_MAX_MSG_SIZE 128 /* Value may be increased. */
+#define VIRTIO_SCMI_MAX_PDU_SIZE \
+	(VIRTIO_SCMI_MAX_MSG_SIZE + SCMI_MSG_MAX_PROT_OVERHEAD)
+#define DESCRIPTORS_PER_TX_MSG 2
+
+/**
+ * struct scmi_vio_channel - Transport channel information
+ *
+ * @vqueue: Associated virtqueue
+ * @cinfo: SCMI Tx or Rx channel
+ * @free_list: List of unused scmi_vio_msg, maintained for Tx channels only
+ * @is_rx: Whether channel is an Rx channel
+ * @ready: Whether transport user is ready to hear about channel
+ * @max_msg: Maximum number of pending messages for this channel.
+ * @lock: Protects access to all members except ready.
+ * @ready_lock: Protects access to ready. If required, it must be taken before
+ *              lock.
+ */
+struct scmi_vio_channel {
+	struct virtqueue *vqueue;
+	struct scmi_chan_info *cinfo;
+	struct list_head free_list;
+	bool is_rx;
+	bool ready;
+	unsigned int max_msg;
+	/* lock to protect access to all members except ready. */
+	spinlock_t lock;
+	/* lock to rotects access to ready flag. */
+	spinlock_t ready_lock;
+};
+
+/**
+ * struct scmi_vio_msg - Transport PDU information
+ *
+ * @request: SDU used for commands
+ * @input: SDU used for (delayed) responses and notifications
+ * @list: List which scmi_vio_msg may be part of
+ * @rx_len: Input SDU size in bytes, once input has been received
+ */
+struct scmi_vio_msg {
+	struct scmi_msg_payld *request;
+	struct scmi_msg_payld *input;
+	struct list_head list;
+	unsigned int rx_len;
+};
+
+/* Only one SCMI VirtIO device can possibly exist */
+static struct virtio_device *scmi_vdev;
+
+static bool scmi_vio_have_vq_rx(struct virtio_device *vdev)
+{
+	return virtio_has_feature(vdev, VIRTIO_SCMI_F_P2A_CHANNELS);
+}
+
+static int scmi_vio_feed_vq_rx(struct scmi_vio_channel *vioch,
+			       struct scmi_vio_msg *msg)
+{
+	struct scatterlist sg_in;
+	int rc;
+	unsigned long flags;
+
+	sg_init_one(&sg_in, msg->input, VIRTIO_SCMI_MAX_PDU_SIZE);
+
+	spin_lock_irqsave(&vioch->lock, flags);
+
+	rc = virtqueue_add_inbuf(vioch->vqueue, &sg_in, 1, msg, GFP_ATOMIC);
+	if (rc)
+		dev_err_once(vioch->cinfo->dev,
+			     "failed to add to virtqueue (%d)\n", rc);
+	else
+		virtqueue_kick(vioch->vqueue);
+
+	spin_unlock_irqrestore(&vioch->lock, flags);
+
+	return rc;
+}
+
+static void scmi_finalize_message(struct scmi_vio_channel *vioch,
+				  struct scmi_vio_msg *msg)
+{
+	if (vioch->is_rx) {
+		scmi_vio_feed_vq_rx(vioch, msg);
+	} else {
+		unsigned long flags;
+
+		spin_lock_irqsave(&vioch->lock, flags);
+		list_add(&msg->list, &vioch->free_list);
+		spin_unlock_irqrestore(&vioch->lock, flags);
+	}
+}
+
+static void scmi_vio_complete_cb(struct virtqueue *vqueue)
+{
+	unsigned long ready_flags;
+	unsigned long flags;
+	unsigned int length;
+	struct scmi_vio_channel *vioch;
+	struct scmi_vio_msg *msg;
+	bool cb_enabled = true;
+
+	if (WARN_ON_ONCE(!vqueue->vdev->priv))
+		return;
+	vioch = &((struct scmi_vio_channel *)vqueue->vdev->priv)[vqueue->index];
+
+	for (;;) {
+		spin_lock_irqsave(&vioch->ready_lock, ready_flags);
+
+		if (!vioch->ready) {
+			if (!cb_enabled)
+				(void)virtqueue_enable_cb(vqueue);
+			goto unlock_ready_out;
+		}
+
+		spin_lock_irqsave(&vioch->lock, flags);
+		if (cb_enabled) {
+			virtqueue_disable_cb(vqueue);
+			cb_enabled = false;
+		}
+		msg = virtqueue_get_buf(vqueue, &length);
+		if (!msg) {
+			if (virtqueue_enable_cb(vqueue))
+				goto unlock_out;
+			cb_enabled = true;
+		}
+		spin_unlock_irqrestore(&vioch->lock, flags);
+
+		if (msg) {
+			msg->rx_len = length;
+			scmi_rx_callback(vioch->cinfo,
+					 msg_read_header(msg->input), msg);
+
+			scmi_finalize_message(vioch, msg);
+		}
+
+		spin_unlock_irqrestore(&vioch->ready_lock, ready_flags);
+	}
+
+unlock_out:
+	spin_unlock_irqrestore(&vioch->lock, flags);
+unlock_ready_out:
+	spin_unlock_irqrestore(&vioch->ready_lock, ready_flags);
+}
+
+static const char *const scmi_vio_vqueue_names[] = { "tx", "rx" };
+
+static vq_callback_t *scmi_vio_complete_callbacks[] = {
+	scmi_vio_complete_cb,
+	scmi_vio_complete_cb
+};
+
+static unsigned int virtio_get_max_msg(struct scmi_chan_info *base_cinfo)
+{
+	struct scmi_vio_channel *vioch = base_cinfo->transport_info;
+
+	return vioch->max_msg;
+}
+
+static int virtio_link_supplier(struct device *dev)
+{
+	if (!scmi_vdev) {
+		dev_notice_once(dev,
+				"Deferring probe after not finding a bound scmi-virtio device\n");
+		return -EPROBE_DEFER;
+	}
+
+	if (!device_link_add(dev, &scmi_vdev->dev,
+			     DL_FLAG_AUTOREMOVE_CONSUMER)) {
+		dev_err(dev, "Adding link to supplier virtio device failed\n");
+		return -ECANCELED;
+	}
+
+	return 0;
+}
+
+static bool virtio_chan_available(struct device *dev, int idx)
+{
+	struct scmi_vio_channel *channels, *vioch = NULL;
+
+	if (WARN_ON_ONCE(!scmi_vdev))
+		return false;
+
+	channels = (struct scmi_vio_channel *)scmi_vdev->priv;
+
+	switch (idx) {
+	case VIRTIO_SCMI_VQ_TX:
+		vioch = &channels[VIRTIO_SCMI_VQ_TX];
+		break;
+	case VIRTIO_SCMI_VQ_RX:
+		if (scmi_vio_have_vq_rx(scmi_vdev))
+			vioch = &channels[VIRTIO_SCMI_VQ_RX];
+		break;
+	default:
+		return false;
+	}
+
+	return vioch && !vioch->cinfo ? true : false;
+}
+
+static int virtio_chan_setup(struct scmi_chan_info *cinfo, struct device *dev,
+			     bool tx)
+{
+	unsigned long flags;
+	struct scmi_vio_channel *vioch;
+	int index = tx ? VIRTIO_SCMI_VQ_TX : VIRTIO_SCMI_VQ_RX;
+	int i;
+
+	if (!scmi_vdev)
+		return -EPROBE_DEFER;
+
+	vioch = &((struct scmi_vio_channel *)scmi_vdev->priv)[index];
+
+	for (i = 0; i < vioch->max_msg; i++) {
+		struct scmi_vio_msg *msg;
+
+		msg = devm_kzalloc(cinfo->dev, sizeof(*msg), GFP_KERNEL);
+		if (!msg)
+			return -ENOMEM;
+
+		if (tx) {
+			msg->request = devm_kzalloc(cinfo->dev,
+						    VIRTIO_SCMI_MAX_PDU_SIZE,
+						    GFP_KERNEL);
+			if (!msg->request)
+				return -ENOMEM;
+		}
+
+		msg->input = devm_kzalloc(cinfo->dev, VIRTIO_SCMI_MAX_PDU_SIZE,
+					  GFP_KERNEL);
+		if (!msg->input)
+			return -ENOMEM;
+
+		if (tx) {
+			spin_lock_irqsave(&vioch->lock, flags);
+			list_add_tail(&msg->list, &vioch->free_list);
+			spin_unlock_irqrestore(&vioch->lock, flags);
+		} else {
+			scmi_vio_feed_vq_rx(vioch, msg);
+		}
+	}
+
+	spin_lock_irqsave(&vioch->lock, flags);
+	cinfo->transport_info = vioch;
+	/* Indirectly setting channel not available any more */
+	vioch->cinfo = cinfo;
+	spin_unlock_irqrestore(&vioch->lock, flags);
+
+	spin_lock_irqsave(&vioch->ready_lock, flags);
+	vioch->ready = true;
+	spin_unlock_irqrestore(&vioch->ready_lock, flags);
+
+	return 0;
+}
+
+static int virtio_chan_free(int id, void *p, void *data)
+{
+	unsigned long flags;
+	struct scmi_chan_info *cinfo = p;
+	struct scmi_vio_channel *vioch = cinfo->transport_info;
+
+	spin_lock_irqsave(&vioch->ready_lock, flags);
+	vioch->ready = false;
+	spin_unlock_irqrestore(&vioch->ready_lock, flags);
+
+	scmi_free_channel(cinfo, data, id);
+
+	spin_lock_irqsave(&vioch->lock, flags);
+	vioch->cinfo = NULL;
+	spin_unlock_irqrestore(&vioch->lock, flags);
+
+	return 0;
+}
+
+static int virtio_send_message(struct scmi_chan_info *cinfo,
+			       struct scmi_xfer *xfer)
+{
+	struct scmi_vio_channel *vioch = cinfo->transport_info;
+	struct scatterlist sg_out;
+	struct scatterlist sg_in;
+	struct scatterlist *sgs[DESCRIPTORS_PER_TX_MSG] = { &sg_out, &sg_in };
+	unsigned long flags;
+	int rc;
+	struct scmi_vio_msg *msg;
+
+	spin_lock_irqsave(&vioch->lock, flags);
+
+	if (list_empty(&vioch->free_list)) {
+		spin_unlock_irqrestore(&vioch->lock, flags);
+		return -EBUSY;
+	}
+
+	msg = list_first_entry(&vioch->free_list, typeof(*msg), list);
+	list_del(&msg->list);
+
+	msg_tx_prepare(msg->request, xfer);
+
+	sg_init_one(&sg_out, msg->request, msg_command_size(xfer));
+	sg_init_one(&sg_in, msg->input, msg_response_size(xfer));
+
+	rc = virtqueue_add_sgs(vioch->vqueue, sgs, 1, 1, msg, GFP_ATOMIC);
+	if (rc) {
+		list_add(&msg->list, &vioch->free_list);
+		dev_err_once(vioch->cinfo->dev,
+			     "%s() failed to add to virtqueue (%d)\n", __func__,
+			     rc);
+	} else {
+		virtqueue_kick(vioch->vqueue);
+	}
+
+	spin_unlock_irqrestore(&vioch->lock, flags);
+
+	return rc;
+}
+
+static void virtio_fetch_response(struct scmi_chan_info *cinfo,
+				  struct scmi_xfer *xfer)
+{
+	struct scmi_vio_msg *msg = xfer->priv;
+
+	if (msg) {
+		msg_fetch_response(msg->input, msg->rx_len, xfer);
+		xfer->priv = NULL;
+	}
+}
+
+static void virtio_fetch_notification(struct scmi_chan_info *cinfo,
+				      size_t max_len, struct scmi_xfer *xfer)
+{
+	struct scmi_vio_msg *msg = xfer->priv;
+
+	if (msg) {
+		msg_fetch_notification(msg->input, msg->rx_len, max_len, xfer);
+		xfer->priv = NULL;
+	}
+}
+
+static const struct scmi_transport_ops scmi_virtio_ops = {
+	.link_supplier = virtio_link_supplier,
+	.chan_available = virtio_chan_available,
+	.chan_setup = virtio_chan_setup,
+	.chan_free = virtio_chan_free,
+	.get_max_msg = virtio_get_max_msg,
+	.send_message = virtio_send_message,
+	.fetch_response = virtio_fetch_response,
+	.fetch_notification = virtio_fetch_notification,
+};
+
+static int scmi_vio_probe(struct virtio_device *vdev)
+{
+	struct device *dev = &vdev->dev;
+	struct scmi_vio_channel *channels;
+	bool have_vq_rx;
+	int vq_cnt;
+	int i;
+	int ret;
+	struct virtqueue *vqs[VIRTIO_SCMI_VQ_MAX_CNT];
+
+	/* Only one SCMI VirtiO device allowed */
+	if (scmi_vdev)
+		return -EINVAL;
+
+	have_vq_rx = scmi_vio_have_vq_rx(vdev);
+	vq_cnt = have_vq_rx ? VIRTIO_SCMI_VQ_MAX_CNT : 1;
+
+	channels = devm_kcalloc(dev, vq_cnt, sizeof(*channels), GFP_KERNEL);
+	if (!channels)
+		return -ENOMEM;
+
+	if (have_vq_rx)
+		channels[VIRTIO_SCMI_VQ_RX].is_rx = true;
+
+	ret = virtio_find_vqs(vdev, vq_cnt, vqs, scmi_vio_complete_callbacks,
+			      scmi_vio_vqueue_names, NULL);
+	if (ret) {
+		dev_err(dev, "Failed to get %d virtqueue(s)\n", vq_cnt);
+		return ret;
+	}
+
+	for (i = 0; i < vq_cnt; i++) {
+		unsigned int sz;
+
+		spin_lock_init(&channels[i].lock);
+		spin_lock_init(&channels[i].ready_lock);
+		INIT_LIST_HEAD(&channels[i].free_list);
+		channels[i].vqueue = vqs[i];
+
+		sz = virtqueue_get_vring_size(channels[i].vqueue);
+		/* Tx messages need multiple descriptors. */
+		if (!channels[i].is_rx)
+			sz /= DESCRIPTORS_PER_TX_MSG;
+
+		if (sz > MSG_TOKEN_MAX) {
+			dev_info_once(dev,
+				      "%s virtqueue could hold %d messages. Only %ld allowed to be pending.\n",
+				      channels[i].is_rx ? "rx" : "tx",
+				      sz, MSG_TOKEN_MAX);
+			sz = MSG_TOKEN_MAX;
+		}
+		channels[i].max_msg = sz;
+	}
+
+	vdev->priv = channels;
+	scmi_vdev = vdev;
+
+	return 0;
+}
+
+static void scmi_vio_remove(struct virtio_device *vdev)
+{
+	vdev->config->reset(vdev);
+	vdev->config->del_vqs(vdev);
+	scmi_vdev = NULL;
+}
+
+static int scmi_vio_validate(struct virtio_device *vdev)
+{
+	if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+		dev_err(&vdev->dev,
+			"device does not comply with spec version 1.x\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static unsigned int features[] = {
+	VIRTIO_SCMI_F_P2A_CHANNELS,
+};
+
+static const struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_SCMI, VIRTIO_DEV_ANY_ID },
+	{ 0 }
+};
+
+static struct virtio_driver virtio_scmi_driver = {
+	.driver.name = "scmi-virtio",
+	.driver.owner = THIS_MODULE,
+	.feature_table = features,
+	.feature_table_size = ARRAY_SIZE(features),
+	.id_table = id_table,
+	.probe = scmi_vio_probe,
+	.remove = scmi_vio_remove,
+	.validate = scmi_vio_validate,
+};
+
+static int __init virtio_scmi_init(void)
+{
+	return register_virtio_driver(&virtio_scmi_driver);
+}
+
+static void __exit virtio_scmi_exit(void)
+{
+	unregister_virtio_driver(&virtio_scmi_driver);
+}
+
+const struct scmi_desc scmi_virtio_desc = {
+	.transport_init = virtio_scmi_init,
+	.transport_exit = virtio_scmi_exit,
+	.ops = &scmi_virtio_ops,
+	.max_rx_timeout_ms = 60000, /* for non-realtime virtio devices */
+	.max_msg = 0, /* overridden by virtio_get_max_msg() */
+	.max_msg_size = VIRTIO_SCMI_MAX_MSG_SIZE,
+};
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 70a8057ad4bb..f74155f6882d 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -55,6 +55,7 @@
 #define VIRTIO_ID_FS			26 /* virtio filesystem */
 #define VIRTIO_ID_PMEM			27 /* virtio pmem */
 #define VIRTIO_ID_MAC80211_HWSIM	29 /* virtio mac80211-hwsim */
+#define VIRTIO_ID_SCMI			32 /* virtio SCMI */
 #define VIRTIO_ID_BT			40 /* virtio bluetooth */
 
 /*
diff --git a/include/uapi/linux/virtio_scmi.h b/include/uapi/linux/virtio_scmi.h
new file mode 100644
index 000000000000..f8ddd04a3ace
--- /dev/null
+++ b/include/uapi/linux/virtio_scmi.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/*
+ * Copyright (C) 2020-2021 OpenSynergy GmbH
+ * Copyright (C) 2021 ARM Ltd.
+ */
+
+#ifndef _UAPI_LINUX_VIRTIO_SCMI_H
+#define _UAPI_LINUX_VIRTIO_SCMI_H
+
+#include <linux/virtio_types.h>
+
+/* Device implements some SCMI notifications, or delayed responses. */
+#define VIRTIO_SCMI_F_P2A_CHANNELS 0
+
+/* Device implements any SCMI statistics shared memory region */
+#define VIRTIO_SCMI_F_SHARED_MEMORY 1
+
+/* Virtqueues */
+
+#define VIRTIO_SCMI_VQ_TX 0 /* cmdq */
+#define VIRTIO_SCMI_VQ_RX 1 /* eventq */
+#define VIRTIO_SCMI_VQ_MAX_CNT 2
+
+#endif /* _UAPI_LINUX_VIRTIO_SCMI_H */
-- 
cgit v1.2.3-71-gd317


From b65a9489730a2494f7a2a33a6eb0a12b8f1dd193 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 23 Jul 2021 12:34:05 +0100
Subject: drm/i915/userptr: Probe existence of backing struct pages upon
 creation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Jason Ekstrand requested a more efficient method than userptr+set-domain
to determine if the userptr object was backed by a complete set of pages
upon creation. To be more efficient than simply populating the userptr
using get_user_pages() (as done by the call to set-domain or execbuf),
we can walk the tree of vm_area_struct and check for gaps or vma not
backed by struct page (VM_PFNMAP). The question is how to handle
VM_MIXEDMAP which may be either struct page or pfn backed...

With discrete we are going to drop support for set_domain(), so offering
a way to probe the pages, without having to resort to dummy batches has
been requested.

v2:
- add new query param for the PROBE flag, so userspace can easily
  check if the kernel supports it(Jason).
- use mmap_read_{lock, unlock}.
- add some kernel-doc.
v3:
- In the docs also mention that PROBE doesn't guarantee that the pages
  will remain valid by the time they are actually used(Tvrtko).
- Add a small comment for the hole finding logic(Jason).
- Move the param next to all the other params which just return true.

Testcase: igt/gem_userptr_blits/probe
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Jordan Justen <jordan.l.justen@intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Ramalingam C <ramalingam.c@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Acked-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210723113405.427004-1-matthew.auld@intel.com
---
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 41 ++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_getparam.c        |  1 +
 include/uapi/drm/i915_drm.h                 | 20 ++++++++++++++
 3 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 56edfeff8c02..468a7a617fbf 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -422,6 +422,34 @@ static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = {
 
 #endif
 
+static int
+probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len)
+{
+	const unsigned long end = addr + len;
+	struct vm_area_struct *vma;
+	int ret = -EFAULT;
+
+	mmap_read_lock(mm);
+	for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
+		/* Check for holes, note that we also update the addr below */
+		if (vma->vm_start > addr)
+			break;
+
+		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+			break;
+
+		if (vma->vm_end >= end) {
+			ret = 0;
+			break;
+		}
+
+		addr = vma->vm_end;
+	}
+	mmap_read_unlock(mm);
+
+	return ret;
+}
+
 /*
  * Creates a new mm object that wraps some normal memory from the process
  * context - user memory.
@@ -477,7 +505,8 @@ i915_gem_userptr_ioctl(struct drm_device *dev,
 	}
 
 	if (args->flags & ~(I915_USERPTR_READ_ONLY |
-			    I915_USERPTR_UNSYNCHRONIZED))
+			    I915_USERPTR_UNSYNCHRONIZED |
+			    I915_USERPTR_PROBE))
 		return -EINVAL;
 
 	if (i915_gem_object_size_2big(args->user_size))
@@ -504,6 +533,16 @@ i915_gem_userptr_ioctl(struct drm_device *dev,
 			return -ENODEV;
 	}
 
+	if (args->flags & I915_USERPTR_PROBE) {
+		/*
+		 * Check that the range pointed to represents real struct
+		 * pages and not iomappings (at this moment in time!)
+		 */
+		ret = probe_range(current->mm, args->user_ptr, args->user_size);
+		if (ret)
+			return ret;
+	}
+
 #ifdef CONFIG_MMU_NOTIFIER
 	obj = i915_gem_object_alloc();
 	if (obj == NULL)
diff --git a/drivers/gpu/drm/i915/i915_getparam.c b/drivers/gpu/drm/i915/i915_getparam.c
index e289397d9178..77490cb5ff9c 100644
--- a/drivers/gpu/drm/i915/i915_getparam.c
+++ b/drivers/gpu/drm/i915/i915_getparam.c
@@ -134,6 +134,7 @@ int i915_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_FENCE_ARRAY:
 	case I915_PARAM_HAS_EXEC_SUBMIT_FENCE:
 	case I915_PARAM_HAS_EXEC_TIMELINE_FENCES:
+	case I915_PARAM_HAS_USERPTR_PROBE:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 7f13d241417f..bde5860b3686 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -683,6 +683,9 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_TIMELINE_FENCES 55
 
+/* Query if the kernel supports the I915_USERPTR_PROBE flag. */
+#define I915_PARAM_HAS_USERPTR_PROBE 56
+
 /* Must be kept compact -- no holes and well documented */
 
 typedef struct drm_i915_getparam {
@@ -2231,12 +2234,29 @@ struct drm_i915_gem_userptr {
 	 * through the GTT. If the HW can't support readonly access, an error is
 	 * returned.
 	 *
+	 * I915_USERPTR_PROBE:
+	 *
+	 * Probe the provided @user_ptr range and validate that the @user_ptr is
+	 * indeed pointing to normal memory and that the range is also valid.
+	 * For example if some garbage address is given to the kernel, then this
+	 * should complain.
+	 *
+	 * Returns -EFAULT if the probe failed.
+	 *
+	 * Note that this doesn't populate the backing pages, and also doesn't
+	 * guarantee that the object will remain valid when the object is
+	 * eventually used.
+	 *
+	 * The kernel supports this feature if I915_PARAM_HAS_USERPTR_PROBE
+	 * returns a non-zero value.
+	 *
 	 * I915_USERPTR_UNSYNCHRONIZED:
 	 *
 	 * NOT USED. Setting this flag will result in an error.
 	 */
 	__u32 flags;
 #define I915_USERPTR_READ_ONLY 0x1
+#define I915_USERPTR_PROBE 0x2
 #define I915_USERPTR_UNSYNCHRONIZED 0x80000000
 	/**
 	 * @handle: Returned handle for the object.
-- 
cgit v1.2.3-71-gd317


From db243b796439c0caba47865564d8acd18a301d18 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 4 Aug 2021 15:45:36 -0500
Subject: net/ipv4/ipv6: Replace one-element arraya with flexible-array members
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare having
a dynamically sized set of trailing elements in a structure. Kernel code
should always use “flexible array members”[1] for these cases. The older
style of one-element or zero-length arrays should no longer be used[2].

Use an anonymous union with a couple of anonymous structs in order to
keep userspace unchanged and refactor the related code accordingly:

$ pahole -C group_filter net/ipv4/ip_sockglue.o
struct group_filter {
	union {
		struct {
			__u32      gf_interface_aux;     /*     0     4 */

			/* XXX 4 bytes hole, try to pack */

			struct __kernel_sockaddr_storage gf_group_aux; /*     8   128 */
			/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
			__u32      gf_fmode_aux;         /*   136     4 */
			__u32      gf_numsrc_aux;        /*   140     4 */
			struct __kernel_sockaddr_storage gf_slist[1]; /*   144   128 */
		};                                       /*     0   272 */
		struct {
			__u32      gf_interface;         /*     0     4 */

			/* XXX 4 bytes hole, try to pack */

			struct __kernel_sockaddr_storage gf_group; /*     8   128 */
			/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
			__u32      gf_fmode;             /*   136     4 */
			__u32      gf_numsrc;            /*   140     4 */
			struct __kernel_sockaddr_storage gf_slist_flex[0]; /*   144     0 */
		};                                       /*     0   144 */
	};                                               /*     0   272 */

	/* size: 272, cachelines: 5, members: 1 */
	/* last cacheline: 16 bytes */
};

$ pahole -C compat_group_filter net/ipv4/ip_sockglue.o
struct compat_group_filter {
	union {
		struct {
			__u32      gf_interface_aux;     /*     0     4 */
			struct __kernel_sockaddr_storage gf_group_aux __attribute__((__aligned__(4))); /*     4   128 */
			/* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */
			__u32      gf_fmode_aux;         /*   132     4 */
			__u32      gf_numsrc_aux;        /*   136     4 */
			struct __kernel_sockaddr_storage gf_slist[1] __attribute__((__aligned__(4))); /*   140   128 */
		} __attribute__((__packed__)) __attribute__((__aligned__(4)));                     /*     0   268 */
		struct {
			__u32      gf_interface;         /*     0     4 */
			struct __kernel_sockaddr_storage gf_group __attribute__((__aligned__(4))); /*     4   128 */
			/* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */
			__u32      gf_fmode;             /*   132     4 */
			__u32      gf_numsrc;            /*   136     4 */
			struct __kernel_sockaddr_storage gf_slist_flex[0] __attribute__((__aligned__(4))); /*   140     0 */
		} __attribute__((__packed__)) __attribute__((__aligned__(4)));                     /*     0   140 */
	} __attribute__((__aligned__(1)));               /*     0   268 */

	/* size: 268, cachelines: 5, members: 1 */
	/* forced alignments: 1 */
	/* last cacheline: 12 bytes */
} __attribute__((__packed__));

This helps with the ongoing efforts to globally enable -Warray-bounds
and get us closer to being able to tighten the FORTIFY_SOURCE routines
on memcpy().

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/79
Link: https://github.com/KSPP/linux/issues/109
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/compat.h     | 27 ++++++++++++++++++++-------
 include/uapi/linux/in.h  | 21 ++++++++++++++++-----
 net/ipv4/ip_sockglue.c   | 19 ++++++++++---------
 net/ipv6/ipv6_sockglue.c | 18 +++++++++---------
 4 files changed, 55 insertions(+), 30 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/compat.h b/include/net/compat.h
index 84805bdc4435..595fee069b82 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -71,13 +71,26 @@ struct compat_group_source_req {
 } __packed;
 
 struct compat_group_filter {
-	__u32				 gf_interface;
-	struct __kernel_sockaddr_storage gf_group
-		__aligned(4);
-	__u32				 gf_fmode;
-	__u32				 gf_numsrc;
-	struct __kernel_sockaddr_storage gf_slist[1]
-		__aligned(4);
+	union {
+		struct {
+			__u32				 gf_interface_aux;
+			struct __kernel_sockaddr_storage gf_group_aux
+				__aligned(4);
+			__u32				 gf_fmode_aux;
+			__u32				 gf_numsrc_aux;
+			struct __kernel_sockaddr_storage gf_slist[1]
+				__aligned(4);
+		} __packed;
+		struct {
+			__u32				 gf_interface;
+			struct __kernel_sockaddr_storage gf_group
+				__aligned(4);
+			__u32				 gf_fmode;
+			__u32				 gf_numsrc;
+			struct __kernel_sockaddr_storage gf_slist_flex[]
+				__aligned(4);
+		} __packed;
+	};
 } __packed;
 
 #endif /* NET_COMPAT_H */
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 193b7cf1f0ac..14168225cecd 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -222,11 +222,22 @@ struct group_source_req {
 };
 
 struct group_filter {
-	__u32				 gf_interface;	/* interface index */
-	struct __kernel_sockaddr_storage gf_group;	/* multicast address */
-	__u32				 gf_fmode;	/* filter mode */
-	__u32				 gf_numsrc;	/* number of sources */
-	struct __kernel_sockaddr_storage gf_slist[1];	/* interface index */
+	union {
+		struct {
+			__u32				 gf_interface_aux; /* interface index */
+			struct __kernel_sockaddr_storage gf_group_aux;	   /* multicast address */
+			__u32				 gf_fmode_aux;	   /* filter mode */
+			__u32				 gf_numsrc_aux;	   /* number of sources */
+			struct __kernel_sockaddr_storage gf_slist[1];	   /* interface index */
+		};
+		struct {
+			__u32				 gf_interface;	  /* interface index */
+			struct __kernel_sockaddr_storage gf_group;	  /* multicast address */
+			__u32				 gf_fmode;	  /* filter mode */
+			__u32				 gf_numsrc;	  /* number of sources */
+			struct __kernel_sockaddr_storage gf_slist_flex[]; /* interface index */
+		};
+	};
 };
 
 #define GROUP_FILTER_SIZE(numsrc) \
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 468969c75708..b297bb28556e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -790,7 +790,8 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen)
 		goto out_free_gsf;
 
 	err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc,
-				 gsf->gf_fmode, &gsf->gf_group, gsf->gf_slist);
+				 gsf->gf_fmode, &gsf->gf_group,
+				 gsf->gf_slist_flex);
 out_free_gsf:
 	kfree(gsf);
 	return err;
@@ -799,7 +800,7 @@ out_free_gsf:
 static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 		int optlen)
 {
-	const int size0 = offsetof(struct compat_group_filter, gf_slist);
+	const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
 	struct compat_group_filter *gf32;
 	unsigned int n;
 	void *p;
@@ -813,7 +814,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 	p = kmalloc(optlen + 4, GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
-	gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */
+	gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */
 
 	err = -EFAULT;
 	if (copy_from_sockptr(gf32, optval, optlen))
@@ -826,7 +827,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 		goto out_free_gsf;
 
 	err = -EINVAL;
-	if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen)
+	if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen)
 		goto out_free_gsf;
 
 	/* numsrc >= (4G-140)/128 overflow in 32 bits */
@@ -834,7 +835,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 	if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf)
 		goto out_free_gsf;
 	err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode,
-				 &gf32->gf_group, gf32->gf_slist);
+				 &gf32->gf_group, gf32->gf_slist_flex);
 out_free_gsf:
 	kfree(p);
 	return err;
@@ -1455,7 +1456,7 @@ static bool getsockopt_needs_rtnl(int optname)
 static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
 		int __user *optlen, int len)
 {
-	const int size0 = offsetof(struct group_filter, gf_slist);
+	const int size0 = offsetof(struct group_filter, gf_slist_flex);
 	struct group_filter __user *p = optval;
 	struct group_filter gsf;
 	int num;
@@ -1467,7 +1468,7 @@ static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
 		return -EFAULT;
 
 	num = gsf.gf_numsrc;
-	err = ip_mc_gsfget(sk, &gsf, p->gf_slist);
+	err = ip_mc_gsfget(sk, &gsf, p->gf_slist_flex);
 	if (err)
 		return err;
 	if (gsf.gf_numsrc < num)
@@ -1481,7 +1482,7 @@ static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
 static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
 		int __user *optlen, int len)
 {
-	const int size0 = offsetof(struct compat_group_filter, gf_slist);
+	const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
 	struct compat_group_filter __user *p = optval;
 	struct compat_group_filter gf32;
 	struct group_filter gf;
@@ -1498,7 +1499,7 @@ static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
 	num = gf.gf_numsrc = gf32.gf_numsrc;
 	gf.gf_group = gf32.gf_group;
 
-	err = ip_mc_gsfget(sk, &gf, p->gf_slist);
+	err = ip_mc_gsfget(sk, &gf, p->gf_slist_flex);
 	if (err)
 		return err;
 	if (gf.gf_numsrc < num)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a6804a7e34c1..e4bdb09c5586 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -225,7 +225,7 @@ static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 	if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen)
 		goto out_free_gsf;
 
-	ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist);
+	ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex);
 out_free_gsf:
 	kfree(gsf);
 	return ret;
@@ -234,7 +234,7 @@ out_free_gsf:
 static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 		int optlen)
 {
-	const int size0 = offsetof(struct compat_group_filter, gf_slist);
+	const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
 	struct compat_group_filter *gf32;
 	void *p;
 	int ret;
@@ -249,7 +249,7 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 	if (!p)
 		return -ENOMEM;
 
-	gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */
+	gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */
 	ret = -EFAULT;
 	if (copy_from_sockptr(gf32, optval, optlen))
 		goto out_free_p;
@@ -261,14 +261,14 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
 		goto out_free_p;
 
 	ret = -EINVAL;
-	if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen)
+	if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen)
 		goto out_free_p;
 
 	ret = ip6_mc_msfilter(sk, &(struct group_filter){
 			.gf_interface = gf32->gf_interface,
 			.gf_group = gf32->gf_group,
 			.gf_fmode = gf32->gf_fmode,
-			.gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist);
+			.gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex);
 
 out_free_p:
 	kfree(p);
@@ -1048,7 +1048,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
 static int ipv6_get_msfilter(struct sock *sk, void __user *optval,
 		int __user *optlen, int len)
 {
-	const int size0 = offsetof(struct group_filter, gf_slist);
+	const int size0 = offsetof(struct group_filter, gf_slist_flex);
 	struct group_filter __user *p = optval;
 	struct group_filter gsf;
 	int num;
@@ -1062,7 +1062,7 @@ static int ipv6_get_msfilter(struct sock *sk, void __user *optval,
 		return -EADDRNOTAVAIL;
 	num = gsf.gf_numsrc;
 	lock_sock(sk);
-	err = ip6_mc_msfget(sk, &gsf, p->gf_slist);
+	err = ip6_mc_msfget(sk, &gsf, p->gf_slist_flex);
 	if (!err) {
 		if (num > gsf.gf_numsrc)
 			num = gsf.gf_numsrc;
@@ -1077,7 +1077,7 @@ static int ipv6_get_msfilter(struct sock *sk, void __user *optval,
 static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval,
 		int __user *optlen)
 {
-	const int size0 = offsetof(struct compat_group_filter, gf_slist);
+	const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
 	struct compat_group_filter __user *p = optval;
 	struct compat_group_filter gf32;
 	struct group_filter gf;
@@ -1100,7 +1100,7 @@ static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval,
 		return -EADDRNOTAVAIL;
 
 	lock_sock(sk);
-	err = ip6_mc_msfget(sk, &gf, p->gf_slist);
+	err = ip6_mc_msfget(sk, &gf, p->gf_slist_flex);
 	release_sock(sk);
 	if (err)
 		return err;
-- 
cgit v1.2.3-71-gd317


From 9344988d2979ce9eefe136a69efcf692615ebba8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 30 Jul 2021 15:14:22 +0200
Subject: netfilter: ctnetlink: allow to filter dump by status bits

If CTA_STATUS is present, but CTA_STATUS_MASK is not, then the
mask is automatically set to 'status', so that kernel returns those
entries that have all of the requested bits set.

This makes more sense than using a all-one mask since we'd hardly
ever find a match.

There are no other checks for status bits, so if e.g. userspace
sets impossible combinations it will get an empty dump.

If kernel would reject unknown status bits, then a program that works on
a future kernel that has IPS_FOO bit fails on old kernels.

Same for 'impossible' combinations:

Kernel never sets ASSURED without first having set SEEN_REPLY, but its
possible that a future kernel could do so.

Therefore no sanity tests other than a 0-mask.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |  1 +
 net/netfilter/nf_conntrack_netlink.c               | 34 +++++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index d8484be72fdc..c6e6d7d7d538 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -56,6 +56,7 @@ enum ctattr_type {
 	CTA_LABELS_MASK,
 	CTA_SYNPROXY,
 	CTA_FILTER,
+	CTA_STATUS_MASK,
 	__CTA_MAX
 };
 #define CTA_MAX (__CTA_MAX - 1)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index e8368e66b0f5..eb35c6151fb0 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -868,6 +868,7 @@ struct ctnetlink_filter {
 	struct nf_conntrack_zone zone;
 
 	struct ctnetlink_filter_u32 mark;
+	struct ctnetlink_filter_u32 status;
 };
 
 static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = {
@@ -927,6 +928,28 @@ static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark,
 	return 0;
 }
 
+static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status,
+					 const struct nlattr * const cda[])
+{
+	if (cda[CTA_STATUS]) {
+		status->val = ntohl(nla_get_be32(cda[CTA_STATUS]));
+		if (cda[CTA_STATUS_MASK])
+			status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK]));
+		else
+			status->mask = status->val;
+
+		/* status->val == 0? always true, else always false. */
+		if (status->mask == 0)
+			return -EINVAL;
+	} else if (cda[CTA_STATUS_MASK]) {
+		return -EINVAL;
+	}
+
+	/* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */
+	BUILD_BUG_ON(__IPS_MAX_BIT >= 32);
+	return 0;
+}
+
 static struct ctnetlink_filter *
 ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
 {
@@ -948,6 +971,10 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
 	if (err)
 		goto err_filter;
 
+	err = ctnetlink_filter_parse_status(&filter->status, cda);
+	if (err)
+		goto err_filter;
+
 	if (!cda[CTA_FILTER])
 		return filter;
 
@@ -1001,7 +1028,7 @@ err_filter:
 
 static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda)
 {
-	return family || cda[CTA_MARK] || cda[CTA_FILTER];
+	return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS];
 }
 
 static int ctnetlink_start(struct netlink_callback *cb)
@@ -1094,6 +1121,7 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
 {
 	struct ctnetlink_filter *filter = data;
 	struct nf_conntrack_tuple *tuple;
+	u32 status;
 
 	if (filter == NULL)
 		goto out;
@@ -1125,6 +1153,9 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
 	if ((ct->mark & filter->mark.mask) != filter->mark.val)
 		goto ignore_entry;
 #endif
+	status = (u32)READ_ONCE(ct->status);
+	if ((status & filter->status.mask) != filter->status.val)
+		goto ignore_entry;
 
 out:
 	return 1;
@@ -1507,6 +1538,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
 	[CTA_LABELS_MASK]	= { .type = NLA_BINARY,
 				    .len = NF_CT_LABELS_MAX_SIZE },
 	[CTA_FILTER]		= { .type = NLA_NESTED },
+	[CTA_STATUS_MASK]	= { .type = NLA_U32 },
 };
 
 static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
-- 
cgit v1.2.3-71-gd317


From a6e57c4af12bbacf927d7321c3aa894948653688 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 3 Aug 2021 00:15:54 +0200
Subject: netfilter: nfnetlink_hook: missing chain family

The family is relevant for pseudo-families like NFPROTO_INET
otherwise the user needs to rely on the hook function name to
differentiate it from NFPROTO_IPV4 and NFPROTO_IPV6 names.

Add nfnl_hook_chain_desc_attributes instead of using the existing
NFTA_CHAIN_* attributes, since these do not provide a family number.

Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_hook.h | 9 +++++++++
 net/netfilter/nfnetlink_hook.c                | 8 ++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nfnetlink_hook.h b/include/uapi/linux/netfilter/nfnetlink_hook.h
index 912ec60b26b0..bbcd285b22e1 100644
--- a/include/uapi/linux/netfilter/nfnetlink_hook.h
+++ b/include/uapi/linux/netfilter/nfnetlink_hook.h
@@ -43,6 +43,15 @@ enum nfnl_hook_chain_info_attributes {
 };
 #define NFNLA_HOOK_INFO_MAX (__NFNLA_HOOK_INFO_MAX - 1)
 
+enum nfnl_hook_chain_desc_attributes {
+	NFNLA_CHAIN_UNSPEC,
+	NFNLA_CHAIN_TABLE,
+	NFNLA_CHAIN_FAMILY,
+	NFNLA_CHAIN_NAME,
+	__NFNLA_CHAIN_MAX,
+};
+#define NFNLA_CHAIN_MAX (__NFNLA_CHAIN_MAX - 1)
+
 /**
  * enum nfnl_hook_chaintype - chain type
  *
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
index ca453c61dbdf..e0ff2973fd14 100644
--- a/net/netfilter/nfnetlink_hook.c
+++ b/net/netfilter/nfnetlink_hook.c
@@ -89,11 +89,15 @@ static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb,
 	if (!nest2)
 		goto cancel_nest;
 
-	ret = nla_put_string(nlskb, NFTA_CHAIN_TABLE, chain->table->name);
+	ret = nla_put_string(nlskb, NFNLA_CHAIN_TABLE, chain->table->name);
 	if (ret)
 		goto cancel_nest;
 
-	ret = nla_put_string(nlskb, NFTA_CHAIN_NAME, chain->name);
+	ret = nla_put_string(nlskb, NFNLA_CHAIN_NAME, chain->name);
+	if (ret)
+		goto cancel_nest;
+
+	ret = nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, chain->table->family);
 	if (ret)
 		goto cancel_nest;
 
-- 
cgit v1.2.3-71-gd317


From a43e2a0e11491b73e2acaa27ee74d6c3b86deac0 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Fri, 16 Jul 2021 22:46:21 -0400
Subject: drm/amdkfd: Allow querying SVM attributes that are clear

Currently the SVM get_attr call allows querying, which flags are set
in the entire address range. Add the opposite query, which flags are
clear in the entire address range. Both queries can be combined in a
single get_attr call, which allows answering questions such as, "is
this address range coherent, non-coherent, or a mix of both"?

Proposed userspace for UAPI:
https://github.com/RadeonOpenCompute/ROCR-Runtime/tree/memory_model_queries

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Philip Yand <philip.yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 19 +++++++++++++------
 include/uapi/linux/kfd_ioctl.h       | 16 +++++++++-------
 2 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index c7b364e4a287..7df69b77bc5c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3019,7 +3019,8 @@ svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size,
 	struct svm_range *prange;
 	uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
 	uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
-	uint32_t flags = 0xffffffff;
+	uint32_t flags_and = 0xffffffff;
+	uint32_t flags_or = 0;
 	int gpuidx;
 	uint32_t i;
 
@@ -3046,12 +3047,12 @@ svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size,
 			get_accessible = true;
 			break;
 		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
+		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
 			get_flags = true;
 			break;
 		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
 			get_granularity = true;
 			break;
-		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
 		case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
 		case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
 			fallthrough;
@@ -3069,7 +3070,8 @@ svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size,
 	if (!node) {
 		pr_debug("range attrs not found return default values\n");
 		svm_range_set_default_attributes(&location, &prefetch_loc,
-						 &granularity, &flags);
+						 &granularity, &flags_and);
+		flags_or = flags_and;
 		if (p->xnack_enabled)
 			bitmap_copy(bitmap_access, svms->bitmap_supported,
 				    MAX_GPU_INSTANCE);
@@ -3115,8 +3117,10 @@ svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size,
 			bitmap_and(bitmap_aip, bitmap_aip,
 				   prange->bitmap_aip, MAX_GPU_INSTANCE);
 		}
-		if (get_flags)
-			flags &= prange->flags;
+		if (get_flags) {
+			flags_and &= prange->flags;
+			flags_or |= prange->flags;
+		}
 
 		if (get_granularity && prange->granularity < granularity)
 			granularity = prange->granularity;
@@ -3150,7 +3154,10 @@ fill_values:
 				attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS;
 			break;
 		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
-			attrs[i].value = flags;
+			attrs[i].value = flags_and;
+			break;
+		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
+			attrs[i].value = ~flags_or;
 			break;
 		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
 			attrs[i].value = (uint32_t)granularity;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 3cb5b5dd9f77..af96af174dc4 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -31,9 +31,10 @@
  * - 1.3 - Add SMI events support
  * - 1.4 - Indicate new SRAM EDC bit in device properties
  * - 1.5 - Add SVM API
+ * - 1.6 - Query clear flags in SVM get_attr API
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 5
+#define KFD_IOCTL_MINOR_VERSION 6
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
@@ -575,18 +576,19 @@ struct kfd_ioctl_svm_attribute {
  * @KFD_IOCTL_SVM_ATTR_PREFERRED_LOC or
  * @KFD_IOCTL_SVM_ATTR_PREFETCH_LOC resepctively. For
  * @KFD_IOCTL_SVM_ATTR_SET_FLAGS, flags of all pages will be
- * aggregated by bitwise AND. The minimum  migration granularity
- * throughout the range will be returned for
- * @KFD_IOCTL_SVM_ATTR_GRANULARITY.
+ * aggregated by bitwise AND. That means, a flag will be set in the
+ * output, if that flag is set for all pages in the range. For
+ * @KFD_IOCTL_SVM_ATTR_CLR_FLAGS, flags of all pages will be
+ * aggregated by bitwise NOR. That means, a flag will be set in the
+ * output, if that flag is clear for all pages in the range.
+ * The minimum migration granularity throughout the range will be
+ * returned for @KFD_IOCTL_SVM_ATTR_GRANULARITY.
  *
  * Querying of accessibility attributes works by initializing the
  * attribute type to @KFD_IOCTL_SVM_ATTR_ACCESS and the value to the
  * GPUID being queried. Multiple attributes can be given to allow
  * querying multiple GPUIDs. The ioctl function overwrites the
  * attribute type to indicate the access for the specified GPU.
- *
- * @KFD_IOCTL_SVM_ATTR_CLR_FLAGS is invalid for
- * @KFD_IOCTL_SVM_OP_GET_ATTR.
  */
 struct kfd_ioctl_svm_args {
 	__u64 start_addr;
-- 
cgit v1.2.3-71-gd317


From af579beb666aefb17e9a335c12c788c92932baf1 Mon Sep 17 00:00:00 2001
From: Matthew Bobrowski <repnop@google.com>
Date: Sun, 8 Aug 2021 15:26:25 +1000
Subject: fanotify: add pidfd support to the fanotify API

Introduce a new flag FAN_REPORT_PIDFD for fanotify_init(2) which
allows userspace applications to control whether a pidfd information
record containing a pidfd is to be returned alongside the generic
event metadata for each event.

If FAN_REPORT_PIDFD is enabled for a notification group, an additional
struct fanotify_event_info_pidfd object type will be supplied
alongside the generic struct fanotify_event_metadata for a single
event. This functionality is analogous to that of FAN_REPORT_FID in
terms of how the event structure is supplied to a userspace
application. Usage of FAN_REPORT_PIDFD with
FAN_REPORT_FID/FAN_REPORT_DFID_NAME is permitted, and in this case a
struct fanotify_event_info_pidfd object will likely follow any struct
fanotify_event_info_fid object.

Currently, the usage of the FAN_REPORT_TID flag is not permitted along
with FAN_REPORT_PIDFD as the pidfd API currently only supports the
creation of pidfds for thread-group leaders. Additionally, usage of
the FAN_REPORT_PIDFD flag is limited to privileged processes only
i.e. event listeners that are running with the CAP_SYS_ADMIN
capability. Attempting to supply the FAN_REPORT_TID initialization
flags with FAN_REPORT_PIDFD or creating a notification group without
CAP_SYS_ADMIN will result with -EINVAL being returned to the caller.

In the event of a pidfd creation error, there are two types of error
values that can be reported back to the listener. There is
FAN_NOPIDFD, which will be reported in cases where the process
responsible for generating the event has terminated prior to the event
listener being able to read the event. Then there is FAN_EPIDFD, which
will be reported when a more generic pidfd creation error has occurred
when fanotify calls pidfd_create().

Link: https://lore.kernel.org/r/5f9e09cff7ed62bfaa51c1369e0f7ea5f16a91aa.1628398044.git.repnop@google.com
Signed-off-by: Matthew Bobrowski <repnop@google.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify_user.c | 85 ++++++++++++++++++++++++++++++++++++--
 include/linux/fanotify.h           |  3 +-
 include/uapi/linux/fanotify.h      | 13 ++++++
 3 files changed, 96 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 99d145eaab49..084a83b1214d 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/fanotify.h>
 #include <linux/fcntl.h>
+#include <linux/fdtable.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/anon_inodes.h>
@@ -106,6 +107,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 #define FANOTIFY_EVENT_ALIGN 4
 #define FANOTIFY_FID_INFO_HDR_LEN \
 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
+#define FANOTIFY_PIDFD_INFO_HDR_LEN \
+	sizeof(struct fanotify_event_info_pidfd)
 
 static int fanotify_fid_info_len(int fh_len, int name_len)
 {
@@ -138,6 +141,9 @@ static int fanotify_event_info_len(unsigned int info_mode,
 		dot_len = 1;
 	}
 
+	if (info_mode & FAN_REPORT_PIDFD)
+		info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
+
 	if (fh_len)
 		info_len += fanotify_fid_info_len(fh_len, dot_len);
 
@@ -401,13 +407,34 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
 	return info_len;
 }
 
+static int copy_pidfd_info_to_user(int pidfd,
+				   char __user *buf,
+				   size_t count)
+{
+	struct fanotify_event_info_pidfd info = { };
+	size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
+
+	if (WARN_ON_ONCE(info_len > count))
+		return -EFAULT;
+
+	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
+	info.hdr.len = info_len;
+	info.pidfd = pidfd;
+
+	if (copy_to_user(buf, &info, info_len))
+		return -EFAULT;
+
+	return info_len;
+}
+
 static int copy_info_records_to_user(struct fanotify_event *event,
 				     struct fanotify_info *info,
-				     unsigned int info_mode,
+				     unsigned int info_mode, int pidfd,
 				     char __user *buf, size_t count)
 {
 	int ret, total_bytes = 0, info_type = 0;
 	unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
+	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
 
 	/*
 	 * Event info records order is as follows: dir fid + name, child fid.
@@ -478,6 +505,16 @@ static int copy_info_records_to_user(struct fanotify_event *event,
 		total_bytes += ret;
 	}
 
+	if (pidfd_mode) {
+		ret = copy_pidfd_info_to_user(pidfd, buf, count);
+		if (ret < 0)
+			return ret;
+
+		buf += ret;
+		count -= ret;
+		total_bytes += ret;
+	}
+
 	return total_bytes;
 }
 
@@ -489,8 +526,9 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	struct path *path = fanotify_event_path(event);
 	struct fanotify_info *info = fanotify_event_info(event);
 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
+	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
 	struct file *f = NULL;
-	int ret, fd = FAN_NOFD;
+	int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
@@ -524,6 +562,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	}
 	metadata.fd = fd;
 
+	if (pidfd_mode) {
+		/*
+		 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
+		 * exclusion is ever lifted. At the time of incoporating pidfd
+		 * support within fanotify, the pidfd API only supported the
+		 * creation of pidfds for thread-group leaders.
+		 */
+		WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
+
+		/*
+		 * The PIDTYPE_TGID check for an event->pid is performed
+		 * preemptively in an attempt to catch out cases where the event
+		 * listener reads events after the event generating process has
+		 * already terminated. Report FAN_NOPIDFD to the event listener
+		 * in those cases, with all other pidfd creation errors being
+		 * reported as FAN_EPIDFD.
+		 */
+		if (metadata.pid == 0 ||
+		    !pid_has_task(event->pid, PIDTYPE_TGID)) {
+			pidfd = FAN_NOPIDFD;
+		} else {
+			pidfd = pidfd_create(event->pid, 0);
+			if (pidfd < 0)
+				pidfd = FAN_EPIDFD;
+		}
+	}
+
 	ret = -EFAULT;
 	/*
 	 * Sanity check copy size in case get_one_event() and
@@ -545,7 +610,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		fd_install(fd, f);
 
 	if (info_mode) {
-		ret = copy_info_records_to_user(event, info, info_mode,
+		ret = copy_info_records_to_user(event, info, info_mode, pidfd,
 						buf, count);
 		if (ret < 0)
 			goto out_close_fd;
@@ -558,6 +623,10 @@ out_close_fd:
 		put_unused_fd(fd);
 		fput(f);
 	}
+
+	if (pidfd >= 0)
+		close_fd(pidfd);
+
 	return ret;
 }
 
@@ -1103,6 +1172,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 #endif
 		return -EINVAL;
 
+	/*
+	 * A pidfd can only be returned for a thread-group leader; thus
+	 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
+	 * exclusive.
+	 */
+	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
+		return -EINVAL;
+
 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
 		return -EINVAL;
 
@@ -1504,7 +1581,7 @@ static int __init fanotify_user_setup(void)
 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
 
 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
-	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
+	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
 
 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 10a7e26ddba6..eec3b7c40811 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -27,7 +27,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
 
 #define FANOTIFY_FID_BITS	(FAN_REPORT_FID | FAN_REPORT_DFID_NAME)
 
-#define FANOTIFY_INFO_MODES	(FANOTIFY_FID_BITS)
+#define FANOTIFY_INFO_MODES	(FANOTIFY_FID_BITS | FAN_REPORT_PIDFD)
 
 /*
  * fanotify_init() flags that require CAP_SYS_ADMIN.
@@ -37,6 +37,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
  */
 #define FANOTIFY_ADMIN_INIT_FLAGS	(FANOTIFY_PERM_CLASSES | \
 					 FAN_REPORT_TID | \
+					 FAN_REPORT_PIDFD | \
 					 FAN_UNLIMITED_QUEUE | \
 					 FAN_UNLIMITED_MARKS)
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index fbf9c5c7dd59..64553df9d735 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -51,6 +51,7 @@
 #define FAN_ENABLE_AUDIT	0x00000040
 
 /* Flags to determine fanotify event format */
+#define FAN_REPORT_PIDFD	0x00000080	/* Report pidfd for event->pid */
 #define FAN_REPORT_TID		0x00000100	/* event->pid is thread id */
 #define FAN_REPORT_FID		0x00000200	/* Report unique file id */
 #define FAN_REPORT_DIR_FID	0x00000400	/* Report unique directory id */
@@ -123,6 +124,7 @@ struct fanotify_event_metadata {
 #define FAN_EVENT_INFO_TYPE_FID		1
 #define FAN_EVENT_INFO_TYPE_DFID_NAME	2
 #define FAN_EVENT_INFO_TYPE_DFID	3
+#define FAN_EVENT_INFO_TYPE_PIDFD	4
 
 /* Variable length info record following event metadata */
 struct fanotify_event_info_header {
@@ -148,6 +150,15 @@ struct fanotify_event_info_fid {
 	unsigned char handle[0];
 };
 
+/*
+ * This structure is used for info records of type FAN_EVENT_INFO_TYPE_PIDFD.
+ * It holds a pidfd for the pid that was responsible for generating an event.
+ */
+struct fanotify_event_info_pidfd {
+	struct fanotify_event_info_header hdr;
+	__s32 pidfd;
+};
+
 struct fanotify_response {
 	__s32 fd;
 	__u32 response;
@@ -160,6 +171,8 @@ struct fanotify_response {
 
 /* No fd set in event */
 #define FAN_NOFD	-1
+#define FAN_NOPIDFD	FAN_NOFD
+#define FAN_EPIDFD	-2
 
 /* Helper functions to deal with fanotify_event_metadata buffers */
 #define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata))
-- 
cgit v1.2.3-71-gd317


From 57e203953d150e6304ab6936bd2d9aa2daa687f4 Mon Sep 17 00:00:00 2001
From: Mikko Perttunen <mperttunen@nvidia.com>
Date: Thu, 10 Jun 2021 14:04:48 +0300
Subject: drm/tegra: Add new UAPI to header

Update the tegra_drm.h UAPI header, adding the new proposed UAPI.
The old staging UAPI is left in for now, with minor modification
to avoid name collisions.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/uapi/drm/tegra_drm.h | 425 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 402 insertions(+), 23 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/tegra_drm.h b/include/uapi/drm/tegra_drm.h
index c4df3c3668b3..94cfc306d50a 100644
--- a/include/uapi/drm/tegra_drm.h
+++ b/include/uapi/drm/tegra_drm.h
@@ -1,24 +1,5 @@
-/*
- * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
+/* SPDX-License-Identifier: MIT */
+/* Copyright (c) 2012-2020 NVIDIA Corporation */
 
 #ifndef _UAPI_TEGRA_DRM_H_
 #define _UAPI_TEGRA_DRM_H_
@@ -29,6 +10,8 @@
 extern "C" {
 #endif
 
+/* Tegra DRM legacy UAPI. Only enabled with STAGING */
+
 #define DRM_TEGRA_GEM_CREATE_TILED     (1 << 0)
 #define DRM_TEGRA_GEM_CREATE_BOTTOM_UP (1 << 1)
 
@@ -649,8 +632,8 @@ struct drm_tegra_gem_get_flags {
 #define DRM_TEGRA_SYNCPT_READ		0x02
 #define DRM_TEGRA_SYNCPT_INCR		0x03
 #define DRM_TEGRA_SYNCPT_WAIT		0x04
-#define DRM_TEGRA_OPEN_CHANNEL		0x05
-#define DRM_TEGRA_CLOSE_CHANNEL		0x06
+#define DRM_TEGRA_OPEN_CHANNEL	        0x05
+#define DRM_TEGRA_CLOSE_CHANNEL	        0x06
 #define DRM_TEGRA_GET_SYNCPT		0x07
 #define DRM_TEGRA_SUBMIT		0x08
 #define DRM_TEGRA_GET_SYNCPT_BASE	0x09
@@ -674,6 +657,402 @@ struct drm_tegra_gem_get_flags {
 #define DRM_IOCTL_TEGRA_GEM_SET_FLAGS DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_GEM_SET_FLAGS, struct drm_tegra_gem_set_flags)
 #define DRM_IOCTL_TEGRA_GEM_GET_FLAGS DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_GEM_GET_FLAGS, struct drm_tegra_gem_get_flags)
 
+/* New Tegra DRM UAPI */
+
+/*
+ * Reported by the driver in the `capabilities` field.
+ *
+ * DRM_TEGRA_CHANNEL_CAP_CACHE_COHERENT: If set, the engine is cache coherent
+ * with regard to the system memory.
+ */
+#define DRM_TEGRA_CHANNEL_CAP_CACHE_COHERENT (1 << 0)
+
+struct drm_tegra_channel_open {
+	/**
+	 * @host1x_class: [in]
+	 *
+	 * Host1x class of the engine that will be programmed using this
+	 * channel.
+	 */
+	__u32 host1x_class;
+
+	/**
+	 * @flags: [in]
+	 *
+	 * Flags.
+	 */
+	__u32 flags;
+
+	/**
+	 * @context: [out]
+	 *
+	 * Opaque identifier corresponding to the opened channel.
+	 */
+	__u32 context;
+
+	/**
+	 * @version: [out]
+	 *
+	 * Version of the engine hardware. This can be used by userspace
+	 * to determine how the engine needs to be programmed.
+	 */
+	__u32 version;
+
+	/**
+	 * @capabilities: [out]
+	 *
+	 * Flags describing the hardware capabilities.
+	 */
+	__u32 capabilities;
+	__u32 padding;
+};
+
+struct drm_tegra_channel_close {
+	/**
+	 * @context: [in]
+	 *
+	 * Identifier of the channel to close.
+	 */
+	__u32 context;
+	__u32 padding;
+};
+
+/*
+ * Mapping flags that can be used to influence how the mapping is created.
+ *
+ * DRM_TEGRA_CHANNEL_MAP_READ: create mapping that allows HW read access
+ * DRM_TEGRA_CHANNEL_MAP_WRITE: create mapping that allows HW write access
+ */
+#define DRM_TEGRA_CHANNEL_MAP_READ  (1 << 0)
+#define DRM_TEGRA_CHANNEL_MAP_WRITE (1 << 1)
+#define DRM_TEGRA_CHANNEL_MAP_READ_WRITE (DRM_TEGRA_CHANNEL_MAP_READ | \
+					  DRM_TEGRA_CHANNEL_MAP_WRITE)
+
+struct drm_tegra_channel_map {
+	/**
+	 * @context: [in]
+	 *
+	 * Identifier of the channel to which make memory available for.
+	 */
+	__u32 context;
+
+	/**
+	 * @handle: [in]
+	 *
+	 * GEM handle of the memory to map.
+	 */
+	__u32 handle;
+
+	/**
+	 * @flags: [in]
+	 *
+	 * Flags.
+	 */
+	__u32 flags;
+
+	/**
+	 * @mapping: [out]
+	 *
+	 * Identifier corresponding to the mapping, to be used for
+	 * relocations or unmapping later.
+	 */
+	__u32 mapping;
+};
+
+struct drm_tegra_channel_unmap {
+	/**
+	 * @context: [in]
+	 *
+	 * Channel identifier of the channel to unmap memory from.
+	 */
+	__u32 context;
+
+	/**
+	 * @mapping: [in]
+	 *
+	 * Mapping identifier of the memory mapping to unmap.
+	 */
+	__u32 mapping;
+};
+
+/* Submission */
+
+/**
+ * Specify that bit 39 of the patched-in address should be set to switch
+ * swizzling between Tegra and non-Tegra sector layout on systems that store
+ * surfaces in system memory in non-Tegra sector layout.
+ */
+#define DRM_TEGRA_SUBMIT_RELOC_SECTOR_LAYOUT (1 << 0)
+
+struct drm_tegra_submit_buf {
+	/**
+	 * @mapping: [in]
+	 *
+	 * Identifier of the mapping to use in the submission.
+	 */
+	__u32 mapping;
+
+	/**
+	 * @flags: [in]
+	 *
+	 * Flags.
+	 */
+	__u32 flags;
+
+	/**
+	 * Information for relocation patching.
+	 */
+	struct {
+		/**
+		 * @target_offset: [in]
+		 *
+		 * Offset from the start of the mapping of the data whose
+		 * address is to be patched into the gather.
+		 */
+		__u64 target_offset;
+
+		/**
+		 * @gather_offset_words: [in]
+		 *
+		 * Offset in words from the start of the gather data to
+		 * where the address should be patched into.
+		 */
+		__u32 gather_offset_words;
+
+		/**
+		 * @shift: [in]
+		 *
+		 * Number of bits the address should be shifted right before
+		 * patching in.
+		 */
+		__u32 shift;
+	} reloc;
+};
+
+/**
+ * Execute `words` words of Host1x opcodes specified in the `gather_data_ptr`
+ * buffer. Each GATHER_UPTR command uses successive words from the buffer.
+ */
+#define DRM_TEGRA_SUBMIT_CMD_GATHER_UPTR		0
+/**
+ * Wait for a syncpoint to reach a value before continuing with further
+ * commands.
+ */
+#define DRM_TEGRA_SUBMIT_CMD_WAIT_SYNCPT		1
+/**
+ * Wait for a syncpoint to reach a value before continuing with further
+ * commands. The threshold is calculated relative to the start of the job.
+ */
+#define DRM_TEGRA_SUBMIT_CMD_WAIT_SYNCPT_RELATIVE	2
+
+struct drm_tegra_submit_cmd_gather_uptr {
+	__u32 words;
+	__u32 reserved[3];
+};
+
+struct drm_tegra_submit_cmd_wait_syncpt {
+	__u32 id;
+	__u32 value;
+	__u32 reserved[2];
+};
+
+struct drm_tegra_submit_cmd {
+	/**
+	 * @type: [in]
+	 *
+	 * Command type to execute. One of the DRM_TEGRA_SUBMIT_CMD*
+	 * defines.
+	 */
+	__u32 type;
+
+	/**
+	 * @flags: [in]
+	 *
+	 * Flags.
+	 */
+	__u32 flags;
+
+	union {
+		struct drm_tegra_submit_cmd_gather_uptr gather_uptr;
+		struct drm_tegra_submit_cmd_wait_syncpt wait_syncpt;
+		__u32 reserved[4];
+	};
+};
+
+struct drm_tegra_submit_syncpt {
+	/**
+	 * @id: [in]
+	 *
+	 * ID of the syncpoint that the job will increment.
+	 */
+	__u32 id;
+
+	/**
+	 * @flags: [in]
+	 *
+	 * Flags.
+	 */
+	__u32 flags;
+
+	/**
+	 * @increments: [in]
+	 *
+	 * Number of times the job will increment this syncpoint.
+	 */
+	__u32 increments;
+
+	/**
+	 * @value: [out]
+	 *
+	 * Value the syncpoint will have once the job has completed all
+	 * its specified syncpoint increments.
+	 *
+	 * Note that the kernel may increment the syncpoint before or after
+	 * the job. These increments are not reflected in this field.
+	 *
+	 * If the job hangs or times out, not all of the increments may
+	 * get executed.
+	 */
+	__u32 value;
+};
+
+struct drm_tegra_channel_submit {
+	/**
+	 * @context: [in]
+	 *
+	 * Identifier of the channel to submit this job to.
+	 */
+	__u32 context;
+
+	/**
+	 * @num_bufs: [in]
+	 *
+	 * Number of elements in the `bufs_ptr` array.
+	 */
+	__u32 num_bufs;
+
+	/**
+	 * @num_cmds: [in]
+	 *
+	 * Number of elements in the `cmds_ptr` array.
+	 */
+	__u32 num_cmds;
+
+	/**
+	 * @gather_data_words: [in]
+	 *
+	 * Number of 32-bit words in the `gather_data_ptr` array.
+	 */
+	__u32 gather_data_words;
+
+	/**
+	 * @bufs_ptr: [in]
+	 *
+	 * Pointer to an array of drm_tegra_submit_buf structures.
+	 */
+	__u64 bufs_ptr;
+
+	/**
+	 * @cmds_ptr: [in]
+	 *
+	 * Pointer to an array of drm_tegra_submit_cmd structures.
+	 */
+	__u64 cmds_ptr;
+
+	/**
+	 * @gather_data_ptr: [in]
+	 *
+	 * Pointer to an array of Host1x opcodes to be used by GATHER_UPTR
+	 * commands.
+	 */
+	__u64 gather_data_ptr;
+
+	/**
+	 * @syncobj_in: [in]
+	 *
+	 * Handle for DRM syncobj that will be waited before submission.
+	 * Ignored if zero.
+	 */
+	__u32 syncobj_in;
+
+	/**
+	 * @syncobj_out: [in]
+	 *
+	 * Handle for DRM syncobj that will have its fence replaced with
+	 * the job's completion fence. Ignored if zero.
+	 */
+	__u32 syncobj_out;
+
+	/**
+	 * @syncpt_incr: [in,out]
+	 *
+	 * Information about the syncpoint the job will increment.
+	 */
+	struct drm_tegra_submit_syncpt syncpt;
+};
+
+struct drm_tegra_syncpoint_allocate {
+	/**
+	 * @id: [out]
+	 *
+	 * ID of allocated syncpoint.
+	 */
+	__u32 id;
+	__u32 padding;
+};
+
+struct drm_tegra_syncpoint_free {
+	/**
+	 * @id: [in]
+	 *
+	 * ID of syncpoint to free.
+	 */
+	__u32 id;
+	__u32 padding;
+};
+
+struct drm_tegra_syncpoint_wait {
+	/**
+	 * @timeout: [in]
+	 *
+	 * Absolute timestamp at which the wait will time out.
+	 */
+	__s64 timeout_ns;
+
+	/**
+	 * @id: [in]
+	 *
+	 * ID of syncpoint to wait on.
+	 */
+	__u32 id;
+
+	/**
+	 * @threshold: [in]
+	 *
+	 * Threshold to wait for.
+	 */
+	__u32 threshold;
+
+	/**
+	 * @value: [out]
+	 *
+	 * Value of the syncpoint upon wait completion.
+	 */
+	__u32 value;
+
+	__u32 padding;
+};
+
+#define DRM_IOCTL_TEGRA_CHANNEL_OPEN DRM_IOWR(DRM_COMMAND_BASE + 0x10, struct drm_tegra_channel_open)
+#define DRM_IOCTL_TEGRA_CHANNEL_CLOSE DRM_IOWR(DRM_COMMAND_BASE + 0x11, struct drm_tegra_channel_close)
+#define DRM_IOCTL_TEGRA_CHANNEL_MAP DRM_IOWR(DRM_COMMAND_BASE + 0x12, struct drm_tegra_channel_map)
+#define DRM_IOCTL_TEGRA_CHANNEL_UNMAP DRM_IOWR(DRM_COMMAND_BASE + 0x13, struct drm_tegra_channel_unmap)
+#define DRM_IOCTL_TEGRA_CHANNEL_SUBMIT DRM_IOWR(DRM_COMMAND_BASE + 0x14, struct drm_tegra_channel_submit)
+
+#define DRM_IOCTL_TEGRA_SYNCPOINT_ALLOCATE DRM_IOWR(DRM_COMMAND_BASE + 0x20, struct drm_tegra_syncpoint_allocate)
+#define DRM_IOCTL_TEGRA_SYNCPOINT_FREE DRM_IOWR(DRM_COMMAND_BASE + 0x21, struct drm_tegra_syncpoint_free)
+#define DRM_IOCTL_TEGRA_SYNCPOINT_WAIT DRM_IOWR(DRM_COMMAND_BASE + 0x22, struct drm_tegra_syncpoint_wait)
+
 #if defined(__cplusplus)
 }
 #endif
-- 
cgit v1.2.3-71-gd317


From 91ccbbac1747eea155632a1c6bb100052309b215 Mon Sep 17 00:00:00 2001
From: Tushar Sugandhi <tusharsu@linux.microsoft.com>
Date: Mon, 12 Jul 2021 17:48:58 -0700
Subject: dm ima: measure data on table load
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DM configures a block device with various target specific attributes
passed to it as a table.  DM loads the table, and calls each target’s
respective constructors with the attributes as input parameters.
Some of these attributes are critical to ensure the device meets
certain security bar.  Thus, IMA should measure these attributes, to
ensure they are not tampered with, during the lifetime of the device.
So that the external services can have high confidence in the
configuration of the block-devices on a given system.

Some devices may have large tables.  And a given device may change its
state (table-load, suspend, resume, rename, remove, table-clear etc.)
many times.  Measuring these attributes each time when the device
changes its state will significantly increase the size of the IMA logs.
Further, once configured, these attributes are not expected to change
unless a new table is loaded, or a device is removed and recreated.
Therefore the clear-text of the attributes should only be measured
during table load, and the hash of the active/inactive table should be
measured for the remaining device state changes.

Export IMA function ima_measure_critical_data() to allow measurement
of DM device parameters, as well as target specific attributes, during
table load.  Compute the hash of the inactive table and store it for
measurements during future state change.  If a load is called multiple
times, update the inactive table hash with the hash of the latest
populated table.  So that the correct inactive table hash is measured
when the device transitions to different states like resume, remove,
rename, etc.

Signed-off-by: Tushar Sugandhi <tusharsu@linux.microsoft.com>
Signed-off-by: Colin Ian King <colin.king@canonical.com> # leak fix
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/Makefile               |   4 +
 drivers/md/dm-core.h              |   5 +
 drivers/md/dm-ima.c               | 330 ++++++++++++++++++++++++++++++++++++++
 drivers/md/dm-ima.h               |  59 +++++++
 drivers/md/dm-ioctl.c             |   7 +-
 drivers/md/dm.c                   |   3 +
 include/linux/device-mapper.h     |   2 +-
 include/uapi/linux/dm-ioctl.h     |   6 +
 security/integrity/ima/ima_main.c |   1 +
 9 files changed, 415 insertions(+), 2 deletions(-)
 create mode 100644 drivers/md/dm-ima.c
 create mode 100644 drivers/md/dm-ima.h

(limited to 'include/uapi')

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index a74aaf8b1445..816945eeed7f 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -96,6 +96,10 @@ ifeq ($(CONFIG_BLK_DEV_ZONED),y)
 dm-mod-objs			+= dm-zone.o
 endif
 
+ifeq ($(CONFIG_IMA),y)
+dm-mod-objs			+= dm-ima.o
+endif
+
 ifeq ($(CONFIG_DM_VERITY_FEC),y)
 dm-verity-objs			+= dm-verity-fec.o
 endif
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index edc1553c4eea..55dccdfbcb22 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -18,6 +18,7 @@
 #include <trace/events/block.h>
 
 #include "dm.h"
+#include "dm-ima.h"
 
 #define DM_RESERVED_MAX_IOS		1024
 
@@ -119,6 +120,10 @@ struct mapped_device {
 	unsigned int nr_zones;
 	unsigned int *zwp_offset;
 #endif
+
+#ifdef CONFIG_IMA
+	struct dm_ima_measurements ima;
+#endif
 };
 
 /*
diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c
new file mode 100644
index 000000000000..c8f54d9f6c8d
--- /dev/null
+++ b/drivers/md/dm-ima.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Microsoft Corporation
+ *
+ * Author: Tushar Sugandhi <tusharsu@linux.microsoft.com>
+ *
+ * File: dm-ima.c
+ *       Enables IMA measurements for DM targets
+ */
+
+#include "dm-core.h"
+#include "dm-ima.h"
+
+#include <linux/ima.h>
+#include <crypto/hash.h>
+#include <linux/crypto.h>
+#include <crypto/hash_info.h>
+
+#define DM_MSG_PREFIX "ima"
+
+/*
+ * Internal function to prefix separator characters in input buffer with escape
+ * character, so that they don't interfere with the construction of key-value pairs,
+ * and clients can split the key1=val1,key2=val2,key3=val3; pairs properly.
+ */
+static void fix_separator_chars(char **buf)
+{
+	int l = strlen(*buf);
+	int i, j, sp = 0;
+
+	for (i = 0; i < l; i++)
+		if ((*buf)[i] == '\\' || (*buf)[i] == ';' || (*buf)[i] == '=' || (*buf)[i] == ',')
+			sp++;
+
+	if (!sp)
+		return;
+
+	for (i = l-1, j = i+sp; i >= 0; i--) {
+		(*buf)[j--] = (*buf)[i];
+		if ((*buf)[i] == '\\' || (*buf)[i] == ';' || (*buf)[i] == '=' || (*buf)[i] == ',')
+			(*buf)[j--] = '\\';
+	}
+}
+
+/*
+ * Internal function to allocate memory for IMA measurements.
+ */
+static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio)
+{
+	unsigned int noio_flag;
+	void *ptr;
+
+	if (noio)
+		noio_flag = memalloc_noio_save();
+
+	ptr = kzalloc(len, flags);
+
+	if (noio)
+		memalloc_noio_restore(noio_flag);
+
+	return ptr;
+}
+
+/*
+ * Internal function to allocate and copy name and uuid for IMA measurements.
+ */
+static int dm_ima_alloc_and_copy_name_uuid(struct mapped_device *md, char **dev_name,
+					   char **dev_uuid, bool noio)
+{
+	int r;
+	*dev_name = dm_ima_alloc(DM_NAME_LEN*2, GFP_KERNEL, noio);
+	if (!(*dev_name)) {
+		r = -ENOMEM;
+		goto error;
+	}
+
+	*dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, GFP_KERNEL, noio);
+	if (!(*dev_uuid)) {
+		r = -ENOMEM;
+		goto error;
+	}
+
+	r = dm_copy_name_and_uuid(md, *dev_name, *dev_uuid);
+	if (r)
+		goto error;
+
+	fix_separator_chars(dev_name);
+	fix_separator_chars(dev_uuid);
+
+	return 0;
+error:
+	kfree(*dev_name);
+	kfree(*dev_uuid);
+	*dev_name = NULL;
+	*dev_uuid = NULL;
+	return r;
+}
+
+/*
+ * Internal function to allocate and copy device data for IMA measurements.
+ */
+static int dm_ima_alloc_and_copy_device_data(struct mapped_device *md, char **device_data,
+					     unsigned int num_targets, bool noio)
+{
+	char *dev_name = NULL, *dev_uuid = NULL;
+	int r;
+
+	r = dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio);
+	if (r)
+		return r;
+
+	*device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio);
+	if (!(*device_data)) {
+		r = -ENOMEM;
+		goto error;
+	}
+
+	scnprintf(*device_data, DM_IMA_DEVICE_BUF_LEN,
+		  "name=%s,uuid=%s,major=%d,minor=%d,minor_count=%d,num_targets=%u;",
+		  dev_name, dev_uuid, md->disk->major, md->disk->first_minor,
+		  md->disk->minors, num_targets);
+error:
+	kfree(dev_name);
+	kfree(dev_uuid);
+	return r;
+}
+
+/*
+ * Internal wrapper function to call IMA to measure DM data.
+ */
+static void dm_ima_measure_data(const char *event_name, const void *buf, size_t buf_len,
+				bool noio)
+{
+	unsigned int noio_flag;
+
+	if (noio)
+		noio_flag = memalloc_noio_save();
+
+	ima_measure_critical_data(DM_NAME, event_name, buf, buf_len, false);
+
+	if (noio)
+		memalloc_noio_restore(noio_flag);
+}
+
+/*
+ * Initialize/reset the dm ima related data structure variables.
+ */
+void dm_ima_reset_data(struct mapped_device *md)
+{
+	memset(&(md->ima), 0, sizeof(md->ima));
+}
+
+/*
+ * Build up the IMA data for each target, and finally measure.
+ */
+void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_flags)
+{
+	size_t device_data_buf_len, target_metadata_buf_len, target_data_buf_len, l = 0;
+	char *target_metadata_buf = NULL, *target_data_buf = NULL, *digest_buf = NULL;
+	char *ima_buf = NULL, *device_data_buf = NULL;
+	int digest_size, last_target_measured = -1, r;
+	status_type_t type = STATUSTYPE_IMA;
+	size_t cur_total_buf_len = 0;
+	unsigned int num_targets, i;
+	SHASH_DESC_ON_STACK(shash, NULL);
+	struct crypto_shash *tfm = NULL;
+	u8 *digest = NULL;
+	bool noio = false;
+
+	ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, GFP_KERNEL, noio);
+	if (!ima_buf)
+		return;
+
+	target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, GFP_KERNEL, noio);
+	if (!target_metadata_buf)
+		goto error;
+
+	target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, GFP_KERNEL, noio);
+	if (!target_data_buf)
+		goto error;
+
+	num_targets = dm_table_get_num_targets(table);
+
+	if (dm_ima_alloc_and_copy_device_data(table->md, &device_data_buf, num_targets, noio))
+		goto error;
+
+	tfm = crypto_alloc_shash("sha256", 0, 0);
+	if (IS_ERR(tfm))
+		goto error;
+
+	shash->tfm = tfm;
+	digest_size = crypto_shash_digestsize(tfm);
+	digest = dm_ima_alloc(digest_size, GFP_KERNEL, noio);
+	if (!digest)
+		goto error;
+
+	r = crypto_shash_init(shash);
+	if (r)
+		goto error;
+
+	device_data_buf_len = strlen(device_data_buf);
+	memcpy(ima_buf + l, device_data_buf, device_data_buf_len);
+	l += device_data_buf_len;
+
+	for (i = 0; i < num_targets; i++) {
+		struct dm_target *ti = dm_table_get_target(table, i);
+
+		if (!ti)
+			goto error;
+
+		last_target_measured = 0;
+
+		/*
+		 * First retrieve the target metadata.
+		 */
+		scnprintf(target_metadata_buf, DM_IMA_TARGET_METADATA_BUF_LEN,
+			  "target_index=%d,target_begin=%llu,target_len=%llu,",
+			  i, ti->begin, ti->len);
+		target_metadata_buf_len = strlen(target_metadata_buf);
+
+		/*
+		 * Then retrieve the actual target data.
+		 */
+		if (ti->type->status)
+			ti->type->status(ti, type, status_flags, target_data_buf,
+					 DM_IMA_TARGET_DATA_BUF_LEN);
+		else
+			target_data_buf[0] = '\0';
+
+		target_data_buf_len = strlen(target_data_buf);
+
+		/*
+		 * Check if the total data can fit into the IMA buffer.
+		 */
+		cur_total_buf_len = l + target_metadata_buf_len + target_data_buf_len;
+
+		/*
+		 * IMA measurements for DM targets are best-effort.
+		 * If the total data buffered so far, including the current target,
+		 * is too large to fit into DM_IMA_MEASUREMENT_BUF_LEN, measure what
+		 * we have in the current buffer, and continue measuring the remaining
+		 * targets by prefixing the device metadata again.
+		 */
+		if (unlikely(cur_total_buf_len >= DM_IMA_MEASUREMENT_BUF_LEN)) {
+			dm_ima_measure_data("table_load", ima_buf, l, noio);
+			r = crypto_shash_update(shash, (const u8 *)ima_buf, l);
+			if (r < 0)
+				goto error;
+
+			memset(ima_buf, 0, DM_IMA_MEASUREMENT_BUF_LEN);
+			l = 0;
+
+			/*
+			 * Each new "table_load" entry in IMA log should have device data
+			 * prefix, so that multiple records from the same table_load for
+			 * a given device can be linked together.
+			 */
+			memcpy(ima_buf + l, device_data_buf, device_data_buf_len);
+			l += device_data_buf_len;
+
+			/*
+			 * If this iteration of the for loop turns out to be the last target
+			 * in the table, dm_ima_measure_data("table_load", ...) doesn't need
+			 * to be called again, just the hash needs to be finalized.
+			 * "last_target_measured" tracks this state.
+			 */
+			last_target_measured = 1;
+		}
+
+		/*
+		 * Fill-in all the target metadata, so that multiple targets for the same
+		 * device can be linked together.
+		 */
+		memcpy(ima_buf + l, target_metadata_buf, target_metadata_buf_len);
+		l += target_metadata_buf_len;
+
+		memcpy(ima_buf + l, target_data_buf, target_data_buf_len);
+		l += target_data_buf_len;
+	}
+
+	if (!last_target_measured) {
+		dm_ima_measure_data("table_load", ima_buf, l, noio);
+
+		r = crypto_shash_update(shash, (const u8 *)ima_buf, l);
+		if (r < 0)
+			goto error;
+	}
+
+	/*
+	 * Finalize the table hash, and store it in table->md->ima.inactive_table.hash,
+	 * so that the table data can be verified against the future device state change
+	 * events, e.g. resume, rename, remove, table-clear etc.
+	 */
+	r = crypto_shash_final(shash, digest);
+	if (r < 0)
+		goto error;
+
+	digest_buf = dm_ima_alloc((digest_size*2)+1, GFP_KERNEL, noio);
+	if (!digest_buf)
+		goto error;
+
+	for (i = 0; i < digest_size; i++)
+		snprintf((digest_buf+(i*2)), 3, "%02x", digest[i]);
+
+	if (table->md->ima.active_table.hash != table->md->ima.inactive_table.hash)
+		kfree(table->md->ima.inactive_table.hash);
+
+	table->md->ima.inactive_table.hash = digest_buf;
+	table->md->ima.inactive_table.hash_len = strlen(digest_buf);
+	table->md->ima.inactive_table.num_targets = num_targets;
+
+	if (table->md->ima.active_table.device_metadata !=
+	    table->md->ima.inactive_table.device_metadata)
+		kfree(table->md->ima.inactive_table.device_metadata);
+
+	table->md->ima.inactive_table.device_metadata = device_data_buf;
+	table->md->ima.inactive_table.device_metadata_len = device_data_buf_len;
+
+	goto exit;
+error:
+	kfree(digest_buf);
+	kfree(device_data_buf);
+exit:
+	kfree(digest);
+	if (tfm)
+		crypto_free_shash(tfm);
+	kfree(ima_buf);
+	kfree(target_metadata_buf);
+	kfree(target_data_buf);
+}
diff --git a/drivers/md/dm-ima.h b/drivers/md/dm-ima.h
new file mode 100644
index 000000000000..16afd9a8c0b2
--- /dev/null
+++ b/drivers/md/dm-ima.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2021 Microsoft Corporation
+ *
+ * Author: Tushar Sugandhi <tusharsu@linux.microsoft.com>
+ *
+ * File: dm-ima.h
+ *       Header file for device mapper IMA measurements.
+ */
+
+#ifndef DM_IMA_H
+#define DM_IMA_H
+
+#define DM_IMA_MEASUREMENT_BUF_LEN	4096
+#define DM_IMA_DEVICE_BUF_LEN		1024
+#define DM_IMA_TARGET_METADATA_BUF_LEN	128
+#define DM_IMA_TARGET_DATA_BUF_LEN	2048
+
+#ifdef CONFIG_IMA
+
+struct dm_ima_device_table_metadata {
+	/*
+	 * Contains data specific to the device which is common across
+	 * all the targets in the table (e.g. name, uuid, major, minor, etc).
+	 * The values are stored in comma separated list of key1=val1,key2=val2;
+	 * pairs delimited by a semicolon at the end of the list.
+	 */
+	char *device_metadata;
+	unsigned int device_metadata_len;
+	unsigned int num_targets;
+
+	/*
+	 * Contains the sha256 hashes of the IMA measurements of the target
+	 * attributes' key-value pairs from the active/inactive tables.
+	 */
+	char *hash;
+	unsigned int hash_len;
+};
+
+/*
+ * This structure contains device metadata, and table hash for
+ * active and inactive tables for ima measurements.
+ */
+struct dm_ima_measurements {
+	struct dm_ima_device_table_metadata active_table;
+	struct dm_ima_device_table_metadata inactive_table;
+};
+
+void dm_ima_reset_data(struct mapped_device *md);
+void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_flags);
+
+#else
+
+static inline void dm_ima_reset_data(struct mapped_device *md) {}
+static inline void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_flags) {}
+
+#endif /* CONFIG_IMA */
+
+#endif /* DM_IMA_H */
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 2575074a2204..d7c3456bf858 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -6,7 +6,7 @@
  */
 
 #include "dm-core.h"
-
+#include "dm-ima.h"
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/miscdevice.h>
@@ -20,6 +20,7 @@
 #include <linux/compat.h>
 
 #include <linux/uaccess.h>
+#include <linux/ima.h>
 
 #define DM_MSG_PREFIX "ioctl"
 #define DM_DRIVER_EMAIL "dm-devel@redhat.com"
@@ -1224,6 +1225,8 @@ static void retrieve_status(struct dm_table *table,
 
 	if (param->flags & DM_STATUS_TABLE_FLAG)
 		type = STATUSTYPE_TABLE;
+	else if (param->flags & DM_IMA_MEASUREMENT_FLAG)
+		type = STATUSTYPE_IMA;
 	else
 		type = STATUSTYPE_INFO;
 
@@ -1425,6 +1428,8 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si
 	if (r)
 		goto err_unlock_md_type;
 
+	dm_ima_measure_on_table_load(t, STATUSTYPE_IMA);
+
 	immutable_target_type = dm_get_immutable_target_type(md);
 	if (immutable_target_type &&
 	    (immutable_target_type != dm_table_get_immutable_target_type(t)) &&
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7981b7287628..33e55ac9ec65 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -8,6 +8,7 @@
 #include "dm-core.h"
 #include "dm-rq.h"
 #include "dm-uevent.h"
+#include "dm-ima.h"
 
 #include <linux/init.h>
 #include <linux/module.h>
@@ -1997,6 +1998,8 @@ int dm_create(int minor, struct mapped_device **result)
 	if (!md)
 		return -ENXIO;
 
+	dm_ima_reset_data(md);
+
 	*result = md;
 	return 0;
 }
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 7457d49acf9a..74486c332946 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -31,7 +31,7 @@ enum dm_queue_mode {
 	DM_TYPE_DAX_BIO_BASED	 = 3,
 };
 
-typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
+typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE, STATUSTYPE_IMA } status_type_t;
 
 union map_info {
 	void *ptr;
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index e5c6e458bdf7..c12ce30b52df 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -376,4 +376,10 @@ enum {
  */
 #define DM_INTERNAL_SUSPEND_FLAG	(1 << 18) /* Out */
 
+/*
+ * If set, returns in the in buffer passed by UM, the raw table information
+ * that would be measured by IMA subsystem on device state change.
+ */
+#define DM_IMA_MEASUREMENT_FLAG	(1 << 19) /* In */
+
 #endif				/* _LINUX_DM_IOCTL_H */
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 287b90509006..673833f94069 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -985,6 +985,7 @@ void ima_measure_critical_data(const char *event_label,
 				   CRITICAL_DATA, 0, event_label,
 				   hash);
 }
+EXPORT_SYMBOL_GPL(ima_measure_critical_data);
 
 static int __init init_ima(void)
 {
-- 
cgit v1.2.3-71-gd317


From 45a687879b31caae4032abd1c2402e289d2b8083 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 14:00:10 +0300
Subject: net: bridge: fix flags interpretation for extern learn fdb entries

Ignore fdb flags when adding port extern learn entries and always set
BR_FDB_LOCAL flag when adding bridge extern learn entries. This is
closest to the behaviour we had before and avoids breaking any use cases
which were allowed.

This patch fixes iproute2 calls which assume NUD_PERMANENT and were
allowed before, example:
$ bridge fdb add 00:11:22:33:44:55 dev swp1 extern_learn

Extern learn entries are allowed to roam, but do not expire, so static
or dynamic flags make no sense for them.

Also add a comment for future reference.

Fixes: eb100e0e24a2 ("net: bridge: allow to add externally learned entries from user-space")
Fixes: 0541a6293298 ("net: bridge: validate the NUD_PERMANENT bit when adding an extern_learn FDB entry")
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20210810110010.43859-1-razor@blackwall.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/neighbour.h |  7 +++++--
 net/bridge/br.c                |  3 +--
 net/bridge/br_fdb.c            | 11 ++++-------
 net/bridge/br_private.h        |  2 +-
 4 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index dc8b72201f6c..00a60695fa53 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -66,8 +66,11 @@ enum {
 #define NUD_NONE	0x00
 
 /* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change
-   and make no address resolution or NUD.
-   NUD_PERMANENT also cannot be deleted by garbage collectors.
+ * and make no address resolution or NUD.
+ * NUD_PERMANENT also cannot be deleted by garbage collectors.
+ * When NTF_EXT_LEARNED is set for a bridge fdb entry the different cache entry
+ * states don't make sense and thus are ignored. Such entries don't age and
+ * can roam.
  */
 
 struct nda_cacheinfo {
diff --git a/net/bridge/br.c b/net/bridge/br.c
index bbab9984f24e..ef743f94254d 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -166,8 +166,7 @@ static int br_switchdev_event(struct notifier_block *unused,
 	case SWITCHDEV_FDB_ADD_TO_BRIDGE:
 		fdb_info = ptr;
 		err = br_fdb_external_learn_add(br, p, fdb_info->addr,
-						fdb_info->vid,
-						fdb_info->is_local, false);
+						fdb_info->vid, false);
 		if (err) {
 			err = notifier_from_errno(err);
 			break;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 835cec1e5a03..5dee30966ed3 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -1044,10 +1044,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
 					   "FDB entry towards bridge must be permanent");
 			return -EINVAL;
 		}
-
-		err = br_fdb_external_learn_add(br, p, addr, vid,
-						ndm->ndm_state & NUD_PERMANENT,
-						true);
+		err = br_fdb_external_learn_add(br, p, addr, vid, true);
 	} else {
 		spin_lock_bh(&br->hash_lock);
 		err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb);
@@ -1275,7 +1272,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
 }
 
 int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
-			      const unsigned char *addr, u16 vid, bool is_local,
+			      const unsigned char *addr, u16 vid,
 			      bool swdev_notify)
 {
 	struct net_bridge_fdb_entry *fdb;
@@ -1293,7 +1290,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 		if (swdev_notify)
 			flags |= BIT(BR_FDB_ADDED_BY_USER);
 
-		if (is_local)
+		if (!p)
 			flags |= BIT(BR_FDB_LOCAL);
 
 		fdb = fdb_create(br, p, addr, vid, flags);
@@ -1322,7 +1319,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 		if (swdev_notify)
 			set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
 
-		if (is_local)
+		if (!p)
 			set_bit(BR_FDB_LOCAL, &fdb->flags);
 
 		if (modified)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aa64d8d63ca3..2b48b204205e 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -711,7 +711,7 @@ int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev,
 int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
 void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
 int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
-			      const unsigned char *addr, u16 vid, bool is_local,
+			      const unsigned char *addr, u16 vid,
 			      bool swdev_notify);
 int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid,
-- 
cgit v1.2.3-71-gd317


From df271cd641f101decaa4f7c1dd5c62939900bd4c Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:19 +0300
Subject: net: bridge: vlan: add support for mcast igmp/mld version global
 options

Add support to change and retrieve global vlan IGMP/MLD versions.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  2 ++
 net/bridge/br_multicast.c      | 18 ++++++++++--------
 net/bridge/br_netlink.c        |  6 ++++--
 net/bridge/br_private.h        | 26 ++++++++++++++++++++++++--
 net/bridge/br_sysfs_br.c       |  4 ++--
 net/bridge/br_vlan_options.c   | 40 ++++++++++++++++++++++++++++++++++++++--
 6 files changed, 80 insertions(+), 16 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 5aca85874447..5188b9f6da28 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -549,6 +549,8 @@ enum {
 	BRIDGE_VLANDB_GOPTS_ID,
 	BRIDGE_VLANDB_GOPTS_RANGE,
 	BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
+	BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
+	BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 470f1ec3b579..643b69d767f7 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4327,7 +4327,8 @@ unlock:
 	return 0;
 }
 
-int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val)
+int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx,
+				  unsigned long val)
 {
 	/* Currently we support only version 2 and 3 */
 	switch (val) {
@@ -4338,15 +4339,16 @@ int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val)
 		return -EINVAL;
 	}
 
-	spin_lock_bh(&br->multicast_lock);
-	br->multicast_ctx.multicast_igmp_version = val;
-	spin_unlock_bh(&br->multicast_lock);
+	spin_lock_bh(&brmctx->br->multicast_lock);
+	brmctx->multicast_igmp_version = val;
+	spin_unlock_bh(&brmctx->br->multicast_lock);
 
 	return 0;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val)
+int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx,
+				 unsigned long val)
 {
 	/* Currently we support version 1 and 2 */
 	switch (val) {
@@ -4357,9 +4359,9 @@ int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val)
 		return -EINVAL;
 	}
 
-	spin_lock_bh(&br->multicast_lock);
-	br->multicast_ctx.multicast_mld_version = val;
-	spin_unlock_bh(&br->multicast_lock);
+	spin_lock_bh(&brmctx->br->multicast_lock);
+	brmctx->multicast_mld_version = val;
+	spin_unlock_bh(&brmctx->br->multicast_lock);
 
 	return 0;
 }
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 616a1b6dec3c..ded1b244dfcd 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1380,7 +1380,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 		__u8 igmp_version;
 
 		igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]);
-		err = br_multicast_set_igmp_version(br, igmp_version);
+		err = br_multicast_set_igmp_version(&br->multicast_ctx,
+						    igmp_version);
 		if (err)
 			return err;
 	}
@@ -1390,7 +1391,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 		__u8 mld_version;
 
 		mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]);
-		err = br_multicast_set_mld_version(br, mld_version);
+		err = br_multicast_set_mld_version(&br->multicast_ctx,
+						   mld_version);
 		if (err)
 			return err;
 	}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 51991f1b3e5a..b0b1e1aa4db4 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -883,9 +883,11 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val,
 			struct netlink_ext_ack *extack);
 int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
 int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
-int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val);
+int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx,
+				  unsigned long val);
 #if IS_ENABLED(CONFIG_IPV6)
-int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val);
+int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx,
+				 unsigned long val);
 #endif
 struct net_bridge_mdb_entry *
 br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst);
@@ -1165,6 +1167,19 @@ br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx)
 	       (br_multicast_port_ctx_is_vlan(pmctx) &&
 		pmctx->vlan->state == BR_STATE_BLOCKING);
 }
+
+static inline bool
+br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
+			       const struct net_bridge_mcast *brmctx2)
+{
+	return brmctx1->multicast_igmp_version ==
+	       brmctx2->multicast_igmp_version &&
+#if IS_ENABLED(CONFIG_IPV6)
+	       brmctx1->multicast_mld_version ==
+	       brmctx2->multicast_mld_version &&
+#endif
+	       true;
+}
 #else
 static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx,
 				   struct net_bridge_mcast_port **pmctx,
@@ -1330,6 +1345,13 @@ static inline int br_mdb_replay(struct net_device *br_dev,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline bool
+br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
+			       const struct net_bridge_mcast *brmctx2)
+{
+	return true;
+}
 #endif
 
 /* br_vlan.c */
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 953d544663d5..08e31debd6f2 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -520,7 +520,7 @@ static ssize_t multicast_igmp_version_show(struct device *d,
 static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val,
 				      struct netlink_ext_ack *extack)
 {
-	return br_multicast_set_igmp_version(br, val);
+	return br_multicast_set_igmp_version(&br->multicast_ctx, val);
 }
 
 static ssize_t multicast_igmp_version_store(struct device *d,
@@ -757,7 +757,7 @@ static ssize_t multicast_mld_version_show(struct device *d,
 static int set_multicast_mld_version(struct net_bridge *br, unsigned long val,
 				     struct netlink_ext_ack *extack)
 {
-	return br_multicast_set_mld_version(br, val);
+	return br_multicast_set_mld_version(&br->multicast_ctx, val);
 }
 
 static ssize_t multicast_mld_version_store(struct device *d,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 4ef975b20185..ac32fb40b7ba 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -264,7 +264,9 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
 {
 	return v_curr->vid - r_end->vid == 1 &&
 	       ((v_curr->priv_flags ^ r_end->priv_flags) &
-		BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0;
+		BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0 &&
+		br_multicast_ctx_options_equal(&v_curr->br_mcast_ctx,
+					       &r_end->br_mcast_ctx);
 }
 
 bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
@@ -285,8 +287,16 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
-		       !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)))
+		       !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) ||
+	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
+		       v_opts->br_mcast_ctx.multicast_igmp_version))
+		goto out_err;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
+		       v_opts->br_mcast_ctx.multicast_mld_version))
 		goto out_err;
+#endif
 #endif
 
 	nla_nest_end(skb, nest);
@@ -305,6 +315,8 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_ID */
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */
+		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */
+		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -359,6 +371,8 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 					   bool *changed,
 					   struct netlink_ext_ack *extack)
 {
+	int err __maybe_unused;
+
 	*changed = false;
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]) {
@@ -368,6 +382,26 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		if (br_multicast_toggle_global_vlan(v, !!mc_snooping))
 			*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]) {
+		u8 ver;
+
+		ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]);
+		err = br_multicast_set_igmp_version(&v->br_mcast_ctx, ver);
+		if (err)
+			return err;
+		*changed = true;
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
+		u8 ver;
+
+		ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]);
+		err = br_multicast_set_mld_version(&v->br_mcast_ctx, ver);
+		if (err)
+			return err;
+		*changed = true;
+	}
+#endif
 #endif
 
 	return 0;
@@ -377,6 +411,8 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_ID]	= { .type = NLA_U16 },
 	[BRIDGE_VLANDB_GOPTS_RANGE]	= { .type = NLA_U16 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]	= { .type = NLA_U8 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3-71-gd317


From 931ba87d2017f3869d656f3c705883549bfeb97f Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:20 +0300
Subject: net: bridge: vlan: add support for mcast last member count global
 option

Add support to change and retrieve global vlan multicast last member
count option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 ++++++++++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 5188b9f6da28..d7a150034376 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -551,6 +551,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
 	BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
 	BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
+	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b0b1e1aa4db4..53627a841ae1 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1174,6 +1174,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 {
 	return brmctx1->multicast_igmp_version ==
 	       brmctx2->multicast_igmp_version &&
+	       brmctx1->multicast_last_member_count ==
+	       brmctx2->multicast_last_member_count &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index ac32fb40b7ba..75733b5bc6f4 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -289,7 +289,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
 		       !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) ||
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
-		       v_opts->br_mcast_ctx.multicast_igmp_version))
+		       v_opts->br_mcast_ctx.multicast_igmp_version) ||
+	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
+			v_opts->br_mcast_ctx.multicast_last_member_count))
 		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -317,6 +319,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */
+		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -391,6 +394,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 			return err;
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]) {
+		u32 cnt;
+
+		cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]);
+		v->br_mcast_ctx.multicast_last_member_count = cnt;
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -413,6 +423,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3-71-gd317


From 50725f6e6b217e7661ca696b7cc1f1b9aa7bda84 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:21 +0300
Subject: net: bridge: vlan: add support for mcast startup query count global
 option

Add support to change and retrieve global vlan multicast startup query
count option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 ++++++++++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index d7a150034376..082b413e1342 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -552,6 +552,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
 	BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
 	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
+	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 53627a841ae1..9d59b60046a4 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1176,6 +1176,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_igmp_version &&
 	       brmctx1->multicast_last_member_count ==
 	       brmctx2->multicast_last_member_count &&
+	       brmctx1->multicast_startup_query_count ==
+	       brmctx2->multicast_startup_query_count &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 75733b5bc6f4..81a0988b97c1 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -291,7 +291,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
 		       v_opts->br_mcast_ctx.multicast_igmp_version) ||
 	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
-			v_opts->br_mcast_ctx.multicast_last_member_count))
+			v_opts->br_mcast_ctx.multicast_last_member_count) ||
+	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
+			v_opts->br_mcast_ctx.multicast_startup_query_count))
 		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -320,6 +322,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */
+		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -401,6 +404,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_last_member_count = cnt;
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]) {
+		u32 cnt;
+
+		cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]);
+		v->br_mcast_ctx.multicast_startup_query_count = cnt;
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -424,6 +434,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3-71-gd317


From 77f6ababa299112092a264cac96bedf1a87015ef Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:22 +0300
Subject: net: bridge: vlan: add support for mcast last member interval global
 option

Add support to change and retrieve global vlan multicast last member
interval option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  2 ++
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 15 +++++++++++++++
 3 files changed, 19 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 082b413e1342..950ad175610e 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -553,6 +553,8 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
 	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
+	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
+	BRIDGE_VLANDB_GOPTS_PAD,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 9d59b60046a4..07b2241859ae 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1178,6 +1178,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_last_member_count &&
 	       brmctx1->multicast_startup_query_count ==
 	       brmctx2->multicast_startup_query_count &&
+	       brmctx1->multicast_last_member_interval ==
+	       brmctx2->multicast_last_member_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 81a0988b97c1..26f242acef75 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -272,6 +272,7 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
 bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 			      const struct net_bridge_vlan *v_opts)
 {
+	u64 clockval __maybe_unused;
 	struct nlattr *nest;
 
 	nest = nla_nest_start(skb, BRIDGE_VLANDB_GLOBAL_OPTIONS);
@@ -296,6 +297,11 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 			v_opts->br_mcast_ctx.multicast_startup_query_count))
 		goto out_err;
 
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
+
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
 		       v_opts->br_mcast_ctx.multicast_mld_version))
@@ -323,6 +329,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -411,6 +418,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_startup_query_count = cnt;
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]);
+		v->br_mcast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -435,6 +449,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]	= { .type = NLA_U64 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3-71-gd317


From 2da0aea21f1c40d003af6680551eaa5471103164 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:23 +0300
Subject: net: bridge: vlan: add support for mcast membership interval global
 option

Add support to change and retrieve global vlan multicast membership
interval option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 +++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 950ad175610e..93f1f16617c8 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -555,6 +555,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
 	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
 	BRIDGE_VLANDB_GOPTS_PAD,
+	BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 07b2241859ae..6508340fd5fc 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1180,6 +1180,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_startup_query_count &&
 	       brmctx1->multicast_last_member_interval ==
 	       brmctx2->multicast_last_member_interval &&
+	       brmctx1->multicast_membership_interval ==
+	       brmctx2->multicast_membership_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 26f242acef75..9d695a63732c 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -301,6 +301,10 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_membership_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
@@ -330,6 +334,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -425,6 +430,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]);
+		v->br_mcast_ctx.multicast_membership_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -450,6 +462,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]	= { .type = NLA_U64 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]	= { .type = NLA_U64 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3-71-gd317


From cd9269d463107bc4a53a0965d90a57efeee9ae11 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:24 +0300
Subject: net: bridge: vlan: add support for mcast querier interval global
 option

Add support to change and retrieve global vlan multicast querier interval
option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 +++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 93f1f16617c8..fdc264c57009 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -556,6 +556,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
 	BRIDGE_VLANDB_GOPTS_PAD,
 	BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
+	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6508340fd5fc..94487e019e96 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1182,6 +1182,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_last_member_interval &&
 	       brmctx1->multicast_membership_interval ==
 	       brmctx2->multicast_membership_interval &&
+	       brmctx1->multicast_querier_interval ==
+	       brmctx2->multicast_querier_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 9d695a63732c..58ed4277cd1b 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -305,6 +305,10 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_querier_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
@@ -335,6 +339,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -437,6 +442,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_membership_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]);
+		v->br_mcast_ctx.multicast_querier_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -463,6 +475,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]	= { .type = NLA_U64 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]	= { .type = NLA_U64 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3-71-gd317


From d6c08aba4f29f606769939eb6156efceb7dbb790 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:25 +0300
Subject: net: bridge: vlan: add support for mcast query interval global option

Add support to change and retrieve global vlan multicast query interval
option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 +++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index fdc264c57009..1517aea738f4 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -557,6 +557,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_PAD,
 	BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
+	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 94487e019e96..1f353e2ddaec 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1184,6 +1184,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_membership_interval &&
 	       brmctx1->multicast_querier_interval ==
 	       brmctx2->multicast_querier_interval &&
+	       brmctx1->multicast_query_interval ==
+	       brmctx2->multicast_query_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 58ed4277cd1b..10ed84336fd7 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -309,6 +309,10 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
@@ -340,6 +344,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -449,6 +454,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_querier_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]);
+		v->br_mcast_ctx.multicast_query_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -470,6 +482,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_RANGE]	= { .type = NLA_U16 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
-- 
cgit v1.2.3-71-gd317


From 425214508b1bd3596edb31da8d9aedee30f2b4f5 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:26 +0300
Subject: net: bridge: vlan: add support for mcast query response interval
 global option

Add support to change and retrieve global vlan multicast query response
interval option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 +++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 1517aea738f4..2627a657c3b3 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -558,6 +558,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
+	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1f353e2ddaec..10b4eaf78f54 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1186,6 +1186,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_querier_interval &&
 	       brmctx1->multicast_query_interval ==
 	       brmctx2->multicast_query_interval &&
+	       brmctx1->multicast_query_response_interval ==
+	       brmctx2->multicast_query_response_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 10ed84336fd7..cd8320b22a89 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -313,6 +313,10 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_response_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
@@ -345,6 +349,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -461,6 +466,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_query_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]);
+		v->br_mcast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -489,6 +501,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]	= { .type = NLA_U64 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 },
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3-71-gd317


From 941121ee22a69935252473f03976f1f1200b9ae9 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:27 +0300
Subject: net: bridge: vlan: add support for mcast startup query interval
 global option

Add support to change and retrieve global vlan multicast startup query
interval option.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_private.h        |  2 ++
 net/bridge/br_vlan_options.c   | 13 +++++++++++++
 3 files changed, 16 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 2627a657c3b3..b5d01538acd4 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -559,6 +559,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
+	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 10b4eaf78f54..8dde124d0464 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1188,6 +1188,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_query_interval &&
 	       brmctx1->multicast_query_response_interval ==
 	       brmctx2->multicast_query_response_interval &&
+	       brmctx1->multicast_startup_query_interval ==
+	       brmctx2->multicast_startup_query_interval &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index cd8320b22a89..7b8dfd138045 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -317,6 +317,10 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
+	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_startup_query_interval);
+	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
+			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
+		goto out_err;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
@@ -350,6 +354,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
+		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -473,6 +478,13 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]) {
+		u64 val;
+
+		val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]);
+		v->br_mcast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val);
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -501,6 +513,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]	= { .type = NLA_U64 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 },
 };
 
-- 
cgit v1.2.3-71-gd317


From 62938182c35906c0ed4beb7845b93b8ffb937597 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:30 +0300
Subject: net: bridge: vlan: add support for mcast querier global option

Add support to change and retrieve global vlan multicast querier state.
We just need to pass multicast context to br_multicast_set_querier
instead of bridge device and the rest of the logic remains the same.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_multicast.c      |  7 +++----
 net/bridge/br_netlink.c        |  5 +++--
 net/bridge/br_private.h        |  5 +++--
 net/bridge/br_sysfs_br.c       |  4 ++--
 net/bridge/br_vlan_options.c   | 15 ++++++++++++++-
 6 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index b5d01538acd4..03fd14a4e377 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -560,6 +560,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
+	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f30c2e5d3142..a780ad8aca37 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4297,14 +4297,13 @@ bool br_multicast_router(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(br_multicast_router);
 
-int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
+int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val)
 {
-	struct net_bridge_mcast *brmctx = &br->multicast_ctx;
 	unsigned long max_delay;
 
 	val = !!val;
 
-	spin_lock_bh(&br->multicast_lock);
+	spin_lock_bh(&brmctx->br->multicast_lock);
 	if (brmctx->multicast_querier == val)
 		goto unlock;
 
@@ -4327,7 +4326,7 @@ int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
 #endif
 
 unlock:
-	spin_unlock_bh(&br->multicast_lock);
+	spin_unlock_bh(&brmctx->br->multicast_lock);
 
 	return 0;
 }
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 12616e9bdd11..60616a4380b3 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1309,7 +1309,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 	if (data[IFLA_BR_MCAST_QUERIER]) {
 		u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]);
 
-		err = br_multicast_set_querier(br, mcast_querier);
+		err = br_multicast_set_querier(&br->multicast_ctx,
+					       mcast_querier);
 		if (err)
 			return err;
 	}
@@ -1575,7 +1576,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	    nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR,
 		       br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) ||
 	    nla_put_u8(skb, IFLA_BR_MCAST_QUERIER,
-		       READ_ONCE(br->multicast_ctx.multicast_querier)) ||
+		       br->multicast_ctx.multicast_querier) ||
 	    nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
 		       br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) ||
 	    nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) ||
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a1529f0bafcc..40477885fb2a 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -881,7 +881,7 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val);
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val);
 int br_multicast_toggle(struct net_bridge *br, unsigned long val,
 			struct netlink_ext_ack *extack);
-int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
+int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val);
 int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
 int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx,
 				  unsigned long val);
@@ -1028,7 +1028,7 @@ __br_multicast_querier_exists(struct net_bridge_mcast *brmctx,
 {
 	bool own_querier_enabled;
 
-	if (READ_ONCE(brmctx->multicast_querier)) {
+	if (brmctx->multicast_querier) {
 		if (is_ipv6 && !br_opt_get(brmctx->br, BROPT_HAS_IPV6_ADDR))
 			own_querier_enabled = false;
 		else
@@ -1190,6 +1190,7 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_query_response_interval &&
 	       brmctx1->multicast_startup_query_interval ==
 	       brmctx2->multicast_startup_query_interval &&
+	       brmctx1->multicast_querier == brmctx2->multicast_querier &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 570edfd391f4..e1234bd8d5a0 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -447,13 +447,13 @@ static ssize_t multicast_querier_show(struct device *d,
 				      char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%d\n", READ_ONCE(br->multicast_ctx.multicast_querier));
+	return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier);
 }
 
 static int set_multicast_querier(struct net_bridge *br, unsigned long val,
 				 struct netlink_ext_ack *extack)
 {
-	return br_multicast_set_querier(br, val);
+	return br_multicast_set_querier(&br->multicast_ctx, val);
 }
 
 static ssize_t multicast_querier_store(struct device *d,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 7b8dfd138045..0d0db8ddae45 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -294,7 +294,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
 			v_opts->br_mcast_ctx.multicast_last_member_count) ||
 	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
-			v_opts->br_mcast_ctx.multicast_startup_query_count))
+			v_opts->br_mcast_ctx.multicast_startup_query_count) ||
+	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
+		       v_opts->br_mcast_ctx.multicast_querier))
 		goto out_err;
 
 	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval);
@@ -355,6 +357,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
+		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -485,6 +488,15 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 		v->br_mcast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val);
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]) {
+		u8 val;
+
+		val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]);
+		err = br_multicast_set_querier(&v->br_mcast_ctx, val);
+		if (err)
+			return err;
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -507,6 +519,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]	= { .type = NLA_U64 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
-- 
cgit v1.2.3-71-gd317


From a97df080b6a86c105f98052ca3a9d66149b461b3 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:31 +0300
Subject: net: bridge: vlan: add support for mcast router global option

Add support to change and retrieve global vlan multicast router state
which is used for the bridge itself. We just need to pass multicast context
to br_multicast_set_router instead of bridge device and the rest of the
logic remains the same.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_multicast.c      | 11 +++++------
 net/bridge/br_netlink.c        |  3 ++-
 net/bridge/br_private.h        |  3 ++-
 net/bridge/br_sysfs_br.c       |  2 +-
 net/bridge/br_vlan_options.c   | 15 ++++++++++++++-
 6 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 03fd14a4e377..2104dd3557b4 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -561,6 +561,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
+	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index a780ad8aca37..df6bf6a237aa 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4053,17 +4053,16 @@ void br_multicast_dev_del(struct net_bridge *br)
 	rcu_barrier();
 }
 
-int br_multicast_set_router(struct net_bridge *br, unsigned long val)
+int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val)
 {
-	struct net_bridge_mcast *brmctx = &br->multicast_ctx;
 	int err = -EINVAL;
 
-	spin_lock_bh(&br->multicast_lock);
+	spin_lock_bh(&brmctx->br->multicast_lock);
 
 	switch (val) {
 	case MDB_RTR_TYPE_DISABLED:
 	case MDB_RTR_TYPE_PERM:
-		br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM);
+		br_mc_router_state_change(brmctx->br, val == MDB_RTR_TYPE_PERM);
 		del_timer(&brmctx->ip4_mc_router_timer);
 #if IS_ENABLED(CONFIG_IPV6)
 		del_timer(&brmctx->ip6_mc_router_timer);
@@ -4073,13 +4072,13 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
 		break;
 	case MDB_RTR_TYPE_TEMP_QUERY:
 		if (brmctx->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
-			br_mc_router_state_change(br, false);
+			br_mc_router_state_change(brmctx->br, false);
 		brmctx->multicast_router = val;
 		err = 0;
 		break;
 	}
 
-	spin_unlock_bh(&br->multicast_lock);
+	spin_unlock_bh(&brmctx->br->multicast_lock);
 
 	return err;
 }
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 60616a4380b3..8ae026fa2ad7 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1286,7 +1286,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 	if (data[IFLA_BR_MCAST_ROUTER]) {
 		u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]);
 
-		err = br_multicast_set_router(br, multicast_router);
+		err = br_multicast_set_router(&br->multicast_ctx,
+					      multicast_router);
 		if (err)
 			return err;
 	}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 40477885fb2a..6d5681ca8d2f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -877,7 +877,7 @@ void br_multicast_dev_del(struct net_bridge *br);
 void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb,
 			struct net_bridge_mcast *brmctx,
 			bool local_rcv, bool local_orig);
-int br_multicast_set_router(struct net_bridge *br, unsigned long val);
+int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val);
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val);
 int br_multicast_toggle(struct net_bridge *br, unsigned long val,
 			struct netlink_ext_ack *extack);
@@ -1191,6 +1191,7 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx1->multicast_startup_query_interval ==
 	       brmctx2->multicast_startup_query_interval &&
 	       brmctx1->multicast_querier == brmctx2->multicast_querier &&
+	       brmctx1->multicast_router == brmctx2->multicast_router &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index e1234bd8d5a0..d9a89ddd0331 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -390,7 +390,7 @@ static ssize_t multicast_router_show(struct device *d,
 static int set_multicast_router(struct net_bridge *br, unsigned long val,
 				struct netlink_ext_ack *extack)
 {
-	return br_multicast_set_router(br, val);
+	return br_multicast_set_router(&br->multicast_ctx, val);
 }
 
 static ssize_t multicast_router_store(struct device *d,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 0d0db8ddae45..6ba45b73931f 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -296,7 +296,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	    nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
 			v_opts->br_mcast_ctx.multicast_startup_query_count) ||
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
-		       v_opts->br_mcast_ctx.multicast_querier))
+		       v_opts->br_mcast_ctx.multicast_querier) ||
+	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
+		       v_opts->br_mcast_ctx.multicast_router))
 		goto out_err;
 
 	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval);
@@ -358,6 +360,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */
+		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
@@ -497,6 +500,15 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 			return err;
 		*changed = true;
 	}
+	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]) {
+		u8 val;
+
+		val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]);
+		err = br_multicast_set_router(&v->br_mcast_ctx, val);
+		if (err)
+			return err;
+		*changed = true;
+	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -520,6 +532,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]	= { .type = NLA_U8 },
+	[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
-- 
cgit v1.2.3-71-gd317


From dc002875c22b56c795ec24dc987ac2dd2081588e Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 18:29:33 +0300
Subject: net: bridge: vlan: use br_rports_fill_info() to export mcast router
 ports

Embed the standard multicast router port export by br_rports_fill_info()
into a new global vlan attribute BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS.
In order to have the same format for the global bridge mcast context and
the per-vlan mcast context we need a double-nesting:
 - BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS
   - MDBA_ROUTER

Currently we don't compare router lists, if any router port exists in
the bridge mcast contexts we consider their option sets as different and
export them separately.

In addition we export the router port vlan id when dumping similar to
the router port notification format.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_mdb.c            | 17 ++++-------------
 net/bridge/br_private.h        | 15 +++++++++++++++
 net/bridge/br_vlan_options.c   | 18 ++++++++++++++++++
 4 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 2104dd3557b4..620d86e825b8 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -562,6 +562,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
 	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
+	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 7c16e2c76220..389ff3c1e9d9 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -16,16 +16,6 @@
 
 #include "br_private.h"
 
-static bool br_rports_have_mc_router(const struct net_bridge_mcast *brmctx)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-	return !hlist_empty(&brmctx->ip4_mc_router_list) ||
-	       !hlist_empty(&brmctx->ip6_mc_router_list);
-#else
-	return !hlist_empty(&brmctx->ip4_mc_router_list);
-#endif
-}
-
 static bool
 br_ip4_rports_get_timer(struct net_bridge_mcast_port *pmctx,
 			unsigned long *timer)
@@ -47,8 +37,8 @@ br_ip6_rports_get_timer(struct net_bridge_mcast_port *pmctx,
 #endif
 }
 
-static int br_rports_fill_info(struct sk_buff *skb,
-			       const struct net_bridge_mcast *brmctx)
+int br_rports_fill_info(struct sk_buff *skb,
+			const struct net_bridge_mcast *brmctx)
 {
 	u16 vid = brmctx->vlan ? brmctx->vlan->vid : 0;
 	bool have_ip4_mc_rtr, have_ip6_mc_rtr;
@@ -97,7 +87,8 @@ static int br_rports_fill_info(struct sk_buff *skb,
 				 ip4_timer)) ||
 		    (have_ip6_mc_rtr &&
 		     nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER,
-				 ip6_timer))) {
+				 ip6_timer)) ||
+		    (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid))) {
 			nla_nest_cancel(skb, port_nest);
 			goto fail;
 		}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6d5681ca8d2f..32c218aa3f36 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -945,6 +945,8 @@ bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on);
 int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
 		  const void *ctx, bool adding, struct notifier_block *nb,
 		  struct netlink_ext_ack *extack);
+int br_rports_fill_info(struct sk_buff *skb,
+			const struct net_bridge_mcast *brmctx);
 
 static inline bool br_group_is_l2(const struct br_ip *group)
 {
@@ -1168,6 +1170,17 @@ br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx)
 		pmctx->vlan->state == BR_STATE_BLOCKING);
 }
 
+static inline bool
+br_rports_have_mc_router(const struct net_bridge_mcast *brmctx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	return !hlist_empty(&brmctx->ip4_mc_router_list) ||
+	       !hlist_empty(&brmctx->ip6_mc_router_list);
+#else
+	return !hlist_empty(&brmctx->ip4_mc_router_list);
+#endif
+}
+
 static inline bool
 br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 			       const struct net_bridge_mcast *brmctx2)
@@ -1192,6 +1205,8 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
 	       brmctx2->multicast_startup_query_interval &&
 	       brmctx1->multicast_querier == brmctx2->multicast_querier &&
 	       brmctx1->multicast_router == brmctx2->multicast_router &&
+	       !br_rports_have_mc_router(brmctx1) &&
+	       !br_rports_have_mc_router(brmctx2) &&
 #if IS_ENABLED(CONFIG_IPV6)
 	       brmctx1->multicast_mld_version ==
 	       brmctx2->multicast_mld_version &&
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 6ba45b73931f..b4fd5fa441b7 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -272,6 +272,7 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
 bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 			      const struct net_bridge_vlan *v_opts)
 {
+	struct nlattr *nest2 __maybe_unused;
 	u64 clockval __maybe_unused;
 	struct nlattr *nest;
 
@@ -326,6 +327,23 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 			      clockval, BRIDGE_VLANDB_GOPTS_PAD))
 		goto out_err;
 
+	if (br_rports_have_mc_router(&v_opts->br_mcast_ctx)) {
+		nest2 = nla_nest_start(skb,
+				       BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS);
+		if (!nest2)
+			goto out_err;
+
+		rcu_read_lock();
+		if (br_rports_fill_info(skb, &v_opts->br_mcast_ctx)) {
+			rcu_read_unlock();
+			nla_nest_cancel(skb, nest2);
+			goto out_err;
+		}
+		rcu_read_unlock();
+
+		nla_nest_end(skb, nest2);
+	}
+
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
 		       v_opts->br_mcast_ctx.multicast_mld_version))
-- 
cgit v1.2.3-71-gd317


From 3d2a2544eae93987f0688c2d6ec06c76f9e1477b Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Fri, 25 Jun 2021 11:17:16 +0300
Subject: nl80211: vendor-cmd: add Intel vendor commands for iwlmei usage

iwlmei allows to integrate with the CSME firmware. There are
flows that are prioprietary for this purpose:

* Get the information of the AP the CSME firmware is connected
  to. This is useful when we need to speed up the connection
  process in case the CSME firmware has a TCP connection
  that must be kept alive across the ownership transition.
* Forbid roaming, which will happen when the CSME firmware
  wants to tell the user space not disrupt the connection.
* Request ownership, upon driver boot when the CSME firmware
  owns the device. This is a notification sent by the kernel.

All those commands are expected to be used by any software
managing the connection (mainly NetworkManager). Those commands
are expected to be used only in case the CSME firmware owns
the device and doesn't want to release the device unless the
host made sure that it can keep the connectivity.

Here are the steps of the expected flow:

1) The machine boots while AMT has an active TCP connection
2) iwlwifi starts and tries to access the device
3) The device is not available because of the active TCP
   connection. (If there are no active connections, the CSME
   firmware would have allowed iwlwifi to use the device)

Note that all the steps up to here don't involve iwlmei. All
this happens in iwlwifi (in iwl_pcie_prepare_card_hw).

4) iwlmei establishes a connection to the CSME firmware (through
   SAP)

Here iwlwifi uses iwlmei to access the device's capabilities
(since it can't touch the device), but this is not relevant
for the vendor commands.

5) The CSME firmware tells iwlmei that it uses the NIC and
   that there is an acitve TCP connection, and hence, the
   host needs to think twice before asking the CSME firmware
   to release the device
6) iwlmei tells iwlwifi to report HW RFKILL with a special
   reason

Up to here, there was no user space involved.

7) The user space (NetworkManager) boots and sees that the
   device is in RFKILL because the host doesn't own the
   device
8) The user space asks the kernel what AP the CSME firmware
   is connected to (with the first vendor command mentionned
   above)
9) The user space checks if it has a profile that matches the
   reply from the CSME firmware
10) The user space installs a network to the wpa_supplicant
    with a specific BSSID and a specific frequency
11) The user space prevents any type of full scan
12) The user space asks iwlmei to request ownership on the
    device (with the third vendor command)
13) iwlmei request ownership from the CSME firmware
14) The CSME firmware grants ownership
15) iwlmei tells iwlwifi to lift the RFKILL
16) RFKILL OFF is reported to userspace
17) The host boots the device, loads the firwmare, and
    connect to a specific BSSID without scanning including IP
    in less than 600ms (this is what I measured, of course
    it depends on many factors)
18) The host reports to the CSME firmware that there is a
    connection
19) The TCP connection is preserved and the host has now
    connectivity

20) Later, the TCP connection to the CSME firmware is
    terminated
21) The CSME firmware tells iwlmei that it is now free to
    do whatever it likes
22) iwlwifi sends the second vendor command to tell the
    user space that it can remove the special network
    configuration and pick any SSID / BSSID it likes.

Co-Developed-by: Ayala Beker <ayala.beker@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Link: https://lore.kernel.org/r/20210625081717.7680-4-emmanuel.grumbach@intel.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211-vnd-intel.h | 77 ++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 include/uapi/linux/nl80211-vnd-intel.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211-vnd-intel.h b/include/uapi/linux/nl80211-vnd-intel.h
new file mode 100644
index 000000000000..0bf177b84fd9
--- /dev/null
+++ b/include/uapi/linux/nl80211-vnd-intel.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2012-2014, 2018-2021 Intel Corporation
+ * Copyright (C) 2013-2015 Intel Mobile Communications GmbH
+ * Copyright (C) 2016-2017 Intel Deutschland GmbH
+ */
+#ifndef __VENDOR_CMD_INTEL_H__
+#define __VENDOR_CMD_INTEL_H__
+
+#define INTEL_OUI	0x001735
+
+/**
+ * enum iwl_mvm_vendor_cmd - supported vendor commands
+ * @IWL_MVM_VENDOR_CMD_GET_CSME_CONN_INFO: reports CSME connection info.
+ * @IWL_MVM_VENDOR_CMD_HOST_GET_OWNERSHIP: asks for ownership on the device.
+ * @IWL_MVM_VENDOR_CMD_ROAMING_FORBIDDEN_EVENT: notifies if roaming is allowed.
+ *	It contains a &IWL_MVM_VENDOR_ATTR_ROAMING_FORBIDDEN and a
+ *	&IWL_MVM_VENDOR_ATTR_VIF_ADDR attributes.
+ */
+
+enum iwl_mvm_vendor_cmd {
+	IWL_MVM_VENDOR_CMD_GET_CSME_CONN_INFO			= 0x2d,
+	IWL_MVM_VENDOR_CMD_HOST_GET_OWNERSHIP			= 0x30,
+	IWL_MVM_VENDOR_CMD_ROAMING_FORBIDDEN_EVENT		= 0x32,
+};
+
+enum iwl_vendor_auth_akm_mode {
+	IWL_VENDOR_AUTH_OPEN,
+	IWL_VENDOR_AUTH_RSNA = 0x6,
+	IWL_VENDOR_AUTH_RSNA_PSK,
+	IWL_VENDOR_AUTH_SAE = 0x9,
+	IWL_VENDOR_AUTH_MAX,
+};
+
+/**
+ * enum iwl_mvm_vendor_attr - attributes used in vendor commands
+ * @__IWL_MVM_VENDOR_ATTR_INVALID: attribute 0 is invalid
+ * @IWL_MVM_VENDOR_ATTR_VIF_ADDR: interface MAC address
+ * @IWL_MVM_VENDOR_ATTR_ADDR: MAC address
+ * @IWL_MVM_VENDOR_ATTR_SSID: SSID (binary attribute, 0..32 octets)
+ * @IWL_MVM_VENDOR_ATTR_STA_CIPHER: the cipher to use for the station with the
+ *	mac address specified in &IWL_MVM_VENDOR_ATTR_ADDR.
+ * @IWL_MVM_VENDOR_ATTR_ROAMING_FORBIDDEN: u8 attribute. Indicates whether
+ *	roaming is forbidden or not. Value 1 means roaming is forbidden,
+ *	0 mean roaming is allowed.
+ * @IWL_MVM_VENDOR_ATTR_AUTH_MODE: u32 attribute. Authentication mode type
+ *	as specified in &enum iwl_vendor_auth_akm_mode.
+ * @IWL_MVM_VENDOR_ATTR_CHANNEL_NUM: u8 attribute. Contains channel number.
+ * @IWL_MVM_VENDOR_ATTR_BAND: u8 attribute.
+ *	0 for 2.4 GHz band, 1 for 5.2GHz band and 2 for 6GHz band.
+ * @IWL_MVM_VENDOR_ATTR_COLLOC_CHANNEL: u32 attribute. Channel number of
+ *	collocated AP. Relevant for 6GHz AP info.
+ * @IWL_MVM_VENDOR_ATTR_COLLOC_ADDR: MAC address of a collocated AP.
+ *	Relevant for 6GHz AP info.
+ *
+ * @NUM_IWL_MVM_VENDOR_ATTR: number of vendor attributes
+ * @MAX_IWL_MVM_VENDOR_ATTR: highest vendor attribute number
+
+ */
+enum iwl_mvm_vendor_attr {
+	__IWL_MVM_VENDOR_ATTR_INVALID				= 0x00,
+	IWL_MVM_VENDOR_ATTR_VIF_ADDR				= 0x02,
+	IWL_MVM_VENDOR_ATTR_ADDR				= 0x0a,
+	IWL_MVM_VENDOR_ATTR_SSID				= 0x3d,
+	IWL_MVM_VENDOR_ATTR_STA_CIPHER				= 0x51,
+	IWL_MVM_VENDOR_ATTR_ROAMING_FORBIDDEN			= 0x64,
+	IWL_MVM_VENDOR_ATTR_AUTH_MODE				= 0x65,
+	IWL_MVM_VENDOR_ATTR_CHANNEL_NUM				= 0x66,
+	IWL_MVM_VENDOR_ATTR_BAND				= 0x69,
+	IWL_MVM_VENDOR_ATTR_COLLOC_CHANNEL			= 0x70,
+	IWL_MVM_VENDOR_ATTR_COLLOC_ADDR				= 0x71,
+
+	NUM_IWL_MVM_VENDOR_ATTR,
+	MAX_IWL_MVM_VENDOR_ATTR = NUM_IWL_MVM_VENDOR_ATTR - 1,
+};
+
+#endif /* __VENDOR_CMD_INTEL_H__ */
-- 
cgit v1.2.3-71-gd317


From 81be10934949da8b12ca4db3de1511a4220fa9b4 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 13 Aug 2021 10:21:42 +0200
Subject: ALSA: pcm: Add SNDRV_PCM_INFO_EXPLICIT_SYNC flag

ALSA PCM core has an optimized way to communicate with user-space for
its control and status data via mmap on the supported architectures
like x86.  Depending on the situation, however, we'd rather want to
enforce user-space notifying the applptr or hwptr change explicitly
via ioctl.  For example, the upcoming non-contig and non-coherent
buffer handling would need an explicit sync, and this needs to catch
the applptr and hwptr changes.  Also, ASoC SOF driver will have the
SPIB support that has the similar requirement for the explicit control
of the applptr and hwptr.

This patch adds the new PCM hardware info flag,
SNDRV_PCM_INFO_EXPLICIT_SYNC.  When this flag is set, PCM core
disables both the control and the status mmap, which enforces
user-space to update via SYNC_PTR ioctl.  In that way, drivers can
catch the applptr and hwptr update and apply the sync operation if
needed.

Link: https://lore.kernel.org/r/20210812113818.6479-1-tiwai@suse.de
Link: https://lore.kernel.org/r/20210610205326.1176400-1-pierre-louis.bossart@linux.intel.com
Link: https://lore.kernel.org/r/20210813082142.5375-1-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/sound/asound.h | 1 +
 sound/core/pcm_native.c     | 9 +++++++++
 2 files changed, 10 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h
index d17c061950df..1d84ec9db93b 100644
--- a/include/uapi/sound/asound.h
+++ b/include/uapi/sound/asound.h
@@ -299,6 +299,7 @@ typedef int __bitwise snd_pcm_subformat_t;
 #define SNDRV_PCM_INFO_HAS_LINK_ABSOLUTE_ATIME     0x02000000  /* report absolute hardware link audio time, not reset on startup */
 #define SNDRV_PCM_INFO_HAS_LINK_ESTIMATED_ATIME    0x04000000  /* report estimated link audio time */
 #define SNDRV_PCM_INFO_HAS_LINK_SYNCHRONIZED_ATIME 0x08000000  /* report synchronized audio/system time */
+#define SNDRV_PCM_INFO_EXPLICIT_SYNC	0x10000000	/* needs explicit sync of pointers and data */
 
 #define SNDRV_PCM_INFO_DRAIN_TRIGGER	0x40000000		/* internal kernel flag - trigger in drain */
 #define SNDRV_PCM_INFO_FIFO_IN_FRAMES	0x80000000	/* internal kernel flag - FIFO size is in frames */
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index dc9fa312fadd..d233cb3b41d8 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3621,6 +3621,12 @@ static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file
 
 static bool pcm_status_mmap_allowed(struct snd_pcm_file *pcm_file)
 {
+	/* If drivers require the explicit sync (typically for non-coherent
+	 * pages), we have to disable the mmap of status and control data
+	 * to enforce the control via SYNC_PTR ioctl.
+	 */
+	if (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_EXPLICIT_SYNC)
+		return false;
 	/* See pcm_control_mmap_allowed() below.
 	 * Since older alsa-lib requires both status and control mmaps to be
 	 * coupled, we have to disable the status mmap for old alsa-lib, too.
@@ -3635,6 +3641,9 @@ static bool pcm_control_mmap_allowed(struct snd_pcm_file *pcm_file)
 {
 	if (pcm_file->no_compat_mmap)
 		return false;
+	/* see above */
+	if (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_EXPLICIT_SYNC)
+		return false;
 	/* Disallow the control mmap when SYNC_APPLPTR flag is set;
 	 * it enforces the user-space to fall back to snd_pcm_sync_ptr(),
 	 * thus it effectively assures the manual update of appl_ptr.
-- 
cgit v1.2.3-71-gd317


From c7fa1d9b1fb179375e889ff076a1566ecc997bfc Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Fri, 13 Aug 2021 18:00:00 +0300
Subject: net: bridge: mcast: dump ipv4 querier state

Add support for dumping global IPv4 querier state, we dump the state
only if our own querier is enabled or there has been another external
querier which has won the election. For the bridge global state we use
a new attribute IFLA_BR_MCAST_QUERIER_STATE and embed the state inside.
The structure is:
 [IFLA_BR_MCAST_QUERIER_STATE]
  `[BRIDGE_QUERIER_IP_ADDRESS] - ip address of the querier
  `[BRIDGE_QUERIER_IP_PORT]    - bridge port ifindex where the querier was
                                 seen (set only if external querier)
  `[BRIDGE_QUERIER_IP_OTHER_TIMER]   -  other querier timeout

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 10 ++++++
 include/uapi/linux/if_link.h   |  1 +
 net/bridge/br_multicast.c      | 73 ++++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_netlink.c        |  5 ++-
 net/bridge/br_private.h        |  4 +++
 5 files changed, 92 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 620d86e825b8..e0fff67fcd88 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -770,4 +770,14 @@ struct br_boolopt_multi {
 	__u32 optval;
 	__u32 optmask;
 };
+
+enum {
+	BRIDGE_QUERIER_UNSPEC,
+	BRIDGE_QUERIER_IP_ADDRESS,
+	BRIDGE_QUERIER_IP_PORT,
+	BRIDGE_QUERIER_IP_OTHER_TIMER,
+	BRIDGE_QUERIER_PAD,
+	__BRIDGE_QUERIER_MAX
+};
+#define BRIDGE_QUERIER_MAX (__BRIDGE_QUERIER_MAX - 1)
 #endif /* _UAPI_LINUX_IF_BRIDGE_H */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 5310003523ce..8aad65b69054 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -479,6 +479,7 @@ enum {
 	IFLA_BR_MCAST_MLD_VERSION,
 	IFLA_BR_VLAN_STATS_PER_PORT,
 	IFLA_BR_MULTI_BOOLOPT,
+	IFLA_BR_MCAST_QUERIER_STATE,
 	__IFLA_BR_MAX,
 };
 
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 3705b7ace62d..4513bc13b6d3 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2905,6 +2905,79 @@ update:
 	return true;
 }
 
+static struct net_bridge_port *
+__br_multicast_get_querier_port(struct net_bridge *br,
+				const struct bridge_mcast_querier *querier)
+{
+	int port_ifidx = READ_ONCE(querier->port_ifidx);
+	struct net_bridge_port *p;
+	struct net_device *dev;
+
+	if (port_ifidx == 0)
+		return NULL;
+
+	dev = dev_get_by_index_rcu(dev_net(br->dev), port_ifidx);
+	if (!dev)
+		return NULL;
+	p = br_port_get_rtnl_rcu(dev);
+	if (!p || p->br != br)
+		return NULL;
+
+	return p;
+}
+
+size_t br_multicast_querier_state_size(void)
+{
+	return nla_total_size(sizeof(0)) +      /* nest attribute */
+	       nla_total_size(sizeof(__be32)) + /* BRIDGE_QUERIER_IP_ADDRESS */
+	       nla_total_size(sizeof(int)) +    /* BRIDGE_QUERIER_IP_PORT */
+	       nla_total_size_64bit(sizeof(u64)); /* BRIDGE_QUERIER_IP_OTHER_TIMER */
+}
+
+/* protected by rtnl or rcu */
+int br_multicast_dump_querier_state(struct sk_buff *skb,
+				    const struct net_bridge_mcast *brmctx,
+				    int nest_attr)
+{
+	struct bridge_mcast_querier querier = {};
+	struct net_bridge_port *p;
+	struct nlattr *nest;
+
+	if (!brmctx->multicast_querier &&
+	    !timer_pending(&brmctx->ip4_other_query.timer))
+		return 0;
+
+	nest = nla_nest_start(skb, nest_attr);
+	if (!nest)
+		return -EMSGSIZE;
+
+	rcu_read_lock();
+	br_multicast_read_querier(&brmctx->ip4_querier, &querier);
+	if (nla_put_in_addr(skb, BRIDGE_QUERIER_IP_ADDRESS,
+			    querier.addr.src.ip4)) {
+		rcu_read_unlock();
+		goto out_err;
+	}
+
+	p = __br_multicast_get_querier_port(brmctx->br, &querier);
+	if (timer_pending(&brmctx->ip4_other_query.timer) &&
+	    (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IP_OTHER_TIMER,
+			       br_timer_value(&brmctx->ip4_other_query.timer),
+			       BRIDGE_QUERIER_PAD) ||
+	     (p && nla_put_u32(skb, BRIDGE_QUERIER_IP_PORT, p->dev->ifindex)))) {
+		rcu_read_unlock();
+		goto out_err;
+	}
+	rcu_read_unlock();
+	nla_nest_end(skb, nest);
+
+	return 0;
+
+out_err:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
 static void
 br_multicast_update_query_timer(struct net_bridge_mcast *brmctx,
 				struct bridge_mcast_other_query *query,
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 8ae026fa2ad7..2f184ad8ae29 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1501,6 +1501,7 @@ static size_t br_get_size(const struct net_device *brdev)
 	       nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */
 	       nla_total_size(sizeof(u8)) +	/* IFLA_BR_MCAST_IGMP_VERSION */
 	       nla_total_size(sizeof(u8)) +	/* IFLA_BR_MCAST_MLD_VERSION */
+	       br_multicast_querier_state_size() + /* IFLA_BR_MCAST_QUERIER_STATE */
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_NF_CALL_IPTABLES */
@@ -1587,7 +1588,9 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	    nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT,
 			br->multicast_ctx.multicast_startup_query_count) ||
 	    nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION,
-		       br->multicast_ctx.multicast_igmp_version))
+		       br->multicast_ctx.multicast_igmp_version) ||
+	    br_multicast_dump_querier_state(skb, &br->multicast_ctx,
+					    IFLA_BR_MCAST_QUERIER_STATE))
 		return -EMSGSIZE;
 #if IS_ENABLED(CONFIG_IPV6)
 	if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 70e2d9a7f105..9b1bf98a2c5a 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -948,6 +948,10 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
 		  struct netlink_ext_ack *extack);
 int br_rports_fill_info(struct sk_buff *skb,
 			const struct net_bridge_mcast *brmctx);
+int br_multicast_dump_querier_state(struct sk_buff *skb,
+				    const struct net_bridge_mcast *brmctx,
+				    int nest_attr);
+size_t br_multicast_querier_state_size(void);
 
 static inline bool br_group_is_l2(const struct br_ip *group)
 {
-- 
cgit v1.2.3-71-gd317


From 85b4108211742c5dd4f9f56c1d0704b4e0d4c98e Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Fri, 13 Aug 2021 18:00:01 +0300
Subject: net: bridge: mcast: dump ipv6 querier state

Add support for dumping global IPv6 querier state, we dump the state
only if our own querier is enabled or there has been another external
querier which has won the election. For the bridge global state we use
a new attribute IFLA_BR_MCAST_QUERIER_STATE and embed the state inside.
The structure is:
  [IFLA_BR_MCAST_QUERIER_STATE]
   `[BRIDGE_QUERIER_IPV6_ADDRESS] - ip address of the querier
   `[BRIDGE_QUERIER_IPV6_PORT]    - bridge port ifindex where the querier
                                    was seen (set only if external querier)
   `[BRIDGE_QUERIER_IPV6_OTHER_TIMER]   -  other querier timeout

IPv4 and IPv6 attributes are embedded at the same level of
IFLA_BR_MCAST_QUERIER_STATE. If we didn't dump anything we cancel the nest
and return.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  3 +++
 net/bridge/br_multicast.c      | 36 ++++++++++++++++++++++++++++++++----
 2 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index e0fff67fcd88..eceaad200bf6 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -777,6 +777,9 @@ enum {
 	BRIDGE_QUERIER_IP_PORT,
 	BRIDGE_QUERIER_IP_OTHER_TIMER,
 	BRIDGE_QUERIER_PAD,
+	BRIDGE_QUERIER_IPV6_ADDRESS,
+	BRIDGE_QUERIER_IPV6_PORT,
+	BRIDGE_QUERIER_IPV6_OTHER_TIMER,
 	__BRIDGE_QUERIER_MAX
 };
 #define BRIDGE_QUERIER_MAX (__BRIDGE_QUERIER_MAX - 1)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 4513bc13b6d3..0e5d6ba03457 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2943,15 +2943,15 @@ int br_multicast_dump_querier_state(struct sk_buff *skb,
 	struct net_bridge_port *p;
 	struct nlattr *nest;
 
-	if (!brmctx->multicast_querier &&
-	    !timer_pending(&brmctx->ip4_other_query.timer))
-		return 0;
-
 	nest = nla_nest_start(skb, nest_attr);
 	if (!nest)
 		return -EMSGSIZE;
 
 	rcu_read_lock();
+	if (!brmctx->multicast_querier &&
+	    !timer_pending(&brmctx->ip4_other_query.timer))
+		goto out_v6;
+
 	br_multicast_read_querier(&brmctx->ip4_querier, &querier);
 	if (nla_put_in_addr(skb, BRIDGE_QUERIER_IP_ADDRESS,
 			    querier.addr.src.ip4)) {
@@ -2968,8 +2968,36 @@ int br_multicast_dump_querier_state(struct sk_buff *skb,
 		rcu_read_unlock();
 		goto out_err;
 	}
+
+out_v6:
+#if IS_ENABLED(CONFIG_IPV6)
+	if (!brmctx->multicast_querier &&
+	    !timer_pending(&brmctx->ip6_other_query.timer))
+		goto out;
+
+	br_multicast_read_querier(&brmctx->ip6_querier, &querier);
+	if (nla_put_in6_addr(skb, BRIDGE_QUERIER_IPV6_ADDRESS,
+			     &querier.addr.src.ip6)) {
+		rcu_read_unlock();
+		goto out_err;
+	}
+
+	p = __br_multicast_get_querier_port(brmctx->br, &querier);
+	if (timer_pending(&brmctx->ip6_other_query.timer) &&
+	    (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IPV6_OTHER_TIMER,
+			       br_timer_value(&brmctx->ip6_other_query.timer),
+			       BRIDGE_QUERIER_PAD) ||
+	     (p && nla_put_u32(skb, BRIDGE_QUERIER_IPV6_PORT,
+			       p->dev->ifindex)))) {
+		rcu_read_unlock();
+		goto out_err;
+	}
+out:
+#endif
 	rcu_read_unlock();
 	nla_nest_end(skb, nest);
+	if (!nla_len(nest))
+		nla_nest_cancel(skb, nest);
 
 	return 0;
 
-- 
cgit v1.2.3-71-gd317


From ddc649d158c560c6685be1701900a6e456ecceac Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Fri, 13 Aug 2021 18:00:02 +0300
Subject: net: bridge: vlan: dump mcast ctx querier state

Use the new mcast querier state dump infrastructure and export vlans'
mcast context querier state embedded in attribute
BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 1 +
 net/bridge/br_vlan_options.c   | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index eceaad200bf6..f71a81fdbbc6 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -563,6 +563,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
 	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
 	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS,
+	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index b4fd5fa441b7..49dec53a4a74 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -299,7 +299,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
 		       v_opts->br_mcast_ctx.multicast_querier) ||
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
-		       v_opts->br_mcast_ctx.multicast_router))
+		       v_opts->br_mcast_ctx.multicast_router) ||
+	    br_multicast_dump_querier_state(skb, &v_opts->br_mcast_ctx,
+					    BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE))
 		goto out_err;
 
 	clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval);
@@ -379,6 +381,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(void)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER */
+		+ br_multicast_querier_state_size() /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE */
 #endif
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
-- 
cgit v1.2.3-71-gd317


From 6c9b40844751ea30c72f7a2f92f4d704bc6b2927 Mon Sep 17 00:00:00 2001
From: Cai Huoqing <caihuoqing@baidu.com>
Date: Fri, 13 Aug 2021 20:08:02 +0800
Subject: net: Remove net/ipx.h and uapi/linux/ipx.h header files

commit <47595e32869f> ("<MAINTAINERS: Mark some staging directories>")
indicated the ipx network layer as obsolete in Jan 2018,
updated in the MAINTAINERS file

now, after being exposed for 3 years to refactoring, so to
delete uapi/linux/ipx.h and net/ipx.h header files for good.
additionally, there is no module that depends on ipx.h except
a broken staging driver(r8188eu)

Signed-off-by: Cai Huoqing <caihuoqing@baidu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipx.h        | 171 -----------------------------------------------
 include/uapi/linux/ipx.h |  87 ------------------------
 2 files changed, 258 deletions(-)
 delete mode 100644 include/net/ipx.h
 delete mode 100644 include/uapi/linux/ipx.h

(limited to 'include/uapi')

diff --git a/include/net/ipx.h b/include/net/ipx.h
deleted file mode 100644
index 9d1342807b59..000000000000
--- a/include/net/ipx.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NET_INET_IPX_H_
-#define _NET_INET_IPX_H_
-/*
- *	The following information is in its entirety obtained from:
- *
- *	Novell 'IPX Router Specification' Version 1.10 
- *		Part No. 107-000029-001
- *
- *	Which is available from ftp.novell.com
- */
-
-#include <linux/netdevice.h>
-#include <net/datalink.h>
-#include <linux/ipx.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/refcount.h>
-
-struct ipx_address {
-	__be32  net;
-	__u8    node[IPX_NODE_LEN]; 
-	__be16  sock;
-};
-
-#define ipx_broadcast_node	"\377\377\377\377\377\377"
-#define ipx_this_node           "\0\0\0\0\0\0"
-
-#define IPX_MAX_PPROP_HOPS 8
-
-struct ipxhdr {
-	__be16			ipx_checksum __packed;
-#define IPX_NO_CHECKSUM	cpu_to_be16(0xFFFF)
-	__be16			ipx_pktsize __packed;
-	__u8			ipx_tctrl;
-	__u8			ipx_type;
-#define IPX_TYPE_UNKNOWN	0x00
-#define IPX_TYPE_RIP		0x01	/* may also be 0 */
-#define IPX_TYPE_SAP		0x04	/* may also be 0 */
-#define IPX_TYPE_SPX		0x05	/* SPX protocol */
-#define IPX_TYPE_NCP		0x11	/* $lots for docs on this (SPIT) */
-#define IPX_TYPE_PPROP		0x14	/* complicated flood fill brdcast */
-	struct ipx_address	ipx_dest __packed;
-	struct ipx_address	ipx_source __packed;
-};
-
-/* From af_ipx.c */
-extern int sysctl_ipx_pprop_broadcasting;
-
-struct ipx_interface {
-	/* IPX address */
-	__be32			if_netnum;
-	unsigned char		if_node[IPX_NODE_LEN];
-	refcount_t		refcnt;
-
-	/* physical device info */
-	struct net_device	*if_dev;
-	struct datalink_proto	*if_dlink;
-	__be16			if_dlink_type;
-
-	/* socket support */
-	unsigned short		if_sknum;
-	struct hlist_head	if_sklist;
-	spinlock_t		if_sklist_lock;
-
-	/* administrative overhead */
-	int			if_ipx_offset;
-	unsigned char		if_internal;
-	unsigned char		if_primary;
-	
-	struct list_head	node; /* node in ipx_interfaces list */
-};
-
-struct ipx_route {
-	__be32			ir_net;
-	struct ipx_interface	*ir_intrfc;
-	unsigned char		ir_routed;
-	unsigned char		ir_router_node[IPX_NODE_LEN];
-	struct list_head	node; /* node in ipx_routes list */
-	refcount_t		refcnt;
-};
-
-struct ipx_cb {
-	u8	ipx_tctrl;
-	__be32	ipx_dest_net;
-	__be32	ipx_source_net;
-	struct {
-		__be32 netnum;
-		int index;
-	} last_hop;
-};
-
-#include <net/sock.h>
-
-struct ipx_sock {
-	/* struct sock has to be the first member of ipx_sock */
-	struct sock		sk;
-	struct ipx_address	dest_addr;
-	struct ipx_interface	*intrfc;
-	__be16			port;
-#ifdef CONFIG_IPX_INTERN
-	unsigned char		node[IPX_NODE_LEN];
-#endif
-	unsigned short		type;
-	/*
-	 * To handle special ncp connection-handling sockets for mars_nwe,
- 	 * the connection number must be stored in the socket.
-	 */
-	unsigned short		ipx_ncp_conn;
-};
-
-static inline struct ipx_sock *ipx_sk(struct sock *sk)
-{
-	return (struct ipx_sock *)sk;
-}
-
-#define IPX_SKB_CB(__skb) ((struct ipx_cb *)&((__skb)->cb[0]))
-
-#define IPX_MIN_EPHEMERAL_SOCKET	0x4000
-#define IPX_MAX_EPHEMERAL_SOCKET	0x7fff
-
-extern struct list_head ipx_routes;
-extern rwlock_t ipx_routes_lock;
-
-extern struct list_head ipx_interfaces;
-struct ipx_interface *ipx_interfaces_head(void);
-extern spinlock_t ipx_interfaces_lock;
-
-extern struct ipx_interface *ipx_primary_net;
-
-int ipx_proc_init(void);
-void ipx_proc_exit(void);
-
-const char *ipx_frame_name(__be16);
-const char *ipx_device_name(struct ipx_interface *intrfc);
-
-static __inline__ void ipxitf_hold(struct ipx_interface *intrfc)
-{
-	refcount_inc(&intrfc->refcnt);
-}
-
-void ipxitf_down(struct ipx_interface *intrfc);
-struct ipx_interface *ipxitf_find_using_net(__be32 net);
-int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node);
-__be16 ipx_cksum(struct ipxhdr *packet, int length);
-int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc,
-		     unsigned char *node);
-void ipxrtr_del_routes(struct ipx_interface *intrfc);
-int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx,
-			struct msghdr *msg, size_t len, int noblock);
-int ipxrtr_route_skb(struct sk_buff *skb);
-struct ipx_route *ipxrtr_lookup(__be32 net);
-int ipxrtr_ioctl(unsigned int cmd, void __user *arg);
-
-static __inline__ void ipxitf_put(struct ipx_interface *intrfc)
-{
-	if (refcount_dec_and_test(&intrfc->refcnt))
-		ipxitf_down(intrfc);
-}
-
-static __inline__ void ipxrtr_hold(struct ipx_route *rt)
-{
-	        refcount_inc(&rt->refcnt);
-}
-
-static __inline__ void ipxrtr_put(struct ipx_route *rt)
-{
-	        if (refcount_dec_and_test(&rt->refcnt))
-			                kfree(rt);
-}
-#endif /* _NET_INET_IPX_H_ */
diff --git a/include/uapi/linux/ipx.h b/include/uapi/linux/ipx.h
deleted file mode 100644
index 3168137adae8..000000000000
--- a/include/uapi/linux/ipx.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _IPX_H_
-#define _IPX_H_
-#include <linux/libc-compat.h>	/* for compatibility with glibc netipx/ipx.h */
-#include <linux/types.h>
-#include <linux/sockios.h>
-#include <linux/socket.h>
-#define IPX_NODE_LEN	6
-#define IPX_MTU		576
-
-#if __UAPI_DEF_SOCKADDR_IPX
-struct sockaddr_ipx {
-	__kernel_sa_family_t sipx_family;
-	__be16		sipx_port;
-	__be32		sipx_network;
-	unsigned char 	sipx_node[IPX_NODE_LEN];
-	__u8		sipx_type;
-	unsigned char	sipx_zero;	/* 16 byte fill */
-};
-#endif /* __UAPI_DEF_SOCKADDR_IPX */
-
-/*
- * So we can fit the extra info for SIOCSIFADDR into the address nicely
- */
-#define sipx_special	sipx_port
-#define sipx_action	sipx_zero
-#define IPX_DLTITF	0
-#define IPX_CRTITF	1
-
-#if __UAPI_DEF_IPX_ROUTE_DEFINITION
-struct ipx_route_definition {
-	__be32        ipx_network;
-	__be32        ipx_router_network;
-	unsigned char ipx_router_node[IPX_NODE_LEN];
-};
-#endif /* __UAPI_DEF_IPX_ROUTE_DEFINITION */
-
-#if __UAPI_DEF_IPX_INTERFACE_DEFINITION
-struct ipx_interface_definition {
-	__be32        ipx_network;
-	unsigned char ipx_device[16];
-	unsigned char ipx_dlink_type;
-#define IPX_FRAME_NONE		0
-#define IPX_FRAME_SNAP		1
-#define IPX_FRAME_8022		2
-#define IPX_FRAME_ETHERII	3
-#define IPX_FRAME_8023		4
-#define IPX_FRAME_TR_8022       5 /* obsolete */
-	unsigned char ipx_special;
-#define IPX_SPECIAL_NONE	0
-#define IPX_PRIMARY		1
-#define IPX_INTERNAL		2
-	unsigned char ipx_node[IPX_NODE_LEN];
-};
-#endif /* __UAPI_DEF_IPX_INTERFACE_DEFINITION */
-
-#if __UAPI_DEF_IPX_CONFIG_DATA
-struct ipx_config_data {
-	unsigned char	ipxcfg_auto_select_primary;
-	unsigned char	ipxcfg_auto_create_interfaces;
-};
-#endif /* __UAPI_DEF_IPX_CONFIG_DATA */
-
-/*
- * OLD Route Definition for backward compatibility.
- */
-
-#if __UAPI_DEF_IPX_ROUTE_DEF
-struct ipx_route_def {
-	__be32		ipx_network;
-	__be32		ipx_router_network;
-#define IPX_ROUTE_NO_ROUTER	0
-	unsigned char	ipx_router_node[IPX_NODE_LEN];
-	unsigned char	ipx_device[16];
-	unsigned short	ipx_flags;
-#define IPX_RT_SNAP		8
-#define IPX_RT_8022		4
-#define IPX_RT_BLUEBOOK		2
-#define IPX_RT_ROUTED		1
-};
-#endif /* __UAPI_DEF_IPX_ROUTE_DEF */
-
-#define SIOCAIPXITFCRT		(SIOCPROTOPRIVATE)
-#define SIOCAIPXPRISLT		(SIOCPROTOPRIVATE + 1)
-#define SIOCIPXCFGDATA		(SIOCPROTOPRIVATE + 2)
-#define SIOCIPXNCPCONN		(SIOCPROTOPRIVATE + 3)
-#endif /* _IPX_H_ */
-- 
cgit v1.2.3-71-gd317


From 9ea9b9c48387edc101d56349492ad9c0492ff78d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 12 Aug 2021 15:23:08 +0200
Subject: remove the lightnvm subsystem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lightnvm supports the OCSSD 1.x and 2.0 specs which were early attempts
to produce Open Channel SSDs and never made it into the NVMe spec
proper.  They have since been superceeded by NVMe enhancements such
as ZNS support.  Remove the support per the deprecation schedule.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210812132308.38486-1-hch@lst.de
Reviewed-by: Matias Bjørling <mb@lightnvm.io>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/driver-api/index.rst                 |    1 -
 Documentation/driver-api/lightnvm-pblk.rst         |   21 -
 Documentation/userspace-api/ioctl/ioctl-number.rst |    1 -
 MAINTAINERS                                        |    9 -
 drivers/Kconfig                                    |    2 -
 drivers/Makefile                                   |    1 -
 drivers/lightnvm/Kconfig                           |   44 -
 drivers/lightnvm/Makefile                          |   11 -
 drivers/lightnvm/core.c                            | 1440 -------------
 drivers/lightnvm/pblk-cache.c                      |  137 --
 drivers/lightnvm/pblk-core.c                       | 2151 --------------------
 drivers/lightnvm/pblk-gc.c                         |  726 -------
 drivers/lightnvm/pblk-init.c                       | 1324 ------------
 drivers/lightnvm/pblk-map.c                        |  210 --
 drivers/lightnvm/pblk-rb.c                         |  858 --------
 drivers/lightnvm/pblk-read.c                       |  474 -----
 drivers/lightnvm/pblk-recovery.c                   |  874 --------
 drivers/lightnvm/pblk-rl.c                         |  254 ---
 drivers/lightnvm/pblk-sysfs.c                      |  728 -------
 drivers/lightnvm/pblk-trace.h                      |  145 --
 drivers/lightnvm/pblk-write.c                      |  665 ------
 drivers/lightnvm/pblk.h                            | 1358 ------------
 drivers/nvme/host/Makefile                         |    1 -
 drivers/nvme/host/core.c                           |   13 -
 drivers/nvme/host/ioctl.c                          |    4 +-
 drivers/nvme/host/lightnvm.c                       | 1274 ------------
 drivers/nvme/host/nvme.h                           |   26 -
 drivers/nvme/host/pci.c                            |    6 -
 include/linux/lightnvm.h                           |  697 -------
 include/uapi/linux/lightnvm.h                      |  224 --
 30 files changed, 1 insertion(+), 13678 deletions(-)
 delete mode 100644 Documentation/driver-api/lightnvm-pblk.rst
 delete mode 100644 drivers/lightnvm/Kconfig
 delete mode 100644 drivers/lightnvm/Makefile
 delete mode 100644 drivers/lightnvm/core.c
 delete mode 100644 drivers/lightnvm/pblk-cache.c
 delete mode 100644 drivers/lightnvm/pblk-core.c
 delete mode 100644 drivers/lightnvm/pblk-gc.c
 delete mode 100644 drivers/lightnvm/pblk-init.c
 delete mode 100644 drivers/lightnvm/pblk-map.c
 delete mode 100644 drivers/lightnvm/pblk-rb.c
 delete mode 100644 drivers/lightnvm/pblk-read.c
 delete mode 100644 drivers/lightnvm/pblk-recovery.c
 delete mode 100644 drivers/lightnvm/pblk-rl.c
 delete mode 100644 drivers/lightnvm/pblk-sysfs.c
 delete mode 100644 drivers/lightnvm/pblk-trace.h
 delete mode 100644 drivers/lightnvm/pblk-write.c
 delete mode 100644 drivers/lightnvm/pblk.h
 delete mode 100644 drivers/nvme/host/lightnvm.c
 delete mode 100644 include/linux/lightnvm.h
 delete mode 100644 include/uapi/linux/lightnvm.h

(limited to 'include/uapi')

diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index f5a3207aa7fa..c57c609ad2eb 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -85,7 +85,6 @@ available subsections can be seen below.
    io-mapping
    io_ordering
    generic-counter
-   lightnvm-pblk
    memory-devices/index
    men-chameleon-bus
    ntb
diff --git a/Documentation/driver-api/lightnvm-pblk.rst b/Documentation/driver-api/lightnvm-pblk.rst
deleted file mode 100644
index 1040ed1cec81..000000000000
--- a/Documentation/driver-api/lightnvm-pblk.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-pblk: Physical Block Device Target
-==================================
-
-pblk implements a fully associative, host-based FTL that exposes a traditional
-block I/O interface. Its primary responsibilities are:
-
-  - Map logical addresses onto physical addresses (4KB granularity) in a
-    logical-to-physical (L2P) table.
-  - Maintain the integrity and consistency of the L2P table as well as its
-    recovery from normal tear down and power outage.
-  - Deal with controller- and media-specific constrains.
-  - Handle I/O errors.
-  - Implement garbage collection.
-  - Maintain consistency across the I/O stack during synchronization points.
-
-For more information please refer to:
-
-  http://lightnvm.io
-
-which maintains updated FAQs, manual pages, technical documentation, tools,
-contacts, etc.
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 1409e40e6345..b7070d76f076 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -160,7 +160,6 @@ Code  Seq#    Include File                                           Comments
 'K'   all    linux/kd.h
 'L'   00-1F  linux/loop.h                                            conflict!
 'L'   10-1F  drivers/scsi/mpt3sas/mpt3sas_ctl.h                      conflict!
-'L'   20-2F  linux/lightnvm.h
 'L'   E0-FF  linux/ppdd.h                                            encrypted disk device driver
                                                                      <http://linux01.gwdg.de/~alatham/ppdd.html>
 'M'   all    linux/soundcard.h                                       conflict!
diff --git a/MAINTAINERS b/MAINTAINERS
index c9467d2839f5..ec3f59a16c9a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10609,15 +10609,6 @@ F:	LICENSES/
 F:	scripts/spdxcheck-test.sh
 F:	scripts/spdxcheck.py
 
-LIGHTNVM PLATFORM SUPPORT
-M:	Matias Bjorling <mb@lightnvm.io>
-L:	linux-block@vger.kernel.org
-S:	Maintained
-W:	http://github/OpenChannelSSD
-F:	drivers/lightnvm/
-F:	include/linux/lightnvm.h
-F:	include/uapi/linux/lightnvm.h
-
 LINEAR RANGES HELPERS
 M:	Mark Brown <broonie@kernel.org>
 R:	Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 8bad63417a50..30d2db37cc87 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -51,8 +51,6 @@ source "drivers/net/Kconfig"
 
 source "drivers/isdn/Kconfig"
 
-source "drivers/lightnvm/Kconfig"
-
 # input before char - char/joystick depends on it. As does USB.
 
 source "drivers/input/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 27c018bdf4de..be5d40ae1488 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -70,7 +70,6 @@ obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
 obj-$(CONFIG_FB_INTEL)          += video/fbdev/intelfb/
 
 obj-$(CONFIG_PARPORT)		+= parport/
-obj-$(CONFIG_NVM)		+= lightnvm/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
 obj-$(CONFIG_DAX)		+= dax/
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
deleted file mode 100644
index 04caa0f2d445..000000000000
--- a/drivers/lightnvm/Kconfig
+++ /dev/null
@@ -1,44 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Open-Channel SSD NVM configuration
-#
-
-menuconfig NVM
-	bool "Open-Channel SSD target support (DEPRECATED)"
-	depends on BLOCK
-	help
-	  Say Y here to get to enable Open-channel SSDs.
-
-	  Open-Channel SSDs implement a set of extension to SSDs, that
-	  exposes direct access to the underlying non-volatile memory.
-
-	  If you say N, all options in this submenu will be skipped and disabled
-	  only do this if you know what you are doing.
-
-	  This code is deprecated and will be removed in Linux 5.15.
-
-if NVM
-
-config NVM_PBLK
-	tristate "Physical Block Device Open-Channel SSD target"
-	select CRC32
-	help
-	  Allows an open-channel SSD to be exposed as a block device to the
-	  host. The target assumes the device exposes raw flash and must be
-	  explicitly managed by the host.
-
-	  Please note the disk format is considered EXPERIMENTAL for now.
-
-if NVM_PBLK
-
-config NVM_PBLK_DEBUG
-	bool "PBlk Debug Support"
-	default n
-	help
-	  Enables debug support for pblk. This includes extra checks, more
-	  vocal error messages, and extra tracking fields in the pblk sysfs
-	  entries.
-
-endif # NVM_PBLK_DEBUG
-
-endif # NVM
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
deleted file mode 100644
index 97d9d7c71550..000000000000
--- a/drivers/lightnvm/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for Open-Channel SSDs.
-#
-
-obj-$(CONFIG_NVM)		:= core.o
-obj-$(CONFIG_NVM_PBLK)		+= pblk.o
-pblk-y				:= pblk-init.o pblk-core.o pblk-rb.o \
-				   pblk-write.o pblk-cache.o pblk-read.o \
-				   pblk-gc.o pblk-recovery.o pblk-map.o \
-				   pblk-rl.o pblk-sysfs.o
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
deleted file mode 100644
index cf8a75494833..000000000000
--- a/drivers/lightnvm/core.c
+++ /dev/null
@@ -1,1440 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2015 IT University of Copenhagen. All rights reserved.
- * Initial release: Matias Bjorling <m@bjorling.me>
- */
-
-#define pr_fmt(fmt) "nvm: " fmt
-
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/sem.h>
-#include <linux/bitmap.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/miscdevice.h>
-#include <linux/lightnvm.h>
-#include <linux/sched/sysctl.h>
-
-static LIST_HEAD(nvm_tgt_types);
-static DECLARE_RWSEM(nvm_tgtt_lock);
-static LIST_HEAD(nvm_devices);
-static DECLARE_RWSEM(nvm_lock);
-
-/* Map between virtual and physical channel and lun */
-struct nvm_ch_map {
-	int ch_off;
-	int num_lun;
-	int *lun_offs;
-};
-
-struct nvm_dev_map {
-	struct nvm_ch_map *chnls;
-	int num_ch;
-};
-
-static void nvm_free(struct kref *ref);
-
-static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
-{
-	struct nvm_target *tgt;
-
-	list_for_each_entry(tgt, &dev->targets, list)
-		if (!strcmp(name, tgt->disk->disk_name))
-			return tgt;
-
-	return NULL;
-}
-
-static bool nvm_target_exists(const char *name)
-{
-	struct nvm_dev *dev;
-	struct nvm_target *tgt;
-	bool ret = false;
-
-	down_write(&nvm_lock);
-	list_for_each_entry(dev, &nvm_devices, devices) {
-		mutex_lock(&dev->mlock);
-		list_for_each_entry(tgt, &dev->targets, list) {
-			if (!strcmp(name, tgt->disk->disk_name)) {
-				ret = true;
-				mutex_unlock(&dev->mlock);
-				goto out;
-			}
-		}
-		mutex_unlock(&dev->mlock);
-	}
-
-out:
-	up_write(&nvm_lock);
-	return ret;
-}
-
-static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end)
-{
-	int i;
-
-	for (i = lun_begin; i <= lun_end; i++) {
-		if (test_and_set_bit(i, dev->lun_map)) {
-			pr_err("lun %d already allocated\n", i);
-			goto err;
-		}
-	}
-
-	return 0;
-err:
-	while (--i >= lun_begin)
-		clear_bit(i, dev->lun_map);
-
-	return -EBUSY;
-}
-
-static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin,
-				 int lun_end)
-{
-	int i;
-
-	for (i = lun_begin; i <= lun_end; i++)
-		WARN_ON(!test_and_clear_bit(i, dev->lun_map));
-}
-
-static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct nvm_dev_map *dev_map = tgt_dev->map;
-	int i, j;
-
-	for (i = 0; i < dev_map->num_ch; i++) {
-		struct nvm_ch_map *ch_map = &dev_map->chnls[i];
-		int *lun_offs = ch_map->lun_offs;
-		int ch = i + ch_map->ch_off;
-
-		if (clear) {
-			for (j = 0; j < ch_map->num_lun; j++) {
-				int lun = j + lun_offs[j];
-				int lunid = (ch * dev->geo.num_lun) + lun;
-
-				WARN_ON(!test_and_clear_bit(lunid,
-							dev->lun_map));
-			}
-		}
-
-		kfree(ch_map->lun_offs);
-	}
-
-	kfree(dev_map->chnls);
-	kfree(dev_map);
-
-	kfree(tgt_dev->luns);
-	kfree(tgt_dev);
-}
-
-static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
-					      u16 lun_begin, u16 lun_end,
-					      u16 op)
-{
-	struct nvm_tgt_dev *tgt_dev = NULL;
-	struct nvm_dev_map *dev_rmap = dev->rmap;
-	struct nvm_dev_map *dev_map;
-	struct ppa_addr *luns;
-	int num_lun = lun_end - lun_begin + 1;
-	int luns_left = num_lun;
-	int num_ch = num_lun / dev->geo.num_lun;
-	int num_ch_mod = num_lun % dev->geo.num_lun;
-	int bch = lun_begin / dev->geo.num_lun;
-	int blun = lun_begin % dev->geo.num_lun;
-	int lunid = 0;
-	int lun_balanced = 1;
-	int sec_per_lun, prev_num_lun;
-	int i, j;
-
-	num_ch = (num_ch_mod == 0) ? num_ch : num_ch + 1;
-
-	dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
-	if (!dev_map)
-		goto err_dev;
-
-	dev_map->chnls = kcalloc(num_ch, sizeof(struct nvm_ch_map), GFP_KERNEL);
-	if (!dev_map->chnls)
-		goto err_chnls;
-
-	luns = kcalloc(num_lun, sizeof(struct ppa_addr), GFP_KERNEL);
-	if (!luns)
-		goto err_luns;
-
-	prev_num_lun = (luns_left > dev->geo.num_lun) ?
-					dev->geo.num_lun : luns_left;
-	for (i = 0; i < num_ch; i++) {
-		struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
-		int *lun_roffs = ch_rmap->lun_offs;
-		struct nvm_ch_map *ch_map = &dev_map->chnls[i];
-		int *lun_offs;
-		int luns_in_chnl = (luns_left > dev->geo.num_lun) ?
-					dev->geo.num_lun : luns_left;
-
-		if (lun_balanced && prev_num_lun != luns_in_chnl)
-			lun_balanced = 0;
-
-		ch_map->ch_off = ch_rmap->ch_off = bch;
-		ch_map->num_lun = luns_in_chnl;
-
-		lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
-		if (!lun_offs)
-			goto err_ch;
-
-		for (j = 0; j < luns_in_chnl; j++) {
-			luns[lunid].ppa = 0;
-			luns[lunid].a.ch = i;
-			luns[lunid++].a.lun = j;
-
-			lun_offs[j] = blun;
-			lun_roffs[j + blun] = blun;
-		}
-
-		ch_map->lun_offs = lun_offs;
-
-		/* when starting a new channel, lun offset is reset */
-		blun = 0;
-		luns_left -= luns_in_chnl;
-	}
-
-	dev_map->num_ch = num_ch;
-
-	tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL);
-	if (!tgt_dev)
-		goto err_ch;
-
-	/* Inherit device geometry from parent */
-	memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
-
-	/* Target device only owns a portion of the physical device */
-	tgt_dev->geo.num_ch = num_ch;
-	tgt_dev->geo.num_lun = (lun_balanced) ? prev_num_lun : -1;
-	tgt_dev->geo.all_luns = num_lun;
-	tgt_dev->geo.all_chunks = num_lun * dev->geo.num_chk;
-
-	tgt_dev->geo.op = op;
-
-	sec_per_lun = dev->geo.clba * dev->geo.num_chk;
-	tgt_dev->geo.total_secs = num_lun * sec_per_lun;
-
-	tgt_dev->q = dev->q;
-	tgt_dev->map = dev_map;
-	tgt_dev->luns = luns;
-	tgt_dev->parent = dev;
-
-	return tgt_dev;
-err_ch:
-	while (--i >= 0)
-		kfree(dev_map->chnls[i].lun_offs);
-	kfree(luns);
-err_luns:
-	kfree(dev_map->chnls);
-err_chnls:
-	kfree(dev_map);
-err_dev:
-	return tgt_dev;
-}
-
-static struct nvm_tgt_type *__nvm_find_target_type(const char *name)
-{
-	struct nvm_tgt_type *tt;
-
-	list_for_each_entry(tt, &nvm_tgt_types, list)
-		if (!strcmp(name, tt->name))
-			return tt;
-
-	return NULL;
-}
-
-static struct nvm_tgt_type *nvm_find_target_type(const char *name)
-{
-	struct nvm_tgt_type *tt;
-
-	down_write(&nvm_tgtt_lock);
-	tt = __nvm_find_target_type(name);
-	up_write(&nvm_tgtt_lock);
-
-	return tt;
-}
-
-static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin,
-				 int lun_end)
-{
-	if (lun_begin > lun_end || lun_end >= geo->all_luns) {
-		pr_err("lun out of bound (%u:%u > %u)\n",
-			lun_begin, lun_end, geo->all_luns - 1);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int __nvm_config_simple(struct nvm_dev *dev,
-			       struct nvm_ioctl_create_simple *s)
-{
-	struct nvm_geo *geo = &dev->geo;
-
-	if (s->lun_begin == -1 && s->lun_end == -1) {
-		s->lun_begin = 0;
-		s->lun_end = geo->all_luns - 1;
-	}
-
-	return nvm_config_check_luns(geo, s->lun_begin, s->lun_end);
-}
-
-static int __nvm_config_extended(struct nvm_dev *dev,
-				 struct nvm_ioctl_create_extended *e)
-{
-	if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) {
-		e->lun_begin = 0;
-		e->lun_end = dev->geo.all_luns - 1;
-	}
-
-	/* op not set falls into target's default */
-	if (e->op == 0xFFFF) {
-		e->op = NVM_TARGET_DEFAULT_OP;
-	} else if (e->op < NVM_TARGET_MIN_OP || e->op > NVM_TARGET_MAX_OP) {
-		pr_err("invalid over provisioning value\n");
-		return -EINVAL;
-	}
-
-	return nvm_config_check_luns(&dev->geo, e->lun_begin, e->lun_end);
-}
-
-static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
-{
-	struct nvm_ioctl_create_extended e;
-	struct gendisk *tdisk;
-	struct nvm_tgt_type *tt;
-	struct nvm_target *t;
-	struct nvm_tgt_dev *tgt_dev;
-	void *targetdata;
-	unsigned int mdts;
-	int ret;
-
-	switch (create->conf.type) {
-	case NVM_CONFIG_TYPE_SIMPLE:
-		ret = __nvm_config_simple(dev, &create->conf.s);
-		if (ret)
-			return ret;
-
-		e.lun_begin = create->conf.s.lun_begin;
-		e.lun_end = create->conf.s.lun_end;
-		e.op = NVM_TARGET_DEFAULT_OP;
-		break;
-	case NVM_CONFIG_TYPE_EXTENDED:
-		ret = __nvm_config_extended(dev, &create->conf.e);
-		if (ret)
-			return ret;
-
-		e = create->conf.e;
-		break;
-	default:
-		pr_err("config type not valid\n");
-		return -EINVAL;
-	}
-
-	tt = nvm_find_target_type(create->tgttype);
-	if (!tt) {
-		pr_err("target type %s not found\n", create->tgttype);
-		return -EINVAL;
-	}
-
-	if ((tt->flags & NVM_TGT_F_HOST_L2P) != (dev->geo.dom & NVM_RSP_L2P)) {
-		pr_err("device is incompatible with target L2P type.\n");
-		return -EINVAL;
-	}
-
-	if (nvm_target_exists(create->tgtname)) {
-		pr_err("target name already exists (%s)\n",
-							create->tgtname);
-		return -EINVAL;
-	}
-
-	ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end);
-	if (ret)
-		return ret;
-
-	t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
-	if (!t) {
-		ret = -ENOMEM;
-		goto err_reserve;
-	}
-
-	tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op);
-	if (!tgt_dev) {
-		pr_err("could not create target device\n");
-		ret = -ENOMEM;
-		goto err_t;
-	}
-
-	tdisk = blk_alloc_disk(dev->q->node);
-	if (!tdisk) {
-		ret = -ENOMEM;
-		goto err_dev;
-	}
-
-	strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name));
-	tdisk->major = 0;
-	tdisk->first_minor = 0;
-	tdisk->fops = tt->bops;
-
-	targetdata = tt->init(tgt_dev, tdisk, create->flags);
-	if (IS_ERR(targetdata)) {
-		ret = PTR_ERR(targetdata);
-		goto err_init;
-	}
-
-	tdisk->private_data = targetdata;
-	tdisk->queue->queuedata = targetdata;
-
-	mdts = (dev->geo.csecs >> 9) * NVM_MAX_VLBA;
-	if (dev->geo.mdts) {
-		mdts = min_t(u32, dev->geo.mdts,
-				(dev->geo.csecs >> 9) * NVM_MAX_VLBA);
-	}
-	blk_queue_max_hw_sectors(tdisk->queue, mdts);
-
-	set_capacity(tdisk, tt->capacity(targetdata));
-	add_disk(tdisk);
-
-	if (tt->sysfs_init && tt->sysfs_init(tdisk)) {
-		ret = -ENOMEM;
-		goto err_sysfs;
-	}
-
-	t->type = tt;
-	t->disk = tdisk;
-	t->dev = tgt_dev;
-
-	mutex_lock(&dev->mlock);
-	list_add_tail(&t->list, &dev->targets);
-	mutex_unlock(&dev->mlock);
-
-	__module_get(tt->owner);
-
-	return 0;
-err_sysfs:
-	if (tt->exit)
-		tt->exit(targetdata, true);
-err_init:
-	blk_cleanup_disk(tdisk);
-err_dev:
-	nvm_remove_tgt_dev(tgt_dev, 0);
-err_t:
-	kfree(t);
-err_reserve:
-	nvm_release_luns_err(dev, e.lun_begin, e.lun_end);
-	return ret;
-}
-
-static void __nvm_remove_target(struct nvm_target *t, bool graceful)
-{
-	struct nvm_tgt_type *tt = t->type;
-	struct gendisk *tdisk = t->disk;
-
-	del_gendisk(tdisk);
-
-	if (tt->sysfs_exit)
-		tt->sysfs_exit(tdisk);
-
-	if (tt->exit)
-		tt->exit(tdisk->private_data, graceful);
-
-	nvm_remove_tgt_dev(t->dev, 1);
-	blk_cleanup_disk(tdisk);
-	module_put(t->type->owner);
-
-	list_del(&t->list);
-	kfree(t);
-}
-
-/**
- * nvm_remove_tgt - Removes a target from the media manager
- * @remove:	ioctl structure with target name to remove.
- *
- * Returns:
- * 0: on success
- * 1: on not found
- * <0: on error
- */
-static int nvm_remove_tgt(struct nvm_ioctl_remove *remove)
-{
-	struct nvm_target *t = NULL;
-	struct nvm_dev *dev;
-
-	down_read(&nvm_lock);
-	list_for_each_entry(dev, &nvm_devices, devices) {
-		mutex_lock(&dev->mlock);
-		t = nvm_find_target(dev, remove->tgtname);
-		if (t) {
-			mutex_unlock(&dev->mlock);
-			break;
-		}
-		mutex_unlock(&dev->mlock);
-	}
-	up_read(&nvm_lock);
-
-	if (!t) {
-		pr_err("failed to remove target %s\n",
-				remove->tgtname);
-		return 1;
-	}
-
-	__nvm_remove_target(t, true);
-	kref_put(&dev->ref, nvm_free);
-
-	return 0;
-}
-
-static int nvm_register_map(struct nvm_dev *dev)
-{
-	struct nvm_dev_map *rmap;
-	int i, j;
-
-	rmap = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
-	if (!rmap)
-		goto err_rmap;
-
-	rmap->chnls = kcalloc(dev->geo.num_ch, sizeof(struct nvm_ch_map),
-								GFP_KERNEL);
-	if (!rmap->chnls)
-		goto err_chnls;
-
-	for (i = 0; i < dev->geo.num_ch; i++) {
-		struct nvm_ch_map *ch_rmap;
-		int *lun_roffs;
-		int luns_in_chnl = dev->geo.num_lun;
-
-		ch_rmap = &rmap->chnls[i];
-
-		ch_rmap->ch_off = -1;
-		ch_rmap->num_lun = luns_in_chnl;
-
-		lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
-		if (!lun_roffs)
-			goto err_ch;
-
-		for (j = 0; j < luns_in_chnl; j++)
-			lun_roffs[j] = -1;
-
-		ch_rmap->lun_offs = lun_roffs;
-	}
-
-	dev->rmap = rmap;
-
-	return 0;
-err_ch:
-	while (--i >= 0)
-		kfree(rmap->chnls[i].lun_offs);
-err_chnls:
-	kfree(rmap);
-err_rmap:
-	return -ENOMEM;
-}
-
-static void nvm_unregister_map(struct nvm_dev *dev)
-{
-	struct nvm_dev_map *rmap = dev->rmap;
-	int i;
-
-	for (i = 0; i < dev->geo.num_ch; i++)
-		kfree(rmap->chnls[i].lun_offs);
-
-	kfree(rmap->chnls);
-	kfree(rmap);
-}
-
-static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
-{
-	struct nvm_dev_map *dev_map = tgt_dev->map;
-	struct nvm_ch_map *ch_map = &dev_map->chnls[p->a.ch];
-	int lun_off = ch_map->lun_offs[p->a.lun];
-
-	p->a.ch += ch_map->ch_off;
-	p->a.lun += lun_off;
-}
-
-static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct nvm_dev_map *dev_rmap = dev->rmap;
-	struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->a.ch];
-	int lun_roff = ch_rmap->lun_offs[p->a.lun];
-
-	p->a.ch -= ch_rmap->ch_off;
-	p->a.lun -= lun_roff;
-}
-
-static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev,
-				struct ppa_addr *ppa_list, int nr_ppas)
-{
-	int i;
-
-	for (i = 0; i < nr_ppas; i++) {
-		nvm_map_to_dev(tgt_dev, &ppa_list[i]);
-		ppa_list[i] = generic_to_dev_addr(tgt_dev->parent, ppa_list[i]);
-	}
-}
-
-static void nvm_ppa_dev_to_tgt(struct nvm_tgt_dev *tgt_dev,
-				struct ppa_addr *ppa_list, int nr_ppas)
-{
-	int i;
-
-	for (i = 0; i < nr_ppas; i++) {
-		ppa_list[i] = dev_to_generic_addr(tgt_dev->parent, ppa_list[i]);
-		nvm_map_to_tgt(tgt_dev, &ppa_list[i]);
-	}
-}
-
-static void nvm_rq_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
-{
-	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-
-	nvm_ppa_tgt_to_dev(tgt_dev, ppa_list, rqd->nr_ppas);
-}
-
-static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
-{
-	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-
-	nvm_ppa_dev_to_tgt(tgt_dev, ppa_list, rqd->nr_ppas);
-}
-
-int nvm_register_tgt_type(struct nvm_tgt_type *tt)
-{
-	int ret = 0;
-
-	down_write(&nvm_tgtt_lock);
-	if (__nvm_find_target_type(tt->name))
-		ret = -EEXIST;
-	else
-		list_add(&tt->list, &nvm_tgt_types);
-	up_write(&nvm_tgtt_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL(nvm_register_tgt_type);
-
-void nvm_unregister_tgt_type(struct nvm_tgt_type *tt)
-{
-	if (!tt)
-		return;
-
-	down_write(&nvm_tgtt_lock);
-	list_del(&tt->list);
-	up_write(&nvm_tgtt_lock);
-}
-EXPORT_SYMBOL(nvm_unregister_tgt_type);
-
-void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags,
-							dma_addr_t *dma_handler)
-{
-	return dev->ops->dev_dma_alloc(dev, dev->dma_pool, mem_flags,
-								dma_handler);
-}
-EXPORT_SYMBOL(nvm_dev_dma_alloc);
-
-void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler)
-{
-	dev->ops->dev_dma_free(dev->dma_pool, addr, dma_handler);
-}
-EXPORT_SYMBOL(nvm_dev_dma_free);
-
-static struct nvm_dev *nvm_find_nvm_dev(const char *name)
-{
-	struct nvm_dev *dev;
-
-	list_for_each_entry(dev, &nvm_devices, devices)
-		if (!strcmp(name, dev->name))
-			return dev;
-
-	return NULL;
-}
-
-static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
-			const struct ppa_addr *ppas, int nr_ppas)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct nvm_geo *geo = &tgt_dev->geo;
-	int i, plane_cnt, pl_idx;
-	struct ppa_addr ppa;
-
-	if (geo->pln_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
-		rqd->nr_ppas = nr_ppas;
-		rqd->ppa_addr = ppas[0];
-
-		return 0;
-	}
-
-	rqd->nr_ppas = nr_ppas;
-	rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
-	if (!rqd->ppa_list) {
-		pr_err("failed to allocate dma memory\n");
-		return -ENOMEM;
-	}
-
-	plane_cnt = geo->pln_mode;
-	rqd->nr_ppas *= plane_cnt;
-
-	for (i = 0; i < nr_ppas; i++) {
-		for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
-			ppa = ppas[i];
-			ppa.g.pl = pl_idx;
-			rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
-		}
-	}
-
-	return 0;
-}
-
-static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev,
-			struct nvm_rq *rqd)
-{
-	if (!rqd->ppa_list)
-		return;
-
-	nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
-}
-
-static int nvm_set_flags(struct nvm_geo *geo, struct nvm_rq *rqd)
-{
-	int flags = 0;
-
-	if (geo->version == NVM_OCSSD_SPEC_20)
-		return 0;
-
-	if (rqd->is_seq)
-		flags |= geo->pln_mode >> 1;
-
-	if (rqd->opcode == NVM_OP_PREAD)
-		flags |= (NVM_IO_SCRAMBLE_ENABLE | NVM_IO_SUSPEND);
-	else if (rqd->opcode == NVM_OP_PWRITE)
-		flags |= NVM_IO_SCRAMBLE_ENABLE;
-
-	return flags;
-}
-
-int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, void *buf)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	int ret;
-
-	if (!dev->ops->submit_io)
-		return -ENODEV;
-
-	nvm_rq_tgt_to_dev(tgt_dev, rqd);
-
-	rqd->dev = tgt_dev;
-	rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd);
-
-	/* In case of error, fail with right address format */
-	ret = dev->ops->submit_io(dev, rqd, buf);
-	if (ret)
-		nvm_rq_dev_to_tgt(tgt_dev, rqd);
-	return ret;
-}
-EXPORT_SYMBOL(nvm_submit_io);
-
-static void nvm_sync_end_io(struct nvm_rq *rqd)
-{
-	struct completion *waiting = rqd->private;
-
-	complete(waiting);
-}
-
-static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd,
-			      void *buf)
-{
-	DECLARE_COMPLETION_ONSTACK(wait);
-	int ret = 0;
-
-	rqd->end_io = nvm_sync_end_io;
-	rqd->private = &wait;
-
-	ret = dev->ops->submit_io(dev, rqd, buf);
-	if (ret)
-		return ret;
-
-	wait_for_completion_io(&wait);
-
-	return 0;
-}
-
-int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
-		       void *buf)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	int ret;
-
-	if (!dev->ops->submit_io)
-		return -ENODEV;
-
-	nvm_rq_tgt_to_dev(tgt_dev, rqd);
-
-	rqd->dev = tgt_dev;
-	rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd);
-
-	ret = nvm_submit_io_wait(dev, rqd, buf);
-
-	return ret;
-}
-EXPORT_SYMBOL(nvm_submit_io_sync);
-
-void nvm_end_io(struct nvm_rq *rqd)
-{
-	struct nvm_tgt_dev *tgt_dev = rqd->dev;
-
-	/* Convert address space */
-	if (tgt_dev)
-		nvm_rq_dev_to_tgt(tgt_dev, rqd);
-
-	if (rqd->end_io)
-		rqd->end_io(rqd);
-}
-EXPORT_SYMBOL(nvm_end_io);
-
-static int nvm_submit_io_sync_raw(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-	if (!dev->ops->submit_io)
-		return -ENODEV;
-
-	rqd->dev = NULL;
-	rqd->flags = nvm_set_flags(&dev->geo, rqd);
-
-	return nvm_submit_io_wait(dev, rqd, NULL);
-}
-
-static int nvm_bb_chunk_sense(struct nvm_dev *dev, struct ppa_addr ppa)
-{
-	struct nvm_rq rqd = { NULL };
-	struct bio bio;
-	struct bio_vec bio_vec;
-	struct page *page;
-	int ret;
-
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
-
-	bio_init(&bio, &bio_vec, 1);
-	bio_add_page(&bio, page, PAGE_SIZE, 0);
-	bio_set_op_attrs(&bio, REQ_OP_READ, 0);
-
-	rqd.bio = &bio;
-	rqd.opcode = NVM_OP_PREAD;
-	rqd.is_seq = 1;
-	rqd.nr_ppas = 1;
-	rqd.ppa_addr = generic_to_dev_addr(dev, ppa);
-
-	ret = nvm_submit_io_sync_raw(dev, &rqd);
-	__free_page(page);
-	if (ret)
-		return ret;
-
-	return rqd.error;
-}
-
-/*
- * Scans a 1.2 chunk first and last page to determine if its state.
- * If the chunk is found to be open, also scan it to update the write
- * pointer.
- */
-static int nvm_bb_chunk_scan(struct nvm_dev *dev, struct ppa_addr ppa,
-			     struct nvm_chk_meta *meta)
-{
-	struct nvm_geo *geo = &dev->geo;
-	int ret, pg, pl;
-
-	/* sense first page */
-	ret = nvm_bb_chunk_sense(dev, ppa);
-	if (ret < 0) /* io error */
-		return ret;
-	else if (ret == 0) /* valid data */
-		meta->state = NVM_CHK_ST_OPEN;
-	else if (ret > 0) {
-		/*
-		 * If empty page, the chunk is free, else it is an
-		 * actual io error. In that case, mark it offline.
-		 */
-		switch (ret) {
-		case NVM_RSP_ERR_EMPTYPAGE:
-			meta->state = NVM_CHK_ST_FREE;
-			return 0;
-		case NVM_RSP_ERR_FAILCRC:
-		case NVM_RSP_ERR_FAILECC:
-		case NVM_RSP_WARN_HIGHECC:
-			meta->state = NVM_CHK_ST_OPEN;
-			goto scan;
-		default:
-			return -ret; /* other io error */
-		}
-	}
-
-	/* sense last page */
-	ppa.g.pg = geo->num_pg - 1;
-	ppa.g.pl = geo->num_pln - 1;
-
-	ret = nvm_bb_chunk_sense(dev, ppa);
-	if (ret < 0) /* io error */
-		return ret;
-	else if (ret == 0) { /* Chunk fully written */
-		meta->state = NVM_CHK_ST_CLOSED;
-		meta->wp = geo->clba;
-		return 0;
-	} else if (ret > 0) {
-		switch (ret) {
-		case NVM_RSP_ERR_EMPTYPAGE:
-		case NVM_RSP_ERR_FAILCRC:
-		case NVM_RSP_ERR_FAILECC:
-		case NVM_RSP_WARN_HIGHECC:
-			meta->state = NVM_CHK_ST_OPEN;
-			break;
-		default:
-			return -ret; /* other io error */
-		}
-	}
-
-scan:
-	/*
-	 * chunk is open, we scan sequentially to update the write pointer.
-	 * We make the assumption that targets write data across all planes
-	 * before moving to the next page.
-	 */
-	for (pg = 0; pg < geo->num_pg; pg++) {
-		for (pl = 0; pl < geo->num_pln; pl++) {
-			ppa.g.pg = pg;
-			ppa.g.pl = pl;
-
-			ret = nvm_bb_chunk_sense(dev, ppa);
-			if (ret < 0) /* io error */
-				return ret;
-			else if (ret == 0) {
-				meta->wp += geo->ws_min;
-			} else if (ret > 0) {
-				switch (ret) {
-				case NVM_RSP_ERR_EMPTYPAGE:
-					return 0;
-				case NVM_RSP_ERR_FAILCRC:
-				case NVM_RSP_ERR_FAILECC:
-				case NVM_RSP_WARN_HIGHECC:
-					meta->wp += geo->ws_min;
-					break;
-				default:
-					return -ret; /* other io error */
-				}
-			}
-		}
-	}
-
-	return 0;
-}
-
-/*
- * folds a bad block list from its plane representation to its
- * chunk representation.
- *
- * If any of the planes status are bad or grown bad, the chunk is marked
- * offline. If not bad, the first plane state acts as the chunk state.
- */
-static int nvm_bb_to_chunk(struct nvm_dev *dev, struct ppa_addr ppa,
-			   u8 *blks, int nr_blks, struct nvm_chk_meta *meta)
-{
-	struct nvm_geo *geo = &dev->geo;
-	int ret, blk, pl, offset, blktype;
-
-	for (blk = 0; blk < geo->num_chk; blk++) {
-		offset = blk * geo->pln_mode;
-		blktype = blks[offset];
-
-		for (pl = 0; pl < geo->pln_mode; pl++) {
-			if (blks[offset + pl] &
-					(NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) {
-				blktype = blks[offset + pl];
-				break;
-			}
-		}
-
-		ppa.g.blk = blk;
-
-		meta->wp = 0;
-		meta->type = NVM_CHK_TP_W_SEQ;
-		meta->wi = 0;
-		meta->slba = generic_to_dev_addr(dev, ppa).ppa;
-		meta->cnlb = dev->geo.clba;
-
-		if (blktype == NVM_BLK_T_FREE) {
-			ret = nvm_bb_chunk_scan(dev, ppa, meta);
-			if (ret)
-				return ret;
-		} else {
-			meta->state = NVM_CHK_ST_OFFLINE;
-		}
-
-		meta++;
-	}
-
-	return 0;
-}
-
-static int nvm_get_bb_meta(struct nvm_dev *dev, sector_t slba,
-			   int nchks, struct nvm_chk_meta *meta)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct ppa_addr ppa;
-	u8 *blks;
-	int ch, lun, nr_blks;
-	int ret = 0;
-
-	ppa.ppa = slba;
-	ppa = dev_to_generic_addr(dev, ppa);
-
-	if (ppa.g.blk != 0)
-		return -EINVAL;
-
-	if ((nchks % geo->num_chk) != 0)
-		return -EINVAL;
-
-	nr_blks = geo->num_chk * geo->pln_mode;
-
-	blks = kmalloc(nr_blks, GFP_KERNEL);
-	if (!blks)
-		return -ENOMEM;
-
-	for (ch = ppa.g.ch; ch < geo->num_ch; ch++) {
-		for (lun = ppa.g.lun; lun < geo->num_lun; lun++) {
-			struct ppa_addr ppa_gen, ppa_dev;
-
-			if (!nchks)
-				goto done;
-
-			ppa_gen.ppa = 0;
-			ppa_gen.g.ch = ch;
-			ppa_gen.g.lun = lun;
-			ppa_dev = generic_to_dev_addr(dev, ppa_gen);
-
-			ret = dev->ops->get_bb_tbl(dev, ppa_dev, blks);
-			if (ret)
-				goto done;
-
-			ret = nvm_bb_to_chunk(dev, ppa_gen, blks, nr_blks,
-									meta);
-			if (ret)
-				goto done;
-
-			meta += geo->num_chk;
-			nchks -= geo->num_chk;
-		}
-	}
-done:
-	kfree(blks);
-	return ret;
-}
-
-int nvm_get_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
-		       int nchks, struct nvm_chk_meta *meta)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-
-	nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1);
-
-	if (dev->geo.version == NVM_OCSSD_SPEC_12)
-		return nvm_get_bb_meta(dev, (sector_t)ppa.ppa, nchks, meta);
-
-	return dev->ops->get_chk_meta(dev, (sector_t)ppa.ppa, nchks, meta);
-}
-EXPORT_SYMBOL_GPL(nvm_get_chunk_meta);
-
-int nvm_set_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
-		       int nr_ppas, int type)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct nvm_rq rqd;
-	int ret;
-
-	if (dev->geo.version == NVM_OCSSD_SPEC_20)
-		return 0;
-
-	if (nr_ppas > NVM_MAX_VLBA) {
-		pr_err("unable to update all blocks atomically\n");
-		return -EINVAL;
-	}
-
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-
-	nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
-	nvm_rq_tgt_to_dev(tgt_dev, &rqd);
-
-	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
-	nvm_free_rqd_ppalist(tgt_dev, &rqd);
-	if (ret)
-		return -EINVAL;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nvm_set_chunk_meta);
-
-static int nvm_core_init(struct nvm_dev *dev)
-{
-	struct nvm_geo *geo = &dev->geo;
-	int ret;
-
-	dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns),
-					sizeof(unsigned long), GFP_KERNEL);
-	if (!dev->lun_map)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&dev->area_list);
-	INIT_LIST_HEAD(&dev->targets);
-	mutex_init(&dev->mlock);
-	spin_lock_init(&dev->lock);
-
-	ret = nvm_register_map(dev);
-	if (ret)
-		goto err_fmtype;
-
-	return 0;
-err_fmtype:
-	kfree(dev->lun_map);
-	return ret;
-}
-
-static void nvm_free(struct kref *ref)
-{
-	struct nvm_dev *dev = container_of(ref, struct nvm_dev, ref);
-
-	if (dev->dma_pool)
-		dev->ops->destroy_dma_pool(dev->dma_pool);
-
-	if (dev->rmap)
-		nvm_unregister_map(dev);
-
-	kfree(dev->lun_map);
-	kfree(dev);
-}
-
-static int nvm_init(struct nvm_dev *dev)
-{
-	struct nvm_geo *geo = &dev->geo;
-	int ret = -EINVAL;
-
-	if (dev->ops->identity(dev)) {
-		pr_err("device could not be identified\n");
-		goto err;
-	}
-
-	pr_debug("ver:%u.%u nvm_vendor:%x\n", geo->major_ver_id,
-			geo->minor_ver_id, geo->vmnt);
-
-	ret = nvm_core_init(dev);
-	if (ret) {
-		pr_err("could not initialize core structures.\n");
-		goto err;
-	}
-
-	pr_info("registered %s [%u/%u/%u/%u/%u]\n",
-			dev->name, dev->geo.ws_min, dev->geo.ws_opt,
-			dev->geo.num_chk, dev->geo.all_luns,
-			dev->geo.num_ch);
-	return 0;
-err:
-	pr_err("failed to initialize nvm\n");
-	return ret;
-}
-
-struct nvm_dev *nvm_alloc_dev(int node)
-{
-	struct nvm_dev *dev;
-
-	dev = kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node);
-	if (dev)
-		kref_init(&dev->ref);
-
-	return dev;
-}
-EXPORT_SYMBOL(nvm_alloc_dev);
-
-int nvm_register(struct nvm_dev *dev)
-{
-	int ret, exp_pool_size;
-
-	pr_warn_once("lightnvm support is deprecated and will be removed in Linux 5.15.\n");
-
-	if (!dev->q || !dev->ops) {
-		kref_put(&dev->ref, nvm_free);
-		return -EINVAL;
-	}
-
-	ret = nvm_init(dev);
-	if (ret) {
-		kref_put(&dev->ref, nvm_free);
-		return ret;
-	}
-
-	exp_pool_size = max_t(int, PAGE_SIZE,
-			      (NVM_MAX_VLBA * (sizeof(u64) + dev->geo.sos)));
-	exp_pool_size = round_up(exp_pool_size, PAGE_SIZE);
-
-	dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist",
-						  exp_pool_size);
-	if (!dev->dma_pool) {
-		pr_err("could not create dma pool\n");
-		kref_put(&dev->ref, nvm_free);
-		return -ENOMEM;
-	}
-
-	/* register device with a supported media manager */
-	down_write(&nvm_lock);
-	list_add(&dev->devices, &nvm_devices);
-	up_write(&nvm_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(nvm_register);
-
-void nvm_unregister(struct nvm_dev *dev)
-{
-	struct nvm_target *t, *tmp;
-
-	mutex_lock(&dev->mlock);
-	list_for_each_entry_safe(t, tmp, &dev->targets, list) {
-		if (t->dev->parent != dev)
-			continue;
-		__nvm_remove_target(t, false);
-		kref_put(&dev->ref, nvm_free);
-	}
-	mutex_unlock(&dev->mlock);
-
-	down_write(&nvm_lock);
-	list_del(&dev->devices);
-	up_write(&nvm_lock);
-
-	kref_put(&dev->ref, nvm_free);
-}
-EXPORT_SYMBOL(nvm_unregister);
-
-static int __nvm_configure_create(struct nvm_ioctl_create *create)
-{
-	struct nvm_dev *dev;
-	int ret;
-
-	down_write(&nvm_lock);
-	dev = nvm_find_nvm_dev(create->dev);
-	up_write(&nvm_lock);
-
-	if (!dev) {
-		pr_err("device not found\n");
-		return -EINVAL;
-	}
-
-	kref_get(&dev->ref);
-	ret = nvm_create_tgt(dev, create);
-	if (ret)
-		kref_put(&dev->ref, nvm_free);
-
-	return ret;
-}
-
-static long nvm_ioctl_info(struct file *file, void __user *arg)
-{
-	struct nvm_ioctl_info *info;
-	struct nvm_tgt_type *tt;
-	int tgt_iter = 0;
-
-	info = memdup_user(arg, sizeof(struct nvm_ioctl_info));
-	if (IS_ERR(info))
-		return PTR_ERR(info);
-
-	info->version[0] = NVM_VERSION_MAJOR;
-	info->version[1] = NVM_VERSION_MINOR;
-	info->version[2] = NVM_VERSION_PATCH;
-
-	down_write(&nvm_tgtt_lock);
-	list_for_each_entry(tt, &nvm_tgt_types, list) {
-		struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter];
-
-		tgt->version[0] = tt->version[0];
-		tgt->version[1] = tt->version[1];
-		tgt->version[2] = tt->version[2];
-		strncpy(tgt->tgtname, tt->name, NVM_TTYPE_NAME_MAX);
-
-		tgt_iter++;
-	}
-
-	info->tgtsize = tgt_iter;
-	up_write(&nvm_tgtt_lock);
-
-	if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) {
-		kfree(info);
-		return -EFAULT;
-	}
-
-	kfree(info);
-	return 0;
-}
-
-static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
-{
-	struct nvm_ioctl_get_devices *devices;
-	struct nvm_dev *dev;
-	int i = 0;
-
-	devices = kzalloc(sizeof(struct nvm_ioctl_get_devices), GFP_KERNEL);
-	if (!devices)
-		return -ENOMEM;
-
-	down_write(&nvm_lock);
-	list_for_each_entry(dev, &nvm_devices, devices) {
-		struct nvm_ioctl_device_info *info = &devices->info[i];
-
-		strlcpy(info->devname, dev->name, sizeof(info->devname));
-
-		/* kept for compatibility */
-		info->bmversion[0] = 1;
-		info->bmversion[1] = 0;
-		info->bmversion[2] = 0;
-		strlcpy(info->bmname, "gennvm", sizeof(info->bmname));
-		i++;
-
-		if (i >= ARRAY_SIZE(devices->info)) {
-			pr_err("max %zd devices can be reported.\n",
-			       ARRAY_SIZE(devices->info));
-			break;
-		}
-	}
-	up_write(&nvm_lock);
-
-	devices->nr_devices = i;
-
-	if (copy_to_user(arg, devices,
-			 sizeof(struct nvm_ioctl_get_devices))) {
-		kfree(devices);
-		return -EFAULT;
-	}
-
-	kfree(devices);
-	return 0;
-}
-
-static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
-{
-	struct nvm_ioctl_create create;
-
-	if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create)))
-		return -EFAULT;
-
-	if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED &&
-	    create.conf.e.rsv != 0) {
-		pr_err("reserved config field in use\n");
-		return -EINVAL;
-	}
-
-	create.dev[DISK_NAME_LEN - 1] = '\0';
-	create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0';
-	create.tgtname[DISK_NAME_LEN - 1] = '\0';
-
-	if (create.flags != 0) {
-		__u32 flags = create.flags;
-
-		/* Check for valid flags */
-		if (flags & NVM_TARGET_FACTORY)
-			flags &= ~NVM_TARGET_FACTORY;
-
-		if (flags) {
-			pr_err("flag not supported\n");
-			return -EINVAL;
-		}
-	}
-
-	return __nvm_configure_create(&create);
-}
-
-static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
-{
-	struct nvm_ioctl_remove remove;
-
-	if (copy_from_user(&remove, arg, sizeof(struct nvm_ioctl_remove)))
-		return -EFAULT;
-
-	remove.tgtname[DISK_NAME_LEN - 1] = '\0';
-
-	if (remove.flags != 0) {
-		pr_err("no flags supported\n");
-		return -EINVAL;
-	}
-
-	return nvm_remove_tgt(&remove);
-}
-
-/* kept for compatibility reasons */
-static long nvm_ioctl_dev_init(struct file *file, void __user *arg)
-{
-	struct nvm_ioctl_dev_init init;
-
-	if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init)))
-		return -EFAULT;
-
-	if (init.flags != 0) {
-		pr_err("no flags supported\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/* Kept for compatibility reasons */
-static long nvm_ioctl_dev_factory(struct file *file, void __user *arg)
-{
-	struct nvm_ioctl_dev_factory fact;
-
-	if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory)))
-		return -EFAULT;
-
-	fact.dev[DISK_NAME_LEN - 1] = '\0';
-
-	if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1))
-		return -EINVAL;
-
-	return 0;
-}
-
-static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	switch (cmd) {
-	case NVM_INFO:
-		return nvm_ioctl_info(file, argp);
-	case NVM_GET_DEVICES:
-		return nvm_ioctl_get_devices(file, argp);
-	case NVM_DEV_CREATE:
-		return nvm_ioctl_dev_create(file, argp);
-	case NVM_DEV_REMOVE:
-		return nvm_ioctl_dev_remove(file, argp);
-	case NVM_DEV_INIT:
-		return nvm_ioctl_dev_init(file, argp);
-	case NVM_DEV_FACTORY:
-		return nvm_ioctl_dev_factory(file, argp);
-	}
-	return 0;
-}
-
-static const struct file_operations _ctl_fops = {
-	.open = nonseekable_open,
-	.unlocked_ioctl = nvm_ctl_ioctl,
-	.owner = THIS_MODULE,
-	.llseek  = noop_llseek,
-};
-
-static struct miscdevice _nvm_misc = {
-	.minor		= MISC_DYNAMIC_MINOR,
-	.name		= "lightnvm",
-	.nodename	= "lightnvm/control",
-	.fops		= &_ctl_fops,
-};
-builtin_misc_device(_nvm_misc);
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
deleted file mode 100644
index f185f1a00008..000000000000
--- a/drivers/lightnvm/pblk-cache.c
+++ /dev/null
@@ -1,137 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2016 CNEX Labs
- * Initial release: Javier Gonzalez <javier@cnexlabs.com>
- *                  Matias Bjorling <matias@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * pblk-cache.c - pblk's write cache
- */
-
-#include "pblk.h"
-
-void pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
-				unsigned long flags)
-{
-	struct pblk_w_ctx w_ctx;
-	sector_t lba = pblk_get_lba(bio);
-	unsigned long start_time;
-	unsigned int bpos, pos;
-	int nr_entries = pblk_get_secs(bio);
-	int i, ret;
-
-	start_time = bio_start_io_acct(bio);
-
-	/* Update the write buffer head (mem) with the entries that we can
-	 * write. The write in itself cannot fail, so there is no need to
-	 * rollback from here on.
-	 */
-retry:
-	ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos);
-	switch (ret) {
-	case NVM_IO_REQUEUE:
-		io_schedule();
-		goto retry;
-	case NVM_IO_ERR:
-		pblk_pipeline_stop(pblk);
-		bio_io_error(bio);
-		goto out;
-	}
-
-	pblk_ppa_set_empty(&w_ctx.ppa);
-	w_ctx.flags = flags;
-	if (bio->bi_opf & REQ_PREFLUSH) {
-		w_ctx.flags |= PBLK_FLUSH_ENTRY;
-		pblk_write_kick(pblk);
-	}
-
-	if (unlikely(!bio_has_data(bio)))
-		goto out;
-
-	for (i = 0; i < nr_entries; i++) {
-		void *data = bio_data(bio);
-
-		w_ctx.lba = lba + i;
-
-		pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
-		pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);
-
-		bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
-	}
-
-	atomic64_add(nr_entries, &pblk->user_wa);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_add(nr_entries, &pblk->inflight_writes);
-	atomic_long_add(nr_entries, &pblk->req_writes);
-#endif
-
-	pblk_rl_inserted(&pblk->rl, nr_entries);
-
-out:
-	bio_end_io_acct(bio, start_time);
-	pblk_write_should_kick(pblk);
-
-	if (ret == NVM_IO_DONE)
-		bio_endio(bio);
-}
-
-/*
- * On GC the incoming lbas are not necessarily sequential. Also, some of the
- * lbas might not be valid entries, which are marked as empty by the GC thread
- */
-int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
-{
-	struct pblk_w_ctx w_ctx;
-	unsigned int bpos, pos;
-	void *data = gc_rq->data;
-	int i, valid_entries;
-
-	/* Update the write buffer head (mem) with the entries that we can
-	 * write. The write in itself cannot fail, so there is no need to
-	 * rollback from here on.
-	 */
-retry:
-	if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) {
-		io_schedule();
-		goto retry;
-	}
-
-	w_ctx.flags = PBLK_IOTYPE_GC;
-	pblk_ppa_set_empty(&w_ctx.ppa);
-
-	for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) {
-		if (gc_rq->lba_list[i] == ADDR_EMPTY)
-			continue;
-
-		w_ctx.lba = gc_rq->lba_list[i];
-
-		pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
-		pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line,
-						gc_rq->paddr_list[i], pos);
-
-		data += PBLK_EXPOSED_PAGE_SIZE;
-		valid_entries++;
-	}
-
-	WARN_ONCE(gc_rq->secs_to_gc != valid_entries,
-					"pblk: inconsistent GC write\n");
-
-	atomic64_add(valid_entries, &pblk->gc_wa);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_add(valid_entries, &pblk->inflight_writes);
-	atomic_long_add(valid_entries, &pblk->recov_gc_writes);
-#endif
-
-	pblk_write_should_kick(pblk);
-	return NVM_IO_OK;
-}
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
deleted file mode 100644
index 33d39d3dd343..000000000000
--- a/drivers/lightnvm/pblk-core.c
+++ /dev/null
@@ -1,2151 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2016 CNEX Labs
- * Initial release: Javier Gonzalez <javier@cnexlabs.com>
- *                  Matias Bjorling <matias@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * pblk-core.c - pblk's core functionality
- *
- */
-
-#define CREATE_TRACE_POINTS
-
-#include "pblk.h"
-#include "pblk-trace.h"
-
-static void pblk_line_mark_bb(struct work_struct *work)
-{
-	struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
-									ws);
-	struct pblk *pblk = line_ws->pblk;
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct ppa_addr *ppa = line_ws->priv;
-	int ret;
-
-	ret = nvm_set_chunk_meta(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
-	if (ret) {
-		struct pblk_line *line;
-		int pos;
-
-		line = pblk_ppa_to_line(pblk, *ppa);
-		pos = pblk_ppa_to_pos(&dev->geo, *ppa);
-
-		pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n",
-				line->id, pos);
-	}
-
-	kfree(ppa);
-	mempool_free(line_ws, &pblk->gen_ws_pool);
-}
-
-static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
-			 struct ppa_addr ppa_addr)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct ppa_addr *ppa;
-	int pos = pblk_ppa_to_pos(geo, ppa_addr);
-
-	pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos);
-	atomic_long_inc(&pblk->erase_failed);
-
-	atomic_dec(&line->blk_in_line);
-	if (test_and_set_bit(pos, line->blk_bitmap))
-		pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n",
-							line->id, pos);
-
-	/* Not necessary to mark bad blocks on 2.0 spec. */
-	if (geo->version == NVM_OCSSD_SPEC_20)
-		return;
-
-	ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC);
-	if (!ppa)
-		return;
-
-	*ppa = ppa_addr;
-	pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb,
-						GFP_ATOMIC, pblk->bb_wq);
-}
-
-static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct nvm_chk_meta *chunk;
-	struct pblk_line *line;
-	int pos;
-
-	line = pblk_ppa_to_line(pblk, rqd->ppa_addr);
-	pos = pblk_ppa_to_pos(geo, rqd->ppa_addr);
-	chunk = &line->chks[pos];
-
-	atomic_dec(&line->left_seblks);
-
-	if (rqd->error) {
-		trace_pblk_chunk_reset(pblk_disk_name(pblk),
-				&rqd->ppa_addr, PBLK_CHUNK_RESET_FAILED);
-
-		chunk->state = NVM_CHK_ST_OFFLINE;
-		pblk_mark_bb(pblk, line, rqd->ppa_addr);
-	} else {
-		trace_pblk_chunk_reset(pblk_disk_name(pblk),
-				&rqd->ppa_addr, PBLK_CHUNK_RESET_DONE);
-
-		chunk->state = NVM_CHK_ST_FREE;
-	}
-
-	trace_pblk_chunk_state(pblk_disk_name(pblk), &rqd->ppa_addr,
-				chunk->state);
-
-	atomic_dec(&pblk->inflight_io);
-}
-
-/* Erase completion assumes that only one block is erased at the time */
-static void pblk_end_io_erase(struct nvm_rq *rqd)
-{
-	struct pblk *pblk = rqd->private;
-
-	__pblk_end_io_erase(pblk, rqd);
-	mempool_free(rqd, &pblk->e_rq_pool);
-}
-
-/*
- * Get information for all chunks from the device.
- *
- * The caller is responsible for freeing (vmalloc) the returned structure
- */
-struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct nvm_chk_meta *meta;
-	struct ppa_addr ppa;
-	unsigned long len;
-	int ret;
-
-	ppa.ppa = 0;
-
-	len = geo->all_chunks * sizeof(*meta);
-	meta = vzalloc(len);
-	if (!meta)
-		return ERR_PTR(-ENOMEM);
-
-	ret = nvm_get_chunk_meta(dev, ppa, geo->all_chunks, meta);
-	if (ret) {
-		vfree(meta);
-		return ERR_PTR(-EIO);
-	}
-
-	return meta;
-}
-
-struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk,
-					      struct nvm_chk_meta *meta,
-					      struct ppa_addr ppa)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	int ch_off = ppa.m.grp * geo->num_chk * geo->num_lun;
-	int lun_off = ppa.m.pu * geo->num_chk;
-	int chk_off = ppa.m.chk;
-
-	return meta + ch_off + lun_off + chk_off;
-}
-
-void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
-			   u64 paddr)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct list_head *move_list = NULL;
-
-	/* Lines being reclaimed (GC'ed) cannot be invalidated. Before the L2P
-	 * table is modified with reclaimed sectors, a check is done to endure
-	 * that newer updates are not overwritten.
-	 */
-	spin_lock(&line->lock);
-	WARN_ON(line->state == PBLK_LINESTATE_FREE);
-
-	if (test_and_set_bit(paddr, line->invalid_bitmap)) {
-		WARN_ONCE(1, "pblk: double invalidate\n");
-		spin_unlock(&line->lock);
-		return;
-	}
-	le32_add_cpu(line->vsc, -1);
-
-	if (line->state == PBLK_LINESTATE_CLOSED)
-		move_list = pblk_line_gc_list(pblk, line);
-	spin_unlock(&line->lock);
-
-	if (move_list) {
-		spin_lock(&l_mg->gc_lock);
-		spin_lock(&line->lock);
-		/* Prevent moving a line that has just been chosen for GC */
-		if (line->state == PBLK_LINESTATE_GC) {
-			spin_unlock(&line->lock);
-			spin_unlock(&l_mg->gc_lock);
-			return;
-		}
-		spin_unlock(&line->lock);
-
-		list_move_tail(&line->list, move_list);
-		spin_unlock(&l_mg->gc_lock);
-	}
-}
-
-void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
-{
-	struct pblk_line *line;
-	u64 paddr;
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	/* Callers must ensure that the ppa points to a device address */
-	BUG_ON(pblk_addr_in_cache(ppa));
-	BUG_ON(pblk_ppa_empty(ppa));
-#endif
-
-	line = pblk_ppa_to_line(pblk, ppa);
-	paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
-
-	__pblk_map_invalidate(pblk, line, paddr);
-}
-
-static void pblk_invalidate_range(struct pblk *pblk, sector_t slba,
-				  unsigned int nr_secs)
-{
-	sector_t lba;
-
-	spin_lock(&pblk->trans_lock);
-	for (lba = slba; lba < slba + nr_secs; lba++) {
-		struct ppa_addr ppa;
-
-		ppa = pblk_trans_map_get(pblk, lba);
-
-		if (!pblk_addr_in_cache(ppa) && !pblk_ppa_empty(ppa))
-			pblk_map_invalidate(pblk, ppa);
-
-		pblk_ppa_set_empty(&ppa);
-		pblk_trans_map_set(pblk, lba, ppa);
-	}
-	spin_unlock(&pblk->trans_lock);
-}
-
-int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-
-	rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
-							&rqd->dma_meta_list);
-	if (!rqd->meta_list)
-		return -ENOMEM;
-
-	if (rqd->nr_ppas == 1)
-		return 0;
-
-	rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size(pblk);
-	rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size(pblk);
-
-	return 0;
-}
-
-void pblk_free_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-
-	if (rqd->meta_list)
-		nvm_dev_dma_free(dev->parent, rqd->meta_list,
-				rqd->dma_meta_list);
-}
-
-/* Caller must guarantee that the request is a valid type */
-struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type)
-{
-	mempool_t *pool;
-	struct nvm_rq *rqd;
-	int rq_size;
-
-	switch (type) {
-	case PBLK_WRITE:
-	case PBLK_WRITE_INT:
-		pool = &pblk->w_rq_pool;
-		rq_size = pblk_w_rq_size;
-		break;
-	case PBLK_READ:
-		pool = &pblk->r_rq_pool;
-		rq_size = pblk_g_rq_size;
-		break;
-	default:
-		pool = &pblk->e_rq_pool;
-		rq_size = pblk_g_rq_size;
-	}
-
-	rqd = mempool_alloc(pool, GFP_KERNEL);
-	memset(rqd, 0, rq_size);
-
-	return rqd;
-}
-
-/* Typically used on completion path. Cannot guarantee request consistency */
-void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
-{
-	mempool_t *pool;
-
-	switch (type) {
-	case PBLK_WRITE:
-		kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
-		fallthrough;
-	case PBLK_WRITE_INT:
-		pool = &pblk->w_rq_pool;
-		break;
-	case PBLK_READ:
-		pool = &pblk->r_rq_pool;
-		break;
-	case PBLK_ERASE:
-		pool = &pblk->e_rq_pool;
-		break;
-	default:
-		pblk_err(pblk, "trying to free unknown rqd type\n");
-		return;
-	}
-
-	pblk_free_rqd_meta(pblk, rqd);
-	mempool_free(rqd, pool);
-}
-
-void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
-			 int nr_pages)
-{
-	struct bio_vec *bv;
-	struct page *page;
-	int i, e, nbv = 0;
-
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		bv = &bio->bi_io_vec[i];
-		page = bv->bv_page;
-		for (e = 0; e < bv->bv_len; e += PBLK_EXPOSED_PAGE_SIZE, nbv++)
-			if (nbv >= off)
-				mempool_free(page++, &pblk->page_bio_pool);
-	}
-}
-
-int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
-		       int nr_pages)
-{
-	struct request_queue *q = pblk->dev->q;
-	struct page *page;
-	int i, ret;
-
-	for (i = 0; i < nr_pages; i++) {
-		page = mempool_alloc(&pblk->page_bio_pool, flags);
-
-		ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
-		if (ret != PBLK_EXPOSED_PAGE_SIZE) {
-			pblk_err(pblk, "could not add page to bio\n");
-			mempool_free(page, &pblk->page_bio_pool);
-			goto err;
-		}
-	}
-
-	return 0;
-err:
-	pblk_bio_free_pages(pblk, bio, (bio->bi_vcnt - i), i);
-	return -1;
-}
-
-void pblk_write_kick(struct pblk *pblk)
-{
-	wake_up_process(pblk->writer_ts);
-	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000));
-}
-
-void pblk_write_timer_fn(struct timer_list *t)
-{
-	struct pblk *pblk = from_timer(pblk, t, wtimer);
-
-	/* kick the write thread every tick to flush outstanding data */
-	pblk_write_kick(pblk);
-}
-
-void pblk_write_should_kick(struct pblk *pblk)
-{
-	unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
-
-	if (secs_avail >= pblk->min_write_pgs_data)
-		pblk_write_kick(pblk);
-}
-
-static void pblk_wait_for_meta(struct pblk *pblk)
-{
-	do {
-		if (!atomic_read(&pblk->inflight_io))
-			break;
-
-		schedule();
-	} while (1);
-}
-
-static void pblk_flush_writer(struct pblk *pblk)
-{
-	pblk_rb_flush(&pblk->rwb);
-	do {
-		if (!pblk_rb_sync_count(&pblk->rwb))
-			break;
-
-		pblk_write_kick(pblk);
-		schedule();
-	} while (1);
-}
-
-struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct list_head *move_list = NULL;
-	int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data)
-			* (pblk->min_write_pgs - pblk->min_write_pgs_data);
-	int vsc = le32_to_cpu(*line->vsc) + packed_meta;
-
-	lockdep_assert_held(&line->lock);
-
-	if (line->w_err_gc->has_write_err) {
-		if (line->gc_group != PBLK_LINEGC_WERR) {
-			line->gc_group = PBLK_LINEGC_WERR;
-			move_list = &l_mg->gc_werr_list;
-			pblk_rl_werr_line_in(&pblk->rl);
-		}
-	} else if (!vsc) {
-		if (line->gc_group != PBLK_LINEGC_FULL) {
-			line->gc_group = PBLK_LINEGC_FULL;
-			move_list = &l_mg->gc_full_list;
-		}
-	} else if (vsc < lm->high_thrs) {
-		if (line->gc_group != PBLK_LINEGC_HIGH) {
-			line->gc_group = PBLK_LINEGC_HIGH;
-			move_list = &l_mg->gc_high_list;
-		}
-	} else if (vsc < lm->mid_thrs) {
-		if (line->gc_group != PBLK_LINEGC_MID) {
-			line->gc_group = PBLK_LINEGC_MID;
-			move_list = &l_mg->gc_mid_list;
-		}
-	} else if (vsc < line->sec_in_line) {
-		if (line->gc_group != PBLK_LINEGC_LOW) {
-			line->gc_group = PBLK_LINEGC_LOW;
-			move_list = &l_mg->gc_low_list;
-		}
-	} else if (vsc == line->sec_in_line) {
-		if (line->gc_group != PBLK_LINEGC_EMPTY) {
-			line->gc_group = PBLK_LINEGC_EMPTY;
-			move_list = &l_mg->gc_empty_list;
-		}
-	} else {
-		line->state = PBLK_LINESTATE_CORRUPT;
-		trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-
-		line->gc_group = PBLK_LINEGC_NONE;
-		move_list =  &l_mg->corrupt_list;
-		pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
-						line->id, vsc,
-						line->sec_in_line,
-						lm->high_thrs, lm->mid_thrs);
-	}
-
-	return move_list;
-}
-
-void pblk_discard(struct pblk *pblk, struct bio *bio)
-{
-	sector_t slba = pblk_get_lba(bio);
-	sector_t nr_secs = pblk_get_secs(bio);
-
-	pblk_invalidate_range(pblk, slba, nr_secs);
-}
-
-void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	atomic_long_inc(&pblk->write_failed);
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	pblk_print_failed_rqd(pblk, rqd, rqd->error);
-#endif
-}
-
-void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	/* Empty page read is not necessarily an error (e.g., L2P recovery) */
-	if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
-		atomic_long_inc(&pblk->read_empty);
-		return;
-	}
-
-	switch (rqd->error) {
-	case NVM_RSP_WARN_HIGHECC:
-		atomic_long_inc(&pblk->read_high_ecc);
-		break;
-	case NVM_RSP_ERR_FAILECC:
-	case NVM_RSP_ERR_FAILCRC:
-		atomic_long_inc(&pblk->read_failed);
-		break;
-	default:
-		pblk_err(pblk, "unknown read error:%d\n", rqd->error);
-	}
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	pblk_print_failed_rqd(pblk, rqd, rqd->error);
-#endif
-}
-
-void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write)
-{
-	pblk->sec_per_write = sec_per_write;
-}
-
-int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-
-	atomic_inc(&pblk->inflight_io);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	if (pblk_check_io(pblk, rqd))
-		return NVM_IO_ERR;
-#endif
-
-	return nvm_submit_io(dev, rqd, buf);
-}
-
-void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-
-	int i;
-
-	for (i = 0; i < rqd->nr_ppas; i++) {
-		struct ppa_addr *ppa = &ppa_list[i];
-		struct nvm_chk_meta *chunk = pblk_dev_ppa_to_chunk(pblk, *ppa);
-		u64 caddr = pblk_dev_ppa_to_chunk_addr(pblk, *ppa);
-
-		if (caddr == 0)
-			trace_pblk_chunk_state(pblk_disk_name(pblk),
-							ppa, NVM_CHK_ST_OPEN);
-		else if (caddr == (chunk->cnlb - 1))
-			trace_pblk_chunk_state(pblk_disk_name(pblk),
-							ppa, NVM_CHK_ST_CLOSED);
-	}
-}
-
-int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	int ret;
-
-	atomic_inc(&pblk->inflight_io);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	if (pblk_check_io(pblk, rqd))
-		return NVM_IO_ERR;
-#endif
-
-	ret = nvm_submit_io_sync(dev, rqd, buf);
-
-	if (trace_pblk_chunk_state_enabled() && !ret &&
-	    rqd->opcode == NVM_OP_PWRITE)
-		pblk_check_chunk_state_update(pblk, rqd);
-
-	return ret;
-}
-
-static int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd,
-				   void *buf)
-{
-	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-	int ret;
-
-	pblk_down_chunk(pblk, ppa_list[0]);
-	ret = pblk_submit_io_sync(pblk, rqd, buf);
-	pblk_up_chunk(pblk, ppa_list[0]);
-
-	return ret;
-}
-
-int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
-		   unsigned long secs_to_flush, bool skip_meta)
-{
-	int max = pblk->sec_per_write;
-	int min = pblk->min_write_pgs;
-	int secs_to_sync = 0;
-
-	if (skip_meta && pblk->min_write_pgs_data != pblk->min_write_pgs)
-		min = max = pblk->min_write_pgs_data;
-
-	if (secs_avail >= max)
-		secs_to_sync = max;
-	else if (secs_avail >= min)
-		secs_to_sync = min * (secs_avail / min);
-	else if (secs_to_flush)
-		secs_to_sync = min;
-
-	return secs_to_sync;
-}
-
-void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
-{
-	u64 addr;
-	int i;
-
-	spin_lock(&line->lock);
-	addr = find_next_zero_bit(line->map_bitmap,
-					pblk->lm.sec_per_line, line->cur_sec);
-	line->cur_sec = addr - nr_secs;
-
-	for (i = 0; i < nr_secs; i++, line->cur_sec--)
-		WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap));
-	spin_unlock(&line->lock);
-}
-
-u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
-{
-	u64 addr;
-	int i;
-
-	lockdep_assert_held(&line->lock);
-
-	/* logic error: ppa out-of-bounds. Prevent generating bad address */
-	if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) {
-		WARN(1, "pblk: page allocation out of bounds\n");
-		nr_secs = pblk->lm.sec_per_line - line->cur_sec;
-	}
-
-	line->cur_sec = addr = find_next_zero_bit(line->map_bitmap,
-					pblk->lm.sec_per_line, line->cur_sec);
-	for (i = 0; i < nr_secs; i++, line->cur_sec++)
-		WARN_ON(test_and_set_bit(line->cur_sec, line->map_bitmap));
-
-	return addr;
-}
-
-u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
-{
-	u64 addr;
-
-	/* Lock needed in case a write fails and a recovery needs to remap
-	 * failed write buffer entries
-	 */
-	spin_lock(&line->lock);
-	addr = __pblk_alloc_page(pblk, line, nr_secs);
-	line->left_msecs -= nr_secs;
-	WARN(line->left_msecs < 0, "pblk: page allocation out of bounds\n");
-	spin_unlock(&line->lock);
-
-	return addr;
-}
-
-u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line)
-{
-	u64 paddr;
-
-	spin_lock(&line->lock);
-	paddr = find_next_zero_bit(line->map_bitmap,
-					pblk->lm.sec_per_line, line->cur_sec);
-	spin_unlock(&line->lock);
-
-	return paddr;
-}
-
-u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	int bit;
-
-	/* This usually only happens on bad lines */
-	bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
-	if (bit >= lm->blk_per_line)
-		return -1;
-
-	return bit * geo->ws_opt;
-}
-
-int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct ppa_addr *ppa_list;
-	struct nvm_rq rqd;
-	u64 paddr = pblk_line_smeta_start(pblk, line);
-	int i, ret;
-
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-
-	ret = pblk_alloc_rqd_meta(pblk, &rqd);
-	if (ret)
-		return ret;
-
-	rqd.opcode = NVM_OP_PREAD;
-	rqd.nr_ppas = lm->smeta_sec;
-	rqd.is_seq = 1;
-	ppa_list = nvm_rq_to_ppa_list(&rqd);
-
-	for (i = 0; i < lm->smeta_sec; i++, paddr++)
-		ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
-
-	ret = pblk_submit_io_sync(pblk, &rqd, line->smeta);
-	if (ret) {
-		pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
-		goto clear_rqd;
-	}
-
-	atomic_dec(&pblk->inflight_io);
-
-	if (rqd.error && rqd.error != NVM_RSP_WARN_HIGHECC) {
-		pblk_log_read_err(pblk, &rqd);
-		ret = -EIO;
-	}
-
-clear_rqd:
-	pblk_free_rqd_meta(pblk, &rqd);
-	return ret;
-}
-
-static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
-				 u64 paddr)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct ppa_addr *ppa_list;
-	struct nvm_rq rqd;
-	__le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf);
-	__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
-	int i, ret;
-
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-
-	ret = pblk_alloc_rqd_meta(pblk, &rqd);
-	if (ret)
-		return ret;
-
-	rqd.opcode = NVM_OP_PWRITE;
-	rqd.nr_ppas = lm->smeta_sec;
-	rqd.is_seq = 1;
-	ppa_list = nvm_rq_to_ppa_list(&rqd);
-
-	for (i = 0; i < lm->smeta_sec; i++, paddr++) {
-		struct pblk_sec_meta *meta = pblk_get_meta(pblk,
-							   rqd.meta_list, i);
-
-		ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
-		meta->lba = lba_list[paddr] = addr_empty;
-	}
-
-	ret = pblk_submit_io_sync_sem(pblk, &rqd, line->smeta);
-	if (ret) {
-		pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
-		goto clear_rqd;
-	}
-
-	atomic_dec(&pblk->inflight_io);
-
-	if (rqd.error) {
-		pblk_log_write_err(pblk, &rqd);
-		ret = -EIO;
-	}
-
-clear_rqd:
-	pblk_free_rqd_meta(pblk, &rqd);
-	return ret;
-}
-
-int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
-			 void *emeta_buf)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	void *ppa_list_buf, *meta_list;
-	struct ppa_addr *ppa_list;
-	struct nvm_rq rqd;
-	u64 paddr = line->emeta_ssec;
-	dma_addr_t dma_ppa_list, dma_meta_list;
-	int min = pblk->min_write_pgs;
-	int left_ppas = lm->emeta_sec[0];
-	int line_id = line->id;
-	int rq_ppas, rq_len;
-	int i, j;
-	int ret;
-
-	meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
-							&dma_meta_list);
-	if (!meta_list)
-		return -ENOMEM;
-
-	ppa_list_buf = meta_list + pblk_dma_meta_size(pblk);
-	dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
-
-next_rq:
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-
-	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
-	rq_len = rq_ppas * geo->csecs;
-
-	rqd.meta_list = meta_list;
-	rqd.ppa_list = ppa_list_buf;
-	rqd.dma_meta_list = dma_meta_list;
-	rqd.dma_ppa_list = dma_ppa_list;
-	rqd.opcode = NVM_OP_PREAD;
-	rqd.nr_ppas = rq_ppas;
-	ppa_list = nvm_rq_to_ppa_list(&rqd);
-
-	for (i = 0; i < rqd.nr_ppas; ) {
-		struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, line_id);
-		int pos = pblk_ppa_to_pos(geo, ppa);
-
-		if (pblk_io_aligned(pblk, rq_ppas))
-			rqd.is_seq = 1;
-
-		while (test_bit(pos, line->blk_bitmap)) {
-			paddr += min;
-			if (pblk_boundary_paddr_checks(pblk, paddr)) {
-				ret = -EINTR;
-				goto free_rqd_dma;
-			}
-
-			ppa = addr_to_gen_ppa(pblk, paddr, line_id);
-			pos = pblk_ppa_to_pos(geo, ppa);
-		}
-
-		if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
-			ret = -EINTR;
-			goto free_rqd_dma;
-		}
-
-		for (j = 0; j < min; j++, i++, paddr++)
-			ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line_id);
-	}
-
-	ret = pblk_submit_io_sync(pblk, &rqd, emeta_buf);
-	if (ret) {
-		pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
-		goto free_rqd_dma;
-	}
-
-	atomic_dec(&pblk->inflight_io);
-
-	if (rqd.error && rqd.error != NVM_RSP_WARN_HIGHECC) {
-		pblk_log_read_err(pblk, &rqd);
-		ret = -EIO;
-		goto free_rqd_dma;
-	}
-
-	emeta_buf += rq_len;
-	left_ppas -= rq_ppas;
-	if (left_ppas)
-		goto next_rq;
-
-free_rqd_dma:
-	nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
-	return ret;
-}
-
-static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
-			    struct ppa_addr ppa)
-{
-	rqd->opcode = NVM_OP_ERASE;
-	rqd->ppa_addr = ppa;
-	rqd->nr_ppas = 1;
-	rqd->is_seq = 1;
-	rqd->bio = NULL;
-}
-
-static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
-{
-	struct nvm_rq rqd = {NULL};
-	int ret;
-
-	trace_pblk_chunk_reset(pblk_disk_name(pblk), &ppa,
-				PBLK_CHUNK_RESET_START);
-
-	pblk_setup_e_rq(pblk, &rqd, ppa);
-
-	/* The write thread schedules erases so that it minimizes disturbances
-	 * with writes. Thus, there is no need to take the LUN semaphore.
-	 */
-	ret = pblk_submit_io_sync(pblk, &rqd, NULL);
-	rqd.private = pblk;
-	__pblk_end_io_erase(pblk, &rqd);
-
-	return ret;
-}
-
-int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct ppa_addr ppa;
-	int ret, bit = -1;
-
-	/* Erase only good blocks, one at a time */
-	do {
-		spin_lock(&line->lock);
-		bit = find_next_zero_bit(line->erase_bitmap, lm->blk_per_line,
-								bit + 1);
-		if (bit >= lm->blk_per_line) {
-			spin_unlock(&line->lock);
-			break;
-		}
-
-		ppa = pblk->luns[bit].bppa; /* set ch and lun */
-		ppa.a.blk = line->id;
-
-		atomic_dec(&line->left_eblks);
-		WARN_ON(test_and_set_bit(bit, line->erase_bitmap));
-		spin_unlock(&line->lock);
-
-		ret = pblk_blk_erase_sync(pblk, ppa);
-		if (ret) {
-			pblk_err(pblk, "failed to erase line %d\n", line->id);
-			return ret;
-		}
-	} while (1);
-
-	return 0;
-}
-
-static void pblk_line_setup_metadata(struct pblk_line *line,
-				     struct pblk_line_mgmt *l_mg,
-				     struct pblk_line_meta *lm)
-{
-	int meta_line;
-
-	lockdep_assert_held(&l_mg->free_lock);
-
-retry_meta:
-	meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
-	if (meta_line == PBLK_DATA_LINES) {
-		spin_unlock(&l_mg->free_lock);
-		io_schedule();
-		spin_lock(&l_mg->free_lock);
-		goto retry_meta;
-	}
-
-	set_bit(meta_line, &l_mg->meta_bitmap);
-	line->meta_line = meta_line;
-
-	line->smeta = l_mg->sline_meta[meta_line];
-	line->emeta = l_mg->eline_meta[meta_line];
-
-	memset(line->smeta, 0, lm->smeta_len);
-	memset(line->emeta->buf, 0, lm->emeta_len[0]);
-
-	line->emeta->mem = 0;
-	atomic_set(&line->emeta->sync, 0);
-}
-
-/* For now lines are always assumed full lines. Thus, smeta former and current
- * lun bitmaps are omitted.
- */
-static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
-				  struct pblk_line *cur)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_emeta *emeta = line->emeta;
-	struct line_emeta *emeta_buf = emeta->buf;
-	struct line_smeta *smeta_buf = (struct line_smeta *)line->smeta;
-	int nr_blk_line;
-
-	/* After erasing the line, new bad blocks might appear and we risk
-	 * having an invalid line
-	 */
-	nr_blk_line = lm->blk_per_line -
-			bitmap_weight(line->blk_bitmap, lm->blk_per_line);
-	if (nr_blk_line < lm->min_blk_line) {
-		spin_lock(&l_mg->free_lock);
-		spin_lock(&line->lock);
-		line->state = PBLK_LINESTATE_BAD;
-		trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-		spin_unlock(&line->lock);
-
-		list_add_tail(&line->list, &l_mg->bad_list);
-		spin_unlock(&l_mg->free_lock);
-
-		pblk_debug(pblk, "line %d is bad\n", line->id);
-
-		return 0;
-	}
-
-	/* Run-time metadata */
-	line->lun_bitmap = ((void *)(smeta_buf)) + sizeof(struct line_smeta);
-
-	/* Mark LUNs allocated in this line (all for now) */
-	bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len);
-
-	smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
-	export_guid(smeta_buf->header.uuid, &pblk->instance_uuid);
-	smeta_buf->header.id = cpu_to_le32(line->id);
-	smeta_buf->header.type = cpu_to_le16(line->type);
-	smeta_buf->header.version_major = SMETA_VERSION_MAJOR;
-	smeta_buf->header.version_minor = SMETA_VERSION_MINOR;
-
-	/* Start metadata */
-	smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
-	smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns);
-
-	/* Fill metadata among lines */
-	if (cur) {
-		memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len);
-		smeta_buf->prev_id = cpu_to_le32(cur->id);
-		cur->emeta->buf->next_id = cpu_to_le32(line->id);
-	} else {
-		smeta_buf->prev_id = cpu_to_le32(PBLK_LINE_EMPTY);
-	}
-
-	/* All smeta must be set at this point */
-	smeta_buf->header.crc = cpu_to_le32(
-			pblk_calc_meta_header_crc(pblk, &smeta_buf->header));
-	smeta_buf->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta_buf));
-
-	/* End metadata */
-	memcpy(&emeta_buf->header, &smeta_buf->header,
-						sizeof(struct line_header));
-
-	emeta_buf->header.version_major = EMETA_VERSION_MAJOR;
-	emeta_buf->header.version_minor = EMETA_VERSION_MINOR;
-	emeta_buf->header.crc = cpu_to_le32(
-			pblk_calc_meta_header_crc(pblk, &emeta_buf->header));
-
-	emeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
-	emeta_buf->nr_lbas = cpu_to_le64(line->sec_in_line);
-	emeta_buf->nr_valid_lbas = cpu_to_le64(0);
-	emeta_buf->next_id = cpu_to_le32(PBLK_LINE_EMPTY);
-	emeta_buf->crc = cpu_to_le32(0);
-	emeta_buf->prev_id = smeta_buf->prev_id;
-
-	return 1;
-}
-
-static int pblk_line_alloc_bitmaps(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-
-	line->map_bitmap = mempool_alloc(l_mg->bitmap_pool, GFP_KERNEL);
-	if (!line->map_bitmap)
-		return -ENOMEM;
-
-	memset(line->map_bitmap, 0, lm->sec_bitmap_len);
-
-	/* will be initialized using bb info from map_bitmap */
-	line->invalid_bitmap = mempool_alloc(l_mg->bitmap_pool, GFP_KERNEL);
-	if (!line->invalid_bitmap) {
-		mempool_free(line->map_bitmap, l_mg->bitmap_pool);
-		line->map_bitmap = NULL;
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-/* For now lines are always assumed full lines. Thus, smeta former and current
- * lun bitmaps are omitted.
- */
-static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
-			     int init)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	u64 off;
-	int bit = -1;
-	int emeta_secs;
-
-	line->sec_in_line = lm->sec_per_line;
-
-	/* Capture bad block information on line mapping bitmaps */
-	while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line,
-					bit + 1)) < lm->blk_per_line) {
-		off = bit * geo->ws_opt;
-		bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off,
-							lm->sec_per_line);
-		bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
-							lm->sec_per_line);
-		line->sec_in_line -= geo->clba;
-	}
-
-	/* Mark smeta metadata sectors as bad sectors */
-	bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
-	off = bit * geo->ws_opt;
-	bitmap_set(line->map_bitmap, off, lm->smeta_sec);
-	line->sec_in_line -= lm->smeta_sec;
-	line->cur_sec = off + lm->smeta_sec;
-
-	if (init && pblk_line_smeta_write(pblk, line, off)) {
-		pblk_debug(pblk, "line smeta I/O failed. Retry\n");
-		return 0;
-	}
-
-	bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line);
-
-	/* Mark emeta metadata sectors as bad sectors. We need to consider bad
-	 * blocks to make sure that there are enough sectors to store emeta
-	 */
-	emeta_secs = lm->emeta_sec[0];
-	off = lm->sec_per_line;
-	while (emeta_secs) {
-		off -= geo->ws_opt;
-		if (!test_bit(off, line->invalid_bitmap)) {
-			bitmap_set(line->invalid_bitmap, off, geo->ws_opt);
-			emeta_secs -= geo->ws_opt;
-		}
-	}
-
-	line->emeta_ssec = off;
-	line->sec_in_line -= lm->emeta_sec[0];
-	line->nr_valid_lbas = 0;
-	line->left_msecs = line->sec_in_line;
-	*line->vsc = cpu_to_le32(line->sec_in_line);
-
-	if (lm->sec_per_line - line->sec_in_line !=
-		bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) {
-		spin_lock(&line->lock);
-		line->state = PBLK_LINESTATE_BAD;
-		trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-		spin_unlock(&line->lock);
-
-		list_add_tail(&line->list, &l_mg->bad_list);
-		pblk_err(pblk, "unexpected line %d is bad\n", line->id);
-
-		return 0;
-	}
-
-	return 1;
-}
-
-static int pblk_prepare_new_line(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	int blk_to_erase = atomic_read(&line->blk_in_line);
-	int i;
-
-	for (i = 0; i < lm->blk_per_line; i++) {
-		struct pblk_lun *rlun = &pblk->luns[i];
-		int pos = pblk_ppa_to_pos(geo, rlun->bppa);
-		int state = line->chks[pos].state;
-
-		/* Free chunks should not be erased */
-		if (state & NVM_CHK_ST_FREE) {
-			set_bit(pblk_ppa_to_pos(geo, rlun->bppa),
-							line->erase_bitmap);
-			blk_to_erase--;
-		}
-	}
-
-	return blk_to_erase;
-}
-
-static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	int blk_in_line = atomic_read(&line->blk_in_line);
-	int blk_to_erase;
-
-	/* Bad blocks do not need to be erased */
-	bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
-
-	spin_lock(&line->lock);
-
-	/* If we have not written to this line, we need to mark up free chunks
-	 * as already erased
-	 */
-	if (line->state == PBLK_LINESTATE_NEW) {
-		blk_to_erase = pblk_prepare_new_line(pblk, line);
-		line->state = PBLK_LINESTATE_FREE;
-		trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-	} else {
-		blk_to_erase = blk_in_line;
-	}
-
-	if (blk_in_line < lm->min_blk_line) {
-		spin_unlock(&line->lock);
-		return -EAGAIN;
-	}
-
-	if (line->state != PBLK_LINESTATE_FREE) {
-		WARN(1, "pblk: corrupted line %d, state %d\n",
-							line->id, line->state);
-		spin_unlock(&line->lock);
-		return -EINTR;
-	}
-
-	line->state = PBLK_LINESTATE_OPEN;
-	trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-				line->state);
-
-	atomic_set(&line->left_eblks, blk_to_erase);
-	atomic_set(&line->left_seblks, blk_to_erase);
-
-	line->meta_distance = lm->meta_distance;
-	spin_unlock(&line->lock);
-
-	kref_init(&line->ref);
-	atomic_set(&line->sec_to_update, 0);
-
-	return 0;
-}
-
-/* Line allocations in the recovery path are always single threaded */
-int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	int ret;
-
-	spin_lock(&l_mg->free_lock);
-	l_mg->data_line = line;
-	list_del(&line->list);
-
-	ret = pblk_line_prepare(pblk, line);
-	if (ret) {
-		list_add(&line->list, &l_mg->free_list);
-		spin_unlock(&l_mg->free_lock);
-		return ret;
-	}
-	spin_unlock(&l_mg->free_lock);
-
-	ret = pblk_line_alloc_bitmaps(pblk, line);
-	if (ret)
-		goto fail;
-
-	if (!pblk_line_init_bb(pblk, line, 0)) {
-		ret = -EINTR;
-		goto fail;
-	}
-
-	pblk_rl_free_lines_dec(&pblk->rl, line, true);
-	return 0;
-
-fail:
-	spin_lock(&l_mg->free_lock);
-	list_add(&line->list, &l_mg->free_list);
-	spin_unlock(&l_mg->free_lock);
-
-	return ret;
-}
-
-void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-
-	mempool_free(line->map_bitmap, l_mg->bitmap_pool);
-	line->map_bitmap = NULL;
-	line->smeta = NULL;
-	line->emeta = NULL;
-}
-
-static void pblk_line_reinit(struct pblk_line *line)
-{
-	*line->vsc = cpu_to_le32(EMPTY_ENTRY);
-
-	line->map_bitmap = NULL;
-	line->invalid_bitmap = NULL;
-	line->smeta = NULL;
-	line->emeta = NULL;
-}
-
-void pblk_line_free(struct pblk_line *line)
-{
-	struct pblk *pblk = line->pblk;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-
-	mempool_free(line->map_bitmap, l_mg->bitmap_pool);
-	mempool_free(line->invalid_bitmap, l_mg->bitmap_pool);
-
-	pblk_line_reinit(line);
-}
-
-struct pblk_line *pblk_line_get(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line *line;
-	int ret, bit;
-
-	lockdep_assert_held(&l_mg->free_lock);
-
-retry:
-	if (list_empty(&l_mg->free_list)) {
-		pblk_err(pblk, "no free lines\n");
-		return NULL;
-	}
-
-	line = list_first_entry(&l_mg->free_list, struct pblk_line, list);
-	list_del(&line->list);
-	l_mg->nr_free_lines--;
-
-	bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
-	if (unlikely(bit >= lm->blk_per_line)) {
-		spin_lock(&line->lock);
-		line->state = PBLK_LINESTATE_BAD;
-		trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-		spin_unlock(&line->lock);
-
-		list_add_tail(&line->list, &l_mg->bad_list);
-
-		pblk_debug(pblk, "line %d is bad\n", line->id);
-		goto retry;
-	}
-
-	ret = pblk_line_prepare(pblk, line);
-	if (ret) {
-		switch (ret) {
-		case -EAGAIN:
-			list_add(&line->list, &l_mg->bad_list);
-			goto retry;
-		case -EINTR:
-			list_add(&line->list, &l_mg->corrupt_list);
-			goto retry;
-		default:
-			pblk_err(pblk, "failed to prepare line %d\n", line->id);
-			list_add(&line->list, &l_mg->free_list);
-			l_mg->nr_free_lines++;
-			return NULL;
-		}
-	}
-
-	return line;
-}
-
-static struct pblk_line *pblk_line_retry(struct pblk *pblk,
-					 struct pblk_line *line)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line *retry_line;
-
-retry:
-	spin_lock(&l_mg->free_lock);
-	retry_line = pblk_line_get(pblk);
-	if (!retry_line) {
-		l_mg->data_line = NULL;
-		spin_unlock(&l_mg->free_lock);
-		return NULL;
-	}
-
-	retry_line->map_bitmap = line->map_bitmap;
-	retry_line->invalid_bitmap = line->invalid_bitmap;
-	retry_line->smeta = line->smeta;
-	retry_line->emeta = line->emeta;
-	retry_line->meta_line = line->meta_line;
-
-	pblk_line_reinit(line);
-
-	l_mg->data_line = retry_line;
-	spin_unlock(&l_mg->free_lock);
-
-	pblk_rl_free_lines_dec(&pblk->rl, line, false);
-
-	if (pblk_line_erase(pblk, retry_line))
-		goto retry;
-
-	return retry_line;
-}
-
-static void pblk_set_space_limit(struct pblk *pblk)
-{
-	struct pblk_rl *rl = &pblk->rl;
-
-	atomic_set(&rl->rb_space, 0);
-}
-
-struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line *line;
-
-	spin_lock(&l_mg->free_lock);
-	line = pblk_line_get(pblk);
-	if (!line) {
-		spin_unlock(&l_mg->free_lock);
-		return NULL;
-	}
-
-	line->seq_nr = l_mg->d_seq_nr++;
-	line->type = PBLK_LINETYPE_DATA;
-	l_mg->data_line = line;
-
-	pblk_line_setup_metadata(line, l_mg, &pblk->lm);
-
-	/* Allocate next line for preparation */
-	l_mg->data_next = pblk_line_get(pblk);
-	if (!l_mg->data_next) {
-		/* If we cannot get a new line, we need to stop the pipeline.
-		 * Only allow as many writes in as we can store safely and then
-		 * fail gracefully
-		 */
-		pblk_set_space_limit(pblk);
-
-		l_mg->data_next = NULL;
-	} else {
-		l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
-		l_mg->data_next->type = PBLK_LINETYPE_DATA;
-	}
-	spin_unlock(&l_mg->free_lock);
-
-	if (pblk_line_alloc_bitmaps(pblk, line))
-		return NULL;
-
-	if (pblk_line_erase(pblk, line)) {
-		line = pblk_line_retry(pblk, line);
-		if (!line)
-			return NULL;
-	}
-
-retry_setup:
-	if (!pblk_line_init_metadata(pblk, line, NULL)) {
-		line = pblk_line_retry(pblk, line);
-		if (!line)
-			return NULL;
-
-		goto retry_setup;
-	}
-
-	if (!pblk_line_init_bb(pblk, line, 1)) {
-		line = pblk_line_retry(pblk, line);
-		if (!line)
-			return NULL;
-
-		goto retry_setup;
-	}
-
-	pblk_rl_free_lines_dec(&pblk->rl, line, true);
-
-	return line;
-}
-
-void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa)
-{
-	struct pblk_line *line;
-
-	line = pblk_ppa_to_line(pblk, ppa);
-	kref_put(&line->ref, pblk_line_put_wq);
-}
-
-void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-	int i;
-
-	for (i = 0; i < rqd->nr_ppas; i++)
-		pblk_ppa_to_line_put(pblk, ppa_list[i]);
-}
-
-static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line)
-{
-	lockdep_assert_held(&pblk->l_mg.free_lock);
-
-	pblk_set_space_limit(pblk);
-	pblk->state = PBLK_STATE_STOPPING;
-	trace_pblk_state(pblk_disk_name(pblk), pblk->state);
-}
-
-static void pblk_line_close_meta_sync(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line *line, *tline;
-	LIST_HEAD(list);
-
-	spin_lock(&l_mg->close_lock);
-	if (list_empty(&l_mg->emeta_list)) {
-		spin_unlock(&l_mg->close_lock);
-		return;
-	}
-
-	list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev);
-	spin_unlock(&l_mg->close_lock);
-
-	list_for_each_entry_safe(line, tline, &list, list) {
-		struct pblk_emeta *emeta = line->emeta;
-
-		while (emeta->mem < lm->emeta_len[0]) {
-			int ret;
-
-			ret = pblk_submit_meta_io(pblk, line);
-			if (ret) {
-				pblk_err(pblk, "sync meta line %d failed (%d)\n",
-							line->id, ret);
-				return;
-			}
-		}
-	}
-
-	pblk_wait_for_meta(pblk);
-	flush_workqueue(pblk->close_wq);
-}
-
-void __pblk_pipeline_flush(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	int ret;
-
-	spin_lock(&l_mg->free_lock);
-	if (pblk->state == PBLK_STATE_RECOVERING ||
-					pblk->state == PBLK_STATE_STOPPED) {
-		spin_unlock(&l_mg->free_lock);
-		return;
-	}
-	pblk->state = PBLK_STATE_RECOVERING;
-	trace_pblk_state(pblk_disk_name(pblk), pblk->state);
-	spin_unlock(&l_mg->free_lock);
-
-	pblk_flush_writer(pblk);
-	pblk_wait_for_meta(pblk);
-
-	ret = pblk_recov_pad(pblk);
-	if (ret) {
-		pblk_err(pblk, "could not close data on teardown(%d)\n", ret);
-		return;
-	}
-
-	flush_workqueue(pblk->bb_wq);
-	pblk_line_close_meta_sync(pblk);
-}
-
-void __pblk_pipeline_stop(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-
-	spin_lock(&l_mg->free_lock);
-	pblk->state = PBLK_STATE_STOPPED;
-	trace_pblk_state(pblk_disk_name(pblk), pblk->state);
-	l_mg->data_line = NULL;
-	l_mg->data_next = NULL;
-	spin_unlock(&l_mg->free_lock);
-}
-
-void pblk_pipeline_stop(struct pblk *pblk)
-{
-	__pblk_pipeline_flush(pblk);
-	__pblk_pipeline_stop(pblk);
-}
-
-struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line *cur, *new = NULL;
-	unsigned int left_seblks;
-
-	new = l_mg->data_next;
-	if (!new)
-		goto out;
-
-	spin_lock(&l_mg->free_lock);
-	cur = l_mg->data_line;
-	l_mg->data_line = new;
-
-	pblk_line_setup_metadata(new, l_mg, &pblk->lm);
-	spin_unlock(&l_mg->free_lock);
-
-retry_erase:
-	left_seblks = atomic_read(&new->left_seblks);
-	if (left_seblks) {
-		/* If line is not fully erased, erase it */
-		if (atomic_read(&new->left_eblks)) {
-			if (pblk_line_erase(pblk, new))
-				goto out;
-		} else {
-			io_schedule();
-		}
-		goto retry_erase;
-	}
-
-	if (pblk_line_alloc_bitmaps(pblk, new))
-		return NULL;
-
-retry_setup:
-	if (!pblk_line_init_metadata(pblk, new, cur)) {
-		new = pblk_line_retry(pblk, new);
-		if (!new)
-			goto out;
-
-		goto retry_setup;
-	}
-
-	if (!pblk_line_init_bb(pblk, new, 1)) {
-		new = pblk_line_retry(pblk, new);
-		if (!new)
-			goto out;
-
-		goto retry_setup;
-	}
-
-	pblk_rl_free_lines_dec(&pblk->rl, new, true);
-
-	/* Allocate next line for preparation */
-	spin_lock(&l_mg->free_lock);
-	l_mg->data_next = pblk_line_get(pblk);
-	if (!l_mg->data_next) {
-		/* If we cannot get a new line, we need to stop the pipeline.
-		 * Only allow as many writes in as we can store safely and then
-		 * fail gracefully
-		 */
-		pblk_stop_writes(pblk, new);
-		l_mg->data_next = NULL;
-	} else {
-		l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
-		l_mg->data_next->type = PBLK_LINETYPE_DATA;
-	}
-	spin_unlock(&l_mg->free_lock);
-
-out:
-	return new;
-}
-
-static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_gc *gc = &pblk->gc;
-
-	spin_lock(&line->lock);
-	WARN_ON(line->state != PBLK_LINESTATE_GC);
-	if (line->w_err_gc->has_gc_err) {
-		spin_unlock(&line->lock);
-		pblk_err(pblk, "line %d had errors during GC\n", line->id);
-		pblk_put_line_back(pblk, line);
-		line->w_err_gc->has_gc_err = 0;
-		return;
-	}
-
-	line->state = PBLK_LINESTATE_FREE;
-	trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-	line->gc_group = PBLK_LINEGC_NONE;
-	pblk_line_free(line);
-
-	if (line->w_err_gc->has_write_err) {
-		pblk_rl_werr_line_out(&pblk->rl);
-		line->w_err_gc->has_write_err = 0;
-	}
-
-	spin_unlock(&line->lock);
-	atomic_dec(&gc->pipeline_gc);
-
-	spin_lock(&l_mg->free_lock);
-	list_add_tail(&line->list, &l_mg->free_list);
-	l_mg->nr_free_lines++;
-	spin_unlock(&l_mg->free_lock);
-
-	pblk_rl_free_lines_inc(&pblk->rl, line);
-}
-
-static void pblk_line_put_ws(struct work_struct *work)
-{
-	struct pblk_line_ws *line_put_ws = container_of(work,
-						struct pblk_line_ws, ws);
-	struct pblk *pblk = line_put_ws->pblk;
-	struct pblk_line *line = line_put_ws->line;
-
-	__pblk_line_put(pblk, line);
-	mempool_free(line_put_ws, &pblk->gen_ws_pool);
-}
-
-void pblk_line_put(struct kref *ref)
-{
-	struct pblk_line *line = container_of(ref, struct pblk_line, ref);
-	struct pblk *pblk = line->pblk;
-
-	__pblk_line_put(pblk, line);
-}
-
-void pblk_line_put_wq(struct kref *ref)
-{
-	struct pblk_line *line = container_of(ref, struct pblk_line, ref);
-	struct pblk *pblk = line->pblk;
-	struct pblk_line_ws *line_put_ws;
-
-	line_put_ws = mempool_alloc(&pblk->gen_ws_pool, GFP_ATOMIC);
-	if (!line_put_ws)
-		return;
-
-	line_put_ws->pblk = pblk;
-	line_put_ws->line = line;
-	line_put_ws->priv = NULL;
-
-	INIT_WORK(&line_put_ws->ws, pblk_line_put_ws);
-	queue_work(pblk->r_end_wq, &line_put_ws->ws);
-}
-
-int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
-{
-	struct nvm_rq *rqd;
-	int err;
-
-	rqd = pblk_alloc_rqd(pblk, PBLK_ERASE);
-
-	pblk_setup_e_rq(pblk, rqd, ppa);
-
-	rqd->end_io = pblk_end_io_erase;
-	rqd->private = pblk;
-
-	trace_pblk_chunk_reset(pblk_disk_name(pblk),
-				&ppa, PBLK_CHUNK_RESET_START);
-
-	/* The write thread schedules erases so that it minimizes disturbances
-	 * with writes. Thus, there is no need to take the LUN semaphore.
-	 */
-	err = pblk_submit_io(pblk, rqd, NULL);
-	if (err) {
-		struct nvm_tgt_dev *dev = pblk->dev;
-		struct nvm_geo *geo = &dev->geo;
-
-		pblk_err(pblk, "could not async erase line:%d,blk:%d\n",
-					pblk_ppa_to_line_id(ppa),
-					pblk_ppa_to_pos(geo, ppa));
-	}
-
-	return err;
-}
-
-struct pblk_line *pblk_line_get_data(struct pblk *pblk)
-{
-	return pblk->l_mg.data_line;
-}
-
-/* For now, always erase next line */
-struct pblk_line *pblk_line_get_erase(struct pblk *pblk)
-{
-	return pblk->l_mg.data_next;
-}
-
-int pblk_line_is_full(struct pblk_line *line)
-{
-	return (line->left_msecs == 0);
-}
-
-static void pblk_line_should_sync_meta(struct pblk *pblk)
-{
-	if (pblk_rl_is_limit(&pblk->rl))
-		pblk_line_close_meta_sync(pblk);
-}
-
-void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct list_head *move_list;
-	int i;
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line),
-				"pblk: corrupt closed line %d\n", line->id);
-#endif
-
-	spin_lock(&l_mg->free_lock);
-	WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap));
-	spin_unlock(&l_mg->free_lock);
-
-	spin_lock(&l_mg->gc_lock);
-	spin_lock(&line->lock);
-	WARN_ON(line->state != PBLK_LINESTATE_OPEN);
-	line->state = PBLK_LINESTATE_CLOSED;
-	move_list = pblk_line_gc_list(pblk, line);
-	list_add_tail(&line->list, move_list);
-
-	mempool_free(line->map_bitmap, l_mg->bitmap_pool);
-	line->map_bitmap = NULL;
-	line->smeta = NULL;
-	line->emeta = NULL;
-
-	for (i = 0; i < lm->blk_per_line; i++) {
-		struct pblk_lun *rlun = &pblk->luns[i];
-		int pos = pblk_ppa_to_pos(geo, rlun->bppa);
-		int state = line->chks[pos].state;
-
-		if (!(state & NVM_CHK_ST_OFFLINE))
-			state = NVM_CHK_ST_CLOSED;
-	}
-
-	spin_unlock(&line->lock);
-	spin_unlock(&l_mg->gc_lock);
-
-	trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-}
-
-void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_emeta *emeta = line->emeta;
-	struct line_emeta *emeta_buf = emeta->buf;
-	struct wa_counters *wa = emeta_to_wa(lm, emeta_buf);
-
-	/* No need for exact vsc value; avoid a big line lock and take aprox. */
-	memcpy(emeta_to_vsc(pblk, emeta_buf), l_mg->vsc_list, lm->vsc_list_len);
-	memcpy(emeta_to_bb(emeta_buf), line->blk_bitmap, lm->blk_bitmap_len);
-
-	wa->user = cpu_to_le64(atomic64_read(&pblk->user_wa));
-	wa->pad = cpu_to_le64(atomic64_read(&pblk->pad_wa));
-	wa->gc = cpu_to_le64(atomic64_read(&pblk->gc_wa));
-
-	if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) {
-		emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
-		export_guid(emeta_buf->header.uuid, &pblk->instance_uuid);
-		emeta_buf->header.id = cpu_to_le32(line->id);
-		emeta_buf->header.type = cpu_to_le16(line->type);
-		emeta_buf->header.version_major = EMETA_VERSION_MAJOR;
-		emeta_buf->header.version_minor = EMETA_VERSION_MINOR;
-		emeta_buf->header.crc = cpu_to_le32(
-			pblk_calc_meta_header_crc(pblk, &emeta_buf->header));
-	}
-
-	emeta_buf->nr_valid_lbas = cpu_to_le64(line->nr_valid_lbas);
-	emeta_buf->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, emeta_buf));
-
-	spin_lock(&l_mg->close_lock);
-	spin_lock(&line->lock);
-
-	/* Update the in-memory start address for emeta, in case it has
-	 * shifted due to write errors
-	 */
-	if (line->emeta_ssec != line->cur_sec)
-		line->emeta_ssec = line->cur_sec;
-
-	list_add_tail(&line->list, &l_mg->emeta_list);
-	spin_unlock(&line->lock);
-	spin_unlock(&l_mg->close_lock);
-
-	pblk_line_should_sync_meta(pblk);
-}
-
-static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	unsigned int lba_list_size = lm->emeta_len[2];
-	struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
-	struct pblk_emeta *emeta = line->emeta;
-
-	w_err_gc->lba_list = kvmalloc(lba_list_size, GFP_KERNEL);
-	memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf),
-				lba_list_size);
-}
-
-void pblk_line_close_ws(struct work_struct *work)
-{
-	struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
-									ws);
-	struct pblk *pblk = line_ws->pblk;
-	struct pblk_line *line = line_ws->line;
-	struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
-
-	/* Write errors makes the emeta start address stored in smeta invalid,
-	 * so keep a copy of the lba list until we've gc'd the line
-	 */
-	if (w_err_gc->has_write_err)
-		pblk_save_lba_list(pblk, line);
-
-	pblk_line_close(pblk, line);
-	mempool_free(line_ws, &pblk->gen_ws_pool);
-}
-
-void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
-		      void (*work)(struct work_struct *), gfp_t gfp_mask,
-		      struct workqueue_struct *wq)
-{
-	struct pblk_line_ws *line_ws;
-
-	line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask);
-	if (!line_ws) {
-		pblk_err(pblk, "pblk: could not allocate memory\n");
-		return;
-	}
-
-	line_ws->pblk = pblk;
-	line_ws->line = line;
-	line_ws->priv = priv;
-
-	INIT_WORK(&line_ws->ws, work);
-	queue_work(wq, &line_ws->ws);
-}
-
-static void __pblk_down_chunk(struct pblk *pblk, int pos)
-{
-	struct pblk_lun *rlun = &pblk->luns[pos];
-	int ret;
-
-	/*
-	 * Only send one inflight I/O per LUN. Since we map at a page
-	 * granurality, all ppas in the I/O will map to the same LUN
-	 */
-
-	ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
-	if (ret == -ETIME || ret == -EINTR)
-		pblk_err(pblk, "taking lun semaphore timed out: err %d\n",
-				-ret);
-}
-
-void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	int pos = pblk_ppa_to_pos(geo, ppa);
-
-	__pblk_down_chunk(pblk, pos);
-}
-
-void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa,
-		  unsigned long *lun_bitmap)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	int pos = pblk_ppa_to_pos(geo, ppa);
-
-	/* If the LUN has been locked for this same request, do no attempt to
-	 * lock it again
-	 */
-	if (test_and_set_bit(pos, lun_bitmap))
-		return;
-
-	__pblk_down_chunk(pblk, pos);
-}
-
-void pblk_up_chunk(struct pblk *pblk, struct ppa_addr ppa)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_lun *rlun;
-	int pos = pblk_ppa_to_pos(geo, ppa);
-
-	rlun = &pblk->luns[pos];
-	up(&rlun->wr_sem);
-}
-
-void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_lun *rlun;
-	int num_lun = geo->all_luns;
-	int bit = -1;
-
-	while ((bit = find_next_bit(lun_bitmap, num_lun, bit + 1)) < num_lun) {
-		rlun = &pblk->luns[bit];
-		up(&rlun->wr_sem);
-	}
-}
-
-void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
-{
-	struct ppa_addr ppa_l2p;
-
-	/* logic error: lba out-of-bounds. Ignore update */
-	if (!(lba < pblk->capacity)) {
-		WARN(1, "pblk: corrupted L2P map request\n");
-		return;
-	}
-
-	spin_lock(&pblk->trans_lock);
-	ppa_l2p = pblk_trans_map_get(pblk, lba);
-
-	if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p))
-		pblk_map_invalidate(pblk, ppa_l2p);
-
-	pblk_trans_map_set(pblk, lba, ppa);
-	spin_unlock(&pblk->trans_lock);
-}
-
-void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
-{
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	/* Callers must ensure that the ppa points to a cache address */
-	BUG_ON(!pblk_addr_in_cache(ppa));
-	BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
-#endif
-
-	pblk_update_map(pblk, lba, ppa);
-}
-
-int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new,
-		       struct pblk_line *gc_line, u64 paddr_gc)
-{
-	struct ppa_addr ppa_l2p, ppa_gc;
-	int ret = 1;
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	/* Callers must ensure that the ppa points to a cache address */
-	BUG_ON(!pblk_addr_in_cache(ppa_new));
-	BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new)));
-#endif
-
-	/* logic error: lba out-of-bounds. Ignore update */
-	if (!(lba < pblk->capacity)) {
-		WARN(1, "pblk: corrupted L2P map request\n");
-		return 0;
-	}
-
-	spin_lock(&pblk->trans_lock);
-	ppa_l2p = pblk_trans_map_get(pblk, lba);
-	ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id);
-
-	if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) {
-		spin_lock(&gc_line->lock);
-		WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap),
-						"pblk: corrupted GC update");
-		spin_unlock(&gc_line->lock);
-
-		ret = 0;
-		goto out;
-	}
-
-	pblk_trans_map_set(pblk, lba, ppa_new);
-out:
-	spin_unlock(&pblk->trans_lock);
-	return ret;
-}
-
-void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
-			 struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache)
-{
-	struct ppa_addr ppa_l2p;
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	/* Callers must ensure that the ppa points to a device address */
-	BUG_ON(pblk_addr_in_cache(ppa_mapped));
-#endif
-	/* Invalidate and discard padded entries */
-	if (lba == ADDR_EMPTY) {
-		atomic64_inc(&pblk->pad_wa);
-#ifdef CONFIG_NVM_PBLK_DEBUG
-		atomic_long_inc(&pblk->padded_wb);
-#endif
-		if (!pblk_ppa_empty(ppa_mapped))
-			pblk_map_invalidate(pblk, ppa_mapped);
-		return;
-	}
-
-	/* logic error: lba out-of-bounds. Ignore update */
-	if (!(lba < pblk->capacity)) {
-		WARN(1, "pblk: corrupted L2P map request\n");
-		return;
-	}
-
-	spin_lock(&pblk->trans_lock);
-	ppa_l2p = pblk_trans_map_get(pblk, lba);
-
-	/* Do not update L2P if the cacheline has been updated. In this case,
-	 * the mapped ppa must be invalidated
-	 */
-	if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) {
-		if (!pblk_ppa_empty(ppa_mapped))
-			pblk_map_invalidate(pblk, ppa_mapped);
-		goto out;
-	}
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p));
-#endif
-
-	pblk_trans_map_set(pblk, lba, ppa_mapped);
-out:
-	spin_unlock(&pblk->trans_lock);
-}
-
-int pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
-			 sector_t blba, int nr_secs, bool *from_cache)
-{
-	int i;
-
-	spin_lock(&pblk->trans_lock);
-	for (i = 0; i < nr_secs; i++) {
-		struct ppa_addr ppa;
-
-		ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i);
-
-		/* If the L2P entry maps to a line, the reference is valid */
-		if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
-			struct pblk_line *line = pblk_ppa_to_line(pblk, ppa);
-
-			if (i > 0 && *from_cache)
-				break;
-			*from_cache = false;
-
-			kref_get(&line->ref);
-		} else {
-			if (i > 0 && !*from_cache)
-				break;
-			*from_cache = true;
-		}
-	}
-	spin_unlock(&pblk->trans_lock);
-	return i;
-}
-
-void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
-			  u64 *lba_list, int nr_secs)
-{
-	u64 lba;
-	int i;
-
-	spin_lock(&pblk->trans_lock);
-	for (i = 0; i < nr_secs; i++) {
-		lba = lba_list[i];
-		if (lba != ADDR_EMPTY) {
-			/* logic error: lba out-of-bounds. Ignore update */
-			if (!(lba < pblk->capacity)) {
-				WARN(1, "pblk: corrupted L2P map request\n");
-				continue;
-			}
-			ppas[i] = pblk_trans_map_get(pblk, lba);
-		}
-	}
-	spin_unlock(&pblk->trans_lock);
-}
-
-void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	void *buffer;
-
-	if (pblk_is_oob_meta_supported(pblk)) {
-		/* Just use OOB metadata buffer as always */
-		buffer = rqd->meta_list;
-	} else {
-		/* We need to reuse last page of request (packed metadata)
-		 * in similar way as traditional oob metadata
-		 */
-		buffer = page_to_virt(
-			rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
-	}
-
-	return buffer;
-}
-
-void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	void *meta_list = rqd->meta_list;
-	void *page;
-	int i = 0;
-
-	if (pblk_is_oob_meta_supported(pblk))
-		return;
-
-	page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
-	/* We need to fill oob meta buffer with data from packed metadata */
-	for (; i < rqd->nr_ppas; i++)
-		memcpy(pblk_get_meta(pblk, meta_list, i),
-			page + (i * sizeof(struct pblk_sec_meta)),
-			sizeof(struct pblk_sec_meta));
-}
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
deleted file mode 100644
index b31658be35a7..000000000000
--- a/drivers/lightnvm/pblk-gc.c
+++ /dev/null
@@ -1,726 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2016 CNEX Labs
- * Initial release: Javier Gonzalez <javier@cnexlabs.com>
- *                  Matias Bjorling <matias@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * pblk-gc.c - pblk's garbage collector
- */
-
-#include "pblk.h"
-#include "pblk-trace.h"
-#include <linux/delay.h>
-
-
-static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
-{
-	vfree(gc_rq->data);
-	kfree(gc_rq);
-}
-
-static int pblk_gc_write(struct pblk *pblk)
-{
-	struct pblk_gc *gc = &pblk->gc;
-	struct pblk_gc_rq *gc_rq, *tgc_rq;
-	LIST_HEAD(w_list);
-
-	spin_lock(&gc->w_lock);
-	if (list_empty(&gc->w_list)) {
-		spin_unlock(&gc->w_lock);
-		return 1;
-	}
-
-	list_cut_position(&w_list, &gc->w_list, gc->w_list.prev);
-	gc->w_entries = 0;
-	spin_unlock(&gc->w_lock);
-
-	list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
-		pblk_write_gc_to_cache(pblk, gc_rq);
-		list_del(&gc_rq->list);
-		kref_put(&gc_rq->line->ref, pblk_line_put);
-		pblk_gc_free_gc_rq(gc_rq);
-	}
-
-	return 0;
-}
-
-static void pblk_gc_writer_kick(struct pblk_gc *gc)
-{
-	wake_up_process(gc->gc_writer_ts);
-}
-
-void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct list_head *move_list;
-
-	spin_lock(&l_mg->gc_lock);
-	spin_lock(&line->lock);
-	WARN_ON(line->state != PBLK_LINESTATE_GC);
-	line->state = PBLK_LINESTATE_CLOSED;
-	trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-
-	/* We need to reset gc_group in order to ensure that
-	 * pblk_line_gc_list will return proper move_list
-	 * since right now current line is not on any of the
-	 * gc lists.
-	 */
-	line->gc_group = PBLK_LINEGC_NONE;
-	move_list = pblk_line_gc_list(pblk, line);
-	spin_unlock(&line->lock);
-	list_add_tail(&line->list, move_list);
-	spin_unlock(&l_mg->gc_lock);
-}
-
-static void pblk_gc_line_ws(struct work_struct *work)
-{
-	struct pblk_line_ws *gc_rq_ws = container_of(work,
-						struct pblk_line_ws, ws);
-	struct pblk *pblk = gc_rq_ws->pblk;
-	struct pblk_gc *gc = &pblk->gc;
-	struct pblk_line *line = gc_rq_ws->line;
-	struct pblk_gc_rq *gc_rq = gc_rq_ws->priv;
-	int ret;
-
-	up(&gc->gc_sem);
-
-	/* Read from GC victim block */
-	ret = pblk_submit_read_gc(pblk, gc_rq);
-	if (ret) {
-		line->w_err_gc->has_gc_err = 1;
-		goto out;
-	}
-
-	if (!gc_rq->secs_to_gc)
-		goto out;
-
-retry:
-	spin_lock(&gc->w_lock);
-	if (gc->w_entries >= PBLK_GC_RQ_QD) {
-		spin_unlock(&gc->w_lock);
-		pblk_gc_writer_kick(&pblk->gc);
-		usleep_range(128, 256);
-		goto retry;
-	}
-	gc->w_entries++;
-	list_add_tail(&gc_rq->list, &gc->w_list);
-	spin_unlock(&gc->w_lock);
-
-	pblk_gc_writer_kick(&pblk->gc);
-
-	kfree(gc_rq_ws);
-	return;
-
-out:
-	pblk_gc_free_gc_rq(gc_rq);
-	kref_put(&line->ref, pblk_line_put);
-	kfree(gc_rq_ws);
-}
-
-static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
-				       struct pblk_line *line)
-{
-	struct line_emeta *emeta_buf;
-	struct pblk_line_meta *lm = &pblk->lm;
-	unsigned int lba_list_size = lm->emeta_len[2];
-	__le64 *lba_list;
-	int ret;
-
-	emeta_buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL);
-	if (!emeta_buf)
-		return NULL;
-
-	ret = pblk_line_emeta_read(pblk, line, emeta_buf);
-	if (ret) {
-		pblk_err(pblk, "line %d read emeta failed (%d)\n",
-				line->id, ret);
-		kvfree(emeta_buf);
-		return NULL;
-	}
-
-	/* If this read fails, it means that emeta is corrupted.
-	 * For now, leave the line untouched.
-	 * TODO: Implement a recovery routine that scans and moves
-	 * all sectors on the line.
-	 */
-
-	ret = pblk_recov_check_emeta(pblk, emeta_buf);
-	if (ret) {
-		pblk_err(pblk, "inconsistent emeta (line %d)\n",
-				line->id);
-		kvfree(emeta_buf);
-		return NULL;
-	}
-
-	lba_list = kvmalloc(lba_list_size, GFP_KERNEL);
-
-	if (lba_list)
-		memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size);
-
-	kvfree(emeta_buf);
-
-	return lba_list;
-}
-
-static void pblk_gc_line_prepare_ws(struct work_struct *work)
-{
-	struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
-									ws);
-	struct pblk *pblk = line_ws->pblk;
-	struct pblk_line *line = line_ws->line;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_gc *gc = &pblk->gc;
-	struct pblk_line_ws *gc_rq_ws;
-	struct pblk_gc_rq *gc_rq;
-	__le64 *lba_list;
-	unsigned long *invalid_bitmap;
-	int sec_left, nr_secs, bit;
-
-	invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
-	if (!invalid_bitmap)
-		goto fail_free_ws;
-
-	if (line->w_err_gc->has_write_err) {
-		lba_list = line->w_err_gc->lba_list;
-		line->w_err_gc->lba_list = NULL;
-	} else {
-		lba_list = get_lba_list_from_emeta(pblk, line);
-		if (!lba_list) {
-			pblk_err(pblk, "could not interpret emeta (line %d)\n",
-					line->id);
-			goto fail_free_invalid_bitmap;
-		}
-	}
-
-	spin_lock(&line->lock);
-	bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line);
-	sec_left = pblk_line_vsc(line);
-	spin_unlock(&line->lock);
-
-	if (sec_left < 0) {
-		pblk_err(pblk, "corrupted GC line (%d)\n", line->id);
-		goto fail_free_lba_list;
-	}
-
-	bit = -1;
-next_rq:
-	gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
-	if (!gc_rq)
-		goto fail_free_lba_list;
-
-	nr_secs = 0;
-	do {
-		bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line,
-								bit + 1);
-		if (bit > line->emeta_ssec)
-			break;
-
-		gc_rq->paddr_list[nr_secs] = bit;
-		gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]);
-	} while (nr_secs < pblk->max_write_pgs);
-
-	if (unlikely(!nr_secs)) {
-		kfree(gc_rq);
-		goto out;
-	}
-
-	gc_rq->nr_secs = nr_secs;
-	gc_rq->line = line;
-
-	gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs));
-	if (!gc_rq->data)
-		goto fail_free_gc_rq;
-
-	gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
-	if (!gc_rq_ws)
-		goto fail_free_gc_data;
-
-	gc_rq_ws->pblk = pblk;
-	gc_rq_ws->line = line;
-	gc_rq_ws->priv = gc_rq;
-
-	/* The write GC path can be much slower than the read GC one due to
-	 * the budget imposed by the rate-limiter. Balance in case that we get
-	 * back pressure from the write GC path.
-	 */
-	while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000)))
-		io_schedule();
-
-	kref_get(&line->ref);
-
-	INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws);
-	queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws);
-
-	sec_left -= nr_secs;
-	if (sec_left > 0)
-		goto next_rq;
-
-out:
-	kvfree(lba_list);
-	kfree(line_ws);
-	kfree(invalid_bitmap);
-
-	kref_put(&line->ref, pblk_line_put);
-	atomic_dec(&gc->read_inflight_gc);
-
-	return;
-
-fail_free_gc_data:
-	vfree(gc_rq->data);
-fail_free_gc_rq:
-	kfree(gc_rq);
-fail_free_lba_list:
-	kvfree(lba_list);
-fail_free_invalid_bitmap:
-	kfree(invalid_bitmap);
-fail_free_ws:
-	kfree(line_ws);
-
-	/* Line goes back to closed state, so we cannot release additional
-	 * reference for line, since we do that only when we want to do
-	 * gc to free line state transition.
-	 */
-	pblk_put_line_back(pblk, line);
-	atomic_dec(&gc->read_inflight_gc);
-
-	pblk_err(pblk, "failed to GC line %d\n", line->id);
-}
-
-static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_gc *gc = &pblk->gc;
-	struct pblk_line_ws *line_ws;
-
-	pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id);
-
-	line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
-	if (!line_ws)
-		return -ENOMEM;
-
-	line_ws->pblk = pblk;
-	line_ws->line = line;
-
-	atomic_inc(&gc->pipeline_gc);
-	INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws);
-	queue_work(gc->gc_reader_wq, &line_ws->ws);
-
-	return 0;
-}
-
-static void pblk_gc_reader_kick(struct pblk_gc *gc)
-{
-	wake_up_process(gc->gc_reader_ts);
-}
-
-static void pblk_gc_kick(struct pblk *pblk)
-{
-	struct pblk_gc *gc = &pblk->gc;
-
-	pblk_gc_writer_kick(gc);
-	pblk_gc_reader_kick(gc);
-
-	/* If we're shutting down GC, let's not start it up again */
-	if (gc->gc_enabled) {
-		wake_up_process(gc->gc_ts);
-		mod_timer(&gc->gc_timer,
-			  jiffies + msecs_to_jiffies(GC_TIME_MSECS));
-	}
-}
-
-static int pblk_gc_read(struct pblk *pblk)
-{
-	struct pblk_gc *gc = &pblk->gc;
-	struct pblk_line *line;
-
-	spin_lock(&gc->r_lock);
-	if (list_empty(&gc->r_list)) {
-		spin_unlock(&gc->r_lock);
-		return 1;
-	}
-
-	line = list_first_entry(&gc->r_list, struct pblk_line, list);
-	list_del(&line->list);
-	spin_unlock(&gc->r_lock);
-
-	pblk_gc_kick(pblk);
-
-	if (pblk_gc_line(pblk, line)) {
-		pblk_err(pblk, "failed to GC line %d\n", line->id);
-		/* rollback */
-		spin_lock(&gc->r_lock);
-		list_add_tail(&line->list, &gc->r_list);
-		spin_unlock(&gc->r_lock);
-	}
-
-	return 0;
-}
-
-static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
-						 struct list_head *group_list)
-{
-	struct pblk_line *line, *victim;
-	unsigned int line_vsc = ~0x0L, victim_vsc = ~0x0L;
-
-	victim = list_first_entry(group_list, struct pblk_line, list);
-
-	list_for_each_entry(line, group_list, list) {
-		if (!atomic_read(&line->sec_to_update))
-			line_vsc = le32_to_cpu(*line->vsc);
-		if (line_vsc < victim_vsc) {
-			victim = line;
-			victim_vsc = le32_to_cpu(*victim->vsc);
-		}
-	}
-
-	if (victim_vsc == ~0x0)
-		return NULL;
-
-	return victim;
-}
-
-static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl)
-{
-	unsigned int nr_blocks_free, nr_blocks_need;
-	unsigned int werr_lines = atomic_read(&rl->werr_lines);
-
-	nr_blocks_need = pblk_rl_high_thrs(rl);
-	nr_blocks_free = pblk_rl_nr_free_blks(rl);
-
-	/* This is not critical, no need to take lock here */
-	return ((werr_lines > 0) ||
-		((gc->gc_active) && (nr_blocks_need > nr_blocks_free)));
-}
-
-void pblk_gc_free_full_lines(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_gc *gc = &pblk->gc;
-	struct pblk_line *line;
-
-	do {
-		spin_lock(&l_mg->gc_lock);
-		if (list_empty(&l_mg->gc_full_list)) {
-			spin_unlock(&l_mg->gc_lock);
-			return;
-		}
-
-		line = list_first_entry(&l_mg->gc_full_list,
-							struct pblk_line, list);
-
-		spin_lock(&line->lock);
-		WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
-		line->state = PBLK_LINESTATE_GC;
-		trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-		spin_unlock(&line->lock);
-
-		list_del(&line->list);
-		spin_unlock(&l_mg->gc_lock);
-
-		atomic_inc(&gc->pipeline_gc);
-		kref_put(&line->ref, pblk_line_put);
-	} while (1);
-}
-
-/*
- * Lines with no valid sectors will be returned to the free list immediately. If
- * GC is activated - either because the free block count is under the determined
- * threshold, or because it is being forced from user space - only lines with a
- * high count of invalid sectors will be recycled.
- */
-static void pblk_gc_run(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_gc *gc = &pblk->gc;
-	struct pblk_line *line;
-	struct list_head *group_list;
-	bool run_gc;
-	int read_inflight_gc, gc_group = 0, prev_group = 0;
-
-	pblk_gc_free_full_lines(pblk);
-
-	run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
-	if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD))
-		return;
-
-next_gc_group:
-	group_list = l_mg->gc_lists[gc_group++];
-
-	do {
-		spin_lock(&l_mg->gc_lock);
-
-		line = pblk_gc_get_victim_line(pblk, group_list);
-		if (!line) {
-			spin_unlock(&l_mg->gc_lock);
-			break;
-		}
-
-		spin_lock(&line->lock);
-		WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
-		line->state = PBLK_LINESTATE_GC;
-		trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-		spin_unlock(&line->lock);
-
-		list_del(&line->list);
-		spin_unlock(&l_mg->gc_lock);
-
-		spin_lock(&gc->r_lock);
-		list_add_tail(&line->list, &gc->r_list);
-		spin_unlock(&gc->r_lock);
-
-		read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc);
-		pblk_gc_reader_kick(gc);
-
-		prev_group = 1;
-
-		/* No need to queue up more GC lines than we can handle */
-		run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
-		if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD)
-			break;
-	} while (1);
-
-	if (!prev_group && pblk->rl.rb_state > gc_group &&
-						gc_group < PBLK_GC_NR_LISTS)
-		goto next_gc_group;
-}
-
-static void pblk_gc_timer(struct timer_list *t)
-{
-	struct pblk *pblk = from_timer(pblk, t, gc.gc_timer);
-
-	pblk_gc_kick(pblk);
-}
-
-static int pblk_gc_ts(void *data)
-{
-	struct pblk *pblk = data;
-
-	while (!kthread_should_stop()) {
-		pblk_gc_run(pblk);
-		set_current_state(TASK_INTERRUPTIBLE);
-		io_schedule();
-	}
-
-	return 0;
-}
-
-static int pblk_gc_writer_ts(void *data)
-{
-	struct pblk *pblk = data;
-
-	while (!kthread_should_stop()) {
-		if (!pblk_gc_write(pblk))
-			continue;
-		set_current_state(TASK_INTERRUPTIBLE);
-		io_schedule();
-	}
-
-	return 0;
-}
-
-static int pblk_gc_reader_ts(void *data)
-{
-	struct pblk *pblk = data;
-	struct pblk_gc *gc = &pblk->gc;
-
-	while (!kthread_should_stop()) {
-		if (!pblk_gc_read(pblk))
-			continue;
-		set_current_state(TASK_INTERRUPTIBLE);
-		io_schedule();
-	}
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	pblk_info(pblk, "flushing gc pipeline, %d lines left\n",
-		atomic_read(&gc->pipeline_gc));
-#endif
-
-	do {
-		if (!atomic_read(&gc->pipeline_gc))
-			break;
-
-		schedule();
-	} while (1);
-
-	return 0;
-}
-
-static void pblk_gc_start(struct pblk *pblk)
-{
-	pblk->gc.gc_active = 1;
-	pblk_debug(pblk, "gc start\n");
-}
-
-void pblk_gc_should_start(struct pblk *pblk)
-{
-	struct pblk_gc *gc = &pblk->gc;
-
-	if (gc->gc_enabled && !gc->gc_active) {
-		pblk_gc_start(pblk);
-		pblk_gc_kick(pblk);
-	}
-}
-
-void pblk_gc_should_stop(struct pblk *pblk)
-{
-	struct pblk_gc *gc = &pblk->gc;
-
-	if (gc->gc_active && !gc->gc_forced)
-		gc->gc_active = 0;
-}
-
-void pblk_gc_should_kick(struct pblk *pblk)
-{
-	pblk_rl_update_rates(&pblk->rl);
-}
-
-void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
-			      int *gc_active)
-{
-	struct pblk_gc *gc = &pblk->gc;
-
-	spin_lock(&gc->lock);
-	*gc_enabled = gc->gc_enabled;
-	*gc_active = gc->gc_active;
-	spin_unlock(&gc->lock);
-}
-
-int pblk_gc_sysfs_force(struct pblk *pblk, int force)
-{
-	struct pblk_gc *gc = &pblk->gc;
-
-	if (force < 0 || force > 1)
-		return -EINVAL;
-
-	spin_lock(&gc->lock);
-	gc->gc_forced = force;
-
-	if (force)
-		gc->gc_enabled = 1;
-	else
-		gc->gc_enabled = 0;
-	spin_unlock(&gc->lock);
-
-	pblk_gc_should_start(pblk);
-
-	return 0;
-}
-
-int pblk_gc_init(struct pblk *pblk)
-{
-	struct pblk_gc *gc = &pblk->gc;
-	int ret;
-
-	gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
-	if (IS_ERR(gc->gc_ts)) {
-		pblk_err(pblk, "could not allocate GC main kthread\n");
-		return PTR_ERR(gc->gc_ts);
-	}
-
-	gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
-							"pblk-gc-writer-ts");
-	if (IS_ERR(gc->gc_writer_ts)) {
-		pblk_err(pblk, "could not allocate GC writer kthread\n");
-		ret = PTR_ERR(gc->gc_writer_ts);
-		goto fail_free_main_kthread;
-	}
-
-	gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk,
-							"pblk-gc-reader-ts");
-	if (IS_ERR(gc->gc_reader_ts)) {
-		pblk_err(pblk, "could not allocate GC reader kthread\n");
-		ret = PTR_ERR(gc->gc_reader_ts);
-		goto fail_free_writer_kthread;
-	}
-
-	timer_setup(&gc->gc_timer, pblk_gc_timer, 0);
-	mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
-
-	gc->gc_active = 0;
-	gc->gc_forced = 0;
-	gc->gc_enabled = 1;
-	gc->w_entries = 0;
-	atomic_set(&gc->read_inflight_gc, 0);
-	atomic_set(&gc->pipeline_gc, 0);
-
-	/* Workqueue that reads valid sectors from a line and submit them to the
-	 * GC writer to be recycled.
-	 */
-	gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq",
-			WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS);
-	if (!gc->gc_line_reader_wq) {
-		pblk_err(pblk, "could not allocate GC line reader workqueue\n");
-		ret = -ENOMEM;
-		goto fail_free_reader_kthread;
-	}
-
-	/* Workqueue that prepare lines for GC */
-	gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq",
-					WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
-	if (!gc->gc_reader_wq) {
-		pblk_err(pblk, "could not allocate GC reader workqueue\n");
-		ret = -ENOMEM;
-		goto fail_free_reader_line_wq;
-	}
-
-	spin_lock_init(&gc->lock);
-	spin_lock_init(&gc->w_lock);
-	spin_lock_init(&gc->r_lock);
-
-	sema_init(&gc->gc_sem, PBLK_GC_RQ_QD);
-
-	INIT_LIST_HEAD(&gc->w_list);
-	INIT_LIST_HEAD(&gc->r_list);
-
-	return 0;
-
-fail_free_reader_line_wq:
-	destroy_workqueue(gc->gc_line_reader_wq);
-fail_free_reader_kthread:
-	kthread_stop(gc->gc_reader_ts);
-fail_free_writer_kthread:
-	kthread_stop(gc->gc_writer_ts);
-fail_free_main_kthread:
-	kthread_stop(gc->gc_ts);
-
-	return ret;
-}
-
-void pblk_gc_exit(struct pblk *pblk, bool graceful)
-{
-	struct pblk_gc *gc = &pblk->gc;
-
-	gc->gc_enabled = 0;
-	del_timer_sync(&gc->gc_timer);
-	gc->gc_active = 0;
-
-	if (gc->gc_ts)
-		kthread_stop(gc->gc_ts);
-
-	if (gc->gc_reader_ts)
-		kthread_stop(gc->gc_reader_ts);
-
-	if (graceful) {
-		flush_workqueue(gc->gc_reader_wq);
-		flush_workqueue(gc->gc_line_reader_wq);
-	}
-
-	destroy_workqueue(gc->gc_reader_wq);
-	destroy_workqueue(gc->gc_line_reader_wq);
-
-	if (gc->gc_writer_ts)
-		kthread_stop(gc->gc_writer_ts);
-}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
deleted file mode 100644
index 5924f09c217b..000000000000
--- a/drivers/lightnvm/pblk-init.c
+++ /dev/null
@@ -1,1324 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
- * Copyright (C) 2016 CNEX Labs
- * Initial release: Javier Gonzalez <javier@cnexlabs.com>
- *                  Matias Bjorling <matias@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * Implementation of a physical block-device target for Open-channel SSDs.
- *
- * pblk-init.c - pblk's initialization.
- */
-
-#include "pblk.h"
-#include "pblk-trace.h"
-
-static unsigned int write_buffer_size;
-
-module_param(write_buffer_size, uint, 0644);
-MODULE_PARM_DESC(write_buffer_size, "number of entries in a write buffer");
-
-struct pblk_global_caches {
-	struct kmem_cache	*ws;
-	struct kmem_cache	*rec;
-	struct kmem_cache	*g_rq;
-	struct kmem_cache	*w_rq;
-
-	struct kref		kref;
-
-	struct mutex		mutex; /* Ensures consistency between
-					* caches and kref
-					*/
-};
-
-static struct pblk_global_caches pblk_caches = {
-	.mutex = __MUTEX_INITIALIZER(pblk_caches.mutex),
-	.kref = KREF_INIT(0),
-};
-
-struct bio_set pblk_bio_set;
-
-static blk_qc_t pblk_submit_bio(struct bio *bio)
-{
-	struct pblk *pblk = bio->bi_bdev->bd_disk->queue->queuedata;
-
-	if (bio_op(bio) == REQ_OP_DISCARD) {
-		pblk_discard(pblk, bio);
-		if (!(bio->bi_opf & REQ_PREFLUSH)) {
-			bio_endio(bio);
-			return BLK_QC_T_NONE;
-		}
-	}
-
-	/* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
-	 * constraint. Writes can be of arbitrary size.
-	 */
-	if (bio_data_dir(bio) == READ) {
-		blk_queue_split(&bio);
-		pblk_submit_read(pblk, bio);
-	} else {
-		/* Prevent deadlock in the case of a modest LUN configuration
-		 * and large user I/Os. Unless stalled, the rate limiter
-		 * leaves at least 256KB available for user I/O.
-		 */
-		if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl))
-			blk_queue_split(&bio);
-
-		pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
-	}
-
-	return BLK_QC_T_NONE;
-}
-
-static const struct block_device_operations pblk_bops = {
-	.owner		= THIS_MODULE,
-	.submit_bio	= pblk_submit_bio,
-};
-
-
-static size_t pblk_trans_map_size(struct pblk *pblk)
-{
-	int entry_size = 8;
-
-	if (pblk->addrf_len < 32)
-		entry_size = 4;
-
-	return entry_size * pblk->capacity;
-}
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-static u32 pblk_l2p_crc(struct pblk *pblk)
-{
-	size_t map_size;
-	u32 crc = ~(u32)0;
-
-	map_size = pblk_trans_map_size(pblk);
-	crc = crc32_le(crc, pblk->trans_map, map_size);
-	return crc;
-}
-#endif
-
-static void pblk_l2p_free(struct pblk *pblk)
-{
-	vfree(pblk->trans_map);
-}
-
-static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
-{
-	struct pblk_line *line = NULL;
-
-	if (factory_init) {
-		guid_gen(&pblk->instance_uuid);
-	} else {
-		line = pblk_recov_l2p(pblk);
-		if (IS_ERR(line)) {
-			pblk_err(pblk, "could not recover l2p table\n");
-			return -EFAULT;
-		}
-	}
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
-#endif
-
-	/* Free full lines directly as GC has not been started yet */
-	pblk_gc_free_full_lines(pblk);
-
-	if (!line) {
-		/* Configure next line for user data */
-		line = pblk_line_get_first_data(pblk);
-		if (!line)
-			return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
-{
-	sector_t i;
-	struct ppa_addr ppa;
-	size_t map_size;
-	int ret = 0;
-
-	map_size = pblk_trans_map_size(pblk);
-	pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN |
-				    __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM);
-	if (!pblk->trans_map) {
-		pblk_err(pblk, "failed to allocate L2P (need %zu of memory)\n",
-				map_size);
-		return -ENOMEM;
-	}
-
-	pblk_ppa_set_empty(&ppa);
-
-	for (i = 0; i < pblk->capacity; i++)
-		pblk_trans_map_set(pblk, i, ppa);
-
-	ret = pblk_l2p_recover(pblk, factory_init);
-	if (ret)
-		vfree(pblk->trans_map);
-
-	return ret;
-}
-
-static void pblk_rwb_free(struct pblk *pblk)
-{
-	if (pblk_rb_tear_down_check(&pblk->rwb))
-		pblk_err(pblk, "write buffer error on tear down\n");
-
-	pblk_rb_free(&pblk->rwb);
-}
-
-static int pblk_rwb_init(struct pblk *pblk)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	unsigned long buffer_size;
-	int pgs_in_buffer, threshold;
-
-	threshold = geo->mw_cunits * geo->all_luns;
-	pgs_in_buffer = (max(geo->mw_cunits, geo->ws_opt) + geo->ws_opt)
-								* geo->all_luns;
-
-	if (write_buffer_size && (write_buffer_size > pgs_in_buffer))
-		buffer_size = write_buffer_size;
-	else
-		buffer_size = pgs_in_buffer;
-
-	return pblk_rb_init(&pblk->rwb, buffer_size, threshold, geo->csecs);
-}
-
-static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo,
-			     struct nvm_addrf_12 *dst)
-{
-	struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf;
-	int power_len;
-
-	/* Re-calculate channel and lun format to adapt to configuration */
-	power_len = get_count_order(geo->num_ch);
-	if (1 << power_len != geo->num_ch) {
-		pblk_err(pblk, "supports only power-of-two channel config.\n");
-		return -EINVAL;
-	}
-	dst->ch_len = power_len;
-
-	power_len = get_count_order(geo->num_lun);
-	if (1 << power_len != geo->num_lun) {
-		pblk_err(pblk, "supports only power-of-two LUN config.\n");
-		return -EINVAL;
-	}
-	dst->lun_len = power_len;
-
-	dst->blk_len = src->blk_len;
-	dst->pg_len = src->pg_len;
-	dst->pln_len = src->pln_len;
-	dst->sec_len = src->sec_len;
-
-	dst->sec_offset = 0;
-	dst->pln_offset = dst->sec_len;
-	dst->ch_offset = dst->pln_offset + dst->pln_len;
-	dst->lun_offset = dst->ch_offset + dst->ch_len;
-	dst->pg_offset = dst->lun_offset + dst->lun_len;
-	dst->blk_offset = dst->pg_offset + dst->pg_len;
-
-	dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset;
-	dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset;
-	dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset;
-	dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset;
-	dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset;
-	dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset;
-
-	return dst->blk_offset + src->blk_len;
-}
-
-static int pblk_set_addrf_20(struct nvm_geo *geo, struct nvm_addrf *adst,
-			     struct pblk_addrf *udst)
-{
-	struct nvm_addrf *src = &geo->addrf;
-
-	adst->ch_len = get_count_order(geo->num_ch);
-	adst->lun_len = get_count_order(geo->num_lun);
-	adst->chk_len = src->chk_len;
-	adst->sec_len = src->sec_len;
-
-	adst->sec_offset = 0;
-	adst->ch_offset = adst->sec_len;
-	adst->lun_offset = adst->ch_offset + adst->ch_len;
-	adst->chk_offset = adst->lun_offset + adst->lun_len;
-
-	adst->sec_mask = ((1ULL << adst->sec_len) - 1) << adst->sec_offset;
-	adst->chk_mask = ((1ULL << adst->chk_len) - 1) << adst->chk_offset;
-	adst->lun_mask = ((1ULL << adst->lun_len) - 1) << adst->lun_offset;
-	adst->ch_mask = ((1ULL << adst->ch_len) - 1) << adst->ch_offset;
-
-	udst->sec_stripe = geo->ws_opt;
-	udst->ch_stripe = geo->num_ch;
-	udst->lun_stripe = geo->num_lun;
-
-	udst->sec_lun_stripe = udst->sec_stripe * udst->ch_stripe;
-	udst->sec_ws_stripe = udst->sec_lun_stripe * udst->lun_stripe;
-
-	return adst->chk_offset + adst->chk_len;
-}
-
-static int pblk_set_addrf(struct pblk *pblk)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	int mod;
-
-	switch (geo->version) {
-	case NVM_OCSSD_SPEC_12:
-		div_u64_rem(geo->clba, pblk->min_write_pgs, &mod);
-		if (mod) {
-			pblk_err(pblk, "bad configuration of sectors/pages\n");
-			return -EINVAL;
-		}
-
-		pblk->addrf_len = pblk_set_addrf_12(pblk, geo,
-							(void *)&pblk->addrf);
-		break;
-	case NVM_OCSSD_SPEC_20:
-		pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf,
-							&pblk->uaddrf);
-		break;
-	default:
-		pblk_err(pblk, "OCSSD revision not supported (%d)\n",
-								geo->version);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int pblk_create_global_caches(void)
-{
-
-	pblk_caches.ws = kmem_cache_create("pblk_blk_ws",
-				sizeof(struct pblk_line_ws), 0, 0, NULL);
-	if (!pblk_caches.ws)
-		return -ENOMEM;
-
-	pblk_caches.rec = kmem_cache_create("pblk_rec",
-				sizeof(struct pblk_rec_ctx), 0, 0, NULL);
-	if (!pblk_caches.rec)
-		goto fail_destroy_ws;
-
-	pblk_caches.g_rq = kmem_cache_create("pblk_g_rq", pblk_g_rq_size,
-				0, 0, NULL);
-	if (!pblk_caches.g_rq)
-		goto fail_destroy_rec;
-
-	pblk_caches.w_rq = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
-				0, 0, NULL);
-	if (!pblk_caches.w_rq)
-		goto fail_destroy_g_rq;
-
-	return 0;
-
-fail_destroy_g_rq:
-	kmem_cache_destroy(pblk_caches.g_rq);
-fail_destroy_rec:
-	kmem_cache_destroy(pblk_caches.rec);
-fail_destroy_ws:
-	kmem_cache_destroy(pblk_caches.ws);
-
-	return -ENOMEM;
-}
-
-static int pblk_get_global_caches(void)
-{
-	int ret = 0;
-
-	mutex_lock(&pblk_caches.mutex);
-
-	if (kref_get_unless_zero(&pblk_caches.kref))
-		goto out;
-
-	ret = pblk_create_global_caches();
-	if (!ret)
-		kref_init(&pblk_caches.kref);
-
-out:
-	mutex_unlock(&pblk_caches.mutex);
-	return ret;
-}
-
-static void pblk_destroy_global_caches(struct kref *ref)
-{
-	struct pblk_global_caches *c;
-
-	c = container_of(ref, struct pblk_global_caches, kref);
-
-	kmem_cache_destroy(c->ws);
-	kmem_cache_destroy(c->rec);
-	kmem_cache_destroy(c->g_rq);
-	kmem_cache_destroy(c->w_rq);
-}
-
-static void pblk_put_global_caches(void)
-{
-	mutex_lock(&pblk_caches.mutex);
-	kref_put(&pblk_caches.kref, pblk_destroy_global_caches);
-	mutex_unlock(&pblk_caches.mutex);
-}
-
-static int pblk_core_init(struct pblk *pblk)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	int ret, max_write_ppas;
-
-	atomic64_set(&pblk->user_wa, 0);
-	atomic64_set(&pblk->pad_wa, 0);
-	atomic64_set(&pblk->gc_wa, 0);
-	pblk->user_rst_wa = 0;
-	pblk->pad_rst_wa = 0;
-	pblk->gc_rst_wa = 0;
-
-	atomic64_set(&pblk->nr_flush, 0);
-	pblk->nr_flush_rst = 0;
-
-	pblk->min_write_pgs = geo->ws_opt;
-	pblk->min_write_pgs_data = pblk->min_write_pgs;
-	max_write_ppas = pblk->min_write_pgs * geo->all_luns;
-	pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
-	pblk->max_write_pgs = min_t(int, pblk->max_write_pgs,
-		queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT));
-	pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
-
-	pblk->oob_meta_size = geo->sos;
-	if (!pblk_is_oob_meta_supported(pblk)) {
-		/* For drives which does not have OOB metadata feature
-		 * in order to support recovery feature we need to use
-		 * so called packed metadata. Packed metada will store
-		 * the same information as OOB metadata (l2p table mapping,
-		 * but in the form of the single page at the end of
-		 * every write request.
-		 */
-		if (pblk->min_write_pgs
-			* sizeof(struct pblk_sec_meta) > PAGE_SIZE) {
-			/* We want to keep all the packed metadata on single
-			 * page per write requests. So we need to ensure that
-			 * it will fit.
-			 *
-			 * This is more like sanity check, since there is
-			 * no device with such a big minimal write size
-			 * (above 1 metabytes).
-			 */
-			pblk_err(pblk, "Not supported min write size\n");
-			return -EINVAL;
-		}
-		/* For packed meta approach we do some simplification.
-		 * On read path we always issue requests which size
-		 * equal to max_write_pgs, with all pages filled with
-		 * user payload except of last one page which will be
-		 * filled with packed metadata.
-		 */
-		pblk->max_write_pgs = pblk->min_write_pgs;
-		pblk->min_write_pgs_data = pblk->min_write_pgs - 1;
-	}
-
-	pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t),
-								GFP_KERNEL);
-	if (!pblk->pad_dist)
-		return -ENOMEM;
-
-	if (pblk_get_global_caches())
-		goto fail_free_pad_dist;
-
-	/* Internal bios can be at most the sectors signaled by the device. */
-	ret = mempool_init_page_pool(&pblk->page_bio_pool, NVM_MAX_VLBA, 0);
-	if (ret)
-		goto free_global_caches;
-
-	ret = mempool_init_slab_pool(&pblk->gen_ws_pool, PBLK_GEN_WS_POOL_SIZE,
-				     pblk_caches.ws);
-	if (ret)
-		goto free_page_bio_pool;
-
-	ret = mempool_init_slab_pool(&pblk->rec_pool, geo->all_luns,
-				     pblk_caches.rec);
-	if (ret)
-		goto free_gen_ws_pool;
-
-	ret = mempool_init_slab_pool(&pblk->r_rq_pool, geo->all_luns,
-				     pblk_caches.g_rq);
-	if (ret)
-		goto free_rec_pool;
-
-	ret = mempool_init_slab_pool(&pblk->e_rq_pool, geo->all_luns,
-				     pblk_caches.g_rq);
-	if (ret)
-		goto free_r_rq_pool;
-
-	ret = mempool_init_slab_pool(&pblk->w_rq_pool, geo->all_luns,
-				     pblk_caches.w_rq);
-	if (ret)
-		goto free_e_rq_pool;
-
-	pblk->close_wq = alloc_workqueue("pblk-close-wq",
-			WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS);
-	if (!pblk->close_wq)
-		goto free_w_rq_pool;
-
-	pblk->bb_wq = alloc_workqueue("pblk-bb-wq",
-			WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
-	if (!pblk->bb_wq)
-		goto free_close_wq;
-
-	pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq",
-			WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
-	if (!pblk->r_end_wq)
-		goto free_bb_wq;
-
-	if (pblk_set_addrf(pblk))
-		goto free_r_end_wq;
-
-	INIT_LIST_HEAD(&pblk->compl_list);
-	INIT_LIST_HEAD(&pblk->resubmit_list);
-
-	return 0;
-
-free_r_end_wq:
-	destroy_workqueue(pblk->r_end_wq);
-free_bb_wq:
-	destroy_workqueue(pblk->bb_wq);
-free_close_wq:
-	destroy_workqueue(pblk->close_wq);
-free_w_rq_pool:
-	mempool_exit(&pblk->w_rq_pool);
-free_e_rq_pool:
-	mempool_exit(&pblk->e_rq_pool);
-free_r_rq_pool:
-	mempool_exit(&pblk->r_rq_pool);
-free_rec_pool:
-	mempool_exit(&pblk->rec_pool);
-free_gen_ws_pool:
-	mempool_exit(&pblk->gen_ws_pool);
-free_page_bio_pool:
-	mempool_exit(&pblk->page_bio_pool);
-free_global_caches:
-	pblk_put_global_caches();
-fail_free_pad_dist:
-	kfree(pblk->pad_dist);
-	return -ENOMEM;
-}
-
-static void pblk_core_free(struct pblk *pblk)
-{
-	if (pblk->close_wq)
-		destroy_workqueue(pblk->close_wq);
-
-	if (pblk->r_end_wq)
-		destroy_workqueue(pblk->r_end_wq);
-
-	if (pblk->bb_wq)
-		destroy_workqueue(pblk->bb_wq);
-
-	mempool_exit(&pblk->page_bio_pool);
-	mempool_exit(&pblk->gen_ws_pool);
-	mempool_exit(&pblk->rec_pool);
-	mempool_exit(&pblk->r_rq_pool);
-	mempool_exit(&pblk->e_rq_pool);
-	mempool_exit(&pblk->w_rq_pool);
-
-	pblk_put_global_caches();
-	kfree(pblk->pad_dist);
-}
-
-static void pblk_line_mg_free(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	int i;
-
-	kfree(l_mg->bb_template);
-	kfree(l_mg->bb_aux);
-	kfree(l_mg->vsc_list);
-
-	for (i = 0; i < PBLK_DATA_LINES; i++) {
-		kfree(l_mg->sline_meta[i]);
-		kvfree(l_mg->eline_meta[i]->buf);
-		kfree(l_mg->eline_meta[i]);
-	}
-
-	mempool_destroy(l_mg->bitmap_pool);
-	kmem_cache_destroy(l_mg->bitmap_cache);
-}
-
-static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg,
-				struct pblk_line *line)
-{
-	struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
-
-	kfree(line->blk_bitmap);
-	kfree(line->erase_bitmap);
-	kfree(line->chks);
-
-	kvfree(w_err_gc->lba_list);
-	kfree(w_err_gc);
-}
-
-static void pblk_lines_free(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line *line;
-	int i;
-
-	for (i = 0; i < l_mg->nr_lines; i++) {
-		line = &pblk->lines[i];
-
-		pblk_line_free(line);
-		pblk_line_meta_free(l_mg, line);
-	}
-
-	pblk_line_mg_free(pblk);
-
-	kfree(pblk->luns);
-	kfree(pblk->lines);
-}
-
-static int pblk_luns_init(struct pblk *pblk)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_lun *rlun;
-	int i;
-
-	/* TODO: Implement unbalanced LUN support */
-	if (geo->num_lun < 0) {
-		pblk_err(pblk, "unbalanced LUN config.\n");
-		return -EINVAL;
-	}
-
-	pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun),
-								GFP_KERNEL);
-	if (!pblk->luns)
-		return -ENOMEM;
-
-	for (i = 0; i < geo->all_luns; i++) {
-		/* Stripe across channels */
-		int ch = i % geo->num_ch;
-		int lun_raw = i / geo->num_ch;
-		int lunid = lun_raw + ch * geo->num_lun;
-
-		rlun = &pblk->luns[i];
-		rlun->bppa = dev->luns[lunid];
-
-		sema_init(&rlun->wr_sem, 1);
-	}
-
-	return 0;
-}
-
-/* See comment over struct line_emeta definition */
-static unsigned int calc_emeta_len(struct pblk *pblk)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-
-	/* Round to sector size so that lba_list starts on its own sector */
-	lm->emeta_sec[1] = DIV_ROUND_UP(
-			sizeof(struct line_emeta) + lm->blk_bitmap_len +
-			sizeof(struct wa_counters), geo->csecs);
-	lm->emeta_len[1] = lm->emeta_sec[1] * geo->csecs;
-
-	/* Round to sector size so that vsc_list starts on its own sector */
-	lm->dsec_per_line = lm->sec_per_line - lm->emeta_sec[0];
-	lm->emeta_sec[2] = DIV_ROUND_UP(lm->dsec_per_line * sizeof(u64),
-			geo->csecs);
-	lm->emeta_len[2] = lm->emeta_sec[2] * geo->csecs;
-
-	lm->emeta_sec[3] = DIV_ROUND_UP(l_mg->nr_lines * sizeof(u32),
-			geo->csecs);
-	lm->emeta_len[3] = lm->emeta_sec[3] * geo->csecs;
-
-	lm->vsc_list_len = l_mg->nr_lines * sizeof(u32);
-
-	return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]);
-}
-
-static int pblk_set_provision(struct pblk *pblk, int nr_free_chks)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct nvm_geo *geo = &dev->geo;
-	sector_t provisioned;
-	int sec_meta, blk_meta, clba;
-	int minimum;
-
-	if (geo->op == NVM_TARGET_DEFAULT_OP)
-		pblk->op = PBLK_DEFAULT_OP;
-	else
-		pblk->op = geo->op;
-
-	minimum = pblk_get_min_chks(pblk);
-	provisioned = nr_free_chks;
-	provisioned *= (100 - pblk->op);
-	sector_div(provisioned, 100);
-
-	if ((nr_free_chks - provisioned) < minimum) {
-		if (geo->op != NVM_TARGET_DEFAULT_OP) {
-			pblk_err(pblk, "OP too small to create a sane instance\n");
-			return -EINTR;
-		}
-
-		/* If the user did not specify an OP value, and PBLK_DEFAULT_OP
-		 * is not enough, calculate and set sane value
-		 */
-
-		provisioned = nr_free_chks - minimum;
-		pblk->op =  (100 * minimum) / nr_free_chks;
-		pblk_info(pblk, "Default OP insufficient, adjusting OP to %d\n",
-				pblk->op);
-	}
-
-	pblk->op_blks = nr_free_chks - provisioned;
-
-	/* Internally pblk manages all free blocks, but all calculations based
-	 * on user capacity consider only provisioned blocks
-	 */
-	pblk->rl.total_blocks = nr_free_chks;
-
-	/* Consider sectors used for metadata */
-	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
-	blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
-
-	clba = (geo->clba / pblk->min_write_pgs) * pblk->min_write_pgs_data;
-	pblk->capacity = (provisioned - blk_meta) * clba;
-
-	atomic_set(&pblk->rl.free_blocks, nr_free_chks);
-	atomic_set(&pblk->rl.free_user_blocks, nr_free_chks);
-
-	return 0;
-}
-
-static int pblk_setup_line_meta_chk(struct pblk *pblk, struct pblk_line *line,
-				   struct nvm_chk_meta *meta)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	int i, nr_bad_chks = 0;
-
-	for (i = 0; i < lm->blk_per_line; i++) {
-		struct pblk_lun *rlun = &pblk->luns[i];
-		struct nvm_chk_meta *chunk;
-		struct nvm_chk_meta *chunk_meta;
-		struct ppa_addr ppa;
-		int pos;
-
-		ppa = rlun->bppa;
-		pos = pblk_ppa_to_pos(geo, ppa);
-		chunk = &line->chks[pos];
-
-		ppa.m.chk = line->id;
-		chunk_meta = pblk_chunk_get_off(pblk, meta, ppa);
-
-		chunk->state = chunk_meta->state;
-		chunk->type = chunk_meta->type;
-		chunk->wi = chunk_meta->wi;
-		chunk->slba = chunk_meta->slba;
-		chunk->cnlb = chunk_meta->cnlb;
-		chunk->wp = chunk_meta->wp;
-
-		trace_pblk_chunk_state(pblk_disk_name(pblk), &ppa,
-					chunk->state);
-
-		if (chunk->type & NVM_CHK_TP_SZ_SPEC) {
-			WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n");
-			continue;
-		}
-
-		if (!(chunk->state & NVM_CHK_ST_OFFLINE))
-			continue;
-
-		set_bit(pos, line->blk_bitmap);
-		nr_bad_chks++;
-	}
-
-	return nr_bad_chks;
-}
-
-static long pblk_setup_line_meta(struct pblk *pblk, struct pblk_line *line,
-				 void *chunk_meta, int line_id)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line_meta *lm = &pblk->lm;
-	long nr_bad_chks, chk_in_line;
-
-	line->pblk = pblk;
-	line->id = line_id;
-	line->type = PBLK_LINETYPE_FREE;
-	line->state = PBLK_LINESTATE_NEW;
-	line->gc_group = PBLK_LINEGC_NONE;
-	line->vsc = &l_mg->vsc_list[line_id];
-	spin_lock_init(&line->lock);
-
-	nr_bad_chks = pblk_setup_line_meta_chk(pblk, line, chunk_meta);
-
-	chk_in_line = lm->blk_per_line - nr_bad_chks;
-	if (nr_bad_chks < 0 || nr_bad_chks > lm->blk_per_line ||
-					chk_in_line < lm->min_blk_line) {
-		line->state = PBLK_LINESTATE_BAD;
-		list_add_tail(&line->list, &l_mg->bad_list);
-		return 0;
-	}
-
-	atomic_set(&line->blk_in_line, chk_in_line);
-	list_add_tail(&line->list, &l_mg->free_list);
-	l_mg->nr_free_lines++;
-
-	return chk_in_line;
-}
-
-static int pblk_alloc_line_meta(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-
-	line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
-	if (!line->blk_bitmap)
-		return -ENOMEM;
-
-	line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
-	if (!line->erase_bitmap)
-		goto free_blk_bitmap;
-
-
-	line->chks = kmalloc_array(lm->blk_per_line,
-				   sizeof(struct nvm_chk_meta), GFP_KERNEL);
-	if (!line->chks)
-		goto free_erase_bitmap;
-
-	line->w_err_gc = kzalloc(sizeof(struct pblk_w_err_gc), GFP_KERNEL);
-	if (!line->w_err_gc)
-		goto free_chks;
-
-	return 0;
-
-free_chks:
-	kfree(line->chks);
-free_erase_bitmap:
-	kfree(line->erase_bitmap);
-free_blk_bitmap:
-	kfree(line->blk_bitmap);
-	return -ENOMEM;
-}
-
-static int pblk_line_mg_init(struct pblk *pblk)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line_meta *lm = &pblk->lm;
-	int i, bb_distance;
-
-	l_mg->nr_lines = geo->num_chk;
-	l_mg->log_line = l_mg->data_line = NULL;
-	l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
-	l_mg->nr_free_lines = 0;
-	bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
-
-	INIT_LIST_HEAD(&l_mg->free_list);
-	INIT_LIST_HEAD(&l_mg->corrupt_list);
-	INIT_LIST_HEAD(&l_mg->bad_list);
-	INIT_LIST_HEAD(&l_mg->gc_full_list);
-	INIT_LIST_HEAD(&l_mg->gc_high_list);
-	INIT_LIST_HEAD(&l_mg->gc_mid_list);
-	INIT_LIST_HEAD(&l_mg->gc_low_list);
-	INIT_LIST_HEAD(&l_mg->gc_empty_list);
-	INIT_LIST_HEAD(&l_mg->gc_werr_list);
-
-	INIT_LIST_HEAD(&l_mg->emeta_list);
-
-	l_mg->gc_lists[0] = &l_mg->gc_werr_list;
-	l_mg->gc_lists[1] = &l_mg->gc_high_list;
-	l_mg->gc_lists[2] = &l_mg->gc_mid_list;
-	l_mg->gc_lists[3] = &l_mg->gc_low_list;
-
-	spin_lock_init(&l_mg->free_lock);
-	spin_lock_init(&l_mg->close_lock);
-	spin_lock_init(&l_mg->gc_lock);
-
-	l_mg->vsc_list = kcalloc(l_mg->nr_lines, sizeof(__le32), GFP_KERNEL);
-	if (!l_mg->vsc_list)
-		goto fail;
-
-	l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
-	if (!l_mg->bb_template)
-		goto fail_free_vsc_list;
-
-	l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
-	if (!l_mg->bb_aux)
-		goto fail_free_bb_template;
-
-	/* smeta is always small enough to fit on a kmalloc memory allocation,
-	 * emeta depends on the number of LUNs allocated to the pblk instance
-	 */
-	for (i = 0; i < PBLK_DATA_LINES; i++) {
-		l_mg->sline_meta[i] = kmalloc(lm->smeta_len, GFP_KERNEL);
-		if (!l_mg->sline_meta[i])
-			goto fail_free_smeta;
-	}
-
-	l_mg->bitmap_cache = kmem_cache_create("pblk_lm_bitmap",
-			lm->sec_bitmap_len, 0, 0, NULL);
-	if (!l_mg->bitmap_cache)
-		goto fail_free_smeta;
-
-	/* the bitmap pool is used for both valid and map bitmaps */
-	l_mg->bitmap_pool = mempool_create_slab_pool(PBLK_DATA_LINES * 2,
-				l_mg->bitmap_cache);
-	if (!l_mg->bitmap_pool)
-		goto fail_destroy_bitmap_cache;
-
-	/* emeta allocates three different buffers for managing metadata with
-	 * in-memory and in-media layouts
-	 */
-	for (i = 0; i < PBLK_DATA_LINES; i++) {
-		struct pblk_emeta *emeta;
-
-		emeta = kmalloc(sizeof(struct pblk_emeta), GFP_KERNEL);
-		if (!emeta)
-			goto fail_free_emeta;
-
-		emeta->buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL);
-		if (!emeta->buf) {
-			kfree(emeta);
-			goto fail_free_emeta;
-		}
-
-		emeta->nr_entries = lm->emeta_sec[0];
-		l_mg->eline_meta[i] = emeta;
-	}
-
-	for (i = 0; i < l_mg->nr_lines; i++)
-		l_mg->vsc_list[i] = cpu_to_le32(EMPTY_ENTRY);
-
-	bb_distance = (geo->all_luns) * geo->ws_opt;
-	for (i = 0; i < lm->sec_per_line; i += bb_distance)
-		bitmap_set(l_mg->bb_template, i, geo->ws_opt);
-
-	return 0;
-
-fail_free_emeta:
-	while (--i >= 0) {
-		kvfree(l_mg->eline_meta[i]->buf);
-		kfree(l_mg->eline_meta[i]);
-	}
-
-	mempool_destroy(l_mg->bitmap_pool);
-fail_destroy_bitmap_cache:
-	kmem_cache_destroy(l_mg->bitmap_cache);
-fail_free_smeta:
-	for (i = 0; i < PBLK_DATA_LINES; i++)
-		kfree(l_mg->sline_meta[i]);
-	kfree(l_mg->bb_aux);
-fail_free_bb_template:
-	kfree(l_mg->bb_template);
-fail_free_vsc_list:
-	kfree(l_mg->vsc_list);
-fail:
-	return -ENOMEM;
-}
-
-static int pblk_line_meta_init(struct pblk *pblk)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	unsigned int smeta_len, emeta_len;
-	int i;
-
-	lm->sec_per_line = geo->clba * geo->all_luns;
-	lm->blk_per_line = geo->all_luns;
-	lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
-	lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
-	lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
-	lm->mid_thrs = lm->sec_per_line / 2;
-	lm->high_thrs = lm->sec_per_line / 4;
-	lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs;
-
-	/* Calculate necessary pages for smeta. See comment over struct
-	 * line_smeta definition
-	 */
-	i = 1;
-add_smeta_page:
-	lm->smeta_sec = i * geo->ws_opt;
-	lm->smeta_len = lm->smeta_sec * geo->csecs;
-
-	smeta_len = sizeof(struct line_smeta) + lm->lun_bitmap_len;
-	if (smeta_len > lm->smeta_len) {
-		i++;
-		goto add_smeta_page;
-	}
-
-	/* Calculate necessary pages for emeta. See comment over struct
-	 * line_emeta definition
-	 */
-	i = 1;
-add_emeta_page:
-	lm->emeta_sec[0] = i * geo->ws_opt;
-	lm->emeta_len[0] = lm->emeta_sec[0] * geo->csecs;
-
-	emeta_len = calc_emeta_len(pblk);
-	if (emeta_len > lm->emeta_len[0]) {
-		i++;
-		goto add_emeta_page;
-	}
-
-	lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0;
-
-	lm->min_blk_line = 1;
-	if (geo->all_luns > 1)
-		lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
-					lm->emeta_sec[0], geo->clba);
-
-	if (lm->min_blk_line > lm->blk_per_line) {
-		pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n",
-							lm->blk_per_line);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int pblk_lines_init(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line *line;
-	void *chunk_meta;
-	int nr_free_chks = 0;
-	int i, ret;
-
-	ret = pblk_line_meta_init(pblk);
-	if (ret)
-		return ret;
-
-	ret = pblk_line_mg_init(pblk);
-	if (ret)
-		return ret;
-
-	ret = pblk_luns_init(pblk);
-	if (ret)
-		goto fail_free_meta;
-
-	chunk_meta = pblk_get_chunk_meta(pblk);
-	if (IS_ERR(chunk_meta)) {
-		ret = PTR_ERR(chunk_meta);
-		goto fail_free_luns;
-	}
-
-	pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line),
-								GFP_KERNEL);
-	if (!pblk->lines) {
-		ret = -ENOMEM;
-		goto fail_free_chunk_meta;
-	}
-
-	for (i = 0; i < l_mg->nr_lines; i++) {
-		line = &pblk->lines[i];
-
-		ret = pblk_alloc_line_meta(pblk, line);
-		if (ret)
-			goto fail_free_lines;
-
-		nr_free_chks += pblk_setup_line_meta(pblk, line, chunk_meta, i);
-
-		trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-								line->state);
-	}
-
-	if (!nr_free_chks) {
-		pblk_err(pblk, "too many bad blocks prevent for sane instance\n");
-		ret = -EINTR;
-		goto fail_free_lines;
-	}
-
-	ret = pblk_set_provision(pblk, nr_free_chks);
-	if (ret)
-		goto fail_free_lines;
-
-	vfree(chunk_meta);
-	return 0;
-
-fail_free_lines:
-	while (--i >= 0)
-		pblk_line_meta_free(l_mg, &pblk->lines[i]);
-	kfree(pblk->lines);
-fail_free_chunk_meta:
-	vfree(chunk_meta);
-fail_free_luns:
-	kfree(pblk->luns);
-fail_free_meta:
-	pblk_line_mg_free(pblk);
-
-	return ret;
-}
-
-static int pblk_writer_init(struct pblk *pblk)
-{
-	pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
-	if (IS_ERR(pblk->writer_ts)) {
-		int err = PTR_ERR(pblk->writer_ts);
-
-		if (err != -EINTR)
-			pblk_err(pblk, "could not allocate writer kthread (%d)\n",
-					err);
-		return err;
-	}
-
-	timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
-	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
-
-	return 0;
-}
-
-static void pblk_writer_stop(struct pblk *pblk)
-{
-	/* The pipeline must be stopped and the write buffer emptied before the
-	 * write thread is stopped
-	 */
-	WARN(pblk_rb_read_count(&pblk->rwb),
-			"Stopping not fully persisted write buffer\n");
-
-	WARN(pblk_rb_sync_count(&pblk->rwb),
-			"Stopping not fully synced write buffer\n");
-
-	del_timer_sync(&pblk->wtimer);
-	if (pblk->writer_ts)
-		kthread_stop(pblk->writer_ts);
-}
-
-static void pblk_free(struct pblk *pblk)
-{
-	pblk_lines_free(pblk);
-	pblk_l2p_free(pblk);
-	pblk_rwb_free(pblk);
-	pblk_core_free(pblk);
-
-	kfree(pblk);
-}
-
-static void pblk_tear_down(struct pblk *pblk, bool graceful)
-{
-	if (graceful)
-		__pblk_pipeline_flush(pblk);
-	__pblk_pipeline_stop(pblk);
-	pblk_writer_stop(pblk);
-	pblk_rb_sync_l2p(&pblk->rwb);
-	pblk_rl_free(&pblk->rl);
-
-	pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful);
-}
-
-static void pblk_exit(void *private, bool graceful)
-{
-	struct pblk *pblk = private;
-
-	pblk_gc_exit(pblk, graceful);
-	pblk_tear_down(pblk, graceful);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
-#endif
-
-	pblk_free(pblk);
-}
-
-static sector_t pblk_capacity(void *private)
-{
-	struct pblk *pblk = private;
-
-	return pblk->capacity * NR_PHY_IN_LOG;
-}
-
-static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
-		       int flags)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct request_queue *bqueue = dev->q;
-	struct request_queue *tqueue = tdisk->queue;
-	struct pblk *pblk;
-	int ret;
-
-	pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
-	if (!pblk)
-		return ERR_PTR(-ENOMEM);
-
-	pblk->dev = dev;
-	pblk->disk = tdisk;
-	pblk->state = PBLK_STATE_RUNNING;
-	trace_pblk_state(pblk_disk_name(pblk), pblk->state);
-	pblk->gc.gc_enabled = 0;
-
-	if (!(geo->version == NVM_OCSSD_SPEC_12 ||
-					geo->version == NVM_OCSSD_SPEC_20)) {
-		pblk_err(pblk, "OCSSD version not supported (%u)\n",
-							geo->version);
-		kfree(pblk);
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (geo->ext) {
-		pblk_err(pblk, "extended metadata not supported\n");
-		kfree(pblk);
-		return ERR_PTR(-EINVAL);
-	}
-
-	spin_lock_init(&pblk->resubmit_lock);
-	spin_lock_init(&pblk->trans_lock);
-	spin_lock_init(&pblk->lock);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_set(&pblk->inflight_writes, 0);
-	atomic_long_set(&pblk->padded_writes, 0);
-	atomic_long_set(&pblk->padded_wb, 0);
-	atomic_long_set(&pblk->req_writes, 0);
-	atomic_long_set(&pblk->sub_writes, 0);
-	atomic_long_set(&pblk->sync_writes, 0);
-	atomic_long_set(&pblk->inflight_reads, 0);
-	atomic_long_set(&pblk->cache_reads, 0);
-	atomic_long_set(&pblk->sync_reads, 0);
-	atomic_long_set(&pblk->recov_writes, 0);
-	atomic_long_set(&pblk->recov_writes, 0);
-	atomic_long_set(&pblk->recov_gc_writes, 0);
-	atomic_long_set(&pblk->recov_gc_reads, 0);
-#endif
-
-	atomic_long_set(&pblk->read_failed, 0);
-	atomic_long_set(&pblk->read_empty, 0);
-	atomic_long_set(&pblk->read_high_ecc, 0);
-	atomic_long_set(&pblk->read_failed_gc, 0);
-	atomic_long_set(&pblk->write_failed, 0);
-	atomic_long_set(&pblk->erase_failed, 0);
-
-	ret = pblk_core_init(pblk);
-	if (ret) {
-		pblk_err(pblk, "could not initialize core\n");
-		goto fail;
-	}
-
-	ret = pblk_lines_init(pblk);
-	if (ret) {
-		pblk_err(pblk, "could not initialize lines\n");
-		goto fail_free_core;
-	}
-
-	ret = pblk_rwb_init(pblk);
-	if (ret) {
-		pblk_err(pblk, "could not initialize write buffer\n");
-		goto fail_free_lines;
-	}
-
-	ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY);
-	if (ret) {
-		pblk_err(pblk, "could not initialize maps\n");
-		goto fail_free_rwb;
-	}
-
-	ret = pblk_writer_init(pblk);
-	if (ret) {
-		if (ret != -EINTR)
-			pblk_err(pblk, "could not initialize write thread\n");
-		goto fail_free_l2p;
-	}
-
-	ret = pblk_gc_init(pblk);
-	if (ret) {
-		pblk_err(pblk, "could not initialize gc\n");
-		goto fail_stop_writer;
-	}
-
-	/* inherit the size from the underlying device */
-	blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
-	blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
-
-	blk_queue_write_cache(tqueue, true, false);
-
-	tqueue->limits.discard_granularity = geo->clba * geo->csecs;
-	tqueue->limits.discard_alignment = 0;
-	blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
-	blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue);
-
-	pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
-			geo->all_luns, pblk->l_mg.nr_lines,
-			(unsigned long long)pblk->capacity,
-			pblk->rwb.nr_entries);
-
-	wake_up_process(pblk->writer_ts);
-
-	/* Check if we need to start GC */
-	pblk_gc_should_kick(pblk);
-
-	return pblk;
-
-fail_stop_writer:
-	pblk_writer_stop(pblk);
-fail_free_l2p:
-	pblk_l2p_free(pblk);
-fail_free_rwb:
-	pblk_rwb_free(pblk);
-fail_free_lines:
-	pblk_lines_free(pblk);
-fail_free_core:
-	pblk_core_free(pblk);
-fail:
-	kfree(pblk);
-	return ERR_PTR(ret);
-}
-
-/* physical block device target */
-static struct nvm_tgt_type tt_pblk = {
-	.name		= "pblk",
-	.version	= {1, 0, 0},
-
-	.bops		= &pblk_bops,
-	.capacity	= pblk_capacity,
-
-	.init		= pblk_init,
-	.exit		= pblk_exit,
-
-	.sysfs_init	= pblk_sysfs_init,
-	.sysfs_exit	= pblk_sysfs_exit,
-	.owner		= THIS_MODULE,
-};
-
-static int __init pblk_module_init(void)
-{
-	int ret;
-
-	ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
-	if (ret)
-		return ret;
-	ret = nvm_register_tgt_type(&tt_pblk);
-	if (ret)
-		bioset_exit(&pblk_bio_set);
-	return ret;
-}
-
-static void pblk_module_exit(void)
-{
-	bioset_exit(&pblk_bio_set);
-	nvm_unregister_tgt_type(&tt_pblk);
-}
-
-module_init(pblk_module_init);
-module_exit(pblk_module_exit);
-MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
-MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
deleted file mode 100644
index 5408e32b2f13..000000000000
--- a/drivers/lightnvm/pblk-map.c
+++ /dev/null
@@ -1,210 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2016 CNEX Labs
- * Initial release: Javier Gonzalez <javier@cnexlabs.com>
- *                  Matias Bjorling <matias@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * pblk-map.c - pblk's lba-ppa mapping strategy
- *
- */
-
-#include "pblk.h"
-
-static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
-			      struct ppa_addr *ppa_list,
-			      unsigned long *lun_bitmap,
-			      void *meta_list,
-			      unsigned int valid_secs)
-{
-	struct pblk_line *line = pblk_line_get_data(pblk);
-	struct pblk_emeta *emeta;
-	struct pblk_w_ctx *w_ctx;
-	__le64 *lba_list;
-	u64 paddr;
-	int nr_secs = pblk->min_write_pgs;
-	int i;
-
-	if (!line)
-		return -ENOSPC;
-
-	if (pblk_line_is_full(line)) {
-		struct pblk_line *prev_line = line;
-
-		/* If we cannot allocate a new line, make sure to store metadata
-		 * on current line and then fail
-		 */
-		line = pblk_line_replace_data(pblk);
-		pblk_line_close_meta(pblk, prev_line);
-
-		if (!line) {
-			pblk_pipeline_stop(pblk);
-			return -ENOSPC;
-		}
-
-	}
-
-	emeta = line->emeta;
-	lba_list = emeta_to_lbas(pblk, emeta->buf);
-
-	paddr = pblk_alloc_page(pblk, line, nr_secs);
-
-	for (i = 0; i < nr_secs; i++, paddr++) {
-		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
-		__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
-
-		/* ppa to be sent to the device */
-		ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
-
-		/* Write context for target bio completion on write buffer. Note
-		 * that the write buffer is protected by the sync backpointer,
-		 * and a single writer thread have access to each specific entry
-		 * at a time. Thus, it is safe to modify the context for the
-		 * entry we are setting up for submission without taking any
-		 * lock or memory barrier.
-		 */
-		if (i < valid_secs) {
-			kref_get(&line->ref);
-			atomic_inc(&line->sec_to_update);
-			w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
-			w_ctx->ppa = ppa_list[i];
-			meta->lba = cpu_to_le64(w_ctx->lba);
-			lba_list[paddr] = cpu_to_le64(w_ctx->lba);
-			if (lba_list[paddr] != addr_empty)
-				line->nr_valid_lbas++;
-			else
-				atomic64_inc(&pblk->pad_wa);
-		} else {
-			lba_list[paddr] = addr_empty;
-			meta->lba = addr_empty;
-			__pblk_map_invalidate(pblk, line, paddr);
-		}
-	}
-
-	pblk_down_rq(pblk, ppa_list[0], lun_bitmap);
-	return 0;
-}
-
-int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
-		 unsigned long *lun_bitmap, unsigned int valid_secs,
-		 unsigned int off)
-{
-	void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
-	void *meta_buffer;
-	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-	unsigned int map_secs;
-	int min = pblk->min_write_pgs;
-	int i;
-	int ret;
-
-	for (i = off; i < rqd->nr_ppas; i += min) {
-		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-		meta_buffer = pblk_get_meta(pblk, meta_list, i);
-
-		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, meta_buffer, map_secs);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-/* only if erase_ppa is set, acquire erase semaphore */
-int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
-		       unsigned int sentry, unsigned long *lun_bitmap,
-		       unsigned int valid_secs, struct ppa_addr *erase_ppa)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
-	void *meta_buffer;
-	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-	struct pblk_line *e_line, *d_line;
-	unsigned int map_secs;
-	int min = pblk->min_write_pgs;
-	int i, erase_lun;
-	int ret;
-
-
-	for (i = 0; i < rqd->nr_ppas; i += min) {
-		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-		meta_buffer = pblk_get_meta(pblk, meta_list, i);
-
-		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, meta_buffer, map_secs);
-		if (ret)
-			return ret;
-
-		erase_lun = pblk_ppa_to_pos(geo, ppa_list[i]);
-
-		/* line can change after page map. We might also be writing the
-		 * last line.
-		 */
-		e_line = pblk_line_get_erase(pblk);
-		if (!e_line)
-			return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
-							valid_secs, i + min);
-
-		spin_lock(&e_line->lock);
-		if (!test_bit(erase_lun, e_line->erase_bitmap)) {
-			set_bit(erase_lun, e_line->erase_bitmap);
-			atomic_dec(&e_line->left_eblks);
-
-			*erase_ppa = ppa_list[i];
-			erase_ppa->a.blk = e_line->id;
-			erase_ppa->a.reserved = 0;
-
-			spin_unlock(&e_line->lock);
-
-			/* Avoid evaluating e_line->left_eblks */
-			return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
-							valid_secs, i + min);
-		}
-		spin_unlock(&e_line->lock);
-	}
-
-	d_line = pblk_line_get_data(pblk);
-
-	/* line can change after page map. We might also be writing the
-	 * last line.
-	 */
-	e_line = pblk_line_get_erase(pblk);
-	if (!e_line)
-		return -ENOSPC;
-
-	/* Erase blocks that are bad in this line but might not be in next */
-	if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
-			bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) {
-		int bit = -1;
-
-retry:
-		bit = find_next_bit(d_line->blk_bitmap,
-						lm->blk_per_line, bit + 1);
-		if (bit >= lm->blk_per_line)
-			return 0;
-
-		spin_lock(&e_line->lock);
-		if (test_bit(bit, e_line->erase_bitmap)) {
-			spin_unlock(&e_line->lock);
-			goto retry;
-		}
-		spin_unlock(&e_line->lock);
-
-		set_bit(bit, e_line->erase_bitmap);
-		atomic_dec(&e_line->left_eblks);
-		*erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */
-		erase_ppa->a.blk = e_line->id;
-	}
-
-	return 0;
-}
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
deleted file mode 100644
index 5abb1705b039..000000000000
--- a/drivers/lightnvm/pblk-rb.c
+++ /dev/null
@@ -1,858 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2016 CNEX Labs
- * Initial release: Javier Gonzalez <javier@cnexlabs.com>
- *
- * Based upon the circular ringbuffer.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * pblk-rb.c - pblk's write buffer
- */
-
-#include <linux/circ_buf.h>
-
-#include "pblk.h"
-
-static DECLARE_RWSEM(pblk_rb_lock);
-
-static void pblk_rb_data_free(struct pblk_rb *rb)
-{
-	struct pblk_rb_pages *p, *t;
-
-	down_write(&pblk_rb_lock);
-	list_for_each_entry_safe(p, t, &rb->pages, list) {
-		free_pages((unsigned long)page_address(p->pages), p->order);
-		list_del(&p->list);
-		kfree(p);
-	}
-	up_write(&pblk_rb_lock);
-}
-
-void pblk_rb_free(struct pblk_rb *rb)
-{
-	pblk_rb_data_free(rb);
-	vfree(rb->entries);
-}
-
-/*
- * pblk_rb_calculate_size -- calculate the size of the write buffer
- */
-static unsigned int pblk_rb_calculate_size(unsigned int nr_entries,
-					   unsigned int threshold)
-{
-	unsigned int thr_sz = 1 << (get_count_order(threshold + NVM_MAX_VLBA));
-	unsigned int max_sz = max(thr_sz, nr_entries);
-	unsigned int max_io;
-
-	/* Alloc a write buffer that can (i) fit at least two split bios
-	 * (considering max I/O size NVM_MAX_VLBA, and (ii) guarantee that the
-	 * threshold will be respected
-	 */
-	max_io = (1 << max((int)(get_count_order(max_sz)),
-				(int)(get_count_order(NVM_MAX_VLBA << 1))));
-	if ((threshold + NVM_MAX_VLBA) >= max_io)
-		max_io <<= 1;
-
-	return max_io;
-}
-
-/*
- * Initialize ring buffer. The data and metadata buffers must be previously
- * allocated and their size must be a power of two
- * (Documentation/core-api/circular-buffers.rst)
- */
-int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
-		 unsigned int seg_size)
-{
-	struct pblk *pblk = container_of(rb, struct pblk, rwb);
-	struct pblk_rb_entry *entries;
-	unsigned int init_entry = 0;
-	unsigned int max_order = MAX_ORDER - 1;
-	unsigned int power_size, power_seg_sz;
-	unsigned int alloc_order, order, iter;
-	unsigned int nr_entries;
-
-	nr_entries = pblk_rb_calculate_size(size, threshold);
-	entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry)));
-	if (!entries)
-		return -ENOMEM;
-
-	power_size = get_count_order(nr_entries);
-	power_seg_sz = get_count_order(seg_size);
-
-	down_write(&pblk_rb_lock);
-	rb->entries = entries;
-	rb->seg_size = (1 << power_seg_sz);
-	rb->nr_entries = (1 << power_size);
-	rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
-	rb->back_thres = threshold;
-	rb->flush_point = EMPTY_ENTRY;
-
-	spin_lock_init(&rb->w_lock);
-	spin_lock_init(&rb->s_lock);
-
-	INIT_LIST_HEAD(&rb->pages);
-
-	alloc_order = power_size;
-	if (alloc_order >= max_order) {
-		order = max_order;
-		iter = (1 << (alloc_order - max_order));
-	} else {
-		order = alloc_order;
-		iter = 1;
-	}
-
-	do {
-		struct pblk_rb_entry *entry;
-		struct pblk_rb_pages *page_set;
-		void *kaddr;
-		unsigned long set_size;
-		int i;
-
-		page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL);
-		if (!page_set) {
-			up_write(&pblk_rb_lock);
-			vfree(entries);
-			return -ENOMEM;
-		}
-
-		page_set->order = order;
-		page_set->pages = alloc_pages(GFP_KERNEL, order);
-		if (!page_set->pages) {
-			kfree(page_set);
-			pblk_rb_data_free(rb);
-			up_write(&pblk_rb_lock);
-			vfree(entries);
-			return -ENOMEM;
-		}
-		kaddr = page_address(page_set->pages);
-
-		entry = &rb->entries[init_entry];
-		entry->data = kaddr;
-		entry->cacheline = pblk_cacheline_to_addr(init_entry++);
-		entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
-
-		set_size = (1 << order);
-		for (i = 1; i < set_size; i++) {
-			entry = &rb->entries[init_entry];
-			entry->cacheline = pblk_cacheline_to_addr(init_entry++);
-			entry->data = kaddr + (i * rb->seg_size);
-			entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
-			bio_list_init(&entry->w_ctx.bios);
-		}
-
-		list_add_tail(&page_set->list, &rb->pages);
-		iter--;
-	} while (iter > 0);
-	up_write(&pblk_rb_lock);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_set(&rb->inflight_flush_point, 0);
-#endif
-
-	/*
-	 * Initialize rate-limiter, which controls access to the write buffer
-	 * by user and GC I/O
-	 */
-	pblk_rl_init(&pblk->rl, rb->nr_entries, threshold);
-
-	return 0;
-}
-
-static void clean_wctx(struct pblk_w_ctx *w_ctx)
-{
-	int flags;
-
-	flags = READ_ONCE(w_ctx->flags);
-	WARN_ONCE(!(flags & PBLK_SUBMITTED_ENTRY),
-			"pblk: overwriting unsubmitted data\n");
-
-	/* Release flags on context. Protect from writes and reads */
-	smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
-	pblk_ppa_set_empty(&w_ctx->ppa);
-	w_ctx->lba = ADDR_EMPTY;
-}
-
-#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size)
-#define pblk_rb_ring_space(rb, head, tail, size) \
-					(CIRC_SPACE(head, tail, size))
-
-/*
- * Buffer space is calculated with respect to the back pointer signaling
- * synchronized entries to the media.
- */
-static unsigned int pblk_rb_space(struct pblk_rb *rb)
-{
-	unsigned int mem = READ_ONCE(rb->mem);
-	unsigned int sync = READ_ONCE(rb->sync);
-
-	return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries);
-}
-
-unsigned int pblk_rb_ptr_wrap(struct pblk_rb *rb, unsigned int p,
-			      unsigned int nr_entries)
-{
-	return (p + nr_entries) & (rb->nr_entries - 1);
-}
-
-/*
- * Buffer count is calculated with respect to the submission entry signaling the
- * entries that are available to send to the media
- */
-unsigned int pblk_rb_read_count(struct pblk_rb *rb)
-{
-	unsigned int mem = READ_ONCE(rb->mem);
-	unsigned int subm = READ_ONCE(rb->subm);
-
-	return pblk_rb_ring_count(mem, subm, rb->nr_entries);
-}
-
-unsigned int pblk_rb_sync_count(struct pblk_rb *rb)
-{
-	unsigned int mem = READ_ONCE(rb->mem);
-	unsigned int sync = READ_ONCE(rb->sync);
-
-	return pblk_rb_ring_count(mem, sync, rb->nr_entries);
-}
-
-unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
-{
-	unsigned int subm;
-
-	subm = READ_ONCE(rb->subm);
-	/* Commit read means updating submission pointer */
-	smp_store_release(&rb->subm, pblk_rb_ptr_wrap(rb, subm, nr_entries));
-
-	return subm;
-}
-
-static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
-{
-	struct pblk *pblk = container_of(rb, struct pblk, rwb);
-	struct pblk_line *line;
-	struct pblk_rb_entry *entry;
-	struct pblk_w_ctx *w_ctx;
-	unsigned int user_io = 0, gc_io = 0;
-	unsigned int i;
-	int flags;
-
-	for (i = 0; i < to_update; i++) {
-		entry = &rb->entries[rb->l2p_update];
-		w_ctx = &entry->w_ctx;
-
-		flags = READ_ONCE(entry->w_ctx.flags);
-		if (flags & PBLK_IOTYPE_USER)
-			user_io++;
-		else if (flags & PBLK_IOTYPE_GC)
-			gc_io++;
-		else
-			WARN(1, "pblk: unknown IO type\n");
-
-		pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
-							entry->cacheline);
-
-		line = pblk_ppa_to_line(pblk, w_ctx->ppa);
-		atomic_dec(&line->sec_to_update);
-		kref_put(&line->ref, pblk_line_put);
-		clean_wctx(w_ctx);
-		rb->l2p_update = pblk_rb_ptr_wrap(rb, rb->l2p_update, 1);
-	}
-
-	pblk_rl_out(&pblk->rl, user_io, gc_io);
-
-	return 0;
-}
-
-/*
- * When we move the l2p_update pointer, we update the l2p table - lookups will
- * point to the physical address instead of to the cacheline in the write buffer
- * from this moment on.
- */
-static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
-			      unsigned int mem, unsigned int sync)
-{
-	unsigned int space, count;
-	int ret = 0;
-
-	lockdep_assert_held(&rb->w_lock);
-
-	/* Update l2p only as buffer entries are being overwritten */
-	space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries);
-	if (space > nr_entries)
-		goto out;
-
-	count = nr_entries - space;
-	/* l2p_update used exclusively under rb->w_lock */
-	ret = __pblk_rb_update_l2p(rb, count);
-
-out:
-	return ret;
-}
-
-/*
- * Update the l2p entry for all sectors stored on the write buffer. This means
- * that all future lookups to the l2p table will point to a device address, not
- * to the cacheline in the write buffer.
- */
-void pblk_rb_sync_l2p(struct pblk_rb *rb)
-{
-	unsigned int sync;
-	unsigned int to_update;
-
-	spin_lock(&rb->w_lock);
-
-	/* Protect from reads and writes */
-	sync = smp_load_acquire(&rb->sync);
-
-	to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
-	__pblk_rb_update_l2p(rb, to_update);
-
-	spin_unlock(&rb->w_lock);
-}
-
-/*
- * Write @nr_entries to ring buffer from @data buffer if there is enough space.
- * Typically, 4KB data chunks coming from a bio will be copied to the ring
- * buffer, thus the write will fail if not all incoming data can be copied.
- *
- */
-static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data,
-				  struct pblk_w_ctx w_ctx,
-				  struct pblk_rb_entry *entry)
-{
-	memcpy(entry->data, data, rb->seg_size);
-
-	entry->w_ctx.lba = w_ctx.lba;
-	entry->w_ctx.ppa = w_ctx.ppa;
-}
-
-void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
-			      struct pblk_w_ctx w_ctx, unsigned int ring_pos)
-{
-	struct pblk *pblk = container_of(rb, struct pblk, rwb);
-	struct pblk_rb_entry *entry;
-	int flags;
-
-	entry = &rb->entries[ring_pos];
-	flags = READ_ONCE(entry->w_ctx.flags);
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	/* Caller must guarantee that the entry is free */
-	BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
-#endif
-
-	__pblk_rb_write_entry(rb, data, w_ctx, entry);
-
-	pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
-	flags = w_ctx.flags | PBLK_WRITTEN_DATA;
-
-	/* Release flags on write context. Protect from writes */
-	smp_store_release(&entry->w_ctx.flags, flags);
-}
-
-void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
-			    struct pblk_w_ctx w_ctx, struct pblk_line *line,
-			    u64 paddr, unsigned int ring_pos)
-{
-	struct pblk *pblk = container_of(rb, struct pblk, rwb);
-	struct pblk_rb_entry *entry;
-	int flags;
-
-	entry = &rb->entries[ring_pos];
-	flags = READ_ONCE(entry->w_ctx.flags);
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	/* Caller must guarantee that the entry is free */
-	BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
-#endif
-
-	__pblk_rb_write_entry(rb, data, w_ctx, entry);
-
-	if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr))
-		entry->w_ctx.lba = ADDR_EMPTY;
-
-	flags = w_ctx.flags | PBLK_WRITTEN_DATA;
-
-	/* Release flags on write context. Protect from writes */
-	smp_store_release(&entry->w_ctx.flags, flags);
-}
-
-static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
-				   unsigned int pos)
-{
-	struct pblk_rb_entry *entry;
-	unsigned int sync, flush_point;
-
-	pblk_rb_sync_init(rb, NULL);
-	sync = READ_ONCE(rb->sync);
-
-	if (pos == sync) {
-		pblk_rb_sync_end(rb, NULL);
-		return 0;
-	}
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_inc(&rb->inflight_flush_point);
-#endif
-
-	flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
-	entry = &rb->entries[flush_point];
-
-	/* Protect flush points */
-	smp_store_release(&rb->flush_point, flush_point);
-
-	if (bio)
-		bio_list_add(&entry->w_ctx.bios, bio);
-
-	pblk_rb_sync_end(rb, NULL);
-
-	return bio ? 1 : 0;
-}
-
-static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
-			       unsigned int *pos)
-{
-	unsigned int mem;
-	unsigned int sync;
-	unsigned int threshold;
-
-	sync = READ_ONCE(rb->sync);
-	mem = READ_ONCE(rb->mem);
-
-	threshold = nr_entries + rb->back_thres;
-
-	if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < threshold)
-		return 0;
-
-	if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
-		return 0;
-
-	*pos = mem;
-
-	return 1;
-}
-
-static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
-			     unsigned int *pos)
-{
-	if (!__pblk_rb_may_write(rb, nr_entries, pos))
-		return 0;
-
-	/* Protect from read count */
-	smp_store_release(&rb->mem, pblk_rb_ptr_wrap(rb, *pos, nr_entries));
-	return 1;
-}
-
-void pblk_rb_flush(struct pblk_rb *rb)
-{
-	struct pblk *pblk = container_of(rb, struct pblk, rwb);
-	unsigned int mem = READ_ONCE(rb->mem);
-
-	if (pblk_rb_flush_point_set(rb, NULL, mem))
-		return;
-
-	pblk_write_kick(pblk);
-}
-
-static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
-				   unsigned int *pos, struct bio *bio,
-				   int *io_ret)
-{
-	unsigned int mem;
-
-	if (!__pblk_rb_may_write(rb, nr_entries, pos))
-		return 0;
-
-	mem = pblk_rb_ptr_wrap(rb, *pos, nr_entries);
-	*io_ret = NVM_IO_DONE;
-
-	if (bio->bi_opf & REQ_PREFLUSH) {
-		struct pblk *pblk = container_of(rb, struct pblk, rwb);
-
-		atomic64_inc(&pblk->nr_flush);
-		if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem))
-			*io_ret = NVM_IO_OK;
-	}
-
-	/* Protect from read count */
-	smp_store_release(&rb->mem, mem);
-
-	return 1;
-}
-
-/*
- * Atomically check that (i) there is space on the write buffer for the
- * incoming I/O, and (ii) the current I/O type has enough budget in the write
- * buffer (rate-limiter).
- */
-int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
-			   unsigned int nr_entries, unsigned int *pos)
-{
-	struct pblk *pblk = container_of(rb, struct pblk, rwb);
-	int io_ret;
-
-	spin_lock(&rb->w_lock);
-	io_ret = pblk_rl_user_may_insert(&pblk->rl, nr_entries);
-	if (io_ret) {
-		spin_unlock(&rb->w_lock);
-		return io_ret;
-	}
-
-	if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &io_ret)) {
-		spin_unlock(&rb->w_lock);
-		return NVM_IO_REQUEUE;
-	}
-
-	pblk_rl_user_in(&pblk->rl, nr_entries);
-	spin_unlock(&rb->w_lock);
-
-	return io_ret;
-}
-
-/*
- * Look at pblk_rb_may_write_user comment
- */
-int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
-			 unsigned int *pos)
-{
-	struct pblk *pblk = container_of(rb, struct pblk, rwb);
-
-	spin_lock(&rb->w_lock);
-	if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) {
-		spin_unlock(&rb->w_lock);
-		return 0;
-	}
-
-	if (!pblk_rb_may_write(rb, nr_entries, pos)) {
-		spin_unlock(&rb->w_lock);
-		return 0;
-	}
-
-	pblk_rl_gc_in(&pblk->rl, nr_entries);
-	spin_unlock(&rb->w_lock);
-
-	return 1;
-}
-
-/*
- * Read available entries on rb and add them to the given bio. To avoid a memory
- * copy, a page reference to the write buffer is used to be added to the bio.
- *
- * This function is used by the write thread to form the write bio that will
- * persist data on the write buffer to the media.
- */
-unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
-				 unsigned int pos, unsigned int nr_entries,
-				 unsigned int count)
-{
-	struct pblk *pblk = container_of(rb, struct pblk, rwb);
-	struct request_queue *q = pblk->dev->q;
-	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
-	struct bio *bio = rqd->bio;
-	struct pblk_rb_entry *entry;
-	struct page *page;
-	unsigned int pad = 0, to_read = nr_entries;
-	unsigned int i;
-	int flags;
-
-	if (count < nr_entries) {
-		pad = nr_entries - count;
-		to_read = count;
-	}
-
-	/* Add space for packed metadata if in use*/
-	pad += (pblk->min_write_pgs - pblk->min_write_pgs_data);
-
-	c_ctx->sentry = pos;
-	c_ctx->nr_valid = to_read;
-	c_ctx->nr_padded = pad;
-
-	for (i = 0; i < to_read; i++) {
-		entry = &rb->entries[pos];
-
-		/* A write has been allowed into the buffer, but data is still
-		 * being copied to it. It is ok to busy wait.
-		 */
-try:
-		flags = READ_ONCE(entry->w_ctx.flags);
-		if (!(flags & PBLK_WRITTEN_DATA)) {
-			io_schedule();
-			goto try;
-		}
-
-		page = virt_to_page(entry->data);
-		if (!page) {
-			pblk_err(pblk, "could not allocate write bio page\n");
-			flags &= ~PBLK_WRITTEN_DATA;
-			flags |= PBLK_SUBMITTED_ENTRY;
-			/* Release flags on context. Protect from writes */
-			smp_store_release(&entry->w_ctx.flags, flags);
-			return NVM_IO_ERR;
-		}
-
-		if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) !=
-								rb->seg_size) {
-			pblk_err(pblk, "could not add page to write bio\n");
-			flags &= ~PBLK_WRITTEN_DATA;
-			flags |= PBLK_SUBMITTED_ENTRY;
-			/* Release flags on context. Protect from writes */
-			smp_store_release(&entry->w_ctx.flags, flags);
-			return NVM_IO_ERR;
-		}
-
-		flags &= ~PBLK_WRITTEN_DATA;
-		flags |= PBLK_SUBMITTED_ENTRY;
-
-		/* Release flags on context. Protect from writes */
-		smp_store_release(&entry->w_ctx.flags, flags);
-
-		pos = pblk_rb_ptr_wrap(rb, pos, 1);
-	}
-
-	if (pad) {
-		if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) {
-			pblk_err(pblk, "could not pad page in write bio\n");
-			return NVM_IO_ERR;
-		}
-
-		if (pad < pblk->min_write_pgs)
-			atomic64_inc(&pblk->pad_dist[pad - 1]);
-		else
-			pblk_warn(pblk, "padding more than min. sectors\n");
-
-		atomic64_add(pad, &pblk->pad_wa);
-	}
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_add(pad, &pblk->padded_writes);
-#endif
-
-	return NVM_IO_OK;
-}
-
-/*
- * Copy to bio only if the lba matches the one on the given cache entry.
- * Otherwise, it means that the entry has been overwritten, and the bio should
- * be directed to disk.
- */
-int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
-			struct ppa_addr ppa)
-{
-	struct pblk *pblk = container_of(rb, struct pblk, rwb);
-	struct pblk_rb_entry *entry;
-	struct pblk_w_ctx *w_ctx;
-	struct ppa_addr l2p_ppa;
-	u64 pos = pblk_addr_to_cacheline(ppa);
-	void *data;
-	int flags;
-	int ret = 1;
-
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	/* Caller must ensure that the access will not cause an overflow */
-	BUG_ON(pos >= rb->nr_entries);
-#endif
-	entry = &rb->entries[pos];
-	w_ctx = &entry->w_ctx;
-	flags = READ_ONCE(w_ctx->flags);
-
-	spin_lock(&rb->w_lock);
-	spin_lock(&pblk->trans_lock);
-	l2p_ppa = pblk_trans_map_get(pblk, lba);
-	spin_unlock(&pblk->trans_lock);
-
-	/* Check if the entry has been overwritten or is scheduled to be */
-	if (!pblk_ppa_comp(l2p_ppa, ppa) || w_ctx->lba != lba ||
-						flags & PBLK_WRITABLE_ENTRY) {
-		ret = 0;
-		goto out;
-	}
-	data = bio_data(bio);
-	memcpy(data, entry->data, rb->seg_size);
-
-out:
-	spin_unlock(&rb->w_lock);
-	return ret;
-}
-
-struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos)
-{
-	unsigned int entry = pblk_rb_ptr_wrap(rb, pos, 0);
-
-	return &rb->entries[entry].w_ctx;
-}
-
-unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags)
-	__acquires(&rb->s_lock)
-{
-	if (flags)
-		spin_lock_irqsave(&rb->s_lock, *flags);
-	else
-		spin_lock_irq(&rb->s_lock);
-
-	return rb->sync;
-}
-
-void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
-	__releases(&rb->s_lock)
-{
-	lockdep_assert_held(&rb->s_lock);
-
-	if (flags)
-		spin_unlock_irqrestore(&rb->s_lock, *flags);
-	else
-		spin_unlock_irq(&rb->s_lock);
-}
-
-unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
-{
-	unsigned int sync, flush_point;
-	lockdep_assert_held(&rb->s_lock);
-
-	sync = READ_ONCE(rb->sync);
-	flush_point = READ_ONCE(rb->flush_point);
-
-	if (flush_point != EMPTY_ENTRY) {
-		unsigned int secs_to_flush;
-
-		secs_to_flush = pblk_rb_ring_count(flush_point, sync,
-					rb->nr_entries);
-		if (secs_to_flush < nr_entries) {
-			/* Protect flush points */
-			smp_store_release(&rb->flush_point, EMPTY_ENTRY);
-		}
-	}
-
-	sync = pblk_rb_ptr_wrap(rb, sync, nr_entries);
-
-	/* Protect from counts */
-	smp_store_release(&rb->sync, sync);
-
-	return sync;
-}
-
-/* Calculate how many sectors to submit up to the current flush point. */
-unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb)
-{
-	unsigned int subm, sync, flush_point;
-	unsigned int submitted, to_flush;
-
-	/* Protect flush points */
-	flush_point = smp_load_acquire(&rb->flush_point);
-	if (flush_point == EMPTY_ENTRY)
-		return 0;
-
-	/* Protect syncs */
-	sync = smp_load_acquire(&rb->sync);
-
-	subm = READ_ONCE(rb->subm);
-	submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries);
-
-	/* The sync point itself counts as a sector to sync */
-	to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1;
-
-	return (submitted < to_flush) ? (to_flush - submitted) : 0;
-}
-
-int pblk_rb_tear_down_check(struct pblk_rb *rb)
-{
-	struct pblk_rb_entry *entry;
-	int i;
-	int ret = 0;
-
-	spin_lock(&rb->w_lock);
-	spin_lock_irq(&rb->s_lock);
-
-	if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
-				(rb->sync == rb->l2p_update) &&
-				(rb->flush_point == EMPTY_ENTRY)) {
-		goto out;
-	}
-
-	if (!rb->entries) {
-		ret = 1;
-		goto out;
-	}
-
-	for (i = 0; i < rb->nr_entries; i++) {
-		entry = &rb->entries[i];
-
-		if (!entry->data) {
-			ret = 1;
-			goto out;
-		}
-	}
-
-out:
-	spin_unlock_irq(&rb->s_lock);
-	spin_unlock(&rb->w_lock);
-
-	return ret;
-}
-
-unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos)
-{
-	return (pos & (rb->nr_entries - 1));
-}
-
-int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos)
-{
-	return (pos >= rb->nr_entries);
-}
-
-ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
-{
-	struct pblk *pblk = container_of(rb, struct pblk, rwb);
-	struct pblk_c_ctx *c;
-	ssize_t offset;
-	int queued_entries = 0;
-
-	spin_lock_irq(&rb->s_lock);
-	list_for_each_entry(c, &pblk->compl_list, list)
-		queued_entries++;
-	spin_unlock_irq(&rb->s_lock);
-
-	if (rb->flush_point != EMPTY_ENTRY)
-		offset = scnprintf(buf, PAGE_SIZE,
-			"%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
-			rb->nr_entries,
-			rb->mem,
-			rb->subm,
-			rb->sync,
-			rb->l2p_update,
-#ifdef CONFIG_NVM_PBLK_DEBUG
-			atomic_read(&rb->inflight_flush_point),
-#else
-			0,
-#endif
-			rb->flush_point,
-			pblk_rb_read_count(rb),
-			pblk_rb_space(rb),
-			pblk_rb_flush_point_count(rb),
-			queued_entries);
-	else
-		offset = scnprintf(buf, PAGE_SIZE,
-			"%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n",
-			rb->nr_entries,
-			rb->mem,
-			rb->subm,
-			rb->sync,
-			rb->l2p_update,
-#ifdef CONFIG_NVM_PBLK_DEBUG
-			atomic_read(&rb->inflight_flush_point),
-#else
-			0,
-#endif
-			pblk_rb_read_count(rb),
-			pblk_rb_space(rb),
-			pblk_rb_flush_point_count(rb),
-			queued_entries);
-
-	return offset;
-}
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
deleted file mode 100644
index c28537a489bc..000000000000
--- a/drivers/lightnvm/pblk-read.c
+++ /dev/null
@@ -1,474 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2016 CNEX Labs
- * Initial release: Javier Gonzalez <javier@cnexlabs.com>
- *                  Matias Bjorling <matias@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * pblk-read.c - pblk's read path
- */
-
-#include "pblk.h"
-
-/*
- * There is no guarantee that the value read from cache has not been updated and
- * resides at another location in the cache. We guarantee though that if the
- * value is read from the cache, it belongs to the mapped lba. In order to
- * guarantee and order between writes and reads are ordered, a flush must be
- * issued.
- */
-static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
-				sector_t lba, struct ppa_addr ppa)
-{
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	/* Callers must ensure that the ppa points to a cache address */
-	BUG_ON(pblk_ppa_empty(ppa));
-	BUG_ON(!pblk_addr_in_cache(ppa));
-#endif
-
-	return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa);
-}
-
-static int pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
-				 struct bio *bio, sector_t blba,
-				 bool *from_cache)
-{
-	void *meta_list = rqd->meta_list;
-	int nr_secs, i;
-
-retry:
-	nr_secs = pblk_lookup_l2p_seq(pblk, rqd->ppa_list, blba, rqd->nr_ppas,
-					from_cache);
-
-	if (!*from_cache)
-		goto end;
-
-	for (i = 0; i < nr_secs; i++) {
-		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
-		sector_t lba = blba + i;
-
-		if (pblk_ppa_empty(rqd->ppa_list[i])) {
-			__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
-
-			meta->lba = addr_empty;
-		} else if (pblk_addr_in_cache(rqd->ppa_list[i])) {
-			/*
-			 * Try to read from write buffer. The address is later
-			 * checked on the write buffer to prevent retrieving
-			 * overwritten data.
-			 */
-			if (!pblk_read_from_cache(pblk, bio, lba,
-							rqd->ppa_list[i])) {
-				if (i == 0) {
-					/*
-					 * We didn't call with bio_advance()
-					 * yet, so we can just retry.
-					 */
-					goto retry;
-				} else {
-					/*
-					 * We already call bio_advance()
-					 * so we cannot retry and we need
-					 * to quit that function in order
-					 * to allow caller to handle the bio
-					 * splitting in the current sector
-					 * position.
-					 */
-					nr_secs = i;
-					goto end;
-				}
-			}
-			meta->lba = cpu_to_le64(lba);
-#ifdef CONFIG_NVM_PBLK_DEBUG
-			atomic_long_inc(&pblk->cache_reads);
-#endif
-		}
-		bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
-	}
-
-end:
-	if (pblk_io_aligned(pblk, nr_secs))
-		rqd->is_seq = 1;
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_add(nr_secs, &pblk->inflight_reads);
-#endif
-
-	return nr_secs;
-}
-
-
-static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
-				sector_t blba)
-{
-	void *meta_list = rqd->meta_list;
-	int nr_lbas = rqd->nr_ppas;
-	int i;
-
-	if (!pblk_is_oob_meta_supported(pblk))
-		return;
-
-	for (i = 0; i < nr_lbas; i++) {
-		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
-		u64 lba = le64_to_cpu(meta->lba);
-
-		if (lba == ADDR_EMPTY)
-			continue;
-
-		if (lba != blba + i) {
-#ifdef CONFIG_NVM_PBLK_DEBUG
-			struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-
-			print_ppa(pblk, &ppa_list[i], "seq", i);
-#endif
-			pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
-							lba, (u64)blba + i);
-			WARN_ON(1);
-		}
-	}
-}
-
-/*
- * There can be holes in the lba list.
- */
-static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
-				 u64 *lba_list, int nr_lbas)
-{
-	void *meta_lba_list = rqd->meta_list;
-	int i, j;
-
-	if (!pblk_is_oob_meta_supported(pblk))
-		return;
-
-	for (i = 0, j = 0; i < nr_lbas; i++) {
-		struct pblk_sec_meta *meta = pblk_get_meta(pblk,
-							   meta_lba_list, j);
-		u64 lba = lba_list[i];
-		u64 meta_lba;
-
-		if (lba == ADDR_EMPTY)
-			continue;
-
-		meta_lba = le64_to_cpu(meta->lba);
-
-		if (lba != meta_lba) {
-#ifdef CONFIG_NVM_PBLK_DEBUG
-			struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-
-			print_ppa(pblk, &ppa_list[j], "rnd", j);
-#endif
-			pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
-							meta_lba, lba);
-			WARN_ON(1);
-		}
-
-		j++;
-	}
-
-	WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n");
-}
-
-static void pblk_end_user_read(struct bio *bio, int error)
-{
-	if (error && error != NVM_RSP_WARN_HIGHECC)
-		bio_io_error(bio);
-	else
-		bio_endio(bio);
-}
-
-static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
-			       bool put_line)
-{
-	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
-	struct bio *int_bio = rqd->bio;
-	unsigned long start_time = r_ctx->start_time;
-
-	bio_end_io_acct(int_bio, start_time);
-
-	if (rqd->error)
-		pblk_log_read_err(pblk, rqd);
-
-	pblk_read_check_seq(pblk, rqd, r_ctx->lba);
-	bio_put(int_bio);
-
-	if (put_line)
-		pblk_rq_to_line_put(pblk, rqd);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
-	atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
-#endif
-
-	pblk_free_rqd(pblk, rqd, PBLK_READ);
-	atomic_dec(&pblk->inflight_io);
-}
-
-static void pblk_end_io_read(struct nvm_rq *rqd)
-{
-	struct pblk *pblk = rqd->private;
-	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
-	struct bio *bio = (struct bio *)r_ctx->private;
-
-	pblk_end_user_read(bio, rqd->error);
-	__pblk_end_io_read(pblk, rqd, true);
-}
-
-static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
-			 sector_t lba, bool *from_cache)
-{
-	struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0);
-	struct ppa_addr ppa;
-
-	pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_inc(&pblk->inflight_reads);
-#endif
-
-retry:
-	if (pblk_ppa_empty(ppa)) {
-		__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
-
-		meta->lba = addr_empty;
-		return;
-	}
-
-	/* Try to read from write buffer. The address is later checked on the
-	 * write buffer to prevent retrieving overwritten data.
-	 */
-	if (pblk_addr_in_cache(ppa)) {
-		if (!pblk_read_from_cache(pblk, bio, lba, ppa)) {
-			pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache);
-			goto retry;
-		}
-
-		meta->lba = cpu_to_le64(lba);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-		atomic_long_inc(&pblk->cache_reads);
-#endif
-	} else {
-		rqd->ppa_addr = ppa;
-	}
-}
-
-void pblk_submit_read(struct pblk *pblk, struct bio *bio)
-{
-	sector_t blba = pblk_get_lba(bio);
-	unsigned int nr_secs = pblk_get_secs(bio);
-	bool from_cache;
-	struct pblk_g_ctx *r_ctx;
-	struct nvm_rq *rqd;
-	struct bio *int_bio, *split_bio;
-	unsigned long start_time;
-
-	start_time = bio_start_io_acct(bio);
-
-	rqd = pblk_alloc_rqd(pblk, PBLK_READ);
-
-	rqd->opcode = NVM_OP_PREAD;
-	rqd->nr_ppas = nr_secs;
-	rqd->private = pblk;
-	rqd->end_io = pblk_end_io_read;
-
-	r_ctx = nvm_rq_to_pdu(rqd);
-	r_ctx->start_time = start_time;
-	r_ctx->lba = blba;
-
-	if (pblk_alloc_rqd_meta(pblk, rqd)) {
-		bio_io_error(bio);
-		pblk_free_rqd(pblk, rqd, PBLK_READ);
-		return;
-	}
-
-	/* Clone read bio to deal internally with:
-	 * -read errors when reading from drive
-	 * -bio_advance() calls during cache reads
-	 */
-	int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
-
-	if (nr_secs > 1)
-		nr_secs = pblk_read_ppalist_rq(pblk, rqd, int_bio, blba,
-						&from_cache);
-	else
-		pblk_read_rq(pblk, rqd, int_bio, blba, &from_cache);
-
-split_retry:
-	r_ctx->private = bio; /* original bio */
-	rqd->bio = int_bio; /* internal bio */
-
-	if (from_cache && nr_secs == rqd->nr_ppas) {
-		/* All data was read from cache, we can complete the IO. */
-		pblk_end_user_read(bio, 0);
-		atomic_inc(&pblk->inflight_io);
-		__pblk_end_io_read(pblk, rqd, false);
-	} else if (nr_secs != rqd->nr_ppas) {
-		/* The read bio request could be partially filled by the write
-		 * buffer, but there are some holes that need to be read from
-		 * the drive. In order to handle this, we will use block layer
-		 * mechanism to split this request in to smaller ones and make
-		 * a chain of it.
-		 */
-		split_bio = bio_split(bio, nr_secs * NR_PHY_IN_LOG, GFP_KERNEL,
-					&pblk_bio_set);
-		bio_chain(split_bio, bio);
-		submit_bio_noacct(bio);
-
-		/* New bio contains first N sectors of the previous one, so
-		 * we can continue to use existing rqd, but we need to shrink
-		 * the number of PPAs in it. New bio is also guaranteed that
-		 * it contains only either data from cache or from drive, newer
-		 * mix of them.
-		 */
-		bio = split_bio;
-		rqd->nr_ppas = nr_secs;
-		if (rqd->nr_ppas == 1)
-			rqd->ppa_addr = rqd->ppa_list[0];
-
-		/* Recreate int_bio - existing might have some needed internal
-		 * fields modified already.
-		 */
-		bio_put(int_bio);
-		int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
-		goto split_retry;
-	} else if (pblk_submit_io(pblk, rqd, NULL)) {
-		/* Submitting IO to drive failed, let's report an error */
-		rqd->error = -ENODEV;
-		pblk_end_io_read(rqd);
-	}
-}
-
-static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
-			      struct pblk_line *line, u64 *lba_list,
-			      u64 *paddr_list_gc, unsigned int nr_secs)
-{
-	struct ppa_addr ppa_list_l2p[NVM_MAX_VLBA];
-	struct ppa_addr ppa_gc;
-	int valid_secs = 0;
-	int i;
-
-	pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs);
-
-	for (i = 0; i < nr_secs; i++) {
-		if (lba_list[i] == ADDR_EMPTY)
-			continue;
-
-		ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id);
-		if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) {
-			paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY;
-			continue;
-		}
-
-		rqd->ppa_list[valid_secs++] = ppa_list_l2p[i];
-	}
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_add(valid_secs, &pblk->inflight_reads);
-#endif
-
-	return valid_secs;
-}
-
-static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
-		      struct pblk_line *line, sector_t lba,
-		      u64 paddr_gc)
-{
-	struct ppa_addr ppa_l2p, ppa_gc;
-	int valid_secs = 0;
-
-	if (lba == ADDR_EMPTY)
-		goto out;
-
-	/* logic error: lba out-of-bounds */
-	if (lba >= pblk->capacity) {
-		WARN(1, "pblk: read lba out of bounds\n");
-		goto out;
-	}
-
-	spin_lock(&pblk->trans_lock);
-	ppa_l2p = pblk_trans_map_get(pblk, lba);
-	spin_unlock(&pblk->trans_lock);
-
-	ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id);
-	if (!pblk_ppa_comp(ppa_l2p, ppa_gc))
-		goto out;
-
-	rqd->ppa_addr = ppa_l2p;
-	valid_secs = 1;
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_inc(&pblk->inflight_reads);
-#endif
-
-out:
-	return valid_secs;
-}
-
-int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
-{
-	struct nvm_rq rqd;
-	int ret = NVM_IO_OK;
-
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-
-	ret = pblk_alloc_rqd_meta(pblk, &rqd);
-	if (ret)
-		return ret;
-
-	if (gc_rq->nr_secs > 1) {
-		gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line,
-							gc_rq->lba_list,
-							gc_rq->paddr_list,
-							gc_rq->nr_secs);
-		if (gc_rq->secs_to_gc == 1)
-			rqd.ppa_addr = rqd.ppa_list[0];
-	} else {
-		gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line,
-							gc_rq->lba_list[0],
-							gc_rq->paddr_list[0]);
-	}
-
-	if (!(gc_rq->secs_to_gc))
-		goto out;
-
-	rqd.opcode = NVM_OP_PREAD;
-	rqd.nr_ppas = gc_rq->secs_to_gc;
-
-	if (pblk_submit_io_sync(pblk, &rqd, gc_rq->data)) {
-		ret = -EIO;
-		goto err_free_dma;
-	}
-
-	pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs);
-
-	atomic_dec(&pblk->inflight_io);
-
-	if (rqd.error) {
-		atomic_long_inc(&pblk->read_failed_gc);
-#ifdef CONFIG_NVM_PBLK_DEBUG
-		pblk_print_failed_rqd(pblk, &rqd, rqd.error);
-#endif
-	}
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads);
-	atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads);
-	atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads);
-#endif
-
-out:
-	pblk_free_rqd_meta(pblk, &rqd);
-	return ret;
-
-err_free_dma:
-	pblk_free_rqd_meta(pblk, &rqd);
-	return ret;
-}
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
deleted file mode 100644
index 0e6f0c76e930..000000000000
--- a/drivers/lightnvm/pblk-recovery.c
+++ /dev/null
@@ -1,874 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2016 CNEX Labs
- * Initial: Javier Gonzalez <javier@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * pblk-recovery.c - pblk's recovery path
- *
- * The L2P recovery path is single threaded as the L2P table is updated in order
- * following the line sequence ID.
- */
-
-#include "pblk.h"
-#include "pblk-trace.h"
-
-int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
-{
-	u32 crc;
-
-	crc = pblk_calc_emeta_crc(pblk, emeta_buf);
-	if (le32_to_cpu(emeta_buf->crc) != crc)
-		return 1;
-
-	if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC)
-		return 1;
-
-	return 0;
-}
-
-static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_emeta *emeta = line->emeta;
-	struct line_emeta *emeta_buf = emeta->buf;
-	__le64 *lba_list;
-	u64 data_start, data_end;
-	u64 nr_valid_lbas, nr_lbas = 0;
-	u64 i;
-
-	lba_list = emeta_to_lbas(pblk, emeta_buf);
-	if (!lba_list)
-		return 1;
-
-	data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
-	data_end = line->emeta_ssec;
-	nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas);
-
-	for (i = data_start; i < data_end; i++) {
-		struct ppa_addr ppa;
-		int pos;
-
-		ppa = addr_to_gen_ppa(pblk, i, line->id);
-		pos = pblk_ppa_to_pos(geo, ppa);
-
-		/* Do not update bad blocks */
-		if (test_bit(pos, line->blk_bitmap))
-			continue;
-
-		if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) {
-			spin_lock(&line->lock);
-			if (test_and_set_bit(i, line->invalid_bitmap))
-				WARN_ONCE(1, "pblk: rec. double invalidate:\n");
-			else
-				le32_add_cpu(line->vsc, -1);
-			spin_unlock(&line->lock);
-
-			continue;
-		}
-
-		pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa);
-		nr_lbas++;
-	}
-
-	if (nr_valid_lbas != nr_lbas)
-		pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n",
-				line->id, nr_valid_lbas, nr_lbas);
-
-	line->left_msecs = 0;
-
-	return 0;
-}
-
-static void pblk_update_line_wp(struct pblk *pblk, struct pblk_line *line,
-				u64 written_secs)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	int i;
-
-	for (i = 0; i < written_secs; i += pblk->min_write_pgs)
-		__pblk_alloc_page(pblk, line, pblk->min_write_pgs);
-
-	spin_lock(&l_mg->free_lock);
-	if (written_secs > line->left_msecs) {
-		/*
-		 * We have all data sectors written
-		 * and some emeta sectors written too.
-		 */
-		line->left_msecs = 0;
-	} else {
-		/* We have only some data sectors written. */
-		line->left_msecs -= written_secs;
-	}
-	spin_unlock(&l_mg->free_lock);
-}
-
-static u64 pblk_sec_in_open_line(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
-	u64 written_secs = 0;
-	int valid_chunks = 0;
-	int i;
-
-	for (i = 0; i < lm->blk_per_line; i++) {
-		struct nvm_chk_meta *chunk = &line->chks[i];
-
-		if (chunk->state & NVM_CHK_ST_OFFLINE)
-			continue;
-
-		written_secs += chunk->wp;
-		valid_chunks++;
-	}
-
-	if (lm->blk_per_line - nr_bb != valid_chunks)
-		pblk_err(pblk, "recovery line %d is bad\n", line->id);
-
-	pblk_update_line_wp(pblk, line, written_secs - lm->smeta_sec);
-
-	return written_secs;
-}
-
-struct pblk_recov_alloc {
-	struct ppa_addr *ppa_list;
-	void *meta_list;
-	struct nvm_rq *rqd;
-	void *data;
-	dma_addr_t dma_ppa_list;
-	dma_addr_t dma_meta_list;
-};
-
-static void pblk_recov_complete(struct kref *ref)
-{
-	struct pblk_pad_rq *pad_rq = container_of(ref, struct pblk_pad_rq, ref);
-
-	complete(&pad_rq->wait);
-}
-
-static void pblk_end_io_recov(struct nvm_rq *rqd)
-{
-	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-	struct pblk_pad_rq *pad_rq = rqd->private;
-	struct pblk *pblk = pad_rq->pblk;
-
-	pblk_up_chunk(pblk, ppa_list[0]);
-
-	pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
-
-	atomic_dec(&pblk->inflight_io);
-	kref_put(&pad_rq->ref, pblk_recov_complete);
-}
-
-/* pad line using line bitmap.  */
-static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
-			       int left_ppas)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	void *meta_list;
-	struct pblk_pad_rq *pad_rq;
-	struct nvm_rq *rqd;
-	struct ppa_addr *ppa_list;
-	void *data;
-	__le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf);
-	u64 w_ptr = line->cur_sec;
-	int left_line_ppas, rq_ppas;
-	int i, j;
-	int ret = 0;
-
-	spin_lock(&line->lock);
-	left_line_ppas = line->left_msecs;
-	spin_unlock(&line->lock);
-
-	pad_rq = kmalloc(sizeof(struct pblk_pad_rq), GFP_KERNEL);
-	if (!pad_rq)
-		return -ENOMEM;
-
-	data = vzalloc(array_size(pblk->max_write_pgs, geo->csecs));
-	if (!data) {
-		ret = -ENOMEM;
-		goto free_rq;
-	}
-
-	pad_rq->pblk = pblk;
-	init_completion(&pad_rq->wait);
-	kref_init(&pad_rq->ref);
-
-next_pad_rq:
-	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
-	if (rq_ppas < pblk->min_write_pgs) {
-		pblk_err(pblk, "corrupted pad line %d\n", line->id);
-		goto fail_complete;
-	}
-
-	rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
-
-	ret = pblk_alloc_rqd_meta(pblk, rqd);
-	if (ret) {
-		pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
-		goto fail_complete;
-	}
-
-	rqd->bio = NULL;
-	rqd->opcode = NVM_OP_PWRITE;
-	rqd->is_seq = 1;
-	rqd->nr_ppas = rq_ppas;
-	rqd->end_io = pblk_end_io_recov;
-	rqd->private = pad_rq;
-
-	ppa_list = nvm_rq_to_ppa_list(rqd);
-	meta_list = rqd->meta_list;
-
-	for (i = 0; i < rqd->nr_ppas; ) {
-		struct ppa_addr ppa;
-		int pos;
-
-		w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
-		ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
-		pos = pblk_ppa_to_pos(geo, ppa);
-
-		while (test_bit(pos, line->blk_bitmap)) {
-			w_ptr += pblk->min_write_pgs;
-			ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
-			pos = pblk_ppa_to_pos(geo, ppa);
-		}
-
-		for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
-			struct ppa_addr dev_ppa;
-			struct pblk_sec_meta *meta;
-			__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
-
-			dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
-
-			pblk_map_invalidate(pblk, dev_ppa);
-			lba_list[w_ptr] = addr_empty;
-			meta = pblk_get_meta(pblk, meta_list, i);
-			meta->lba = addr_empty;
-			ppa_list[i] = dev_ppa;
-		}
-	}
-
-	kref_get(&pad_rq->ref);
-	pblk_down_chunk(pblk, ppa_list[0]);
-
-	ret = pblk_submit_io(pblk, rqd, data);
-	if (ret) {
-		pblk_err(pblk, "I/O submission failed: %d\n", ret);
-		pblk_up_chunk(pblk, ppa_list[0]);
-		kref_put(&pad_rq->ref, pblk_recov_complete);
-		pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
-		goto fail_complete;
-	}
-
-	left_line_ppas -= rq_ppas;
-	left_ppas -= rq_ppas;
-	if (left_ppas && left_line_ppas)
-		goto next_pad_rq;
-
-fail_complete:
-	kref_put(&pad_rq->ref, pblk_recov_complete);
-	wait_for_completion(&pad_rq->wait);
-
-	if (!pblk_line_is_full(line))
-		pblk_err(pblk, "corrupted padded line: %d\n", line->id);
-
-	vfree(data);
-free_rq:
-	kfree(pad_rq);
-	return ret;
-}
-
-static int pblk_pad_distance(struct pblk *pblk, struct pblk_line *line)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	int distance = geo->mw_cunits * geo->all_luns * geo->ws_opt;
-
-	return (distance > line->left_msecs) ? line->left_msecs : distance;
-}
-
-/* Return a chunk belonging to a line by stripe(write order) index */
-static struct nvm_chk_meta *pblk_get_stripe_chunk(struct pblk *pblk,
-						  struct pblk_line *line,
-						  int index)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_lun *rlun;
-	struct ppa_addr ppa;
-	int pos;
-
-	rlun = &pblk->luns[index];
-	ppa = rlun->bppa;
-	pos = pblk_ppa_to_pos(geo, ppa);
-
-	return &line->chks[pos];
-}
-
-static int pblk_line_wps_are_unbalanced(struct pblk *pblk,
-				      struct pblk_line *line)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	int blk_in_line = lm->blk_per_line;
-	struct nvm_chk_meta *chunk;
-	u64 max_wp, min_wp;
-	int i;
-
-	i = find_first_zero_bit(line->blk_bitmap, blk_in_line);
-
-	/* If there is one or zero good chunks in the line,
-	 * the write pointers can't be unbalanced.
-	 */
-	if (i >= (blk_in_line - 1))
-		return 0;
-
-	chunk = pblk_get_stripe_chunk(pblk, line, i);
-	max_wp = chunk->wp;
-	if (max_wp > pblk->max_write_pgs)
-		min_wp = max_wp - pblk->max_write_pgs;
-	else
-		min_wp = 0;
-
-	i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1);
-	while (i < blk_in_line) {
-		chunk = pblk_get_stripe_chunk(pblk, line, i);
-		if (chunk->wp > max_wp || chunk->wp < min_wp)
-			return 1;
-
-		i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1);
-	}
-
-	return 0;
-}
-
-static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
-			       struct pblk_recov_alloc p)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct nvm_geo *geo = &dev->geo;
-	struct ppa_addr *ppa_list;
-	void *meta_list;
-	struct nvm_rq *rqd;
-	void *data;
-	dma_addr_t dma_ppa_list, dma_meta_list;
-	__le64 *lba_list;
-	u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
-	bool padded = false;
-	int rq_ppas;
-	int i, j;
-	int ret;
-	u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec;
-
-	if (pblk_line_wps_are_unbalanced(pblk, line))
-		pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id);
-
-	ppa_list = p.ppa_list;
-	meta_list = p.meta_list;
-	rqd = p.rqd;
-	data = p.data;
-	dma_ppa_list = p.dma_ppa_list;
-	dma_meta_list = p.dma_meta_list;
-
-	lba_list = emeta_to_lbas(pblk, line->emeta->buf);
-
-next_rq:
-	memset(rqd, 0, pblk_g_rq_size);
-
-	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
-	if (!rq_ppas)
-		rq_ppas = pblk->min_write_pgs;
-
-retry_rq:
-	rqd->bio = NULL;
-	rqd->opcode = NVM_OP_PREAD;
-	rqd->meta_list = meta_list;
-	rqd->nr_ppas = rq_ppas;
-	rqd->ppa_list = ppa_list;
-	rqd->dma_ppa_list = dma_ppa_list;
-	rqd->dma_meta_list = dma_meta_list;
-	ppa_list = nvm_rq_to_ppa_list(rqd);
-
-	if (pblk_io_aligned(pblk, rq_ppas))
-		rqd->is_seq = 1;
-
-	for (i = 0; i < rqd->nr_ppas; ) {
-		struct ppa_addr ppa;
-		int pos;
-
-		ppa = addr_to_gen_ppa(pblk, paddr, line->id);
-		pos = pblk_ppa_to_pos(geo, ppa);
-
-		while (test_bit(pos, line->blk_bitmap)) {
-			paddr += pblk->min_write_pgs;
-			ppa = addr_to_gen_ppa(pblk, paddr, line->id);
-			pos = pblk_ppa_to_pos(geo, ppa);
-		}
-
-		for (j = 0; j < pblk->min_write_pgs; j++, i++)
-			ppa_list[i] =
-				addr_to_gen_ppa(pblk, paddr + j, line->id);
-	}
-
-	ret = pblk_submit_io_sync(pblk, rqd, data);
-	if (ret) {
-		pblk_err(pblk, "I/O submission failed: %d\n", ret);
-		return ret;
-	}
-
-	atomic_dec(&pblk->inflight_io);
-
-	/* If a read fails, do a best effort by padding the line and retrying */
-	if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
-		int pad_distance, ret;
-
-		if (padded) {
-			pblk_log_read_err(pblk, rqd);
-			return -EINTR;
-		}
-
-		pad_distance = pblk_pad_distance(pblk, line);
-		ret = pblk_recov_pad_line(pblk, line, pad_distance);
-		if (ret) {
-			return ret;
-		}
-
-		padded = true;
-		goto retry_rq;
-	}
-
-	pblk_get_packed_meta(pblk, rqd);
-
-	for (i = 0; i < rqd->nr_ppas; i++) {
-		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
-		u64 lba = le64_to_cpu(meta->lba);
-
-		lba_list[paddr++] = cpu_to_le64(lba);
-
-		if (lba == ADDR_EMPTY || lba >= pblk->capacity)
-			continue;
-
-		line->nr_valid_lbas++;
-		pblk_update_map(pblk, lba, ppa_list[i]);
-	}
-
-	left_ppas -= rq_ppas;
-	if (left_ppas > 0)
-		goto next_rq;
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	WARN_ON(padded && !pblk_line_is_full(line));
-#endif
-
-	return 0;
-}
-
-/* Scan line for lbas on out of bound area */
-static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct nvm_rq *rqd;
-	struct ppa_addr *ppa_list;
-	void *meta_list;
-	struct pblk_recov_alloc p;
-	void *data;
-	dma_addr_t dma_ppa_list, dma_meta_list;
-	int ret = 0;
-
-	meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
-	if (!meta_list)
-		return -ENOMEM;
-
-	ppa_list = (void *)(meta_list) + pblk_dma_meta_size(pblk);
-	dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
-
-	data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL);
-	if (!data) {
-		ret = -ENOMEM;
-		goto free_meta_list;
-	}
-
-	rqd = mempool_alloc(&pblk->r_rq_pool, GFP_KERNEL);
-	memset(rqd, 0, pblk_g_rq_size);
-
-	p.ppa_list = ppa_list;
-	p.meta_list = meta_list;
-	p.rqd = rqd;
-	p.data = data;
-	p.dma_ppa_list = dma_ppa_list;
-	p.dma_meta_list = dma_meta_list;
-
-	ret = pblk_recov_scan_oob(pblk, line, p);
-	if (ret) {
-		pblk_err(pblk, "could not recover L2P form OOB\n");
-		goto out;
-	}
-
-	if (pblk_line_is_full(line))
-		pblk_line_recov_close(pblk, line);
-
-out:
-	mempool_free(rqd, &pblk->r_rq_pool);
-	kfree(data);
-free_meta_list:
-	nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
-
-	return ret;
-}
-
-/* Insert lines ordered by sequence number (seq_num) on list */
-static void pblk_recov_line_add_ordered(struct list_head *head,
-					struct pblk_line *line)
-{
-	struct pblk_line *t = NULL;
-
-	list_for_each_entry(t, head, list)
-		if (t->seq_nr > line->seq_nr)
-			break;
-
-	__list_add(&line->list, t->list.prev, &t->list);
-}
-
-static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	unsigned int emeta_secs;
-	u64 emeta_start;
-	struct ppa_addr ppa;
-	int pos;
-
-	emeta_secs = lm->emeta_sec[0];
-	emeta_start = lm->sec_per_line;
-
-	while (emeta_secs) {
-		emeta_start--;
-		ppa = addr_to_gen_ppa(pblk, emeta_start, line->id);
-		pos = pblk_ppa_to_pos(geo, ppa);
-		if (!test_bit(pos, line->blk_bitmap))
-			emeta_secs--;
-	}
-
-	return emeta_start;
-}
-
-static int pblk_recov_check_line_version(struct pblk *pblk,
-					 struct line_emeta *emeta)
-{
-	struct line_header *header = &emeta->header;
-
-	if (header->version_major != EMETA_VERSION_MAJOR) {
-		pblk_err(pblk, "line major version mismatch: %d, expected: %d\n",
-			 header->version_major, EMETA_VERSION_MAJOR);
-		return 1;
-	}
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	if (header->version_minor > EMETA_VERSION_MINOR)
-		pblk_info(pblk, "newer line minor version found: %d\n",
-				header->version_minor);
-#endif
-
-	return 0;
-}
-
-static void pblk_recov_wa_counters(struct pblk *pblk,
-				   struct line_emeta *emeta)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct line_header *header = &emeta->header;
-	struct wa_counters *wa = emeta_to_wa(lm, emeta);
-
-	/* WA counters were introduced in emeta version 0.2 */
-	if (header->version_major > 0 || header->version_minor >= 2) {
-		u64 user = le64_to_cpu(wa->user);
-		u64 pad = le64_to_cpu(wa->pad);
-		u64 gc = le64_to_cpu(wa->gc);
-
-		atomic64_set(&pblk->user_wa, user);
-		atomic64_set(&pblk->pad_wa, pad);
-		atomic64_set(&pblk->gc_wa, gc);
-
-		pblk->user_rst_wa = user;
-		pblk->pad_rst_wa = pad;
-		pblk->gc_rst_wa = gc;
-	}
-}
-
-static int pblk_line_was_written(struct pblk_line *line,
-				 struct pblk *pblk)
-{
-
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct nvm_chk_meta *chunk;
-	struct ppa_addr bppa;
-	int smeta_blk;
-
-	if (line->state == PBLK_LINESTATE_BAD)
-		return 0;
-
-	smeta_blk = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
-	if (smeta_blk >= lm->blk_per_line)
-		return 0;
-
-	bppa = pblk->luns[smeta_blk].bppa;
-	chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)];
-
-	if (chunk->state & NVM_CHK_ST_CLOSED ||
-	    (chunk->state & NVM_CHK_ST_OPEN
-	     && chunk->wp >= lm->smeta_sec))
-		return 1;
-
-	return 0;
-}
-
-static bool pblk_line_is_open(struct pblk *pblk, struct pblk_line *line)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	int i;
-
-	for (i = 0; i < lm->blk_per_line; i++)
-		if (line->chks[i].state & NVM_CHK_ST_OPEN)
-			return true;
-
-	return false;
-}
-
-struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line *line, *tline, *data_line = NULL;
-	struct pblk_smeta *smeta;
-	struct pblk_emeta *emeta;
-	struct line_smeta *smeta_buf;
-	int found_lines = 0, recovered_lines = 0, open_lines = 0;
-	int is_next = 0;
-	int meta_line;
-	int i, valid_uuid = 0;
-	LIST_HEAD(recov_list);
-
-	/* TODO: Implement FTL snapshot */
-
-	/* Scan recovery - takes place when FTL snapshot fails */
-	spin_lock(&l_mg->free_lock);
-	meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
-	set_bit(meta_line, &l_mg->meta_bitmap);
-	smeta = l_mg->sline_meta[meta_line];
-	emeta = l_mg->eline_meta[meta_line];
-	smeta_buf = (struct line_smeta *)smeta;
-	spin_unlock(&l_mg->free_lock);
-
-	/* Order data lines using their sequence number */
-	for (i = 0; i < l_mg->nr_lines; i++) {
-		u32 crc;
-
-		line = &pblk->lines[i];
-
-		memset(smeta, 0, lm->smeta_len);
-		line->smeta = smeta;
-		line->lun_bitmap = ((void *)(smeta_buf)) +
-						sizeof(struct line_smeta);
-
-		if (!pblk_line_was_written(line, pblk))
-			continue;
-
-		/* Lines that cannot be read are assumed as not written here */
-		if (pblk_line_smeta_read(pblk, line))
-			continue;
-
-		crc = pblk_calc_smeta_crc(pblk, smeta_buf);
-		if (le32_to_cpu(smeta_buf->crc) != crc)
-			continue;
-
-		if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC)
-			continue;
-
-		if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) {
-			pblk_err(pblk, "found incompatible line version %u\n",
-					smeta_buf->header.version_major);
-			return ERR_PTR(-EINVAL);
-		}
-
-		/* The first valid instance uuid is used for initialization */
-		if (!valid_uuid) {
-			import_guid(&pblk->instance_uuid, smeta_buf->header.uuid);
-			valid_uuid = 1;
-		}
-
-		if (!guid_equal(&pblk->instance_uuid,
-				(guid_t *)&smeta_buf->header.uuid)) {
-			pblk_debug(pblk, "ignore line %u due to uuid mismatch\n",
-					i);
-			continue;
-		}
-
-		/* Update line metadata */
-		spin_lock(&line->lock);
-		line->id = le32_to_cpu(smeta_buf->header.id);
-		line->type = le16_to_cpu(smeta_buf->header.type);
-		line->seq_nr = le64_to_cpu(smeta_buf->seq_nr);
-		spin_unlock(&line->lock);
-
-		/* Update general metadata */
-		spin_lock(&l_mg->free_lock);
-		if (line->seq_nr >= l_mg->d_seq_nr)
-			l_mg->d_seq_nr = line->seq_nr + 1;
-		l_mg->nr_free_lines--;
-		spin_unlock(&l_mg->free_lock);
-
-		if (pblk_line_recov_alloc(pblk, line))
-			goto out;
-
-		pblk_recov_line_add_ordered(&recov_list, line);
-		found_lines++;
-		pblk_debug(pblk, "recovering data line %d, seq:%llu\n",
-						line->id, smeta_buf->seq_nr);
-	}
-
-	if (!found_lines) {
-		guid_gen(&pblk->instance_uuid);
-
-		spin_lock(&l_mg->free_lock);
-		WARN_ON_ONCE(!test_and_clear_bit(meta_line,
-							&l_mg->meta_bitmap));
-		spin_unlock(&l_mg->free_lock);
-
-		goto out;
-	}
-
-	/* Verify closed blocks and recover this portion of L2P table*/
-	list_for_each_entry_safe(line, tline, &recov_list, list) {
-		recovered_lines++;
-
-		line->emeta_ssec = pblk_line_emeta_start(pblk, line);
-		line->emeta = emeta;
-		memset(line->emeta->buf, 0, lm->emeta_len[0]);
-
-		if (pblk_line_is_open(pblk, line)) {
-			pblk_recov_l2p_from_oob(pblk, line);
-			goto next;
-		}
-
-		if (pblk_line_emeta_read(pblk, line, line->emeta->buf)) {
-			pblk_recov_l2p_from_oob(pblk, line);
-			goto next;
-		}
-
-		if (pblk_recov_check_emeta(pblk, line->emeta->buf)) {
-			pblk_recov_l2p_from_oob(pblk, line);
-			goto next;
-		}
-
-		if (pblk_recov_check_line_version(pblk, line->emeta->buf))
-			return ERR_PTR(-EINVAL);
-
-		pblk_recov_wa_counters(pblk, line->emeta->buf);
-
-		if (pblk_recov_l2p_from_emeta(pblk, line))
-			pblk_recov_l2p_from_oob(pblk, line);
-
-next:
-		if (pblk_line_is_full(line)) {
-			struct list_head *move_list;
-
-			spin_lock(&line->lock);
-			line->state = PBLK_LINESTATE_CLOSED;
-			trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-			move_list = pblk_line_gc_list(pblk, line);
-			spin_unlock(&line->lock);
-
-			spin_lock(&l_mg->gc_lock);
-			list_move_tail(&line->list, move_list);
-			spin_unlock(&l_mg->gc_lock);
-
-			mempool_free(line->map_bitmap, l_mg->bitmap_pool);
-			line->map_bitmap = NULL;
-			line->smeta = NULL;
-			line->emeta = NULL;
-		} else {
-			spin_lock(&line->lock);
-			line->state = PBLK_LINESTATE_OPEN;
-			spin_unlock(&line->lock);
-
-			line->emeta->mem = 0;
-			atomic_set(&line->emeta->sync, 0);
-
-			trace_pblk_line_state(pblk_disk_name(pblk), line->id,
-					line->state);
-
-			data_line = line;
-			line->meta_line = meta_line;
-
-			open_lines++;
-		}
-	}
-
-	if (!open_lines) {
-		spin_lock(&l_mg->free_lock);
-		WARN_ON_ONCE(!test_and_clear_bit(meta_line,
-							&l_mg->meta_bitmap));
-		spin_unlock(&l_mg->free_lock);
-	} else {
-		spin_lock(&l_mg->free_lock);
-		l_mg->data_line = data_line;
-		/* Allocate next line for preparation */
-		l_mg->data_next = pblk_line_get(pblk);
-		if (l_mg->data_next) {
-			l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
-			l_mg->data_next->type = PBLK_LINETYPE_DATA;
-			is_next = 1;
-		}
-		spin_unlock(&l_mg->free_lock);
-	}
-
-	if (is_next)
-		pblk_line_erase(pblk, l_mg->data_next);
-
-out:
-	if (found_lines != recovered_lines)
-		pblk_err(pblk, "failed to recover all found lines %d/%d\n",
-						found_lines, recovered_lines);
-
-	return data_line;
-}
-
-/*
- * Pad current line
- */
-int pblk_recov_pad(struct pblk *pblk)
-{
-	struct pblk_line *line;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	int left_msecs;
-	int ret = 0;
-
-	spin_lock(&l_mg->free_lock);
-	line = l_mg->data_line;
-	left_msecs = line->left_msecs;
-	spin_unlock(&l_mg->free_lock);
-
-	ret = pblk_recov_pad_line(pblk, line, left_msecs);
-	if (ret) {
-		pblk_err(pblk, "tear down padding failed (%d)\n", ret);
-		return ret;
-	}
-
-	pblk_line_close_meta(pblk, line);
-	return ret;
-}
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
deleted file mode 100644
index a5f8bc2defbc..000000000000
--- a/drivers/lightnvm/pblk-rl.c
+++ /dev/null
@@ -1,254 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2016 CNEX Labs
- * Initial release: Javier Gonzalez <javier@cnexlabs.com>
- *                  Matias Bjorling <matias@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * pblk-rl.c - pblk's rate limiter for user I/O
- *
- */
-
-#include "pblk.h"
-
-static void pblk_rl_kick_u_timer(struct pblk_rl *rl)
-{
-	mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000));
-}
-
-int pblk_rl_is_limit(struct pblk_rl *rl)
-{
-	int rb_space;
-
-	rb_space = atomic_read(&rl->rb_space);
-
-	return (rb_space == 0);
-}
-
-int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries)
-{
-	int rb_user_cnt = atomic_read(&rl->rb_user_cnt);
-	int rb_space = atomic_read(&rl->rb_space);
-
-	if (unlikely(rb_space >= 0) && (rb_space - nr_entries < 0))
-		return NVM_IO_ERR;
-
-	if (rb_user_cnt >= rl->rb_user_max)
-		return NVM_IO_REQUEUE;
-
-	return NVM_IO_OK;
-}
-
-void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries)
-{
-	int rb_space = atomic_read(&rl->rb_space);
-
-	if (unlikely(rb_space >= 0))
-		atomic_sub(nr_entries, &rl->rb_space);
-}
-
-int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
-{
-	int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt);
-	int rb_user_active;
-
-	/* If there is no user I/O let GC take over space on the write buffer */
-	rb_user_active = READ_ONCE(rl->rb_user_active);
-	return (!(rb_gc_cnt >= rl->rb_gc_max && rb_user_active));
-}
-
-void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
-{
-	atomic_add(nr_entries, &rl->rb_user_cnt);
-
-	/* Release user I/O state. Protect from GC */
-	smp_store_release(&rl->rb_user_active, 1);
-	pblk_rl_kick_u_timer(rl);
-}
-
-void pblk_rl_werr_line_in(struct pblk_rl *rl)
-{
-	atomic_inc(&rl->werr_lines);
-}
-
-void pblk_rl_werr_line_out(struct pblk_rl *rl)
-{
-	atomic_dec(&rl->werr_lines);
-}
-
-void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
-{
-	atomic_add(nr_entries, &rl->rb_gc_cnt);
-}
-
-void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc)
-{
-	atomic_sub(nr_user, &rl->rb_user_cnt);
-	atomic_sub(nr_gc, &rl->rb_gc_cnt);
-}
-
-unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
-{
-	return atomic_read(&rl->free_blocks);
-}
-
-unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl)
-{
-	return atomic_read(&rl->free_user_blocks);
-}
-
-static void __pblk_rl_update_rates(struct pblk_rl *rl,
-				   unsigned long free_blocks)
-{
-	struct pblk *pblk = container_of(rl, struct pblk, rl);
-	int max = rl->rb_budget;
-	int werr_gc_needed = atomic_read(&rl->werr_lines);
-
-	if (free_blocks >= rl->high) {
-		if (werr_gc_needed) {
-			/* Allocate a small budget for recovering
-			 * lines with write errors
-			 */
-			rl->rb_gc_max = 1 << rl->rb_windows_pw;
-			rl->rb_user_max = max - rl->rb_gc_max;
-			rl->rb_state = PBLK_RL_WERR;
-		} else {
-			rl->rb_user_max = max;
-			rl->rb_gc_max = 0;
-			rl->rb_state = PBLK_RL_OFF;
-		}
-	} else if (free_blocks < rl->high) {
-		int shift = rl->high_pw - rl->rb_windows_pw;
-		int user_windows = free_blocks >> shift;
-		int user_max = user_windows << ilog2(NVM_MAX_VLBA);
-
-		rl->rb_user_max = user_max;
-		rl->rb_gc_max = max - user_max;
-
-		if (free_blocks <= rl->rsv_blocks) {
-			rl->rb_user_max = 0;
-			rl->rb_gc_max = max;
-		}
-
-		/* In the worst case, we will need to GC lines in the low list
-		 * (high valid sector count). If there are lines to GC on high
-		 * or mid lists, these will be prioritized
-		 */
-		rl->rb_state = PBLK_RL_LOW;
-	}
-
-	if (rl->rb_state != PBLK_RL_OFF)
-		pblk_gc_should_start(pblk);
-	else
-		pblk_gc_should_stop(pblk);
-}
-
-void pblk_rl_update_rates(struct pblk_rl *rl)
-{
-	__pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl));
-}
-
-void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
-{
-	int blk_in_line = atomic_read(&line->blk_in_line);
-	int free_blocks;
-
-	atomic_add(blk_in_line, &rl->free_blocks);
-	free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks);
-
-	__pblk_rl_update_rates(rl, free_blocks);
-}
-
-void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
-			    bool used)
-{
-	int blk_in_line = atomic_read(&line->blk_in_line);
-	int free_blocks;
-
-	atomic_sub(blk_in_line, &rl->free_blocks);
-
-	if (used)
-		free_blocks = atomic_sub_return(blk_in_line,
-							&rl->free_user_blocks);
-	else
-		free_blocks = atomic_read(&rl->free_user_blocks);
-
-	__pblk_rl_update_rates(rl, free_blocks);
-}
-
-int pblk_rl_high_thrs(struct pblk_rl *rl)
-{
-	return rl->high;
-}
-
-int pblk_rl_max_io(struct pblk_rl *rl)
-{
-	return rl->rb_max_io;
-}
-
-static void pblk_rl_u_timer(struct timer_list *t)
-{
-	struct pblk_rl *rl = from_timer(rl, t, u_timer);
-
-	/* Release user I/O state. Protect from GC */
-	smp_store_release(&rl->rb_user_active, 0);
-}
-
-void pblk_rl_free(struct pblk_rl *rl)
-{
-	del_timer(&rl->u_timer);
-}
-
-void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold)
-{
-	struct pblk *pblk = container_of(rl, struct pblk, rl);
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line_meta *lm = &pblk->lm;
-	int sec_meta, blk_meta;
-	unsigned int rb_windows;
-
-	/* Consider sectors used for metadata */
-	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
-	blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
-
-	rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
-	rl->high_pw = get_count_order(rl->high);
-
-	rl->rsv_blocks = pblk_get_min_chks(pblk);
-
-	/* This will always be a power-of-2 */
-	rb_windows = budget / NVM_MAX_VLBA;
-	rl->rb_windows_pw = get_count_order(rb_windows);
-
-	/* To start with, all buffer is available to user I/O writers */
-	rl->rb_budget = budget;
-	rl->rb_user_max = budget;
-	rl->rb_gc_max = 0;
-	rl->rb_state = PBLK_RL_HIGH;
-
-	/* Maximize I/O size and ansure that back threshold is respected */
-	if (threshold)
-		rl->rb_max_io = budget - pblk->min_write_pgs_data - threshold;
-	else
-		rl->rb_max_io = budget - pblk->min_write_pgs_data - 1;
-
-	atomic_set(&rl->rb_user_cnt, 0);
-	atomic_set(&rl->rb_gc_cnt, 0);
-	atomic_set(&rl->rb_space, -1);
-	atomic_set(&rl->werr_lines, 0);
-
-	timer_setup(&rl->u_timer, pblk_rl_u_timer, 0);
-
-	rl->rb_user_active = 0;
-	rl->rb_gc_active = 0;
-}
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
deleted file mode 100644
index 6387302b03f2..000000000000
--- a/drivers/lightnvm/pblk-sysfs.c
+++ /dev/null
@@ -1,728 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2016 CNEX Labs
- * Initial release: Javier Gonzalez <javier@cnexlabs.com>
- *                  Matias Bjorling <matias@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * Implementation of a physical block-device target for Open-channel SSDs.
- *
- * pblk-sysfs.c - pblk's sysfs
- *
- */
-
-#include "pblk.h"
-
-static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_lun *rlun;
-	ssize_t sz = 0;
-	int i;
-
-	for (i = 0; i < geo->all_luns; i++) {
-		int active = 1;
-
-		rlun = &pblk->luns[i];
-		if (!down_trylock(&rlun->wr_sem)) {
-			active = 0;
-			up(&rlun->wr_sem);
-		}
-		sz += scnprintf(page + sz, PAGE_SIZE - sz,
-				"pblk: pos:%d, ch:%d, lun:%d - %d\n",
-					i,
-					rlun->bppa.a.ch,
-					rlun->bppa.a.lun,
-					active);
-	}
-
-	return sz;
-}
-
-static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
-{
-	int free_blocks, free_user_blocks, total_blocks;
-	int rb_user_max, rb_user_cnt;
-	int rb_gc_max, rb_gc_cnt, rb_budget, rb_state;
-
-	free_blocks = pblk_rl_nr_free_blks(&pblk->rl);
-	free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl);
-	rb_user_max = pblk->rl.rb_user_max;
-	rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
-	rb_gc_max = pblk->rl.rb_gc_max;
-	rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt);
-	rb_budget = pblk->rl.rb_budget;
-	rb_state = pblk->rl.rb_state;
-
-	total_blocks = pblk->rl.total_blocks;
-
-	return snprintf(page, PAGE_SIZE,
-		"u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n",
-				rb_user_cnt,
-				rb_user_max,
-				rb_gc_cnt,
-				rb_gc_max,
-				rb_state,
-				rb_budget,
-				pblk->rl.high,
-				free_blocks,
-				free_user_blocks,
-				total_blocks,
-				READ_ONCE(pblk->rl.rb_user_active));
-}
-
-static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page)
-{
-	int gc_enabled, gc_active;
-
-	pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active);
-	return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n",
-					gc_enabled, gc_active);
-}
-
-static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page)
-{
-	ssize_t sz;
-
-	sz = snprintf(page, PAGE_SIZE,
-			"read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n",
-			atomic_long_read(&pblk->read_failed),
-			atomic_long_read(&pblk->read_high_ecc),
-			atomic_long_read(&pblk->read_empty),
-			atomic_long_read(&pblk->read_failed_gc),
-			atomic_long_read(&pblk->write_failed),
-			atomic_long_read(&pblk->erase_failed));
-
-	return sz;
-}
-
-static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page)
-{
-	return pblk_rb_sysfs(&pblk->rwb, page);
-}
-
-static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	ssize_t sz = 0;
-
-	if (geo->version == NVM_OCSSD_SPEC_12) {
-		struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf;
-		struct nvm_addrf_12 *gppaf = (struct nvm_addrf_12 *)&geo->addrf;
-
-		sz = scnprintf(page, PAGE_SIZE,
-			"g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
-			pblk->addrf_len,
-			ppaf->blk_offset, ppaf->blk_len,
-			ppaf->pg_offset, ppaf->pg_len,
-			ppaf->lun_offset, ppaf->lun_len,
-			ppaf->ch_offset, ppaf->ch_len,
-			ppaf->pln_offset, ppaf->pln_len,
-			ppaf->sec_offset, ppaf->sec_len);
-
-		sz += scnprintf(page + sz, PAGE_SIZE - sz,
-			"d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
-			gppaf->blk_offset, gppaf->blk_len,
-			gppaf->pg_offset, gppaf->pg_len,
-			gppaf->lun_offset, gppaf->lun_len,
-			gppaf->ch_offset, gppaf->ch_len,
-			gppaf->pln_offset, gppaf->pln_len,
-			gppaf->sec_offset, gppaf->sec_len);
-	} else {
-		struct nvm_addrf *ppaf = &pblk->addrf;
-		struct nvm_addrf *gppaf = &geo->addrf;
-
-		sz = scnprintf(page, PAGE_SIZE,
-			"pblk:(s:%d)ch:%d/%d,lun:%d/%d,chk:%d/%d/sec:%d/%d\n",
-			pblk->addrf_len,
-			ppaf->ch_offset, ppaf->ch_len,
-			ppaf->lun_offset, ppaf->lun_len,
-			ppaf->chk_offset, ppaf->chk_len,
-			ppaf->sec_offset, ppaf->sec_len);
-
-		sz += scnprintf(page + sz, PAGE_SIZE - sz,
-			"device:ch:%d/%d,lun:%d/%d,chk:%d/%d,sec:%d/%d\n",
-			gppaf->ch_offset, gppaf->ch_len,
-			gppaf->lun_offset, gppaf->lun_len,
-			gppaf->chk_offset, gppaf->chk_len,
-			gppaf->sec_offset, gppaf->sec_len);
-	}
-
-	return sz;
-}
-
-static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line *line;
-	ssize_t sz = 0;
-	int nr_free_lines;
-	int cur_data, cur_log;
-	int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0;
-	int d_line_cnt = 0, l_line_cnt = 0;
-	int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
-	int gc_werr = 0;
-
-	int bad = 0, cor = 0;
-	int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
-	int map_weight = 0, meta_weight = 0;
-
-	spin_lock(&l_mg->free_lock);
-	cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1;
-	cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1;
-	nr_free_lines = l_mg->nr_free_lines;
-
-	list_for_each_entry(line, &l_mg->free_list, list)
-		free_line_cnt++;
-	spin_unlock(&l_mg->free_lock);
-
-	spin_lock(&l_mg->close_lock);
-	list_for_each_entry(line, &l_mg->emeta_list, list)
-		emeta_line_cnt++;
-	spin_unlock(&l_mg->close_lock);
-
-	spin_lock(&l_mg->gc_lock);
-	list_for_each_entry(line, &l_mg->gc_full_list, list) {
-		if (line->type == PBLK_LINETYPE_DATA)
-			d_line_cnt++;
-		else if (line->type == PBLK_LINETYPE_LOG)
-			l_line_cnt++;
-		closed_line_cnt++;
-		gc_full++;
-	}
-
-	list_for_each_entry(line, &l_mg->gc_high_list, list) {
-		if (line->type == PBLK_LINETYPE_DATA)
-			d_line_cnt++;
-		else if (line->type == PBLK_LINETYPE_LOG)
-			l_line_cnt++;
-		closed_line_cnt++;
-		gc_high++;
-	}
-
-	list_for_each_entry(line, &l_mg->gc_mid_list, list) {
-		if (line->type == PBLK_LINETYPE_DATA)
-			d_line_cnt++;
-		else if (line->type == PBLK_LINETYPE_LOG)
-			l_line_cnt++;
-		closed_line_cnt++;
-		gc_mid++;
-	}
-
-	list_for_each_entry(line, &l_mg->gc_low_list, list) {
-		if (line->type == PBLK_LINETYPE_DATA)
-			d_line_cnt++;
-		else if (line->type == PBLK_LINETYPE_LOG)
-			l_line_cnt++;
-		closed_line_cnt++;
-		gc_low++;
-	}
-
-	list_for_each_entry(line, &l_mg->gc_empty_list, list) {
-		if (line->type == PBLK_LINETYPE_DATA)
-			d_line_cnt++;
-		else if (line->type == PBLK_LINETYPE_LOG)
-			l_line_cnt++;
-		closed_line_cnt++;
-		gc_empty++;
-	}
-
-	list_for_each_entry(line, &l_mg->gc_werr_list, list) {
-		if (line->type == PBLK_LINETYPE_DATA)
-			d_line_cnt++;
-		else if (line->type == PBLK_LINETYPE_LOG)
-			l_line_cnt++;
-		closed_line_cnt++;
-		gc_werr++;
-	}
-
-	list_for_each_entry(line, &l_mg->bad_list, list)
-		bad++;
-	list_for_each_entry(line, &l_mg->corrupt_list, list)
-		cor++;
-	spin_unlock(&l_mg->gc_lock);
-
-	spin_lock(&l_mg->free_lock);
-	if (l_mg->data_line) {
-		cur_sec = l_mg->data_line->cur_sec;
-		msecs = l_mg->data_line->left_msecs;
-		vsc = le32_to_cpu(*l_mg->data_line->vsc);
-		sec_in_line = l_mg->data_line->sec_in_line;
-		meta_weight = bitmap_weight(&l_mg->meta_bitmap,
-							PBLK_DATA_LINES);
-
-		spin_lock(&l_mg->data_line->lock);
-		if (l_mg->data_line->map_bitmap)
-			map_weight = bitmap_weight(l_mg->data_line->map_bitmap,
-							lm->sec_per_line);
-		else
-			map_weight = 0;
-		spin_unlock(&l_mg->data_line->lock);
-	}
-	spin_unlock(&l_mg->free_lock);
-
-	if (nr_free_lines != free_line_cnt)
-		pblk_err(pblk, "corrupted free line list:%d/%d\n",
-						nr_free_lines, free_line_cnt);
-
-	sz = scnprintf(page, PAGE_SIZE - sz,
-		"line: nluns:%d, nblks:%d, nsecs:%d\n",
-		geo->all_luns, lm->blk_per_line, lm->sec_per_line);
-
-	sz += scnprintf(page + sz, PAGE_SIZE - sz,
-		"lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n",
-					cur_data, cur_log,
-					nr_free_lines,
-					emeta_line_cnt, meta_weight,
-					closed_line_cnt,
-					bad, cor,
-					d_line_cnt, l_line_cnt,
-					l_mg->nr_lines);
-
-	sz += scnprintf(page + sz, PAGE_SIZE - sz,
-		"GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, werr: %d, queue:%d\n",
-			gc_full, gc_high, gc_mid, gc_low, gc_empty, gc_werr,
-			atomic_read(&pblk->gc.read_inflight_gc));
-
-	sz += scnprintf(page + sz, PAGE_SIZE - sz,
-		"data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
-			cur_data, cur_sec, msecs, vsc, sec_in_line,
-			map_weight, lm->sec_per_line,
-			atomic_read(&pblk->inflight_io));
-
-	return sz;
-}
-
-static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_meta *lm = &pblk->lm;
-	ssize_t sz = 0;
-
-	sz = scnprintf(page, PAGE_SIZE - sz,
-				"smeta - len:%d, secs:%d\n",
-					lm->smeta_len, lm->smeta_sec);
-	sz += scnprintf(page + sz, PAGE_SIZE - sz,
-				"emeta - len:%d, sec:%d, bb_start:%d\n",
-					lm->emeta_len[0], lm->emeta_sec[0],
-					lm->emeta_bb);
-	sz += scnprintf(page + sz, PAGE_SIZE - sz,
-				"bitmap lengths: sec:%d, blk:%d, lun:%d\n",
-					lm->sec_bitmap_len,
-					lm->blk_bitmap_len,
-					lm->lun_bitmap_len);
-	sz += scnprintf(page + sz, PAGE_SIZE - sz,
-				"blk_line:%d, sec_line:%d, sec_blk:%d\n",
-					lm->blk_per_line,
-					lm->sec_per_line,
-					geo->clba);
-
-	return sz;
-}
-
-static ssize_t pblk_sysfs_get_sec_per_write(struct pblk *pblk, char *page)
-{
-	return snprintf(page, PAGE_SIZE, "%d\n", pblk->sec_per_write);
-}
-
-static ssize_t pblk_get_write_amp(u64 user, u64 gc, u64 pad,
-				  char *page)
-{
-	int sz;
-
-	sz = scnprintf(page, PAGE_SIZE,
-			"user:%lld gc:%lld pad:%lld WA:",
-			user, gc, pad);
-
-	if (!user) {
-		sz += scnprintf(page + sz, PAGE_SIZE - sz, "NaN\n");
-	} else {
-		u64 wa_int;
-		u32 wa_frac;
-
-		wa_int = (user + gc + pad) * 100000;
-		wa_int = div64_u64(wa_int, user);
-		wa_int = div_u64_rem(wa_int, 100000, &wa_frac);
-
-		sz += scnprintf(page + sz, PAGE_SIZE - sz, "%llu.%05u\n",
-							wa_int, wa_frac);
-	}
-
-	return sz;
-}
-
-static ssize_t pblk_sysfs_get_write_amp_mileage(struct pblk *pblk, char *page)
-{
-	return pblk_get_write_amp(atomic64_read(&pblk->user_wa),
-		atomic64_read(&pblk->gc_wa), atomic64_read(&pblk->pad_wa),
-		page);
-}
-
-static ssize_t pblk_sysfs_get_write_amp_trip(struct pblk *pblk, char *page)
-{
-	return pblk_get_write_amp(
-		atomic64_read(&pblk->user_wa) - pblk->user_rst_wa,
-		atomic64_read(&pblk->gc_wa) - pblk->gc_rst_wa,
-		atomic64_read(&pblk->pad_wa) - pblk->pad_rst_wa, page);
-}
-
-static long long bucket_percentage(unsigned long long bucket,
-				   unsigned long long total)
-{
-	int p = bucket * 100;
-
-	p = div_u64(p, total);
-
-	return p;
-}
-
-static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page)
-{
-	int sz = 0;
-	unsigned long long total;
-	unsigned long long total_buckets = 0;
-	int buckets = pblk->min_write_pgs - 1;
-	int i;
-
-	total = atomic64_read(&pblk->nr_flush) - pblk->nr_flush_rst;
-	if (!total) {
-		for (i = 0; i < (buckets + 1); i++)
-			sz += scnprintf(page + sz, PAGE_SIZE - sz,
-				"%d:0 ", i);
-		sz += scnprintf(page + sz, PAGE_SIZE - sz, "\n");
-
-		return sz;
-	}
-
-	for (i = 0; i < buckets; i++)
-		total_buckets += atomic64_read(&pblk->pad_dist[i]);
-
-	sz += scnprintf(page + sz, PAGE_SIZE - sz, "0:%lld%% ",
-		bucket_percentage(total - total_buckets, total));
-
-	for (i = 0; i < buckets; i++) {
-		unsigned long long p;
-
-		p = bucket_percentage(atomic64_read(&pblk->pad_dist[i]),
-					  total);
-		sz += scnprintf(page + sz, PAGE_SIZE - sz, "%d:%lld%% ",
-				i + 1, p);
-	}
-	sz += scnprintf(page + sz, PAGE_SIZE - sz, "\n");
-
-	return sz;
-}
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
-{
-	return snprintf(page, PAGE_SIZE,
-		"%lu\t%lu\t%ld\t%llu\t%ld\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n",
-			atomic_long_read(&pblk->inflight_writes),
-			atomic_long_read(&pblk->inflight_reads),
-			atomic_long_read(&pblk->req_writes),
-			(u64)atomic64_read(&pblk->nr_flush),
-			atomic_long_read(&pblk->padded_writes),
-			atomic_long_read(&pblk->padded_wb),
-			atomic_long_read(&pblk->sub_writes),
-			atomic_long_read(&pblk->sync_writes),
-			atomic_long_read(&pblk->recov_writes),
-			atomic_long_read(&pblk->recov_gc_writes),
-			atomic_long_read(&pblk->recov_gc_reads),
-			atomic_long_read(&pblk->cache_reads),
-			atomic_long_read(&pblk->sync_reads));
-}
-#endif
-
-static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
-				   size_t len)
-{
-	size_t c_len;
-	int force;
-
-	c_len = strcspn(page, "\n");
-	if (c_len >= len)
-		return -EINVAL;
-
-	if (kstrtouint(page, 0, &force))
-		return -EINVAL;
-
-	pblk_gc_sysfs_force(pblk, force);
-
-	return len;
-}
-
-static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk,
-					     const char *page, size_t len)
-{
-	size_t c_len;
-	int sec_per_write;
-
-	c_len = strcspn(page, "\n");
-	if (c_len >= len)
-		return -EINVAL;
-
-	if (kstrtouint(page, 0, &sec_per_write))
-		return -EINVAL;
-
-	if (!pblk_is_oob_meta_supported(pblk)) {
-		/* For packed metadata case it is
-		 * not allowed to change sec_per_write.
-		 */
-		return -EINVAL;
-	}
-
-	if (sec_per_write < pblk->min_write_pgs
-				|| sec_per_write > pblk->max_write_pgs
-				|| sec_per_write % pblk->min_write_pgs != 0)
-		return -EINVAL;
-
-	pblk_set_sec_per_write(pblk, sec_per_write);
-
-	return len;
-}
-
-static ssize_t pblk_sysfs_set_write_amp_trip(struct pblk *pblk,
-			const char *page, size_t len)
-{
-	size_t c_len;
-	int reset_value;
-
-	c_len = strcspn(page, "\n");
-	if (c_len >= len)
-		return -EINVAL;
-
-	if (kstrtouint(page, 0, &reset_value))
-		return -EINVAL;
-
-	if (reset_value !=  0)
-		return -EINVAL;
-
-	pblk->user_rst_wa = atomic64_read(&pblk->user_wa);
-	pblk->pad_rst_wa = atomic64_read(&pblk->pad_wa);
-	pblk->gc_rst_wa = atomic64_read(&pblk->gc_wa);
-
-	return len;
-}
-
-
-static ssize_t pblk_sysfs_set_padding_dist(struct pblk *pblk,
-			const char *page, size_t len)
-{
-	size_t c_len;
-	int reset_value;
-	int buckets = pblk->min_write_pgs - 1;
-	int i;
-
-	c_len = strcspn(page, "\n");
-	if (c_len >= len)
-		return -EINVAL;
-
-	if (kstrtouint(page, 0, &reset_value))
-		return -EINVAL;
-
-	if (reset_value !=  0)
-		return -EINVAL;
-
-	for (i = 0; i < buckets; i++)
-		atomic64_set(&pblk->pad_dist[i], 0);
-
-	pblk->nr_flush_rst = atomic64_read(&pblk->nr_flush);
-
-	return len;
-}
-
-static struct attribute sys_write_luns = {
-	.name = "write_luns",
-	.mode = 0444,
-};
-
-static struct attribute sys_rate_limiter_attr = {
-	.name = "rate_limiter",
-	.mode = 0444,
-};
-
-static struct attribute sys_gc_state = {
-	.name = "gc_state",
-	.mode = 0444,
-};
-
-static struct attribute sys_errors_attr = {
-	.name = "errors",
-	.mode = 0444,
-};
-
-static struct attribute sys_rb_attr = {
-	.name = "write_buffer",
-	.mode = 0444,
-};
-
-static struct attribute sys_stats_ppaf_attr = {
-	.name = "ppa_format",
-	.mode = 0444,
-};
-
-static struct attribute sys_lines_attr = {
-	.name = "lines",
-	.mode = 0444,
-};
-
-static struct attribute sys_lines_info_attr = {
-	.name = "lines_info",
-	.mode = 0444,
-};
-
-static struct attribute sys_gc_force = {
-	.name = "gc_force",
-	.mode = 0200,
-};
-
-static struct attribute sys_max_sec_per_write = {
-	.name = "max_sec_per_write",
-	.mode = 0644,
-};
-
-static struct attribute sys_write_amp_mileage = {
-	.name = "write_amp_mileage",
-	.mode = 0444,
-};
-
-static struct attribute sys_write_amp_trip = {
-	.name = "write_amp_trip",
-	.mode = 0644,
-};
-
-static struct attribute sys_padding_dist = {
-	.name = "padding_dist",
-	.mode = 0644,
-};
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-static struct attribute sys_stats_debug_attr = {
-	.name = "stats",
-	.mode = 0444,
-};
-#endif
-
-static struct attribute *pblk_attrs[] = {
-	&sys_write_luns,
-	&sys_rate_limiter_attr,
-	&sys_errors_attr,
-	&sys_gc_state,
-	&sys_gc_force,
-	&sys_max_sec_per_write,
-	&sys_rb_attr,
-	&sys_stats_ppaf_attr,
-	&sys_lines_attr,
-	&sys_lines_info_attr,
-	&sys_write_amp_mileage,
-	&sys_write_amp_trip,
-	&sys_padding_dist,
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	&sys_stats_debug_attr,
-#endif
-	NULL,
-};
-
-static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
-			       char *buf)
-{
-	struct pblk *pblk = container_of(kobj, struct pblk, kobj);
-
-	if (strcmp(attr->name, "rate_limiter") == 0)
-		return pblk_sysfs_rate_limiter(pblk, buf);
-	else if (strcmp(attr->name, "write_luns") == 0)
-		return pblk_sysfs_luns_show(pblk, buf);
-	else if (strcmp(attr->name, "gc_state") == 0)
-		return pblk_sysfs_gc_state_show(pblk, buf);
-	else if (strcmp(attr->name, "errors") == 0)
-		return pblk_sysfs_stats(pblk, buf);
-	else if (strcmp(attr->name, "write_buffer") == 0)
-		return pblk_sysfs_write_buffer(pblk, buf);
-	else if (strcmp(attr->name, "ppa_format") == 0)
-		return pblk_sysfs_ppaf(pblk, buf);
-	else if (strcmp(attr->name, "lines") == 0)
-		return pblk_sysfs_lines(pblk, buf);
-	else if (strcmp(attr->name, "lines_info") == 0)
-		return pblk_sysfs_lines_info(pblk, buf);
-	else if (strcmp(attr->name, "max_sec_per_write") == 0)
-		return pblk_sysfs_get_sec_per_write(pblk, buf);
-	else if (strcmp(attr->name, "write_amp_mileage") == 0)
-		return pblk_sysfs_get_write_amp_mileage(pblk, buf);
-	else if (strcmp(attr->name, "write_amp_trip") == 0)
-		return pblk_sysfs_get_write_amp_trip(pblk, buf);
-	else if (strcmp(attr->name, "padding_dist") == 0)
-		return pblk_sysfs_get_padding_dist(pblk, buf);
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	else if (strcmp(attr->name, "stats") == 0)
-		return pblk_sysfs_stats_debug(pblk, buf);
-#endif
-	return 0;
-}
-
-static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
-				const char *buf, size_t len)
-{
-	struct pblk *pblk = container_of(kobj, struct pblk, kobj);
-
-	if (strcmp(attr->name, "gc_force") == 0)
-		return pblk_sysfs_gc_force(pblk, buf, len);
-	else if (strcmp(attr->name, "max_sec_per_write") == 0)
-		return pblk_sysfs_set_sec_per_write(pblk, buf, len);
-	else if (strcmp(attr->name, "write_amp_trip") == 0)
-		return pblk_sysfs_set_write_amp_trip(pblk, buf, len);
-	else if (strcmp(attr->name, "padding_dist") == 0)
-		return pblk_sysfs_set_padding_dist(pblk, buf, len);
-	return 0;
-}
-
-static const struct sysfs_ops pblk_sysfs_ops = {
-	.show = pblk_sysfs_show,
-	.store = pblk_sysfs_store,
-};
-
-static struct kobj_type pblk_ktype = {
-	.sysfs_ops	= &pblk_sysfs_ops,
-	.default_attrs	= pblk_attrs,
-};
-
-int pblk_sysfs_init(struct gendisk *tdisk)
-{
-	struct pblk *pblk = tdisk->private_data;
-	struct device *parent_dev = disk_to_dev(pblk->disk);
-	int ret;
-
-	ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype,
-					kobject_get(&parent_dev->kobj),
-					"%s", "pblk");
-	if (ret) {
-		pblk_err(pblk, "could not register\n");
-		return ret;
-	}
-
-	kobject_uevent(&pblk->kobj, KOBJ_ADD);
-	return 0;
-}
-
-void pblk_sysfs_exit(struct gendisk *tdisk)
-{
-	struct pblk *pblk = tdisk->private_data;
-
-	kobject_uevent(&pblk->kobj, KOBJ_REMOVE);
-	kobject_del(&pblk->kobj);
-	kobject_put(&pblk->kobj);
-}
diff --git a/drivers/lightnvm/pblk-trace.h b/drivers/lightnvm/pblk-trace.h
deleted file mode 100644
index 47b67c6bff7a..000000000000
--- a/drivers/lightnvm/pblk-trace.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM pblk
-
-#if !defined(_TRACE_PBLK_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_PBLK_H
-
-#include <linux/tracepoint.h>
-
-struct ppa_addr;
-
-#define show_chunk_flags(state) __print_flags(state, "",	\
-	{ NVM_CHK_ST_FREE,		"FREE",		},	\
-	{ NVM_CHK_ST_CLOSED,		"CLOSED",	},	\
-	{ NVM_CHK_ST_OPEN,		"OPEN",		},	\
-	{ NVM_CHK_ST_OFFLINE,		"OFFLINE",	})
-
-#define show_line_state(state) __print_symbolic(state,		\
-	{ PBLK_LINESTATE_NEW,		"NEW",		},	\
-	{ PBLK_LINESTATE_FREE,		"FREE",		},	\
-	{ PBLK_LINESTATE_OPEN,		"OPEN",		},	\
-	{ PBLK_LINESTATE_CLOSED,	"CLOSED",	},	\
-	{ PBLK_LINESTATE_GC,		"GC",		},	\
-	{ PBLK_LINESTATE_BAD,		"BAD",		},	\
-	{ PBLK_LINESTATE_CORRUPT,	"CORRUPT"	})
-
-
-#define show_pblk_state(state) __print_symbolic(state,		\
-	{ PBLK_STATE_RUNNING,		"RUNNING",	},	\
-	{ PBLK_STATE_STOPPING,		"STOPPING",	},	\
-	{ PBLK_STATE_RECOVERING,	"RECOVERING",	},	\
-	{ PBLK_STATE_STOPPED,		"STOPPED"	})
-
-#define show_chunk_erase_state(state) __print_symbolic(state,	\
-	{ PBLK_CHUNK_RESET_START,	"START",	},	\
-	{ PBLK_CHUNK_RESET_DONE,	"OK",		},	\
-	{ PBLK_CHUNK_RESET_FAILED,	"FAILED"	})
-
-
-TRACE_EVENT(pblk_chunk_reset,
-
-	TP_PROTO(const char *name, struct ppa_addr *ppa, int state),
-
-	TP_ARGS(name, ppa, state),
-
-	TP_STRUCT__entry(
-		__string(name, name)
-		__field(u64, ppa)
-		__field(int, state)
-	),
-
-	TP_fast_assign(
-		__assign_str(name, name);
-		__entry->ppa = ppa->ppa;
-		__entry->state = state;
-	),
-
-	TP_printk("dev=%s grp=%llu pu=%llu chk=%llu state=%s", __get_str(name),
-			(u64)(((struct ppa_addr *)(&__entry->ppa))->m.grp),
-			(u64)(((struct ppa_addr *)(&__entry->ppa))->m.pu),
-			(u64)(((struct ppa_addr *)(&__entry->ppa))->m.chk),
-			show_chunk_erase_state((int)__entry->state))
-
-);
-
-TRACE_EVENT(pblk_chunk_state,
-
-	TP_PROTO(const char *name, struct ppa_addr *ppa, int state),
-
-	TP_ARGS(name, ppa, state),
-
-	TP_STRUCT__entry(
-		__string(name, name)
-		__field(u64, ppa)
-		__field(int, state)
-	),
-
-	TP_fast_assign(
-		__assign_str(name, name);
-		__entry->ppa = ppa->ppa;
-		__entry->state = state;
-	),
-
-	TP_printk("dev=%s grp=%llu pu=%llu chk=%llu state=%s", __get_str(name),
-			(u64)(((struct ppa_addr *)(&__entry->ppa))->m.grp),
-			(u64)(((struct ppa_addr *)(&__entry->ppa))->m.pu),
-			(u64)(((struct ppa_addr *)(&__entry->ppa))->m.chk),
-			show_chunk_flags((int)__entry->state))
-
-);
-
-TRACE_EVENT(pblk_line_state,
-
-	TP_PROTO(const char *name, int line, int state),
-
-	TP_ARGS(name, line, state),
-
-	TP_STRUCT__entry(
-		__string(name, name)
-		__field(int, line)
-		__field(int, state)
-	),
-
-	TP_fast_assign(
-		__assign_str(name, name);
-		__entry->line = line;
-		__entry->state = state;
-	),
-
-	TP_printk("dev=%s line=%d state=%s", __get_str(name),
-			(int)__entry->line,
-			show_line_state((int)__entry->state))
-
-);
-
-TRACE_EVENT(pblk_state,
-
-	TP_PROTO(const char *name, int state),
-
-	TP_ARGS(name, state),
-
-	TP_STRUCT__entry(
-		__string(name, name)
-		__field(int, state)
-	),
-
-	TP_fast_assign(
-		__assign_str(name, name);
-		__entry->state = state;
-	),
-
-	TP_printk("dev=%s state=%s", __get_str(name),
-			show_pblk_state((int)__entry->state))
-
-);
-
-#endif /* !defined(_TRACE_PBLK_H) || defined(TRACE_HEADER_MULTI_READ) */
-
-/* This part must be outside protection */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../drivers/lightnvm
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE pblk-trace
-#include <trace/define_trace.h>
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
deleted file mode 100644
index b9a2aeba95ab..000000000000
--- a/drivers/lightnvm/pblk-write.c
+++ /dev/null
@@ -1,665 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2016 CNEX Labs
- * Initial release: Javier Gonzalez <javier@cnexlabs.com>
- *                  Matias Bjorling <matias@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * pblk-write.c - pblk's write path from write buffer to media
- */
-
-#include "pblk.h"
-#include "pblk-trace.h"
-
-static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
-				    struct pblk_c_ctx *c_ctx)
-{
-	struct bio *original_bio;
-	struct pblk_rb *rwb = &pblk->rwb;
-	unsigned long ret;
-	int i;
-
-	for (i = 0; i < c_ctx->nr_valid; i++) {
-		struct pblk_w_ctx *w_ctx;
-		int pos = c_ctx->sentry + i;
-		int flags;
-
-		w_ctx = pblk_rb_w_ctx(rwb, pos);
-		flags = READ_ONCE(w_ctx->flags);
-
-		if (flags & PBLK_FLUSH_ENTRY) {
-			flags &= ~PBLK_FLUSH_ENTRY;
-			/* Release flags on context. Protect from writes */
-			smp_store_release(&w_ctx->flags, flags);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-			atomic_dec(&rwb->inflight_flush_point);
-#endif
-		}
-
-		while ((original_bio = bio_list_pop(&w_ctx->bios)))
-			bio_endio(original_bio);
-	}
-
-	if (c_ctx->nr_padded)
-		pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
-							c_ctx->nr_padded);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_add(rqd->nr_ppas, &pblk->sync_writes);
-#endif
-
-	ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
-
-	bio_put(rqd->bio);
-	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
-
-	return ret;
-}
-
-static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
-					   struct nvm_rq *rqd,
-					   struct pblk_c_ctx *c_ctx)
-{
-	list_del(&c_ctx->list);
-	return pblk_end_w_bio(pblk, rqd, c_ctx);
-}
-
-static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
-				struct pblk_c_ctx *c_ctx)
-{
-	struct pblk_c_ctx *c, *r;
-	unsigned long flags;
-	unsigned long pos;
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
-#endif
-	pblk_up_rq(pblk, c_ctx->lun_bitmap);
-
-	pos = pblk_rb_sync_init(&pblk->rwb, &flags);
-	if (pos == c_ctx->sentry) {
-		pos = pblk_end_w_bio(pblk, rqd, c_ctx);
-
-retry:
-		list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
-			rqd = nvm_rq_from_c_ctx(c);
-			if (c->sentry == pos) {
-				pos = pblk_end_queued_w_bio(pblk, rqd, c);
-				goto retry;
-			}
-		}
-	} else {
-		WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd);
-		list_add_tail(&c_ctx->list, &pblk->compl_list);
-	}
-	pblk_rb_sync_end(&pblk->rwb, &flags);
-}
-
-/* Map remaining sectors in chunk, starting from ppa */
-static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa,
-		int rqd_ppas)
-{
-	struct pblk_line *line;
-	struct ppa_addr map_ppa = *ppa;
-	__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
-	__le64 *lba_list;
-	u64 paddr;
-	int done = 0;
-	int n = 0;
-
-	line = pblk_ppa_to_line(pblk, *ppa);
-	lba_list = emeta_to_lbas(pblk, line->emeta->buf);
-
-	spin_lock(&line->lock);
-
-	while (!done)  {
-		paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa);
-
-		if (!test_and_set_bit(paddr, line->map_bitmap))
-			line->left_msecs--;
-
-		if (n < rqd_ppas && lba_list[paddr] != addr_empty)
-			line->nr_valid_lbas--;
-
-		lba_list[paddr] = addr_empty;
-
-		if (!test_and_set_bit(paddr, line->invalid_bitmap))
-			le32_add_cpu(line->vsc, -1);
-
-		done = nvm_next_ppa_in_chk(pblk->dev, &map_ppa);
-
-		n++;
-	}
-
-	line->w_err_gc->has_write_err = 1;
-	spin_unlock(&line->lock);
-}
-
-static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
-				  unsigned int nr_entries)
-{
-	struct pblk_rb *rb = &pblk->rwb;
-	struct pblk_rb_entry *entry;
-	struct pblk_line *line;
-	struct pblk_w_ctx *w_ctx;
-	struct ppa_addr ppa_l2p;
-	int flags;
-	unsigned int i;
-
-	spin_lock(&pblk->trans_lock);
-	for (i = 0; i < nr_entries; i++) {
-		entry = &rb->entries[pblk_rb_ptr_wrap(rb, sentry, i)];
-		w_ctx = &entry->w_ctx;
-
-		/* Check if the lba has been overwritten */
-		if (w_ctx->lba != ADDR_EMPTY) {
-			ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
-			if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
-				w_ctx->lba = ADDR_EMPTY;
-		}
-
-		/* Mark up the entry as submittable again */
-		flags = READ_ONCE(w_ctx->flags);
-		flags |= PBLK_WRITTEN_DATA;
-		/* Release flags on write context. Protect from writes */
-		smp_store_release(&w_ctx->flags, flags);
-
-		/* Decrease the reference count to the line as we will
-		 * re-map these entries
-		 */
-		line = pblk_ppa_to_line(pblk, w_ctx->ppa);
-		atomic_dec(&line->sec_to_update);
-		kref_put(&line->ref, pblk_line_put);
-	}
-	spin_unlock(&pblk->trans_lock);
-}
-
-static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
-{
-	struct pblk_c_ctx *r_ctx;
-
-	r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL);
-	if (!r_ctx)
-		return;
-
-	r_ctx->lun_bitmap = NULL;
-	r_ctx->sentry = c_ctx->sentry;
-	r_ctx->nr_valid = c_ctx->nr_valid;
-	r_ctx->nr_padded = c_ctx->nr_padded;
-
-	spin_lock(&pblk->resubmit_lock);
-	list_add_tail(&r_ctx->list, &pblk->resubmit_list);
-	spin_unlock(&pblk->resubmit_lock);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes);
-#endif
-}
-
-static void pblk_submit_rec(struct work_struct *work)
-{
-	struct pblk_rec_ctx *recovery =
-			container_of(work, struct pblk_rec_ctx, ws_rec);
-	struct pblk *pblk = recovery->pblk;
-	struct nvm_rq *rqd = recovery->rqd;
-	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
-	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-
-	pblk_log_write_err(pblk, rqd);
-
-	pblk_map_remaining(pblk, ppa_list, rqd->nr_ppas);
-	pblk_queue_resubmit(pblk, c_ctx);
-
-	pblk_up_rq(pblk, c_ctx->lun_bitmap);
-	if (c_ctx->nr_padded)
-		pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
-							c_ctx->nr_padded);
-	bio_put(rqd->bio);
-	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
-	mempool_free(recovery, &pblk->rec_pool);
-
-	atomic_dec(&pblk->inflight_io);
-	pblk_write_kick(pblk);
-}
-
-
-static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	struct pblk_rec_ctx *recovery;
-
-	recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC);
-	if (!recovery) {
-		pblk_err(pblk, "could not allocate recovery work\n");
-		return;
-	}
-
-	recovery->pblk = pblk;
-	recovery->rqd = rqd;
-
-	INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
-	queue_work(pblk->close_wq, &recovery->ws_rec);
-}
-
-static void pblk_end_io_write(struct nvm_rq *rqd)
-{
-	struct pblk *pblk = rqd->private;
-	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
-
-	if (rqd->error) {
-		pblk_end_w_fail(pblk, rqd);
-		return;
-	} else {
-		if (trace_pblk_chunk_state_enabled())
-			pblk_check_chunk_state_update(pblk, rqd);
-#ifdef CONFIG_NVM_PBLK_DEBUG
-		WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
-#endif
-	}
-
-	pblk_complete_write(pblk, rqd, c_ctx);
-	atomic_dec(&pblk->inflight_io);
-}
-
-static void pblk_end_io_write_meta(struct nvm_rq *rqd)
-{
-	struct pblk *pblk = rqd->private;
-	struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd);
-	struct pblk_line *line = m_ctx->private;
-	struct pblk_emeta *emeta = line->emeta;
-	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-	int sync;
-
-	pblk_up_chunk(pblk, ppa_list[0]);
-
-	if (rqd->error) {
-		pblk_log_write_err(pblk, rqd);
-		pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id);
-		line->w_err_gc->has_write_err = 1;
-	} else {
-		if (trace_pblk_chunk_state_enabled())
-			pblk_check_chunk_state_update(pblk, rqd);
-	}
-
-	sync = atomic_add_return(rqd->nr_ppas, &emeta->sync);
-	if (sync == emeta->nr_entries)
-		pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws,
-						GFP_ATOMIC, pblk->close_wq);
-
-	pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
-
-	atomic_dec(&pblk->inflight_io);
-}
-
-static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
-			   unsigned int nr_secs, nvm_end_io_fn(*end_io))
-{
-	/* Setup write request */
-	rqd->opcode = NVM_OP_PWRITE;
-	rqd->nr_ppas = nr_secs;
-	rqd->is_seq = 1;
-	rqd->private = pblk;
-	rqd->end_io = end_io;
-
-	return pblk_alloc_rqd_meta(pblk, rqd);
-}
-
-static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
-			   struct ppa_addr *erase_ppa)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line *e_line = pblk_line_get_erase(pblk);
-	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
-	unsigned int valid = c_ctx->nr_valid;
-	unsigned int padded = c_ctx->nr_padded;
-	unsigned int nr_secs = valid + padded;
-	unsigned long *lun_bitmap;
-	int ret;
-
-	lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
-	if (!lun_bitmap)
-		return -ENOMEM;
-	c_ctx->lun_bitmap = lun_bitmap;
-
-	ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, pblk_end_io_write);
-	if (ret) {
-		kfree(lun_bitmap);
-		return ret;
-	}
-
-	if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
-		ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
-							valid, 0);
-	else
-		ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
-							valid, erase_ppa);
-
-	return ret;
-}
-
-static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
-				  unsigned int secs_to_flush)
-{
-	int secs_to_sync;
-
-	secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush, true);
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	if ((!secs_to_sync && secs_to_flush)
-			|| (secs_to_sync < 0)
-			|| (secs_to_sync > secs_avail && !secs_to_flush)) {
-		pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n",
-				secs_avail, secs_to_sync, secs_to_flush);
-	}
-#endif
-
-	return secs_to_sync;
-}
-
-int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_emeta *emeta = meta_line->emeta;
-	struct ppa_addr *ppa_list;
-	struct pblk_g_ctx *m_ctx;
-	struct nvm_rq *rqd;
-	void *data;
-	u64 paddr;
-	int rq_ppas = pblk->min_write_pgs;
-	int id = meta_line->id;
-	int rq_len;
-	int i, j;
-	int ret;
-
-	rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
-
-	m_ctx = nvm_rq_to_pdu(rqd);
-	m_ctx->private = meta_line;
-
-	rq_len = rq_ppas * geo->csecs;
-	data = ((void *)emeta->buf) + emeta->mem;
-
-	ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta);
-	if (ret)
-		goto fail_free_rqd;
-
-	ppa_list = nvm_rq_to_ppa_list(rqd);
-	for (i = 0; i < rqd->nr_ppas; ) {
-		spin_lock(&meta_line->lock);
-		paddr = __pblk_alloc_page(pblk, meta_line, rq_ppas);
-		spin_unlock(&meta_line->lock);
-		for (j = 0; j < rq_ppas; j++, i++, paddr++)
-			ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id);
-	}
-
-	spin_lock(&l_mg->close_lock);
-	emeta->mem += rq_len;
-	if (emeta->mem >= lm->emeta_len[0])
-		list_del(&meta_line->list);
-	spin_unlock(&l_mg->close_lock);
-
-	pblk_down_chunk(pblk, ppa_list[0]);
-
-	ret = pblk_submit_io(pblk, rqd, data);
-	if (ret) {
-		pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
-		goto fail_rollback;
-	}
-
-	return NVM_IO_OK;
-
-fail_rollback:
-	pblk_up_chunk(pblk, ppa_list[0]);
-	spin_lock(&l_mg->close_lock);
-	pblk_dealloc_page(pblk, meta_line, rq_ppas);
-	list_add(&meta_line->list, &meta_line->list);
-	spin_unlock(&l_mg->close_lock);
-fail_free_rqd:
-	pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
-	return ret;
-}
-
-static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
-				       struct pblk_line *meta_line,
-				       struct nvm_rq *data_rqd)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd);
-	struct pblk_line *data_line = pblk_line_get_data(pblk);
-	struct ppa_addr ppa, ppa_opt;
-	u64 paddr;
-	int pos_opt;
-
-	/* Schedule a metadata I/O that is half the distance from the data I/O
-	 * with regards to the number of LUNs forming the pblk instance. This
-	 * balances LUN conflicts across every I/O.
-	 *
-	 * When the LUN configuration changes (e.g., due to GC), this distance
-	 * can align, which would result on metadata and data I/Os colliding. In
-	 * this case, modify the distance to not be optimal, but move the
-	 * optimal in the right direction.
-	 */
-	paddr = pblk_lookup_page(pblk, meta_line);
-	ppa = addr_to_gen_ppa(pblk, paddr, 0);
-	ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0);
-	pos_opt = pblk_ppa_to_pos(geo, ppa_opt);
-
-	if (test_bit(pos_opt, data_c_ctx->lun_bitmap) ||
-				test_bit(pos_opt, data_line->blk_bitmap))
-		return true;
-
-	if (unlikely(pblk_ppa_comp(ppa_opt, ppa)))
-		data_line->meta_distance--;
-
-	return false;
-}
-
-static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
-						    struct nvm_rq *data_rqd)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line *meta_line;
-
-	spin_lock(&l_mg->close_lock);
-	if (list_empty(&l_mg->emeta_list)) {
-		spin_unlock(&l_mg->close_lock);
-		return NULL;
-	}
-	meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list);
-	if (meta_line->emeta->mem >= lm->emeta_len[0]) {
-		spin_unlock(&l_mg->close_lock);
-		return NULL;
-	}
-	spin_unlock(&l_mg->close_lock);
-
-	if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd))
-		return NULL;
-
-	return meta_line;
-}
-
-static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	struct ppa_addr erase_ppa;
-	struct pblk_line *meta_line;
-	int err;
-
-	pblk_ppa_set_empty(&erase_ppa);
-
-	/* Assign lbas to ppas and populate request structure */
-	err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
-	if (err) {
-		pblk_err(pblk, "could not setup write request: %d\n", err);
-		return NVM_IO_ERR;
-	}
-
-	meta_line = pblk_should_submit_meta_io(pblk, rqd);
-
-	/* Submit data write for current data line */
-	err = pblk_submit_io(pblk, rqd, NULL);
-	if (err) {
-		pblk_err(pblk, "data I/O submission failed: %d\n", err);
-		return NVM_IO_ERR;
-	}
-
-	if (!pblk_ppa_empty(erase_ppa)) {
-		/* Submit erase for next data line */
-		if (pblk_blk_erase_async(pblk, erase_ppa)) {
-			struct pblk_line *e_line = pblk_line_get_erase(pblk);
-			struct nvm_tgt_dev *dev = pblk->dev;
-			struct nvm_geo *geo = &dev->geo;
-			int bit;
-
-			atomic_inc(&e_line->left_eblks);
-			bit = pblk_ppa_to_pos(geo, erase_ppa);
-			WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
-		}
-	}
-
-	if (meta_line) {
-		/* Submit metadata write for previous data line */
-		err = pblk_submit_meta_io(pblk, meta_line);
-		if (err) {
-			pblk_err(pblk, "metadata I/O submission failed: %d",
-					err);
-			return NVM_IO_ERR;
-		}
-	}
-
-	return NVM_IO_OK;
-}
-
-static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
-	struct bio *bio = rqd->bio;
-
-	if (c_ctx->nr_padded)
-		pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid,
-							c_ctx->nr_padded);
-}
-
-static int pblk_submit_write(struct pblk *pblk, int *secs_left)
-{
-	struct bio *bio;
-	struct nvm_rq *rqd;
-	unsigned int secs_avail, secs_to_sync, secs_to_com;
-	unsigned int secs_to_flush, packed_meta_pgs;
-	unsigned long pos;
-	unsigned int resubmit;
-
-	*secs_left = 0;
-
-	spin_lock(&pblk->resubmit_lock);
-	resubmit = !list_empty(&pblk->resubmit_list);
-	spin_unlock(&pblk->resubmit_lock);
-
-	/* Resubmit failed writes first */
-	if (resubmit) {
-		struct pblk_c_ctx *r_ctx;
-
-		spin_lock(&pblk->resubmit_lock);
-		r_ctx = list_first_entry(&pblk->resubmit_list,
-					struct pblk_c_ctx, list);
-		list_del(&r_ctx->list);
-		spin_unlock(&pblk->resubmit_lock);
-
-		secs_avail = r_ctx->nr_valid;
-		pos = r_ctx->sentry;
-
-		pblk_prepare_resubmit(pblk, pos, secs_avail);
-		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
-				secs_avail);
-
-		kfree(r_ctx);
-	} else {
-		/* If there are no sectors in the cache,
-		 * flushes (bios without data) will be cleared on
-		 * the cache threads
-		 */
-		secs_avail = pblk_rb_read_count(&pblk->rwb);
-		if (!secs_avail)
-			return 0;
-
-		secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
-		if (!secs_to_flush && secs_avail < pblk->min_write_pgs_data)
-			return 0;
-
-		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
-					secs_to_flush);
-		if (secs_to_sync > pblk->max_write_pgs) {
-			pblk_err(pblk, "bad buffer sync calculation\n");
-			return 0;
-		}
-
-		secs_to_com = (secs_to_sync > secs_avail) ?
-			secs_avail : secs_to_sync;
-		pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
-	}
-
-	packed_meta_pgs = (pblk->min_write_pgs - pblk->min_write_pgs_data);
-	bio = bio_alloc(GFP_KERNEL, secs_to_sync + packed_meta_pgs);
-
-	bio->bi_iter.bi_sector = 0; /* internal bio */
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
-	rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
-	rqd->bio = bio;
-
-	if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
-								secs_avail)) {
-		pblk_err(pblk, "corrupted write bio\n");
-		goto fail_put_bio;
-	}
-
-	if (pblk_submit_io_set(pblk, rqd))
-		goto fail_free_bio;
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_long_add(secs_to_sync, &pblk->sub_writes);
-#endif
-
-	*secs_left = 1;
-	return 0;
-
-fail_free_bio:
-	pblk_free_write_rqd(pblk, rqd);
-fail_put_bio:
-	bio_put(bio);
-	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
-
-	return -EINTR;
-}
-
-int pblk_write_ts(void *data)
-{
-	struct pblk *pblk = data;
-	int secs_left;
-	int write_failure = 0;
-
-	while (!kthread_should_stop()) {
-		if (!write_failure) {
-			write_failure = pblk_submit_write(pblk, &secs_left);
-
-			if (secs_left)
-				continue;
-		}
-		set_current_state(TASK_INTERRUPTIBLE);
-		io_schedule();
-	}
-
-	return 0;
-}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
deleted file mode 100644
index 86ffa875bfe1..000000000000
--- a/drivers/lightnvm/pblk.h
+++ /dev/null
@@ -1,1358 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2015 IT University of Copenhagen (rrpc.h)
- * Copyright (C) 2016 CNEX Labs
- * Initial release: Matias Bjorling <matias@cnexlabs.com>
- * Write buffering: Javier Gonzalez <javier@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * Implementation of a Physical Block-device target for Open-channel SSDs.
- *
- */
-
-#ifndef PBLK_H_
-#define PBLK_H_
-
-#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
-#include <linux/bio.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/vmalloc.h>
-#include <linux/crc32.h>
-#include <linux/uuid.h>
-
-#include <linux/lightnvm.h>
-
-/* Run only GC if less than 1/X blocks are free */
-#define GC_LIMIT_INVERSE 5
-#define GC_TIME_MSECS 1000
-
-#define PBLK_SECTOR (512)
-#define PBLK_EXPOSED_PAGE_SIZE (4096)
-
-#define PBLK_NR_CLOSE_JOBS (4)
-
-#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16)
-
-/* Max 512 LUNs per device */
-#define PBLK_MAX_LUNS_BITMAP (4)
-
-#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
-
-/* Static pool sizes */
-#define PBLK_GEN_WS_POOL_SIZE (2)
-
-#define PBLK_DEFAULT_OP (11)
-
-enum {
-	PBLK_READ		= READ,
-	PBLK_WRITE		= WRITE,/* Write from write buffer */
-	PBLK_WRITE_INT,			/* Internal write - no write buffer */
-	PBLK_READ_RECOV,		/* Recovery read - errors allowed */
-	PBLK_ERASE,
-};
-
-enum {
-	/* IO Types */
-	PBLK_IOTYPE_USER	= 1 << 0,
-	PBLK_IOTYPE_GC		= 1 << 1,
-
-	/* Write buffer flags */
-	PBLK_FLUSH_ENTRY	= 1 << 2,
-	PBLK_WRITTEN_DATA	= 1 << 3,
-	PBLK_SUBMITTED_ENTRY	= 1 << 4,
-	PBLK_WRITABLE_ENTRY	= 1 << 5,
-};
-
-enum {
-	PBLK_BLK_ST_OPEN =	0x1,
-	PBLK_BLK_ST_CLOSED =	0x2,
-};
-
-enum {
-	PBLK_CHUNK_RESET_START,
-	PBLK_CHUNK_RESET_DONE,
-	PBLK_CHUNK_RESET_FAILED,
-};
-
-struct pblk_sec_meta {
-	u64 reserved;
-	__le64 lba;
-};
-
-/* The number of GC lists and the rate-limiter states go together. This way the
- * rate-limiter can dictate how much GC is needed based on resource utilization.
- */
-#define PBLK_GC_NR_LISTS 4
-
-enum {
-	PBLK_RL_OFF = 0,
-	PBLK_RL_WERR = 1,
-	PBLK_RL_HIGH = 2,
-	PBLK_RL_MID = 3,
-	PBLK_RL_LOW = 4
-};
-
-#define pblk_dma_ppa_size (sizeof(u64) * NVM_MAX_VLBA)
-
-/* write buffer completion context */
-struct pblk_c_ctx {
-	struct list_head list;		/* Head for out-of-order completion */
-
-	unsigned long *lun_bitmap;	/* Luns used on current request */
-	unsigned int sentry;
-	unsigned int nr_valid;
-	unsigned int nr_padded;
-};
-
-/* read context */
-struct pblk_g_ctx {
-	void *private;
-	unsigned long start_time;
-	u64 lba;
-};
-
-/* Pad context */
-struct pblk_pad_rq {
-	struct pblk *pblk;
-	struct completion wait;
-	struct kref ref;
-};
-
-/* Recovery context */
-struct pblk_rec_ctx {
-	struct pblk *pblk;
-	struct nvm_rq *rqd;
-	struct work_struct ws_rec;
-};
-
-/* Write context */
-struct pblk_w_ctx {
-	struct bio_list bios;		/* Original bios - used for completion
-					 * in REQ_FUA, REQ_FLUSH case
-					 */
-	u64 lba;			/* Logic addr. associated with entry */
-	struct ppa_addr ppa;		/* Physic addr. associated with entry */
-	int flags;			/* Write context flags */
-};
-
-struct pblk_rb_entry {
-	struct ppa_addr cacheline;	/* Cacheline for this entry */
-	void *data;			/* Pointer to data on this entry */
-	struct pblk_w_ctx w_ctx;	/* Context for this entry */
-	struct list_head index;		/* List head to enable indexes */
-};
-
-#define EMPTY_ENTRY (~0U)
-
-struct pblk_rb_pages {
-	struct page *pages;
-	int order;
-	struct list_head list;
-};
-
-struct pblk_rb {
-	struct pblk_rb_entry *entries;	/* Ring buffer entries */
-	unsigned int mem;		/* Write offset - points to next
-					 * writable entry in memory
-					 */
-	unsigned int subm;		/* Read offset - points to last entry
-					 * that has been submitted to the media
-					 * to be persisted
-					 */
-	unsigned int sync;		/* Synced - backpointer that signals
-					 * the last submitted entry that has
-					 * been successfully persisted to media
-					 */
-	unsigned int flush_point;	/* Sync point - last entry that must be
-					 * flushed to the media. Used with
-					 * REQ_FLUSH and REQ_FUA
-					 */
-	unsigned int l2p_update;	/* l2p update point - next entry for
-					 * which l2p mapping will be updated to
-					 * contain a device ppa address (instead
-					 * of a cacheline
-					 */
-	unsigned int nr_entries;	/* Number of entries in write buffer -
-					 * must be a power of two
-					 */
-	unsigned int seg_size;		/* Size of the data segments being
-					 * stored on each entry. Typically this
-					 * will be 4KB
-					 */
-
-	unsigned int back_thres;	/* Threshold that shall be maintained by
-					 * the backpointer in order to respect
-					 * geo->mw_cunits on a per chunk basis
-					 */
-
-	struct list_head pages;		/* List of data pages */
-
-	spinlock_t w_lock;		/* Write lock */
-	spinlock_t s_lock;		/* Sync lock */
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	atomic_t inflight_flush_point;	/* Not served REQ_FLUSH | REQ_FUA */
-#endif
-};
-
-#define PBLK_RECOVERY_SECTORS 16
-
-struct pblk_lun {
-	struct ppa_addr bppa;
-	struct semaphore wr_sem;
-};
-
-struct pblk_gc_rq {
-	struct pblk_line *line;
-	void *data;
-	u64 paddr_list[NVM_MAX_VLBA];
-	u64 lba_list[NVM_MAX_VLBA];
-	int nr_secs;
-	int secs_to_gc;
-	struct list_head list;
-};
-
-struct pblk_gc {
-	/* These states are not protected by a lock since (i) they are in the
-	 * fast path, and (ii) they are not critical.
-	 */
-	int gc_active;
-	int gc_enabled;
-	int gc_forced;
-
-	struct task_struct *gc_ts;
-	struct task_struct *gc_writer_ts;
-	struct task_struct *gc_reader_ts;
-
-	struct workqueue_struct *gc_line_reader_wq;
-	struct workqueue_struct *gc_reader_wq;
-
-	struct timer_list gc_timer;
-
-	struct semaphore gc_sem;
-	atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */
-	atomic_t pipeline_gc;	   /* Number of lines in the GC pipeline -
-				    * started reads to finished writes
-				    */
-	int w_entries;
-
-	struct list_head w_list;
-	struct list_head r_list;
-
-	spinlock_t lock;
-	spinlock_t w_lock;
-	spinlock_t r_lock;
-};
-
-struct pblk_rl {
-	unsigned int high;	/* Upper threshold for rate limiter (free run -
-				 * user I/O rate limiter
-				 */
-	unsigned int high_pw;	/* High rounded up as a power of 2 */
-
-#define PBLK_USER_HIGH_THRS 8	/* Begin write limit at 12% available blks */
-#define PBLK_USER_LOW_THRS 10	/* Aggressive GC at 10% available blocks */
-
-	int rb_windows_pw;	/* Number of rate windows in the write buffer
-				 * given as a power-of-2. This guarantees that
-				 * when user I/O is being rate limited, there
-				 * will be reserved enough space for the GC to
-				 * place its payload. A window is of
-				 * pblk->max_write_pgs size, which in NVMe is
-				 * 64, i.e., 256kb.
-				 */
-	int rb_budget;		/* Total number of entries available for I/O */
-	int rb_user_max;	/* Max buffer entries available for user I/O */
-	int rb_gc_max;		/* Max buffer entries available for GC I/O */
-	int rb_gc_rsv;		/* Reserved buffer entries for GC I/O */
-	int rb_state;		/* Rate-limiter current state */
-	int rb_max_io;		/* Maximum size for an I/O giving the config */
-
-	atomic_t rb_user_cnt;	/* User I/O buffer counter */
-	atomic_t rb_gc_cnt;	/* GC I/O buffer counter */
-	atomic_t rb_space;	/* Space limit in case of reaching capacity */
-
-	int rsv_blocks;		/* Reserved blocks for GC */
-
-	int rb_user_active;
-	int rb_gc_active;
-
-	atomic_t werr_lines;	/* Number of write error lines that needs gc */
-
-	struct timer_list u_timer;
-
-	unsigned long total_blocks;
-
-	atomic_t free_blocks;		/* Total number of free blocks (+ OP) */
-	atomic_t free_user_blocks;	/* Number of user free blocks (no OP) */
-};
-
-#define PBLK_LINE_EMPTY (~0U)
-
-enum {
-	/* Line Types */
-	PBLK_LINETYPE_FREE = 0,
-	PBLK_LINETYPE_LOG = 1,
-	PBLK_LINETYPE_DATA = 2,
-
-	/* Line state */
-	PBLK_LINESTATE_NEW = 9,
-	PBLK_LINESTATE_FREE = 10,
-	PBLK_LINESTATE_OPEN = 11,
-	PBLK_LINESTATE_CLOSED = 12,
-	PBLK_LINESTATE_GC = 13,
-	PBLK_LINESTATE_BAD = 14,
-	PBLK_LINESTATE_CORRUPT = 15,
-
-	/* GC group */
-	PBLK_LINEGC_NONE = 20,
-	PBLK_LINEGC_EMPTY = 21,
-	PBLK_LINEGC_LOW = 22,
-	PBLK_LINEGC_MID = 23,
-	PBLK_LINEGC_HIGH = 24,
-	PBLK_LINEGC_FULL = 25,
-	PBLK_LINEGC_WERR = 26
-};
-
-#define PBLK_MAGIC 0x70626c6b /*pblk*/
-
-/* emeta/smeta persistent storage format versions:
- * Changes in major version requires offline migration.
- * Changes in minor version are handled automatically during
- * recovery.
- */
-
-#define SMETA_VERSION_MAJOR (0)
-#define SMETA_VERSION_MINOR (1)
-
-#define EMETA_VERSION_MAJOR (0)
-#define EMETA_VERSION_MINOR (2)
-
-struct line_header {
-	__le32 crc;
-	__le32 identifier;	/* pblk identifier */
-	__u8 uuid[16];		/* instance uuid */
-	__le16 type;		/* line type */
-	__u8 version_major;	/* version major */
-	__u8 version_minor;	/* version minor */
-	__le32 id;		/* line id for current line */
-};
-
-struct line_smeta {
-	struct line_header header;
-
-	__le32 crc;		/* Full structure including struct crc */
-	/* Previous line metadata */
-	__le32 prev_id;		/* Line id for previous line */
-
-	/* Current line metadata */
-	__le64 seq_nr;		/* Sequence number for current line */
-
-	/* Active writers */
-	__le32 window_wr_lun;	/* Number of parallel LUNs to write */
-
-	__le32 rsvd[2];
-
-	__le64 lun_bitmap[];
-};
-
-
-/*
- * Metadata layout in media:
- *	First sector:
- *		1. struct line_emeta
- *		2. bad block bitmap (u64 * window_wr_lun)
- *		3. write amplification counters
- *	Mid sectors (start at lbas_sector):
- *		3. nr_lbas (u64) forming lba list
- *	Last sectors (start at vsc_sector):
- *		4. u32 valid sector count (vsc) for all lines (~0U: free line)
- */
-struct line_emeta {
-	struct line_header header;
-
-	__le32 crc;		/* Full structure including struct crc */
-
-	/* Previous line metadata */
-	__le32 prev_id;		/* Line id for prev line */
-
-	/* Current line metadata */
-	__le64 seq_nr;		/* Sequence number for current line */
-
-	/* Active writers */
-	__le32 window_wr_lun;	/* Number of parallel LUNs to write */
-
-	/* Bookkeeping for recovery */
-	__le32 next_id;		/* Line id for next line */
-	__le64 nr_lbas;		/* Number of lbas mapped in line */
-	__le64 nr_valid_lbas;	/* Number of valid lbas mapped in line */
-	__le64 bb_bitmap[];     /* Updated bad block bitmap for line */
-};
-
-
-/* Write amplification counters stored on media */
-struct wa_counters {
-	__le64 user;		/* Number of user written sectors */
-	__le64 gc;		/* Number of sectors written by GC*/
-	__le64 pad;		/* Number of padded sectors */
-};
-
-struct pblk_emeta {
-	struct line_emeta *buf;		/* emeta buffer in media format */
-	int mem;			/* Write offset - points to next
-					 * writable entry in memory
-					 */
-	atomic_t sync;			/* Synced - backpointer that signals the
-					 * last entry that has been successfully
-					 * persisted to media
-					 */
-	unsigned int nr_entries;	/* Number of emeta entries */
-};
-
-struct pblk_smeta {
-	struct line_smeta *buf;		/* smeta buffer in persistent format */
-};
-
-struct pblk_w_err_gc {
-	int has_write_err;
-	int has_gc_err;
-	__le64 *lba_list;
-};
-
-struct pblk_line {
-	struct pblk *pblk;
-	unsigned int id;		/* Line number corresponds to the
-					 * block line
-					 */
-	unsigned int seq_nr;		/* Unique line sequence number */
-
-	int state;			/* PBLK_LINESTATE_X */
-	int type;			/* PBLK_LINETYPE_X */
-	int gc_group;			/* PBLK_LINEGC_X */
-	struct list_head list;		/* Free, GC lists */
-
-	unsigned long *lun_bitmap;	/* Bitmap for LUNs mapped in line */
-
-	struct nvm_chk_meta *chks;	/* Chunks forming line */
-
-	struct pblk_smeta *smeta;	/* Start metadata */
-	struct pblk_emeta *emeta;	/* End medatada */
-
-	int meta_line;			/* Metadata line id */
-	int meta_distance;		/* Distance between data and metadata */
-
-	u64 emeta_ssec;			/* Sector where emeta starts */
-
-	unsigned int sec_in_line;	/* Number of usable secs in line */
-
-	atomic_t blk_in_line;		/* Number of good blocks in line */
-	unsigned long *blk_bitmap;	/* Bitmap for valid/invalid blocks */
-	unsigned long *erase_bitmap;	/* Bitmap for erased blocks */
-
-	unsigned long *map_bitmap;	/* Bitmap for mapped sectors in line */
-	unsigned long *invalid_bitmap;	/* Bitmap for invalid sectors in line */
-
-	atomic_t left_eblks;		/* Blocks left for erasing */
-	atomic_t left_seblks;		/* Blocks left for sync erasing */
-
-	int left_msecs;			/* Sectors left for mapping */
-	unsigned int cur_sec;		/* Sector map pointer */
-	unsigned int nr_valid_lbas;	/* Number of valid lbas in line */
-
-	__le32 *vsc;			/* Valid sector count in line */
-
-	struct kref ref;		/* Write buffer L2P references */
-	atomic_t sec_to_update;         /* Outstanding L2P updates to ppa */
-
-	struct pblk_w_err_gc *w_err_gc;	/* Write error gc recovery metadata */
-
-	spinlock_t lock;		/* Necessary for invalid_bitmap only */
-};
-
-#define PBLK_DATA_LINES 4
-
-enum {
-	PBLK_EMETA_TYPE_HEADER = 1,	/* struct line_emeta first sector */
-	PBLK_EMETA_TYPE_LLBA = 2,	/* lba list - type: __le64 */
-	PBLK_EMETA_TYPE_VSC = 3,	/* vsc list - type: __le32 */
-};
-
-struct pblk_line_mgmt {
-	int nr_lines;			/* Total number of full lines */
-	int nr_free_lines;		/* Number of full lines in free list */
-
-	/* Free lists - use free_lock */
-	struct list_head free_list;	/* Full lines ready to use */
-	struct list_head corrupt_list;	/* Full lines corrupted */
-	struct list_head bad_list;	/* Full lines bad */
-
-	/* GC lists - use gc_lock */
-	struct list_head *gc_lists[PBLK_GC_NR_LISTS];
-	struct list_head gc_high_list;	/* Full lines ready to GC, high isc */
-	struct list_head gc_mid_list;	/* Full lines ready to GC, mid isc */
-	struct list_head gc_low_list;	/* Full lines ready to GC, low isc */
-
-	struct list_head gc_werr_list;  /* Write err recovery list */
-
-	struct list_head gc_full_list;	/* Full lines ready to GC, no valid */
-	struct list_head gc_empty_list;	/* Full lines close, all valid */
-
-	struct pblk_line *log_line;	/* Current FTL log line */
-	struct pblk_line *data_line;	/* Current data line */
-	struct pblk_line *log_next;	/* Next FTL log line */
-	struct pblk_line *data_next;	/* Next data line */
-
-	struct list_head emeta_list;	/* Lines queued to schedule emeta */
-
-	__le32 *vsc_list;		/* Valid sector counts for all lines */
-
-	/* Pre-allocated metadata for data lines */
-	struct pblk_smeta *sline_meta[PBLK_DATA_LINES];
-	struct pblk_emeta *eline_meta[PBLK_DATA_LINES];
-	unsigned long meta_bitmap;
-
-	/* Cache and mempool for map/invalid bitmaps */
-	struct kmem_cache *bitmap_cache;
-	mempool_t *bitmap_pool;
-
-	/* Helpers for fast bitmap calculations */
-	unsigned long *bb_template;
-	unsigned long *bb_aux;
-
-	unsigned long d_seq_nr;		/* Data line unique sequence number */
-	unsigned long l_seq_nr;		/* Log line unique sequence number */
-
-	spinlock_t free_lock;
-	spinlock_t close_lock;
-	spinlock_t gc_lock;
-};
-
-struct pblk_line_meta {
-	unsigned int smeta_len;		/* Total length for smeta */
-	unsigned int smeta_sec;		/* Sectors needed for smeta */
-
-	unsigned int emeta_len[4];	/* Lengths for emeta:
-					 *  [0]: Total
-					 *  [1]: struct line_emeta +
-					 *       bb_bitmap + struct wa_counters
-					 *  [2]: L2P portion
-					 *  [3]: vsc
-					 */
-	unsigned int emeta_sec[4];	/* Sectors needed for emeta. Same layout
-					 * as emeta_len
-					 */
-
-	unsigned int emeta_bb;		/* Boundary for bb that affects emeta */
-
-	unsigned int vsc_list_len;	/* Length for vsc list */
-	unsigned int sec_bitmap_len;	/* Length for sector bitmap in line */
-	unsigned int blk_bitmap_len;	/* Length for block bitmap in line */
-	unsigned int lun_bitmap_len;	/* Length for lun bitmap in line */
-
-	unsigned int blk_per_line;	/* Number of blocks in a full line */
-	unsigned int sec_per_line;	/* Number of sectors in a line */
-	unsigned int dsec_per_line;	/* Number of data sectors in a line */
-	unsigned int min_blk_line;	/* Min. number of good blocks in line */
-
-	unsigned int mid_thrs;		/* Threshold for GC mid list */
-	unsigned int high_thrs;		/* Threshold for GC high list */
-
-	unsigned int meta_distance;	/* Distance between data and metadata */
-};
-
-enum {
-	PBLK_STATE_RUNNING = 0,
-	PBLK_STATE_STOPPING = 1,
-	PBLK_STATE_RECOVERING = 2,
-	PBLK_STATE_STOPPED = 3,
-};
-
-/* Internal format to support not power-of-2 device formats */
-struct pblk_addrf {
-	/* gen to dev */
-	int sec_stripe;
-	int ch_stripe;
-	int lun_stripe;
-
-	/* dev to gen */
-	int sec_lun_stripe;
-	int sec_ws_stripe;
-};
-
-struct pblk {
-	struct nvm_tgt_dev *dev;
-	struct gendisk *disk;
-
-	struct kobject kobj;
-
-	struct pblk_lun *luns;
-
-	struct pblk_line *lines;		/* Line array */
-	struct pblk_line_mgmt l_mg;		/* Line management */
-	struct pblk_line_meta lm;		/* Line metadata */
-
-	struct nvm_addrf addrf;		/* Aligned address format */
-	struct pblk_addrf uaddrf;	/* Unaligned address format */
-	int addrf_len;
-
-	struct pblk_rb rwb;
-
-	int state;			/* pblk line state */
-
-	int min_write_pgs; /* Minimum amount of pages required by controller */
-	int min_write_pgs_data; /* Minimum amount of payload pages */
-	int max_write_pgs; /* Maximum amount of pages supported by controller */
-	int oob_meta_size; /* Size of OOB sector metadata */
-
-	sector_t capacity; /* Device capacity when bad blocks are subtracted */
-
-	int op;      /* Percentage of device used for over-provisioning */
-	int op_blks; /* Number of blocks used for over-provisioning */
-
-	/* pblk provisioning values. Used by rate limiter */
-	struct pblk_rl rl;
-
-	int sec_per_write;
-
-	guid_t instance_uuid;
-
-	/* Persistent write amplification counters, 4kb sector I/Os */
-	atomic64_t user_wa;		/* Sectors written by user */
-	atomic64_t gc_wa;		/* Sectors written by GC */
-	atomic64_t pad_wa;		/* Padded sectors written */
-
-	/* Reset values for delta write amplification measurements */
-	u64 user_rst_wa;
-	u64 gc_rst_wa;
-	u64 pad_rst_wa;
-
-	/* Counters used for calculating padding distribution */
-	atomic64_t *pad_dist;		/* Padding distribution buckets */
-	u64 nr_flush_rst;		/* Flushes reset value for pad dist.*/
-	atomic64_t nr_flush;		/* Number of flush/fua I/O */
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-	/* Non-persistent debug counters, 4kb sector I/Os */
-	atomic_long_t inflight_writes;	/* Inflight writes (user and gc) */
-	atomic_long_t padded_writes;	/* Sectors padded due to flush/fua */
-	atomic_long_t padded_wb;	/* Sectors padded in write buffer */
-	atomic_long_t req_writes;	/* Sectors stored on write buffer */
-	atomic_long_t sub_writes;	/* Sectors submitted from buffer */
-	atomic_long_t sync_writes;	/* Sectors synced to media */
-	atomic_long_t inflight_reads;	/* Inflight sector read requests */
-	atomic_long_t cache_reads;	/* Read requests that hit the cache */
-	atomic_long_t sync_reads;	/* Completed sector read requests */
-	atomic_long_t recov_writes;	/* Sectors submitted from recovery */
-	atomic_long_t recov_gc_writes;	/* Sectors submitted from write GC */
-	atomic_long_t recov_gc_reads;	/* Sectors submitted from read GC */
-#endif
-
-	spinlock_t lock;
-
-	atomic_long_t read_failed;
-	atomic_long_t read_empty;
-	atomic_long_t read_high_ecc;
-	atomic_long_t read_failed_gc;
-	atomic_long_t write_failed;
-	atomic_long_t erase_failed;
-
-	atomic_t inflight_io;		/* General inflight I/O counter */
-
-	struct task_struct *writer_ts;
-
-	/* Simple translation map of logical addresses to physical addresses.
-	 * The logical addresses is known by the host system, while the physical
-	 * addresses are used when writing to the disk block device.
-	 */
-	unsigned char *trans_map;
-	spinlock_t trans_lock;
-
-	struct list_head compl_list;
-
-	spinlock_t resubmit_lock;	 /* Resubmit list lock */
-	struct list_head resubmit_list; /* Resubmit list for failed writes*/
-
-	mempool_t page_bio_pool;
-	mempool_t gen_ws_pool;
-	mempool_t rec_pool;
-	mempool_t r_rq_pool;
-	mempool_t w_rq_pool;
-	mempool_t e_rq_pool;
-
-	struct workqueue_struct *close_wq;
-	struct workqueue_struct *bb_wq;
-	struct workqueue_struct *r_end_wq;
-
-	struct timer_list wtimer;
-
-	struct pblk_gc gc;
-};
-
-struct pblk_line_ws {
-	struct pblk *pblk;
-	struct pblk_line *line;
-	void *priv;
-	struct work_struct ws;
-};
-
-#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx))
-#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
-
-#define pblk_err(pblk, fmt, ...)			\
-	pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
-#define pblk_info(pblk, fmt, ...)			\
-	pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
-#define pblk_warn(pblk, fmt, ...)			\
-	pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
-#define pblk_debug(pblk, fmt, ...)			\
-	pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
-
-/*
- * pblk ring buffer operations
- */
-int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
-		 unsigned int seg_sz);
-int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
-			   unsigned int nr_entries, unsigned int *pos);
-int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
-			 unsigned int *pos);
-void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
-			      struct pblk_w_ctx w_ctx, unsigned int pos);
-void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
-			    struct pblk_w_ctx w_ctx, struct pblk_line *line,
-			    u64 paddr, unsigned int pos);
-struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos);
-void pblk_rb_flush(struct pblk_rb *rb);
-
-void pblk_rb_sync_l2p(struct pblk_rb *rb);
-unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
-				 unsigned int pos, unsigned int nr_entries,
-				 unsigned int count);
-int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
-			struct ppa_addr ppa);
-unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
-
-unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags);
-unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
-unsigned int pblk_rb_ptr_wrap(struct pblk_rb *rb, unsigned int p,
-			      unsigned int nr_entries);
-void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
-unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb);
-
-unsigned int pblk_rb_read_count(struct pblk_rb *rb);
-unsigned int pblk_rb_sync_count(struct pblk_rb *rb);
-unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos);
-
-int pblk_rb_tear_down_check(struct pblk_rb *rb);
-int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos);
-void pblk_rb_free(struct pblk_rb *rb);
-ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf);
-
-/*
- * pblk core
- */
-struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type);
-void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type);
-int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd);
-void pblk_free_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd);
-void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write);
-int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
-			struct pblk_c_ctx *c_ctx);
-void pblk_discard(struct pblk *pblk, struct bio *bio);
-struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk);
-struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk,
-					      struct nvm_chk_meta *lp,
-					      struct ppa_addr ppa);
-void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
-void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
-int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf);
-int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf);
-int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line);
-void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd);
-struct pblk_line *pblk_line_get(struct pblk *pblk);
-struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
-struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
-void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa);
-void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd);
-int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
-void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
-struct pblk_line *pblk_line_get_data(struct pblk *pblk);
-struct pblk_line *pblk_line_get_erase(struct pblk *pblk);
-int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
-int pblk_line_is_full(struct pblk_line *line);
-void pblk_line_free(struct pblk_line *line);
-void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line);
-void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
-void pblk_line_close_ws(struct work_struct *work);
-void pblk_pipeline_stop(struct pblk *pblk);
-void __pblk_pipeline_stop(struct pblk *pblk);
-void __pblk_pipeline_flush(struct pblk *pblk);
-void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
-		     void (*work)(struct work_struct *), gfp_t gfp_mask,
-		     struct workqueue_struct *wq);
-u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line);
-int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line);
-int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
-			 void *emeta_buf);
-int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa);
-void pblk_line_put(struct kref *ref);
-void pblk_line_put_wq(struct kref *ref);
-struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line);
-u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line);
-void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
-u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
-u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
-int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
-		   unsigned long secs_to_flush, bool skip_meta);
-void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa,
-		  unsigned long *lun_bitmap);
-void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa);
-void pblk_up_chunk(struct pblk *pblk, struct ppa_addr ppa);
-void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap);
-int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
-		       int nr_pages);
-void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
-			 int nr_pages);
-void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa);
-void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
-			   u64 paddr);
-void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa);
-void pblk_update_map_cache(struct pblk *pblk, sector_t lba,
-			   struct ppa_addr ppa);
-void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
-			 struct ppa_addr ppa, struct ppa_addr entry_line);
-int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
-		       struct pblk_line *gc_line, u64 paddr);
-void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
-			  u64 *lba_list, int nr_secs);
-int pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
-			 sector_t blba, int nr_secs, bool *from_cache);
-void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd);
-void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd);
-
-/*
- * pblk user I/O write path
- */
-void pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
-			unsigned long flags);
-int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
-
-/*
- * pblk map
- */
-int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
-		       unsigned int sentry, unsigned long *lun_bitmap,
-		       unsigned int valid_secs, struct ppa_addr *erase_ppa);
-int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
-		 unsigned long *lun_bitmap, unsigned int valid_secs,
-		 unsigned int off);
-
-/*
- * pblk write thread
- */
-int pblk_write_ts(void *data);
-void pblk_write_timer_fn(struct timer_list *t);
-void pblk_write_should_kick(struct pblk *pblk);
-void pblk_write_kick(struct pblk *pblk);
-
-/*
- * pblk read path
- */
-extern struct bio_set pblk_bio_set;
-void pblk_submit_read(struct pblk *pblk, struct bio *bio);
-int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
-/*
- * pblk recovery
- */
-struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
-int pblk_recov_pad(struct pblk *pblk);
-int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
-
-/*
- * pblk gc
- */
-#define PBLK_GC_MAX_READERS 8	/* Max number of outstanding GC reader jobs */
-#define PBLK_GC_RQ_QD 128	/* Queue depth for inflight GC requests */
-#define PBLK_GC_L_QD 4		/* Queue depth for inflight GC lines */
-
-int pblk_gc_init(struct pblk *pblk);
-void pblk_gc_exit(struct pblk *pblk, bool graceful);
-void pblk_gc_should_start(struct pblk *pblk);
-void pblk_gc_should_stop(struct pblk *pblk);
-void pblk_gc_should_kick(struct pblk *pblk);
-void pblk_gc_free_full_lines(struct pblk *pblk);
-void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
-			      int *gc_active);
-int pblk_gc_sysfs_force(struct pblk *pblk, int force);
-void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line);
-
-/*
- * pblk rate limiter
- */
-void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold);
-void pblk_rl_free(struct pblk_rl *rl);
-void pblk_rl_update_rates(struct pblk_rl *rl);
-int pblk_rl_high_thrs(struct pblk_rl *rl);
-unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
-unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl);
-int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
-void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries);
-void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
-int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
-void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
-void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
-int pblk_rl_max_io(struct pblk_rl *rl);
-void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
-void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
-			    bool used);
-int pblk_rl_is_limit(struct pblk_rl *rl);
-
-void pblk_rl_werr_line_in(struct pblk_rl *rl);
-void pblk_rl_werr_line_out(struct pblk_rl *rl);
-
-/*
- * pblk sysfs
- */
-int pblk_sysfs_init(struct gendisk *tdisk);
-void pblk_sysfs_exit(struct gendisk *tdisk);
-
-static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx)
-{
-	return c_ctx - sizeof(struct nvm_rq);
-}
-
-static inline void *emeta_to_bb(struct line_emeta *emeta)
-{
-	return emeta->bb_bitmap;
-}
-
-static inline void *emeta_to_wa(struct pblk_line_meta *lm,
-				struct line_emeta *emeta)
-{
-	return emeta->bb_bitmap + lm->blk_bitmap_len;
-}
-
-static inline void *emeta_to_lbas(struct pblk *pblk, struct line_emeta *emeta)
-{
-	return ((void *)emeta + pblk->lm.emeta_len[1]);
-}
-
-static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta)
-{
-	return (emeta_to_lbas(pblk, emeta) + pblk->lm.emeta_len[2]);
-}
-
-static inline int pblk_line_vsc(struct pblk_line *line)
-{
-	return le32_to_cpu(*line->vsc);
-}
-
-static inline int pblk_ppa_to_line_id(struct ppa_addr p)
-{
-	return p.a.blk;
-}
-
-static inline struct pblk_line *pblk_ppa_to_line(struct pblk *pblk,
-						 struct ppa_addr p)
-{
-	return &pblk->lines[pblk_ppa_to_line_id(p)];
-}
-
-static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
-{
-	return p.a.lun * geo->num_ch + p.a.ch;
-}
-
-static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
-					      u64 line_id)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct ppa_addr ppa;
-
-	if (geo->version == NVM_OCSSD_SPEC_12) {
-		struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf;
-
-		ppa.ppa = 0;
-		ppa.g.blk = line_id;
-		ppa.g.pg = (paddr & ppaf->pg_mask) >> ppaf->pg_offset;
-		ppa.g.lun = (paddr & ppaf->lun_mask) >> ppaf->lun_offset;
-		ppa.g.ch = (paddr & ppaf->ch_mask) >> ppaf->ch_offset;
-		ppa.g.pl = (paddr & ppaf->pln_mask) >> ppaf->pln_offset;
-		ppa.g.sec = (paddr & ppaf->sec_mask) >> ppaf->sec_offset;
-	} else {
-		struct pblk_addrf *uaddrf = &pblk->uaddrf;
-		int secs, chnls, luns;
-
-		ppa.ppa = 0;
-
-		ppa.m.chk = line_id;
-
-		paddr = div_u64_rem(paddr, uaddrf->sec_stripe, &secs);
-		ppa.m.sec = secs;
-
-		paddr = div_u64_rem(paddr, uaddrf->ch_stripe, &chnls);
-		ppa.m.grp = chnls;
-
-		paddr = div_u64_rem(paddr, uaddrf->lun_stripe, &luns);
-		ppa.m.pu = luns;
-
-		ppa.m.sec += uaddrf->sec_stripe * paddr;
-	}
-
-	return ppa;
-}
-
-static inline struct nvm_chk_meta *pblk_dev_ppa_to_chunk(struct pblk *pblk,
-							struct ppa_addr p)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line *line = pblk_ppa_to_line(pblk, p);
-	int pos = pblk_ppa_to_pos(geo, p);
-
-	return &line->chks[pos];
-}
-
-static inline u64 pblk_dev_ppa_to_chunk_addr(struct pblk *pblk,
-							struct ppa_addr p)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-
-	return dev_to_chunk_addr(dev->parent, &pblk->addrf, p);
-}
-
-static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
-							struct ppa_addr p)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	u64 paddr;
-
-	if (geo->version == NVM_OCSSD_SPEC_12) {
-		struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf;
-
-		paddr = (u64)p.g.ch << ppaf->ch_offset;
-		paddr |= (u64)p.g.lun << ppaf->lun_offset;
-		paddr |= (u64)p.g.pg << ppaf->pg_offset;
-		paddr |= (u64)p.g.pl << ppaf->pln_offset;
-		paddr |= (u64)p.g.sec << ppaf->sec_offset;
-	} else {
-		struct pblk_addrf *uaddrf = &pblk->uaddrf;
-		u64 secs = p.m.sec;
-		int sec_stripe;
-
-		paddr = (u64)p.m.grp * uaddrf->sec_stripe;
-		paddr += (u64)p.m.pu * uaddrf->sec_lun_stripe;
-
-		secs = div_u64_rem(secs, uaddrf->sec_stripe, &sec_stripe);
-		paddr += secs * uaddrf->sec_ws_stripe;
-		paddr += sec_stripe;
-	}
-
-	return paddr;
-}
-
-static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-
-	return nvm_ppa32_to_ppa64(dev->parent, &pblk->addrf, ppa32);
-}
-
-static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-
-	return nvm_ppa64_to_ppa32(dev->parent, &pblk->addrf, ppa64);
-}
-
-static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
-								sector_t lba)
-{
-	struct ppa_addr ppa;
-
-	if (pblk->addrf_len < 32) {
-		u32 *map = (u32 *)pblk->trans_map;
-
-		ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
-	} else {
-		struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
-
-		ppa = map[lba];
-	}
-
-	return ppa;
-}
-
-static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
-						struct ppa_addr ppa)
-{
-	if (pblk->addrf_len < 32) {
-		u32 *map = (u32 *)pblk->trans_map;
-
-		map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
-	} else {
-		u64 *map = (u64 *)pblk->trans_map;
-
-		map[lba] = ppa.ppa;
-	}
-}
-
-static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
-{
-	return (ppa_addr.ppa == ADDR_EMPTY);
-}
-
-static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
-{
-	ppa_addr->ppa = ADDR_EMPTY;
-}
-
-static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa)
-{
-	return (lppa.ppa == rppa.ppa);
-}
-
-static inline int pblk_addr_in_cache(struct ppa_addr ppa)
-{
-	return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached);
-}
-
-static inline int pblk_addr_to_cacheline(struct ppa_addr ppa)
-{
-	return ppa.c.line;
-}
-
-static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
-{
-	struct ppa_addr p;
-
-	p.c.line = addr;
-	p.c.is_cached = 1;
-
-	return p;
-}
-
-static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
-					    struct line_header *header)
-{
-	u32 crc = ~(u32)0;
-
-	crc = crc32_le(crc, (unsigned char *)header + sizeof(crc),
-				sizeof(struct line_header) - sizeof(crc));
-
-	return crc;
-}
-
-static inline u32 pblk_calc_smeta_crc(struct pblk *pblk,
-				      struct line_smeta *smeta)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	u32 crc = ~(u32)0;
-
-	crc = crc32_le(crc, (unsigned char *)smeta +
-				sizeof(struct line_header) + sizeof(crc),
-				lm->smeta_len -
-				sizeof(struct line_header) - sizeof(crc));
-
-	return crc;
-}
-
-static inline u32 pblk_calc_emeta_crc(struct pblk *pblk,
-				      struct line_emeta *emeta)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	u32 crc = ~(u32)0;
-
-	crc = crc32_le(crc, (unsigned char *)emeta +
-				sizeof(struct line_header) + sizeof(crc),
-				lm->emeta_len[0] -
-				sizeof(struct line_header) - sizeof(crc));
-
-	return crc;
-}
-
-static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs)
-{
-	return !(nr_secs % pblk->min_write_pgs);
-}
-
-#ifdef CONFIG_NVM_PBLK_DEBUG
-static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p,
-			     char *msg, int error)
-{
-	struct nvm_geo *geo = &pblk->dev->geo;
-
-	if (p->c.is_cached) {
-		pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n",
-				msg, error, (u64)p->c.line);
-	} else if (geo->version == NVM_OCSSD_SPEC_12) {
-		pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
-			msg, error,
-			p->g.ch, p->g.lun, p->g.blk,
-			p->g.pg, p->g.pl, p->g.sec);
-	} else {
-		pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n",
-			msg, error,
-			p->m.grp, p->m.pu, p->m.chk, p->m.sec);
-	}
-}
-
-static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
-					 int error)
-{
-	int bit = -1;
-
-	if (rqd->nr_ppas ==  1) {
-		print_ppa(pblk, &rqd->ppa_addr, "rqd", error);
-		return;
-	}
-
-	while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
-						bit + 1)) < rqd->nr_ppas) {
-		print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error);
-	}
-
-	pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
-}
-
-static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
-				       struct ppa_addr *ppas, int nr_ppas)
-{
-	struct nvm_geo *geo = &tgt_dev->geo;
-	struct ppa_addr *ppa;
-	int i;
-
-	for (i = 0; i < nr_ppas; i++) {
-		ppa = &ppas[i];
-
-		if (geo->version == NVM_OCSSD_SPEC_12) {
-			if (!ppa->c.is_cached &&
-					ppa->g.ch < geo->num_ch &&
-					ppa->g.lun < geo->num_lun &&
-					ppa->g.pl < geo->num_pln &&
-					ppa->g.blk < geo->num_chk &&
-					ppa->g.pg < geo->num_pg &&
-					ppa->g.sec < geo->ws_min)
-				continue;
-		} else {
-			if (!ppa->c.is_cached &&
-					ppa->m.grp < geo->num_ch &&
-					ppa->m.pu < geo->num_lun &&
-					ppa->m.chk < geo->num_chk &&
-					ppa->m.sec < geo->clba)
-				continue;
-		}
-
-		print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i);
-
-		return 1;
-	}
-	return 0;
-}
-
-static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
-
-	if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
-		WARN_ON(1);
-		return -EINVAL;
-	}
-
-	if (rqd->opcode == NVM_OP_PWRITE) {
-		struct pblk_line *line;
-		int i;
-
-		for (i = 0; i < rqd->nr_ppas; i++) {
-			line = pblk_ppa_to_line(pblk, ppa_list[i]);
-
-			spin_lock(&line->lock);
-			if (line->state != PBLK_LINESTATE_OPEN) {
-				pblk_err(pblk, "bad ppa: line:%d,state:%d\n",
-							line->id, line->state);
-				WARN_ON(1);
-				spin_unlock(&line->lock);
-				return -EINVAL;
-			}
-			spin_unlock(&line->lock);
-		}
-	}
-
-	return 0;
-}
-#endif
-
-static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-
-	if (paddr > lm->sec_per_line)
-		return 1;
-
-	return 0;
-}
-
-static inline unsigned int pblk_get_bi_idx(struct bio *bio)
-{
-	return bio->bi_iter.bi_idx;
-}
-
-static inline sector_t pblk_get_lba(struct bio *bio)
-{
-	return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
-}
-
-static inline unsigned int pblk_get_secs(struct bio *bio)
-{
-	return  bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
-}
-
-static inline char *pblk_disk_name(struct pblk *pblk)
-{
-	struct gendisk *disk = pblk->disk;
-
-	return disk->disk_name;
-}
-
-static inline unsigned int pblk_get_min_chks(struct pblk *pblk)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	/* In a worst-case scenario every line will have OP invalid sectors.
-	 * We will then need a minimum of 1/OP lines to free up a single line
-	 */
-
-	return DIV_ROUND_UP(100, pblk->op) * lm->blk_per_line;
-}
-
-static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk,
-							 void *meta, int index)
-{
-	return meta +
-	       max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size)
-	       * index;
-}
-
-static inline int pblk_dma_meta_size(struct pblk *pblk)
-{
-	return max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size)
-	       * NVM_MAX_VLBA;
-}
-
-static inline int pblk_is_oob_meta_supported(struct pblk *pblk)
-{
-	return pblk->oob_meta_size >= sizeof(struct pblk_sec_meta);
-}
-#endif /* PBLK_H_ */
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index cbc509784b2e..dfaacd472e5d 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -12,7 +12,6 @@ obj-$(CONFIG_NVME_TCP)			+= nvme-tcp.o
 nvme-core-y				:= core.o ioctl.o
 nvme-core-$(CONFIG_TRACING)		+= trace.o
 nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
-nvme-core-$(CONFIG_NVM)			+= lightnvm.o
 nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
 nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
 nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index dfd9dec0c1f6..ce33014e3eb0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -587,9 +587,6 @@ static void nvme_free_ns(struct kref *kref)
 {
 	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
 
-	if (ns->ndev)
-		nvme_nvm_unregister(ns);
-
 	put_disk(ns->disk);
 	nvme_put_ns_head(ns->head);
 	nvme_put_ctrl(ns->ctrl);
@@ -3218,9 +3215,6 @@ static const struct attribute_group nvme_ns_id_attr_group = {
 
 const struct attribute_group *nvme_ns_id_attr_groups[] = {
 	&nvme_ns_id_attr_group,
-#ifdef CONFIG_NVM
-	&nvme_nvm_attr_group,
-#endif
 	NULL,
 };
 
@@ -3767,13 +3761,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	if (nvme_update_ns_info(ns, id))
 		goto out_put_disk;
 
-	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
-		if (nvme_nvm_register(ns, disk->disk_name, node)) {
-			dev_warn(ctrl->device, "LightNVM init failure\n");
-			goto out_put_disk;
-		}
-	}
-
 	down_write(&ctrl->namespaces_rwsem);
 	list_add_tail(&ns->list, &ctrl->namespaces);
 	up_write(&ctrl->namespaces_rwsem);
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 305ddd415e45..22314962842d 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -342,9 +342,7 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
 	case NVME_IOCTL_IO64_CMD:
 		return nvme_user_cmd64(ns->ctrl, ns, argp);
 	default:
-		if (!ns->ndev)
-			return -ENOTTY;
-		return nvme_nvm_ioctl(ns, cmd, argp);
+		return -ENOTTY;
 	}
 }
 
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
deleted file mode 100644
index e9d9ad47f70f..000000000000
--- a/drivers/nvme/host/lightnvm.c
+++ /dev/null
@@ -1,1274 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * nvme-lightnvm.c - LightNVM NVMe device
- *
- * Copyright (C) 2014-2015 IT University of Copenhagen
- * Initial release: Matias Bjorling <mb@lightnvm.io>
- */
-
-#include "nvme.h"
-
-#include <linux/nvme.h>
-#include <linux/bitops.h>
-#include <linux/lightnvm.h>
-#include <linux/vmalloc.h>
-#include <linux/sched/sysctl.h>
-#include <uapi/linux/lightnvm.h>
-
-enum nvme_nvm_admin_opcode {
-	nvme_nvm_admin_identity		= 0xe2,
-	nvme_nvm_admin_get_bb_tbl	= 0xf2,
-	nvme_nvm_admin_set_bb_tbl	= 0xf1,
-};
-
-enum nvme_nvm_log_page {
-	NVME_NVM_LOG_REPORT_CHUNK	= 0xca,
-};
-
-struct nvme_nvm_ph_rw {
-	__u8			opcode;
-	__u8			flags;
-	__u16			command_id;
-	__le32			nsid;
-	__u64			rsvd2;
-	__le64			metadata;
-	__le64			prp1;
-	__le64			prp2;
-	__le64			spba;
-	__le16			length;
-	__le16			control;
-	__le32			dsmgmt;
-	__le64			resv;
-};
-
-struct nvme_nvm_erase_blk {
-	__u8			opcode;
-	__u8			flags;
-	__u16			command_id;
-	__le32			nsid;
-	__u64			rsvd[2];
-	__le64			prp1;
-	__le64			prp2;
-	__le64			spba;
-	__le16			length;
-	__le16			control;
-	__le32			dsmgmt;
-	__le64			resv;
-};
-
-struct nvme_nvm_identity {
-	__u8			opcode;
-	__u8			flags;
-	__u16			command_id;
-	__le32			nsid;
-	__u64			rsvd[2];
-	__le64			prp1;
-	__le64			prp2;
-	__u32			rsvd11[6];
-};
-
-struct nvme_nvm_getbbtbl {
-	__u8			opcode;
-	__u8			flags;
-	__u16			command_id;
-	__le32			nsid;
-	__u64			rsvd[2];
-	__le64			prp1;
-	__le64			prp2;
-	__le64			spba;
-	__u32			rsvd4[4];
-};
-
-struct nvme_nvm_setbbtbl {
-	__u8			opcode;
-	__u8			flags;
-	__u16			command_id;
-	__le32			nsid;
-	__le64			rsvd[2];
-	__le64			prp1;
-	__le64			prp2;
-	__le64			spba;
-	__le16			nlb;
-	__u8			value;
-	__u8			rsvd3;
-	__u32			rsvd4[3];
-};
-
-struct nvme_nvm_command {
-	union {
-		struct nvme_common_command common;
-		struct nvme_nvm_ph_rw ph_rw;
-		struct nvme_nvm_erase_blk erase;
-		struct nvme_nvm_identity identity;
-		struct nvme_nvm_getbbtbl get_bb;
-		struct nvme_nvm_setbbtbl set_bb;
-	};
-};
-
-struct nvme_nvm_id12_grp {
-	__u8			mtype;
-	__u8			fmtype;
-	__le16			res16;
-	__u8			num_ch;
-	__u8			num_lun;
-	__u8			num_pln;
-	__u8			rsvd1;
-	__le16			num_chk;
-	__le16			num_pg;
-	__le16			fpg_sz;
-	__le16			csecs;
-	__le16			sos;
-	__le16			rsvd2;
-	__le32			trdt;
-	__le32			trdm;
-	__le32			tprt;
-	__le32			tprm;
-	__le32			tbet;
-	__le32			tbem;
-	__le32			mpos;
-	__le32			mccap;
-	__le16			cpar;
-	__u8			reserved[906];
-} __packed;
-
-struct nvme_nvm_id12_addrf {
-	__u8			ch_offset;
-	__u8			ch_len;
-	__u8			lun_offset;
-	__u8			lun_len;
-	__u8			pln_offset;
-	__u8			pln_len;
-	__u8			blk_offset;
-	__u8			blk_len;
-	__u8			pg_offset;
-	__u8			pg_len;
-	__u8			sec_offset;
-	__u8			sec_len;
-	__u8			res[4];
-} __packed;
-
-struct nvme_nvm_id12 {
-	__u8			ver_id;
-	__u8			vmnt;
-	__u8			cgrps;
-	__u8			res;
-	__le32			cap;
-	__le32			dom;
-	struct nvme_nvm_id12_addrf ppaf;
-	__u8			resv[228];
-	struct nvme_nvm_id12_grp grp;
-	__u8			resv2[2880];
-} __packed;
-
-struct nvme_nvm_bb_tbl {
-	__u8	tblid[4];
-	__le16	verid;
-	__le16	revid;
-	__le32	rvsd1;
-	__le32	tblks;
-	__le32	tfact;
-	__le32	tgrown;
-	__le32	tdresv;
-	__le32	thresv;
-	__le32	rsvd2[8];
-	__u8	blk[];
-};
-
-struct nvme_nvm_id20_addrf {
-	__u8			grp_len;
-	__u8			pu_len;
-	__u8			chk_len;
-	__u8			lba_len;
-	__u8			resv[4];
-};
-
-struct nvme_nvm_id20 {
-	__u8			mjr;
-	__u8			mnr;
-	__u8			resv[6];
-
-	struct nvme_nvm_id20_addrf lbaf;
-
-	__le32			mccap;
-	__u8			resv2[12];
-
-	__u8			wit;
-	__u8			resv3[31];
-
-	/* Geometry */
-	__le16			num_grp;
-	__le16			num_pu;
-	__le32			num_chk;
-	__le32			clba;
-	__u8			resv4[52];
-
-	/* Write data requirements */
-	__le32			ws_min;
-	__le32			ws_opt;
-	__le32			mw_cunits;
-	__le32			maxoc;
-	__le32			maxocpu;
-	__u8			resv5[44];
-
-	/* Performance related metrics */
-	__le32			trdt;
-	__le32			trdm;
-	__le32			twrt;
-	__le32			twrm;
-	__le32			tcrst;
-	__le32			tcrsm;
-	__u8			resv6[40];
-
-	/* Reserved area */
-	__u8			resv7[2816];
-
-	/* Vendor specific */
-	__u8			vs[1024];
-};
-
-struct nvme_nvm_chk_meta {
-	__u8	state;
-	__u8	type;
-	__u8	wi;
-	__u8	rsvd[5];
-	__le64	slba;
-	__le64	cnlb;
-	__le64	wp;
-};
-
-/*
- * Check we didn't inadvertently grow the command struct
- */
-static inline void _nvme_nvm_check_size(void)
-{
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_grp) != 960);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_addrf) != 16);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12) != NVME_IDENTIFY_DATA_SIZE);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_id20_addrf) != 8);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_id20) != NVME_IDENTIFY_DATA_SIZE);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) != 32);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) !=
-						sizeof(struct nvm_chk_meta));
-}
-
-static void nvme_nvm_set_addr_12(struct nvm_addrf_12 *dst,
-				 struct nvme_nvm_id12_addrf *src)
-{
-	dst->ch_len = src->ch_len;
-	dst->lun_len = src->lun_len;
-	dst->blk_len = src->blk_len;
-	dst->pg_len = src->pg_len;
-	dst->pln_len = src->pln_len;
-	dst->sec_len = src->sec_len;
-
-	dst->ch_offset = src->ch_offset;
-	dst->lun_offset = src->lun_offset;
-	dst->blk_offset = src->blk_offset;
-	dst->pg_offset = src->pg_offset;
-	dst->pln_offset = src->pln_offset;
-	dst->sec_offset = src->sec_offset;
-
-	dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset;
-	dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset;
-	dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset;
-	dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset;
-	dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset;
-	dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset;
-}
-
-static int nvme_nvm_setup_12(struct nvme_nvm_id12 *id,
-			     struct nvm_geo *geo)
-{
-	struct nvme_nvm_id12_grp *src;
-	int sec_per_pg, sec_per_pl, pg_per_blk;
-
-	if (id->cgrps != 1)
-		return -EINVAL;
-
-	src = &id->grp;
-
-	if (src->mtype != 0) {
-		pr_err("nvm: memory type not supported\n");
-		return -EINVAL;
-	}
-
-	/* 1.2 spec. only reports a single version id - unfold */
-	geo->major_ver_id = id->ver_id;
-	geo->minor_ver_id = 2;
-
-	/* Set compacted version for upper layers */
-	geo->version = NVM_OCSSD_SPEC_12;
-
-	geo->num_ch = src->num_ch;
-	geo->num_lun = src->num_lun;
-	geo->all_luns = geo->num_ch * geo->num_lun;
-
-	geo->num_chk = le16_to_cpu(src->num_chk);
-
-	geo->csecs = le16_to_cpu(src->csecs);
-	geo->sos = le16_to_cpu(src->sos);
-
-	pg_per_blk = le16_to_cpu(src->num_pg);
-	sec_per_pg = le16_to_cpu(src->fpg_sz) / geo->csecs;
-	sec_per_pl = sec_per_pg * src->num_pln;
-	geo->clba = sec_per_pl * pg_per_blk;
-
-	geo->all_chunks = geo->all_luns * geo->num_chk;
-	geo->total_secs = geo->clba * geo->all_chunks;
-
-	geo->ws_min = sec_per_pg;
-	geo->ws_opt = sec_per_pg;
-	geo->mw_cunits = geo->ws_opt << 3;	/* default to MLC safe values */
-
-	/* Do not impose values for maximum number of open blocks as it is
-	 * unspecified in 1.2. Users of 1.2 must be aware of this and eventually
-	 * specify these values through a quirk if restrictions apply.
-	 */
-	geo->maxoc = geo->all_luns * geo->num_chk;
-	geo->maxocpu = geo->num_chk;
-
-	geo->mccap = le32_to_cpu(src->mccap);
-
-	geo->trdt = le32_to_cpu(src->trdt);
-	geo->trdm = le32_to_cpu(src->trdm);
-	geo->tprt = le32_to_cpu(src->tprt);
-	geo->tprm = le32_to_cpu(src->tprm);
-	geo->tbet = le32_to_cpu(src->tbet);
-	geo->tbem = le32_to_cpu(src->tbem);
-
-	/* 1.2 compatibility */
-	geo->vmnt = id->vmnt;
-	geo->cap = le32_to_cpu(id->cap);
-	geo->dom = le32_to_cpu(id->dom);
-
-	geo->mtype = src->mtype;
-	geo->fmtype = src->fmtype;
-
-	geo->cpar = le16_to_cpu(src->cpar);
-	geo->mpos = le32_to_cpu(src->mpos);
-
-	geo->pln_mode = NVM_PLANE_SINGLE;
-
-	if (geo->mpos & 0x020202) {
-		geo->pln_mode = NVM_PLANE_DOUBLE;
-		geo->ws_opt <<= 1;
-	} else if (geo->mpos & 0x040404) {
-		geo->pln_mode = NVM_PLANE_QUAD;
-		geo->ws_opt <<= 2;
-	}
-
-	geo->num_pln = src->num_pln;
-	geo->num_pg = le16_to_cpu(src->num_pg);
-	geo->fpg_sz = le16_to_cpu(src->fpg_sz);
-
-	nvme_nvm_set_addr_12((struct nvm_addrf_12 *)&geo->addrf, &id->ppaf);
-
-	return 0;
-}
-
-static void nvme_nvm_set_addr_20(struct nvm_addrf *dst,
-				 struct nvme_nvm_id20_addrf *src)
-{
-	dst->ch_len = src->grp_len;
-	dst->lun_len = src->pu_len;
-	dst->chk_len = src->chk_len;
-	dst->sec_len = src->lba_len;
-
-	dst->sec_offset = 0;
-	dst->chk_offset = dst->sec_len;
-	dst->lun_offset = dst->chk_offset + dst->chk_len;
-	dst->ch_offset = dst->lun_offset + dst->lun_len;
-
-	dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset;
-	dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset;
-	dst->chk_mask = ((1ULL << dst->chk_len) - 1) << dst->chk_offset;
-	dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset;
-}
-
-static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id,
-			     struct nvm_geo *geo)
-{
-	geo->major_ver_id = id->mjr;
-	geo->minor_ver_id = id->mnr;
-
-	/* Set compacted version for upper layers */
-	geo->version = NVM_OCSSD_SPEC_20;
-
-	geo->num_ch = le16_to_cpu(id->num_grp);
-	geo->num_lun = le16_to_cpu(id->num_pu);
-	geo->all_luns = geo->num_ch * geo->num_lun;
-
-	geo->num_chk = le32_to_cpu(id->num_chk);
-	geo->clba = le32_to_cpu(id->clba);
-
-	geo->all_chunks = geo->all_luns * geo->num_chk;
-	geo->total_secs = geo->clba * geo->all_chunks;
-
-	geo->ws_min = le32_to_cpu(id->ws_min);
-	geo->ws_opt = le32_to_cpu(id->ws_opt);
-	geo->mw_cunits = le32_to_cpu(id->mw_cunits);
-	geo->maxoc = le32_to_cpu(id->maxoc);
-	geo->maxocpu = le32_to_cpu(id->maxocpu);
-
-	geo->trdt = le32_to_cpu(id->trdt);
-	geo->trdm = le32_to_cpu(id->trdm);
-	geo->tprt = le32_to_cpu(id->twrt);
-	geo->tprm = le32_to_cpu(id->twrm);
-	geo->tbet = le32_to_cpu(id->tcrst);
-	geo->tbem = le32_to_cpu(id->tcrsm);
-
-	nvme_nvm_set_addr_20(&geo->addrf, &id->lbaf);
-
-	return 0;
-}
-
-static int nvme_nvm_identity(struct nvm_dev *nvmdev)
-{
-	struct nvme_ns *ns = nvmdev->q->queuedata;
-	struct nvme_nvm_id12 *id;
-	struct nvme_nvm_command c = {};
-	int ret;
-
-	c.identity.opcode = nvme_nvm_admin_identity;
-	c.identity.nsid = cpu_to_le32(ns->head->ns_id);
-
-	id = kmalloc(sizeof(struct nvme_nvm_id12), GFP_KERNEL);
-	if (!id)
-		return -ENOMEM;
-
-	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
-				id, sizeof(struct nvme_nvm_id12));
-	if (ret) {
-		ret = -EIO;
-		goto out;
-	}
-
-	/*
-	 * The 1.2 and 2.0 specifications share the first byte in their geometry
-	 * command to make it possible to know what version a device implements.
-	 */
-	switch (id->ver_id) {
-	case 1:
-		ret = nvme_nvm_setup_12(id, &nvmdev->geo);
-		break;
-	case 2:
-		ret = nvme_nvm_setup_20((struct nvme_nvm_id20 *)id,
-							&nvmdev->geo);
-		break;
-	default:
-		dev_err(ns->ctrl->device, "OCSSD revision not supported (%d)\n",
-							id->ver_id);
-		ret = -EINVAL;
-	}
-
-out:
-	kfree(id);
-	return ret;
-}
-
-static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
-								u8 *blks)
-{
-	struct request_queue *q = nvmdev->q;
-	struct nvm_geo *geo = &nvmdev->geo;
-	struct nvme_ns *ns = q->queuedata;
-	struct nvme_ctrl *ctrl = ns->ctrl;
-	struct nvme_nvm_command c = {};
-	struct nvme_nvm_bb_tbl *bb_tbl;
-	int nr_blks = geo->num_chk * geo->num_pln;
-	int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
-	int ret = 0;
-
-	c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl;
-	c.get_bb.nsid = cpu_to_le32(ns->head->ns_id);
-	c.get_bb.spba = cpu_to_le64(ppa.ppa);
-
-	bb_tbl = kzalloc(tblsz, GFP_KERNEL);
-	if (!bb_tbl)
-		return -ENOMEM;
-
-	ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c,
-								bb_tbl, tblsz);
-	if (ret) {
-		dev_err(ctrl->device, "get bad block table failed (%d)\n", ret);
-		ret = -EIO;
-		goto out;
-	}
-
-	if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' ||
-		bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') {
-		dev_err(ctrl->device, "bbt format mismatch\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (le16_to_cpu(bb_tbl->verid) != 1) {
-		ret = -EINVAL;
-		dev_err(ctrl->device, "bbt version not supported\n");
-		goto out;
-	}
-
-	if (le32_to_cpu(bb_tbl->tblks) != nr_blks) {
-		ret = -EINVAL;
-		dev_err(ctrl->device,
-				"bbt unsuspected blocks returned (%u!=%u)",
-				le32_to_cpu(bb_tbl->tblks), nr_blks);
-		goto out;
-	}
-
-	memcpy(blks, bb_tbl->blk, geo->num_chk * geo->num_pln);
-out:
-	kfree(bb_tbl);
-	return ret;
-}
-
-static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas,
-							int nr_ppas, int type)
-{
-	struct nvme_ns *ns = nvmdev->q->queuedata;
-	struct nvme_nvm_command c = {};
-	int ret = 0;
-
-	c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl;
-	c.set_bb.nsid = cpu_to_le32(ns->head->ns_id);
-	c.set_bb.spba = cpu_to_le64(ppas->ppa);
-	c.set_bb.nlb = cpu_to_le16(nr_ppas - 1);
-	c.set_bb.value = type;
-
-	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
-								NULL, 0);
-	if (ret)
-		dev_err(ns->ctrl->device, "set bad block table failed (%d)\n",
-									ret);
-	return ret;
-}
-
-/*
- * Expect the lba in device format
- */
-static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
-				 sector_t slba, int nchks,
-				 struct nvm_chk_meta *meta)
-{
-	struct nvm_geo *geo = &ndev->geo;
-	struct nvme_ns *ns = ndev->q->queuedata;
-	struct nvme_ctrl *ctrl = ns->ctrl;
-	struct nvme_nvm_chk_meta *dev_meta, *dev_meta_off;
-	struct ppa_addr ppa;
-	size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
-	size_t log_pos, offset, len;
-	int i, max_len;
-	int ret = 0;
-
-	/*
-	 * limit requests to maximum 256K to avoid issuing arbitrary large
-	 * requests when the device does not specific a maximum transfer size.
-	 */
-	max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024);
-
-	dev_meta = kmalloc(max_len, GFP_KERNEL);
-	if (!dev_meta)
-		return -ENOMEM;
-
-	/* Normalize lba address space to obtain log offset */
-	ppa.ppa = slba;
-	ppa = dev_to_generic_addr(ndev, ppa);
-
-	log_pos = ppa.m.chk;
-	log_pos += ppa.m.pu * geo->num_chk;
-	log_pos += ppa.m.grp * geo->num_lun * geo->num_chk;
-
-	offset = log_pos * sizeof(struct nvme_nvm_chk_meta);
-
-	while (left) {
-		len = min_t(unsigned int, left, max_len);
-
-		memset(dev_meta, 0, max_len);
-		dev_meta_off = dev_meta;
-
-		ret = nvme_get_log(ctrl, ns->head->ns_id,
-				NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM,
-				dev_meta, len, offset);
-		if (ret) {
-			dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
-			break;
-		}
-
-		for (i = 0; i < len; i += sizeof(struct nvme_nvm_chk_meta)) {
-			meta->state = dev_meta_off->state;
-			meta->type = dev_meta_off->type;
-			meta->wi = dev_meta_off->wi;
-			meta->slba = le64_to_cpu(dev_meta_off->slba);
-			meta->cnlb = le64_to_cpu(dev_meta_off->cnlb);
-			meta->wp = le64_to_cpu(dev_meta_off->wp);
-
-			meta++;
-			dev_meta_off++;
-		}
-
-		offset += len;
-		left -= len;
-	}
-
-	kfree(dev_meta);
-
-	return ret;
-}
-
-static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
-				    struct nvme_nvm_command *c)
-{
-	c->ph_rw.opcode = rqd->opcode;
-	c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
-	c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa);
-	c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
-	c->ph_rw.control = cpu_to_le16(rqd->flags);
-	c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
-}
-
-static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
-{
-	struct nvm_rq *rqd = rq->end_io_data;
-
-	rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
-	rqd->error = nvme_req(rq)->status;
-	nvm_end_io(rqd);
-
-	kfree(nvme_req(rq)->cmd);
-	blk_mq_free_request(rq);
-}
-
-static struct request *nvme_nvm_alloc_request(struct request_queue *q,
-					      struct nvm_rq *rqd,
-					      struct nvme_nvm_command *cmd)
-{
-	struct nvme_ns *ns = q->queuedata;
-	struct request *rq;
-
-	nvme_nvm_rqtocmd(rqd, ns, cmd);
-
-	rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0);
-	if (IS_ERR(rq))
-		return rq;
-
-	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
-
-	if (rqd->bio)
-		blk_rq_append_bio(rq, rqd->bio);
-	else
-		rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
-
-	return rq;
-}
-
-static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd,
-			      void *buf)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct request_queue *q = dev->q;
-	struct nvme_nvm_command *cmd;
-	struct request *rq;
-	int ret;
-
-	cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
-	if (!cmd)
-		return -ENOMEM;
-
-	rq = nvme_nvm_alloc_request(q, rqd, cmd);
-	if (IS_ERR(rq)) {
-		ret = PTR_ERR(rq);
-		goto err_free_cmd;
-	}
-
-	if (buf) {
-		ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas,
-				GFP_KERNEL);
-		if (ret)
-			goto err_free_cmd;
-	}
-
-	rq->end_io_data = rqd;
-
-	blk_execute_rq_nowait(NULL, rq, 0, nvme_nvm_end_io);
-
-	return 0;
-
-err_free_cmd:
-	kfree(cmd);
-	return ret;
-}
-
-static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name,
-					int size)
-{
-	struct nvme_ns *ns = nvmdev->q->queuedata;
-
-	return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0);
-}
-
-static void nvme_nvm_destroy_dma_pool(void *pool)
-{
-	struct dma_pool *dma_pool = pool;
-
-	dma_pool_destroy(dma_pool);
-}
-
-static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
-				    gfp_t mem_flags, dma_addr_t *dma_handler)
-{
-	return dma_pool_alloc(pool, mem_flags, dma_handler);
-}
-
-static void nvme_nvm_dev_dma_free(void *pool, void *addr,
-							dma_addr_t dma_handler)
-{
-	dma_pool_free(pool, addr, dma_handler);
-}
-
-static struct nvm_dev_ops nvme_nvm_dev_ops = {
-	.identity		= nvme_nvm_identity,
-
-	.get_bb_tbl		= nvme_nvm_get_bb_tbl,
-	.set_bb_tbl		= nvme_nvm_set_bb_tbl,
-
-	.get_chk_meta		= nvme_nvm_get_chk_meta,
-
-	.submit_io		= nvme_nvm_submit_io,
-
-	.create_dma_pool	= nvme_nvm_create_dma_pool,
-	.destroy_dma_pool	= nvme_nvm_destroy_dma_pool,
-	.dev_dma_alloc		= nvme_nvm_dev_dma_alloc,
-	.dev_dma_free		= nvme_nvm_dev_dma_free,
-};
-
-static int nvme_nvm_submit_user_cmd(struct request_queue *q,
-				struct nvme_ns *ns,
-				struct nvme_nvm_command *vcmd,
-				void __user *ubuf, unsigned int bufflen,
-				void __user *meta_buf, unsigned int meta_len,
-				void __user *ppa_buf, unsigned int ppa_len,
-				u32 *result, u64 *status, unsigned int timeout)
-{
-	bool write = nvme_is_write((struct nvme_command *)vcmd);
-	struct nvm_dev *dev = ns->ndev;
-	struct request *rq;
-	struct bio *bio = NULL;
-	__le64 *ppa_list = NULL;
-	dma_addr_t ppa_dma;
-	__le64 *metadata = NULL;
-	dma_addr_t metadata_dma;
-	DECLARE_COMPLETION_ONSTACK(wait);
-	int ret = 0;
-
-	rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0);
-	if (IS_ERR(rq)) {
-		ret = -ENOMEM;
-		goto err_cmd;
-	}
-
-	if (timeout)
-		rq->timeout = timeout;
-
-	if (ppa_buf && ppa_len) {
-		ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma);
-		if (!ppa_list) {
-			ret = -ENOMEM;
-			goto err_rq;
-		}
-		if (copy_from_user(ppa_list, (void __user *)ppa_buf,
-						sizeof(u64) * (ppa_len + 1))) {
-			ret = -EFAULT;
-			goto err_ppa;
-		}
-		vcmd->ph_rw.spba = cpu_to_le64(ppa_dma);
-	} else {
-		vcmd->ph_rw.spba = cpu_to_le64((uintptr_t)ppa_buf);
-	}
-
-	if (ubuf && bufflen) {
-		ret = blk_rq_map_user(q, rq, NULL, ubuf, bufflen, GFP_KERNEL);
-		if (ret)
-			goto err_ppa;
-		bio = rq->bio;
-
-		if (meta_buf && meta_len) {
-			metadata = dma_pool_alloc(dev->dma_pool, GFP_KERNEL,
-								&metadata_dma);
-			if (!metadata) {
-				ret = -ENOMEM;
-				goto err_map;
-			}
-
-			if (write) {
-				if (copy_from_user(metadata,
-						(void __user *)meta_buf,
-						meta_len)) {
-					ret = -EFAULT;
-					goto err_meta;
-				}
-			}
-			vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma);
-		}
-
-		bio_set_dev(bio, ns->disk->part0);
-	}
-
-	blk_execute_rq(NULL, rq, 0);
-
-	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
-		ret = -EINTR;
-	else if (nvme_req(rq)->status & 0x7ff)
-		ret = -EIO;
-	if (result)
-		*result = nvme_req(rq)->status & 0x7ff;
-	if (status)
-		*status = le64_to_cpu(nvme_req(rq)->result.u64);
-
-	if (metadata && !ret && !write) {
-		if (copy_to_user(meta_buf, (void *)metadata, meta_len))
-			ret = -EFAULT;
-	}
-err_meta:
-	if (meta_buf && meta_len)
-		dma_pool_free(dev->dma_pool, metadata, metadata_dma);
-err_map:
-	if (bio)
-		blk_rq_unmap_user(bio);
-err_ppa:
-	if (ppa_buf && ppa_len)
-		dma_pool_free(dev->dma_pool, ppa_list, ppa_dma);
-err_rq:
-	blk_mq_free_request(rq);
-err_cmd:
-	return ret;
-}
-
-static int nvme_nvm_submit_vio(struct nvme_ns *ns,
-					struct nvm_user_vio __user *uvio)
-{
-	struct nvm_user_vio vio;
-	struct nvme_nvm_command c;
-	unsigned int length;
-	int ret;
-
-	if (copy_from_user(&vio, uvio, sizeof(vio)))
-		return -EFAULT;
-	if (vio.flags)
-		return -EINVAL;
-
-	memset(&c, 0, sizeof(c));
-	c.ph_rw.opcode = vio.opcode;
-	c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
-	c.ph_rw.control = cpu_to_le16(vio.control);
-	c.ph_rw.length = cpu_to_le16(vio.nppas);
-
-	length = (vio.nppas + 1) << ns->lba_shift;
-
-	ret = nvme_nvm_submit_user_cmd(ns->queue, ns, &c,
-			(void __user *)(uintptr_t)vio.addr, length,
-			(void __user *)(uintptr_t)vio.metadata,
-							vio.metadata_len,
-			(void __user *)(uintptr_t)vio.ppa_list, vio.nppas,
-			&vio.result, &vio.status, 0);
-
-	if (ret && copy_to_user(uvio, &vio, sizeof(vio)))
-		return -EFAULT;
-
-	return ret;
-}
-
-static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
-					struct nvm_passthru_vio __user *uvcmd)
-{
-	struct nvm_passthru_vio vcmd;
-	struct nvme_nvm_command c;
-	struct request_queue *q;
-	unsigned int timeout = 0;
-	int ret;
-
-	if (copy_from_user(&vcmd, uvcmd, sizeof(vcmd)))
-		return -EFAULT;
-	if ((vcmd.opcode != 0xF2) && (!capable(CAP_SYS_ADMIN)))
-		return -EACCES;
-	if (vcmd.flags)
-		return -EINVAL;
-
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = vcmd.opcode;
-	c.common.nsid = cpu_to_le32(ns->head->ns_id);
-	c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2);
-	c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3);
-	/* cdw11-12 */
-	c.ph_rw.length = cpu_to_le16(vcmd.nppas);
-	c.ph_rw.control  = cpu_to_le16(vcmd.control);
-	c.common.cdw13 = cpu_to_le32(vcmd.cdw13);
-	c.common.cdw14 = cpu_to_le32(vcmd.cdw14);
-	c.common.cdw15 = cpu_to_le32(vcmd.cdw15);
-
-	if (vcmd.timeout_ms)
-		timeout = msecs_to_jiffies(vcmd.timeout_ms);
-
-	q = admin ? ns->ctrl->admin_q : ns->queue;
-
-	ret = nvme_nvm_submit_user_cmd(q, ns,
-			(struct nvme_nvm_command *)&c,
-			(void __user *)(uintptr_t)vcmd.addr, vcmd.data_len,
-			(void __user *)(uintptr_t)vcmd.metadata,
-							vcmd.metadata_len,
-			(void __user *)(uintptr_t)vcmd.ppa_list, vcmd.nppas,
-			&vcmd.result, &vcmd.status, timeout);
-
-	if (ret && copy_to_user(uvcmd, &vcmd, sizeof(vcmd)))
-		return -EFAULT;
-
-	return ret;
-}
-
-int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp)
-{
-	switch (cmd) {
-	case NVME_NVM_IOCTL_ADMIN_VIO:
-		return nvme_nvm_user_vcmd(ns, 1, argp);
-	case NVME_NVM_IOCTL_IO_VIO:
-		return nvme_nvm_user_vcmd(ns, 0, argp);
-	case NVME_NVM_IOCTL_SUBMIT_VIO:
-		return nvme_nvm_submit_vio(ns, argp);
-	default:
-		return -ENOTTY;
-	}
-}
-
-int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
-{
-	struct request_queue *q = ns->queue;
-	struct nvm_dev *dev;
-	struct nvm_geo *geo;
-
-	_nvme_nvm_check_size();
-
-	dev = nvm_alloc_dev(node);
-	if (!dev)
-		return -ENOMEM;
-
-	/* Note that csecs and sos will be overridden if it is a 1.2 drive. */
-	geo = &dev->geo;
-	geo->csecs = 1 << ns->lba_shift;
-	geo->sos = ns->ms;
-	if (ns->features & NVME_NS_EXT_LBAS)
-		geo->ext = true;
-	else
-		geo->ext = false;
-	geo->mdts = ns->ctrl->max_hw_sectors;
-
-	dev->q = q;
-	memcpy(dev->name, disk_name, DISK_NAME_LEN);
-	dev->ops = &nvme_nvm_dev_ops;
-	dev->private_data = ns;
-	ns->ndev = dev;
-
-	return nvm_register(dev);
-}
-
-void nvme_nvm_unregister(struct nvme_ns *ns)
-{
-	nvm_unregister(ns->ndev);
-}
-
-static ssize_t nvm_dev_attr_show(struct device *dev,
-		struct device_attribute *dattr, char *page)
-{
-	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	struct nvm_dev *ndev = ns->ndev;
-	struct nvm_geo *geo = &ndev->geo;
-	struct attribute *attr;
-
-	if (!ndev)
-		return 0;
-
-	attr = &dattr->attr;
-
-	if (strcmp(attr->name, "version") == 0) {
-		if (geo->major_ver_id == 1)
-			return scnprintf(page, PAGE_SIZE, "%u\n",
-						geo->major_ver_id);
-		else
-			return scnprintf(page, PAGE_SIZE, "%u.%u\n",
-						geo->major_ver_id,
-						geo->minor_ver_id);
-	} else if (strcmp(attr->name, "capabilities") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->cap);
-	} else if (strcmp(attr->name, "read_typ") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdt);
-	} else if (strcmp(attr->name, "read_max") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdm);
-	} else {
-		return scnprintf(page,
-				 PAGE_SIZE,
-				 "Unhandled attr(%s) in `%s`\n",
-				 attr->name, __func__);
-	}
-}
-
-static ssize_t nvm_dev_attr_show_ppaf(struct nvm_addrf_12 *ppaf, char *page)
-{
-	return scnprintf(page, PAGE_SIZE,
-		"0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
-				ppaf->ch_offset, ppaf->ch_len,
-				ppaf->lun_offset, ppaf->lun_len,
-				ppaf->pln_offset, ppaf->pln_len,
-				ppaf->blk_offset, ppaf->blk_len,
-				ppaf->pg_offset, ppaf->pg_len,
-				ppaf->sec_offset, ppaf->sec_len);
-}
-
-static ssize_t nvm_dev_attr_show_12(struct device *dev,
-		struct device_attribute *dattr, char *page)
-{
-	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	struct nvm_dev *ndev = ns->ndev;
-	struct nvm_geo *geo = &ndev->geo;
-	struct attribute *attr;
-
-	if (!ndev)
-		return 0;
-
-	attr = &dattr->attr;
-
-	if (strcmp(attr->name, "vendor_opcode") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->vmnt);
-	} else if (strcmp(attr->name, "device_mode") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->dom);
-	/* kept for compatibility */
-	} else if (strcmp(attr->name, "media_manager") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm");
-	} else if (strcmp(attr->name, "ppa_format") == 0) {
-		return nvm_dev_attr_show_ppaf((void *)&geo->addrf, page);
-	} else if (strcmp(attr->name, "media_type") == 0) {	/* u8 */
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->mtype);
-	} else if (strcmp(attr->name, "flash_media_type") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->fmtype);
-	} else if (strcmp(attr->name, "num_channels") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch);
-	} else if (strcmp(attr->name, "num_luns") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun);
-	} else if (strcmp(attr->name, "num_planes") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pln);
-	} else if (strcmp(attr->name, "num_blocks") == 0) {	/* u16 */
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk);
-	} else if (strcmp(attr->name, "num_pages") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pg);
-	} else if (strcmp(attr->name, "page_size") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->fpg_sz);
-	} else if (strcmp(attr->name, "hw_sector_size") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->csecs);
-	} else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->sos);
-	} else if (strcmp(attr->name, "prog_typ") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt);
-	} else if (strcmp(attr->name, "prog_max") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm);
-	} else if (strcmp(attr->name, "erase_typ") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet);
-	} else if (strcmp(attr->name, "erase_max") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem);
-	} else if (strcmp(attr->name, "multiplane_modes") == 0) {
-		return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mpos);
-	} else if (strcmp(attr->name, "media_capabilities") == 0) {
-		return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mccap);
-	} else if (strcmp(attr->name, "max_phys_secs") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", NVM_MAX_VLBA);
-	} else {
-		return scnprintf(page, PAGE_SIZE,
-			"Unhandled attr(%s) in `%s`\n",
-			attr->name, __func__);
-	}
-}
-
-static ssize_t nvm_dev_attr_show_20(struct device *dev,
-		struct device_attribute *dattr, char *page)
-{
-	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	struct nvm_dev *ndev = ns->ndev;
-	struct nvm_geo *geo = &ndev->geo;
-	struct attribute *attr;
-
-	if (!ndev)
-		return 0;
-
-	attr = &dattr->attr;
-
-	if (strcmp(attr->name, "groups") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch);
-	} else if (strcmp(attr->name, "punits") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun);
-	} else if (strcmp(attr->name, "chunks") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk);
-	} else if (strcmp(attr->name, "clba") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->clba);
-	} else if (strcmp(attr->name, "ws_min") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_min);
-	} else if (strcmp(attr->name, "ws_opt") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_opt);
-	} else if (strcmp(attr->name, "maxoc") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxoc);
-	} else if (strcmp(attr->name, "maxocpu") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxocpu);
-	} else if (strcmp(attr->name, "mw_cunits") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->mw_cunits);
-	} else if (strcmp(attr->name, "write_typ") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt);
-	} else if (strcmp(attr->name, "write_max") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm);
-	} else if (strcmp(attr->name, "reset_typ") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet);
-	} else if (strcmp(attr->name, "reset_max") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem);
-	} else {
-		return scnprintf(page, PAGE_SIZE,
-			"Unhandled attr(%s) in `%s`\n",
-			attr->name, __func__);
-	}
-}
-
-#define NVM_DEV_ATTR_RO(_name)					\
-	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL)
-#define NVM_DEV_ATTR_12_RO(_name)					\
-	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_12, NULL)
-#define NVM_DEV_ATTR_20_RO(_name)					\
-	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_20, NULL)
-
-/* general attributes */
-static NVM_DEV_ATTR_RO(version);
-static NVM_DEV_ATTR_RO(capabilities);
-
-static NVM_DEV_ATTR_RO(read_typ);
-static NVM_DEV_ATTR_RO(read_max);
-
-/* 1.2 values */
-static NVM_DEV_ATTR_12_RO(vendor_opcode);
-static NVM_DEV_ATTR_12_RO(device_mode);
-static NVM_DEV_ATTR_12_RO(ppa_format);
-static NVM_DEV_ATTR_12_RO(media_manager);
-static NVM_DEV_ATTR_12_RO(media_type);
-static NVM_DEV_ATTR_12_RO(flash_media_type);
-static NVM_DEV_ATTR_12_RO(num_channels);
-static NVM_DEV_ATTR_12_RO(num_luns);
-static NVM_DEV_ATTR_12_RO(num_planes);
-static NVM_DEV_ATTR_12_RO(num_blocks);
-static NVM_DEV_ATTR_12_RO(num_pages);
-static NVM_DEV_ATTR_12_RO(page_size);
-static NVM_DEV_ATTR_12_RO(hw_sector_size);
-static NVM_DEV_ATTR_12_RO(oob_sector_size);
-static NVM_DEV_ATTR_12_RO(prog_typ);
-static NVM_DEV_ATTR_12_RO(prog_max);
-static NVM_DEV_ATTR_12_RO(erase_typ);
-static NVM_DEV_ATTR_12_RO(erase_max);
-static NVM_DEV_ATTR_12_RO(multiplane_modes);
-static NVM_DEV_ATTR_12_RO(media_capabilities);
-static NVM_DEV_ATTR_12_RO(max_phys_secs);
-
-/* 2.0 values */
-static NVM_DEV_ATTR_20_RO(groups);
-static NVM_DEV_ATTR_20_RO(punits);
-static NVM_DEV_ATTR_20_RO(chunks);
-static NVM_DEV_ATTR_20_RO(clba);
-static NVM_DEV_ATTR_20_RO(ws_min);
-static NVM_DEV_ATTR_20_RO(ws_opt);
-static NVM_DEV_ATTR_20_RO(maxoc);
-static NVM_DEV_ATTR_20_RO(maxocpu);
-static NVM_DEV_ATTR_20_RO(mw_cunits);
-static NVM_DEV_ATTR_20_RO(write_typ);
-static NVM_DEV_ATTR_20_RO(write_max);
-static NVM_DEV_ATTR_20_RO(reset_typ);
-static NVM_DEV_ATTR_20_RO(reset_max);
-
-static struct attribute *nvm_dev_attrs[] = {
-	/* version agnostic attrs */
-	&dev_attr_version.attr,
-	&dev_attr_capabilities.attr,
-	&dev_attr_read_typ.attr,
-	&dev_attr_read_max.attr,
-
-	/* 1.2 attrs */
-	&dev_attr_vendor_opcode.attr,
-	&dev_attr_device_mode.attr,
-	&dev_attr_media_manager.attr,
-	&dev_attr_ppa_format.attr,
-	&dev_attr_media_type.attr,
-	&dev_attr_flash_media_type.attr,
-	&dev_attr_num_channels.attr,
-	&dev_attr_num_luns.attr,
-	&dev_attr_num_planes.attr,
-	&dev_attr_num_blocks.attr,
-	&dev_attr_num_pages.attr,
-	&dev_attr_page_size.attr,
-	&dev_attr_hw_sector_size.attr,
-	&dev_attr_oob_sector_size.attr,
-	&dev_attr_prog_typ.attr,
-	&dev_attr_prog_max.attr,
-	&dev_attr_erase_typ.attr,
-	&dev_attr_erase_max.attr,
-	&dev_attr_multiplane_modes.attr,
-	&dev_attr_media_capabilities.attr,
-	&dev_attr_max_phys_secs.attr,
-
-	/* 2.0 attrs */
-	&dev_attr_groups.attr,
-	&dev_attr_punits.attr,
-	&dev_attr_chunks.attr,
-	&dev_attr_clba.attr,
-	&dev_attr_ws_min.attr,
-	&dev_attr_ws_opt.attr,
-	&dev_attr_maxoc.attr,
-	&dev_attr_maxocpu.attr,
-	&dev_attr_mw_cunits.attr,
-
-	&dev_attr_write_typ.attr,
-	&dev_attr_write_max.attr,
-	&dev_attr_reset_typ.attr,
-	&dev_attr_reset_max.attr,
-
-	NULL,
-};
-
-static umode_t nvm_dev_attrs_visible(struct kobject *kobj,
-				     struct attribute *attr, int index)
-{
-	struct device *dev = kobj_to_dev(kobj);
-	struct gendisk *disk = dev_to_disk(dev);
-	struct nvme_ns *ns = disk->private_data;
-	struct nvm_dev *ndev = ns->ndev;
-	struct device_attribute *dev_attr =
-		container_of(attr, typeof(*dev_attr), attr);
-
-	if (!ndev)
-		return 0;
-
-	if (dev_attr->show == nvm_dev_attr_show)
-		return attr->mode;
-
-	switch (ndev->geo.major_ver_id) {
-	case 1:
-		if (dev_attr->show == nvm_dev_attr_show_12)
-			return attr->mode;
-		break;
-	case 2:
-		if (dev_attr->show == nvm_dev_attr_show_20)
-			return attr->mode;
-		break;
-	}
-
-	return 0;
-}
-
-const struct attribute_group nvme_nvm_attr_group = {
-	.name		= "lightnvm",
-	.attrs		= nvm_dev_attrs,
-	.is_visible	= nvm_dev_attrs_visible,
-};
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 5cd1fa3b8464..ab803f91ace1 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -11,7 +11,6 @@
 #include <linux/pci.h>
 #include <linux/kref.h>
 #include <linux/blk-mq.h>
-#include <linux/lightnvm.h>
 #include <linux/sed-opal.h>
 #include <linux/fault-inject.h>
 #include <linux/rcupdate.h>
@@ -92,11 +91,6 @@ enum nvme_quirks {
 	 */
 	NVME_QUIRK_NO_DEEPEST_PS		= (1 << 5),
 
-	/*
-	 * Supports the LighNVM command set if indicated in vs[1].
-	 */
-	NVME_QUIRK_LIGHTNVM			= (1 << 6),
-
 	/*
 	 * Set MEDIUM priority on SQ creation
 	 */
@@ -823,26 +817,6 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
 }
 #endif
 
-#ifdef CONFIG_NVM
-int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
-void nvme_nvm_unregister(struct nvme_ns *ns);
-extern const struct attribute_group nvme_nvm_attr_group;
-int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp);
-#else
-static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
-				    int node)
-{
-	return 0;
-}
-
-static inline void nvme_nvm_unregister(struct nvme_ns *ns) {};
-static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd,
-		void __user *argp)
-{
-	return -ENOTTY;
-}
-#endif /* CONFIG_NVM */
-
 static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
 {
 	return dev_to_disk(dev)->private_data;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 51852085239e..db7a9bee2014 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3243,12 +3243,6 @@ static const struct pci_device_id nvme_id_table[] = {
 	{ PCI_DEVICE(0x1b4b, 0x1092),	/* Lexar 256 GB SSD */
 		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
 				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
-	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
-		.driver_data = NVME_QUIRK_LIGHTNVM, },
-	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
-		.driver_data = NVME_QUIRK_LIGHTNVM, },
-	{ PCI_DEVICE(0x1d1d, 0x2601),	/* CNEX Granby */
-		.driver_data = NVME_QUIRK_LIGHTNVM, },
 	{ PCI_DEVICE(0x10ec, 0x5762),   /* ADATA SX6000LNP */
 		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
 	{ PCI_DEVICE(0x1cc1, 0x8201),   /* ADATA SX8200PNP 512GB */
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
deleted file mode 100644
index 0908abda9c1b..000000000000
--- a/include/linux/lightnvm.h
+++ /dev/null
@@ -1,697 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef NVM_H
-#define NVM_H
-
-#include <linux/blkdev.h>
-#include <linux/types.h>
-#include <uapi/linux/lightnvm.h>
-
-enum {
-	NVM_IO_OK = 0,
-	NVM_IO_REQUEUE = 1,
-	NVM_IO_DONE = 2,
-	NVM_IO_ERR = 3,
-
-	NVM_IOTYPE_NONE = 0,
-	NVM_IOTYPE_GC = 1,
-};
-
-/* common format */
-#define NVM_GEN_CH_BITS  (8)
-#define NVM_GEN_LUN_BITS (8)
-#define NVM_GEN_BLK_BITS (16)
-#define NVM_GEN_RESERVED (32)
-
-/* 1.2 format */
-#define NVM_12_PG_BITS  (16)
-#define NVM_12_PL_BITS  (4)
-#define NVM_12_SEC_BITS (4)
-#define NVM_12_RESERVED (8)
-
-/* 2.0 format */
-#define NVM_20_SEC_BITS (24)
-#define NVM_20_RESERVED (8)
-
-enum {
-	NVM_OCSSD_SPEC_12 = 12,
-	NVM_OCSSD_SPEC_20 = 20,
-};
-
-struct ppa_addr {
-	/* Generic structure for all addresses */
-	union {
-		/* generic device format */
-		struct {
-			u64 ch		: NVM_GEN_CH_BITS;
-			u64 lun		: NVM_GEN_LUN_BITS;
-			u64 blk		: NVM_GEN_BLK_BITS;
-			u64 reserved	: NVM_GEN_RESERVED;
-		} a;
-
-		/* 1.2 device format */
-		struct {
-			u64 ch		: NVM_GEN_CH_BITS;
-			u64 lun		: NVM_GEN_LUN_BITS;
-			u64 blk		: NVM_GEN_BLK_BITS;
-			u64 pg		: NVM_12_PG_BITS;
-			u64 pl		: NVM_12_PL_BITS;
-			u64 sec		: NVM_12_SEC_BITS;
-			u64 reserved	: NVM_12_RESERVED;
-		} g;
-
-		/* 2.0 device format */
-		struct {
-			u64 grp		: NVM_GEN_CH_BITS;
-			u64 pu		: NVM_GEN_LUN_BITS;
-			u64 chk		: NVM_GEN_BLK_BITS;
-			u64 sec		: NVM_20_SEC_BITS;
-			u64 reserved	: NVM_20_RESERVED;
-		} m;
-
-		struct {
-			u64 line	: 63;
-			u64 is_cached	: 1;
-		} c;
-
-		u64 ppa;
-	};
-};
-
-struct nvm_rq;
-struct nvm_id;
-struct nvm_dev;
-struct nvm_tgt_dev;
-struct nvm_chk_meta;
-
-typedef int (nvm_id_fn)(struct nvm_dev *);
-typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
-typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
-typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int,
-							struct nvm_chk_meta *);
-typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *, void *);
-typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int);
-typedef void (nvm_destroy_dma_pool_fn)(void *);
-typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
-								dma_addr_t *);
-typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t);
-
-struct nvm_dev_ops {
-	nvm_id_fn		*identity;
-	nvm_op_bb_tbl_fn	*get_bb_tbl;
-	nvm_op_set_bb_fn	*set_bb_tbl;
-
-	nvm_get_chk_meta_fn	*get_chk_meta;
-
-	nvm_submit_io_fn	*submit_io;
-
-	nvm_create_dma_pool_fn	*create_dma_pool;
-	nvm_destroy_dma_pool_fn	*destroy_dma_pool;
-	nvm_dev_dma_alloc_fn	*dev_dma_alloc;
-	nvm_dev_dma_free_fn	*dev_dma_free;
-};
-
-#ifdef CONFIG_NVM
-
-#include <linux/file.h>
-#include <linux/dmapool.h>
-
-enum {
-	/* HW Responsibilities */
-	NVM_RSP_L2P	= 1 << 0,
-	NVM_RSP_ECC	= 1 << 1,
-
-	/* Physical Adressing Mode */
-	NVM_ADDRMODE_LINEAR	= 0,
-	NVM_ADDRMODE_CHANNEL	= 1,
-
-	/* Plane programming mode for LUN */
-	NVM_PLANE_SINGLE	= 1,
-	NVM_PLANE_DOUBLE	= 2,
-	NVM_PLANE_QUAD		= 4,
-
-	/* Status codes */
-	NVM_RSP_SUCCESS		= 0x0,
-	NVM_RSP_NOT_CHANGEABLE	= 0x1,
-	NVM_RSP_ERR_FAILWRITE	= 0x40ff,
-	NVM_RSP_ERR_EMPTYPAGE	= 0x42ff,
-	NVM_RSP_ERR_FAILECC	= 0x4281,
-	NVM_RSP_ERR_FAILCRC	= 0x4004,
-	NVM_RSP_WARN_HIGHECC	= 0x4700,
-
-	/* Device opcodes */
-	NVM_OP_PWRITE		= 0x91,
-	NVM_OP_PREAD		= 0x92,
-	NVM_OP_ERASE		= 0x90,
-
-	/* PPA Command Flags */
-	NVM_IO_SNGL_ACCESS	= 0x0,
-	NVM_IO_DUAL_ACCESS	= 0x1,
-	NVM_IO_QUAD_ACCESS	= 0x2,
-
-	/* NAND Access Modes */
-	NVM_IO_SUSPEND		= 0x80,
-	NVM_IO_SLC_MODE		= 0x100,
-	NVM_IO_SCRAMBLE_ENABLE	= 0x200,
-
-	/* Block Types */
-	NVM_BLK_T_FREE		= 0x0,
-	NVM_BLK_T_BAD		= 0x1,
-	NVM_BLK_T_GRWN_BAD	= 0x2,
-	NVM_BLK_T_DEV		= 0x4,
-	NVM_BLK_T_HOST		= 0x8,
-
-	/* Memory capabilities */
-	NVM_ID_CAP_SLC		= 0x1,
-	NVM_ID_CAP_CMD_SUSPEND	= 0x2,
-	NVM_ID_CAP_SCRAMBLE	= 0x4,
-	NVM_ID_CAP_ENCRYPT	= 0x8,
-
-	/* Memory types */
-	NVM_ID_FMTYPE_SLC	= 0,
-	NVM_ID_FMTYPE_MLC	= 1,
-
-	/* Device capabilities */
-	NVM_ID_DCAP_BBLKMGMT	= 0x1,
-	NVM_UD_DCAP_ECC		= 0x2,
-};
-
-struct nvm_id_lp_mlc {
-	u16	num_pairs;
-	u8	pairs[886];
-};
-
-struct nvm_id_lp_tbl {
-	__u8	id[8];
-	struct nvm_id_lp_mlc mlc;
-};
-
-struct nvm_addrf_12 {
-	u8	ch_len;
-	u8	lun_len;
-	u8	blk_len;
-	u8	pg_len;
-	u8	pln_len;
-	u8	sec_len;
-
-	u8	ch_offset;
-	u8	lun_offset;
-	u8	blk_offset;
-	u8	pg_offset;
-	u8	pln_offset;
-	u8	sec_offset;
-
-	u64	ch_mask;
-	u64	lun_mask;
-	u64	blk_mask;
-	u64	pg_mask;
-	u64	pln_mask;
-	u64	sec_mask;
-};
-
-struct nvm_addrf {
-	u8	ch_len;
-	u8	lun_len;
-	u8	chk_len;
-	u8	sec_len;
-	u8	rsv_len[2];
-
-	u8	ch_offset;
-	u8	lun_offset;
-	u8	chk_offset;
-	u8	sec_offset;
-	u8	rsv_off[2];
-
-	u64	ch_mask;
-	u64	lun_mask;
-	u64	chk_mask;
-	u64	sec_mask;
-	u64	rsv_mask[2];
-};
-
-enum {
-	/* Chunk states */
-	NVM_CHK_ST_FREE =	1 << 0,
-	NVM_CHK_ST_CLOSED =	1 << 1,
-	NVM_CHK_ST_OPEN =	1 << 2,
-	NVM_CHK_ST_OFFLINE =	1 << 3,
-
-	/* Chunk types */
-	NVM_CHK_TP_W_SEQ =	1 << 0,
-	NVM_CHK_TP_W_RAN =	1 << 1,
-	NVM_CHK_TP_SZ_SPEC =	1 << 4,
-};
-
-/*
- * Note: The structure size is linked to nvme_nvm_chk_meta such that the same
- * buffer can be used when converting from little endian to cpu addressing.
- */
-struct nvm_chk_meta {
-	u8	state;
-	u8	type;
-	u8	wi;
-	u8	rsvd[5];
-	u64	slba;
-	u64	cnlb;
-	u64	wp;
-};
-
-struct nvm_target {
-	struct list_head list;
-	struct nvm_tgt_dev *dev;
-	struct nvm_tgt_type *type;
-	struct gendisk *disk;
-};
-
-#define ADDR_EMPTY (~0ULL)
-
-#define NVM_TARGET_DEFAULT_OP (101)
-#define NVM_TARGET_MIN_OP (3)
-#define NVM_TARGET_MAX_OP (80)
-
-#define NVM_VERSION_MAJOR 1
-#define NVM_VERSION_MINOR 0
-#define NVM_VERSION_PATCH 0
-
-#define NVM_MAX_VLBA (64) /* max logical blocks in a vector command */
-
-struct nvm_rq;
-typedef void (nvm_end_io_fn)(struct nvm_rq *);
-
-struct nvm_rq {
-	struct nvm_tgt_dev *dev;
-
-	struct bio *bio;
-
-	union {
-		struct ppa_addr ppa_addr;
-		dma_addr_t dma_ppa_list;
-	};
-
-	struct ppa_addr *ppa_list;
-
-	void *meta_list;
-	dma_addr_t dma_meta_list;
-
-	nvm_end_io_fn *end_io;
-
-	uint8_t opcode;
-	uint16_t nr_ppas;
-	uint16_t flags;
-
-	u64 ppa_status; /* ppa media status */
-	int error;
-
-	int is_seq; /* Sequential hint flag. 1.2 only */
-
-	void *private;
-};
-
-static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu)
-{
-	return pdu - sizeof(struct nvm_rq);
-}
-
-static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata)
-{
-	return rqdata + 1;
-}
-
-static inline struct ppa_addr *nvm_rq_to_ppa_list(struct nvm_rq *rqd)
-{
-	return (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
-}
-
-enum {
-	NVM_BLK_ST_FREE =	0x1,	/* Free block */
-	NVM_BLK_ST_TGT =	0x2,	/* Block in use by target */
-	NVM_BLK_ST_BAD =	0x8,	/* Bad block */
-};
-
-/* Instance geometry */
-struct nvm_geo {
-	/* device reported version */
-	u8	major_ver_id;
-	u8	minor_ver_id;
-
-	/* kernel short version */
-	u8	version;
-
-	/* instance specific geometry */
-	int num_ch;
-	int num_lun;		/* per channel */
-
-	/* calculated values */
-	int all_luns;		/* across channels */
-	int all_chunks;		/* across channels */
-
-	int op;			/* over-provision in instance */
-
-	sector_t total_secs;	/* across channels */
-
-	/* chunk geometry */
-	u32	num_chk;	/* chunks per lun */
-	u32	clba;		/* sectors per chunk */
-	u16	csecs;		/* sector size */
-	u16	sos;		/* out-of-band area size */
-	bool	ext;		/* metadata in extended data buffer */
-	u32	mdts;		/* Max data transfer size*/
-
-	/* device write constrains */
-	u32	ws_min;		/* minimum write size */
-	u32	ws_opt;		/* optimal write size */
-	u32	mw_cunits;	/* distance required for successful read */
-	u32	maxoc;		/* maximum open chunks */
-	u32	maxocpu;	/* maximum open chunks per parallel unit */
-
-	/* device capabilities */
-	u32	mccap;
-
-	/* device timings */
-	u32	trdt;		/* Avg. Tread (ns) */
-	u32	trdm;		/* Max Tread (ns) */
-	u32	tprt;		/* Avg. Tprog (ns) */
-	u32	tprm;		/* Max Tprog (ns) */
-	u32	tbet;		/* Avg. Terase (ns) */
-	u32	tbem;		/* Max Terase (ns) */
-
-	/* generic address format */
-	struct nvm_addrf addrf;
-
-	/* 1.2 compatibility */
-	u8	vmnt;
-	u32	cap;
-	u32	dom;
-
-	u8	mtype;
-	u8	fmtype;
-
-	u16	cpar;
-	u32	mpos;
-
-	u8	num_pln;
-	u8	pln_mode;
-	u16	num_pg;
-	u16	fpg_sz;
-};
-
-/* sub-device structure */
-struct nvm_tgt_dev {
-	/* Device information */
-	struct nvm_geo geo;
-
-	/* Base ppas for target LUNs */
-	struct ppa_addr *luns;
-
-	struct request_queue *q;
-
-	struct nvm_dev *parent;
-	void *map;
-};
-
-struct nvm_dev {
-	struct nvm_dev_ops *ops;
-
-	struct list_head devices;
-
-	/* Device information */
-	struct nvm_geo geo;
-
-	unsigned long *lun_map;
-	void *dma_pool;
-
-	/* Backend device */
-	struct request_queue *q;
-	char name[DISK_NAME_LEN];
-	void *private_data;
-
-	struct kref ref;
-	void *rmap;
-
-	struct mutex mlock;
-	spinlock_t lock;
-
-	/* target management */
-	struct list_head area_list;
-	struct list_head targets;
-};
-
-static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
-						  struct ppa_addr r)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct ppa_addr l;
-
-	if (geo->version == NVM_OCSSD_SPEC_12) {
-		struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&geo->addrf;
-
-		l.ppa = ((u64)r.g.ch) << ppaf->ch_offset;
-		l.ppa |= ((u64)r.g.lun) << ppaf->lun_offset;
-		l.ppa |= ((u64)r.g.blk) << ppaf->blk_offset;
-		l.ppa |= ((u64)r.g.pg) << ppaf->pg_offset;
-		l.ppa |= ((u64)r.g.pl) << ppaf->pln_offset;
-		l.ppa |= ((u64)r.g.sec) << ppaf->sec_offset;
-	} else {
-		struct nvm_addrf *lbaf = &geo->addrf;
-
-		l.ppa = ((u64)r.m.grp) << lbaf->ch_offset;
-		l.ppa |= ((u64)r.m.pu) << lbaf->lun_offset;
-		l.ppa |= ((u64)r.m.chk) << lbaf->chk_offset;
-		l.ppa |= ((u64)r.m.sec) << lbaf->sec_offset;
-	}
-
-	return l;
-}
-
-static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev,
-						  struct ppa_addr r)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct ppa_addr l;
-
-	l.ppa = 0;
-
-	if (geo->version == NVM_OCSSD_SPEC_12) {
-		struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&geo->addrf;
-
-		l.g.ch = (r.ppa & ppaf->ch_mask) >> ppaf->ch_offset;
-		l.g.lun = (r.ppa & ppaf->lun_mask) >> ppaf->lun_offset;
-		l.g.blk = (r.ppa & ppaf->blk_mask) >> ppaf->blk_offset;
-		l.g.pg = (r.ppa & ppaf->pg_mask) >> ppaf->pg_offset;
-		l.g.pl = (r.ppa & ppaf->pln_mask) >> ppaf->pln_offset;
-		l.g.sec = (r.ppa & ppaf->sec_mask) >> ppaf->sec_offset;
-	} else {
-		struct nvm_addrf *lbaf = &geo->addrf;
-
-		l.m.grp = (r.ppa & lbaf->ch_mask) >> lbaf->ch_offset;
-		l.m.pu = (r.ppa & lbaf->lun_mask) >> lbaf->lun_offset;
-		l.m.chk = (r.ppa & lbaf->chk_mask) >> lbaf->chk_offset;
-		l.m.sec = (r.ppa & lbaf->sec_mask) >> lbaf->sec_offset;
-	}
-
-	return l;
-}
-
-static inline u64 dev_to_chunk_addr(struct nvm_dev *dev, void *addrf,
-				    struct ppa_addr p)
-{
-	struct nvm_geo *geo = &dev->geo;
-	u64 caddr;
-
-	if (geo->version == NVM_OCSSD_SPEC_12) {
-		struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)addrf;
-
-		caddr = (u64)p.g.pg << ppaf->pg_offset;
-		caddr |= (u64)p.g.pl << ppaf->pln_offset;
-		caddr |= (u64)p.g.sec << ppaf->sec_offset;
-	} else {
-		caddr = p.m.sec;
-	}
-
-	return caddr;
-}
-
-static inline struct ppa_addr nvm_ppa32_to_ppa64(struct nvm_dev *dev,
-						 void *addrf, u32 ppa32)
-{
-	struct ppa_addr ppa64;
-
-	ppa64.ppa = 0;
-
-	if (ppa32 == -1) {
-		ppa64.ppa = ADDR_EMPTY;
-	} else if (ppa32 & (1U << 31)) {
-		ppa64.c.line = ppa32 & ((~0U) >> 1);
-		ppa64.c.is_cached = 1;
-	} else {
-		struct nvm_geo *geo = &dev->geo;
-
-		if (geo->version == NVM_OCSSD_SPEC_12) {
-			struct nvm_addrf_12 *ppaf = addrf;
-
-			ppa64.g.ch = (ppa32 & ppaf->ch_mask) >>
-							ppaf->ch_offset;
-			ppa64.g.lun = (ppa32 & ppaf->lun_mask) >>
-							ppaf->lun_offset;
-			ppa64.g.blk = (ppa32 & ppaf->blk_mask) >>
-							ppaf->blk_offset;
-			ppa64.g.pg = (ppa32 & ppaf->pg_mask) >>
-							ppaf->pg_offset;
-			ppa64.g.pl = (ppa32 & ppaf->pln_mask) >>
-							ppaf->pln_offset;
-			ppa64.g.sec = (ppa32 & ppaf->sec_mask) >>
-							ppaf->sec_offset;
-		} else {
-			struct nvm_addrf *lbaf = addrf;
-
-			ppa64.m.grp = (ppa32 & lbaf->ch_mask) >>
-							lbaf->ch_offset;
-			ppa64.m.pu = (ppa32 & lbaf->lun_mask) >>
-							lbaf->lun_offset;
-			ppa64.m.chk = (ppa32 & lbaf->chk_mask) >>
-							lbaf->chk_offset;
-			ppa64.m.sec = (ppa32 & lbaf->sec_mask) >>
-							lbaf->sec_offset;
-		}
-	}
-
-	return ppa64;
-}
-
-static inline u32 nvm_ppa64_to_ppa32(struct nvm_dev *dev,
-				     void *addrf, struct ppa_addr ppa64)
-{
-	u32 ppa32 = 0;
-
-	if (ppa64.ppa == ADDR_EMPTY) {
-		ppa32 = ~0U;
-	} else if (ppa64.c.is_cached) {
-		ppa32 |= ppa64.c.line;
-		ppa32 |= 1U << 31;
-	} else {
-		struct nvm_geo *geo = &dev->geo;
-
-		if (geo->version == NVM_OCSSD_SPEC_12) {
-			struct nvm_addrf_12 *ppaf = addrf;
-
-			ppa32 |= ppa64.g.ch << ppaf->ch_offset;
-			ppa32 |= ppa64.g.lun << ppaf->lun_offset;
-			ppa32 |= ppa64.g.blk << ppaf->blk_offset;
-			ppa32 |= ppa64.g.pg << ppaf->pg_offset;
-			ppa32 |= ppa64.g.pl << ppaf->pln_offset;
-			ppa32 |= ppa64.g.sec << ppaf->sec_offset;
-		} else {
-			struct nvm_addrf *lbaf = addrf;
-
-			ppa32 |= ppa64.m.grp << lbaf->ch_offset;
-			ppa32 |= ppa64.m.pu << lbaf->lun_offset;
-			ppa32 |= ppa64.m.chk << lbaf->chk_offset;
-			ppa32 |= ppa64.m.sec << lbaf->sec_offset;
-		}
-	}
-
-	return ppa32;
-}
-
-static inline int nvm_next_ppa_in_chk(struct nvm_tgt_dev *dev,
-				      struct ppa_addr *ppa)
-{
-	struct nvm_geo *geo = &dev->geo;
-	int last = 0;
-
-	if (geo->version == NVM_OCSSD_SPEC_12) {
-		int sec = ppa->g.sec;
-
-		sec++;
-		if (sec == geo->ws_min) {
-			int pg = ppa->g.pg;
-
-			sec = 0;
-			pg++;
-			if (pg == geo->num_pg) {
-				int pl = ppa->g.pl;
-
-				pg = 0;
-				pl++;
-				if (pl == geo->num_pln)
-					last = 1;
-
-				ppa->g.pl = pl;
-			}
-			ppa->g.pg = pg;
-		}
-		ppa->g.sec = sec;
-	} else {
-		ppa->m.sec++;
-		if (ppa->m.sec == geo->clba)
-			last = 1;
-	}
-
-	return last;
-}
-
-typedef sector_t (nvm_tgt_capacity_fn)(void *);
-typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
-				int flags);
-typedef void (nvm_tgt_exit_fn)(void *, bool);
-typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
-typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
-
-enum {
-	NVM_TGT_F_DEV_L2P = 0,
-	NVM_TGT_F_HOST_L2P = 1 << 0,
-};
-
-struct nvm_tgt_type {
-	const char *name;
-	unsigned int version[3];
-	int flags;
-
-	/* target entry points */
-	const struct block_device_operations *bops;
-	nvm_tgt_capacity_fn *capacity;
-
-	/* module-specific init/teardown */
-	nvm_tgt_init_fn *init;
-	nvm_tgt_exit_fn *exit;
-
-	/* sysfs */
-	nvm_tgt_sysfs_init_fn *sysfs_init;
-	nvm_tgt_sysfs_exit_fn *sysfs_exit;
-
-	/* For internal use */
-	struct list_head list;
-	struct module *owner;
-};
-
-extern int nvm_register_tgt_type(struct nvm_tgt_type *);
-extern void nvm_unregister_tgt_type(struct nvm_tgt_type *);
-
-extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *);
-extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t);
-
-extern struct nvm_dev *nvm_alloc_dev(int);
-extern int nvm_register(struct nvm_dev *);
-extern void nvm_unregister(struct nvm_dev *);
-
-extern int nvm_get_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr,
-			      int, struct nvm_chk_meta *);
-extern int nvm_set_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr *,
-			      int, int);
-extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *, void *);
-extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *, void *);
-extern void nvm_end_io(struct nvm_rq *);
-
-#else /* CONFIG_NVM */
-struct nvm_dev_ops;
-
-static inline struct nvm_dev *nvm_alloc_dev(int node)
-{
-	return ERR_PTR(-EINVAL);
-}
-static inline int nvm_register(struct nvm_dev *dev)
-{
-	return -EINVAL;
-}
-static inline void nvm_unregister(struct nvm_dev *dev) {}
-#endif /* CONFIG_NVM */
-#endif /* LIGHTNVM.H */
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
deleted file mode 100644
index 2745afd9b8fa..000000000000
--- a/include/uapi/linux/lightnvm.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (C) 2015 CNEX Labs.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING.  If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- */
-
-#ifndef _UAPI_LINUX_LIGHTNVM_H
-#define _UAPI_LINUX_LIGHTNVM_H
-
-#ifdef __KERNEL__
-#include <linux/const.h>
-#else /* __KERNEL__ */
-#include <stdio.h>
-#include <sys/ioctl.h>
-#define DISK_NAME_LEN 32
-#endif /* __KERNEL__ */
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-
-#define NVM_TTYPE_NAME_MAX 48
-#define NVM_TTYPE_MAX 63
-#define NVM_MMTYPE_LEN 8
-
-#define NVM_CTRL_FILE "/dev/lightnvm/control"
-
-struct nvm_ioctl_info_tgt {
-	__u32 version[3];
-	__u32 reserved;
-	char tgtname[NVM_TTYPE_NAME_MAX];
-};
-
-struct nvm_ioctl_info {
-	__u32 version[3];	/* in/out - major, minor, patch */
-	__u16 tgtsize;		/* number of targets */
-	__u16 reserved16;	/* pad to 4K page */
-	__u32 reserved[12];
-	struct nvm_ioctl_info_tgt tgts[NVM_TTYPE_MAX];
-};
-
-enum {
-	NVM_DEVICE_ACTIVE = 1 << 0,
-};
-
-struct nvm_ioctl_device_info {
-	char devname[DISK_NAME_LEN];
-	char bmname[NVM_TTYPE_NAME_MAX];
-	__u32 bmversion[3];
-	__u32 flags;
-	__u32 reserved[8];
-};
-
-struct nvm_ioctl_get_devices {
-	__u32 nr_devices;
-	__u32 reserved[31];
-	struct nvm_ioctl_device_info info[31];
-};
-
-struct nvm_ioctl_create_simple {
-	__u32 lun_begin;
-	__u32 lun_end;
-};
-
-struct nvm_ioctl_create_extended {
-	__u16 lun_begin;
-	__u16 lun_end;
-	__u16 op;
-	__u16 rsv;
-};
-
-enum {
-	NVM_CONFIG_TYPE_SIMPLE = 0,
-	NVM_CONFIG_TYPE_EXTENDED = 1,
-};
-
-struct nvm_ioctl_create_conf {
-	__u32 type;
-	union {
-		struct nvm_ioctl_create_simple s;
-		struct nvm_ioctl_create_extended e;
-	};
-};
-
-enum {
-	NVM_TARGET_FACTORY = 1 << 0,	/* Init target in factory mode */
-};
-
-struct nvm_ioctl_create {
-	char dev[DISK_NAME_LEN];		/* open-channel SSD device */
-	char tgttype[NVM_TTYPE_NAME_MAX];	/* target type name */
-	char tgtname[DISK_NAME_LEN];		/* dev to expose target as */
-
-	__u32 flags;
-
-	struct nvm_ioctl_create_conf conf;
-};
-
-struct nvm_ioctl_remove {
-	char tgtname[DISK_NAME_LEN];
-
-	__u32 flags;
-};
-
-struct nvm_ioctl_dev_init {
-	char dev[DISK_NAME_LEN];		/* open-channel SSD device */
-	char mmtype[NVM_MMTYPE_LEN];		/* register to media manager */
-
-	__u32 flags;
-};
-
-enum {
-	NVM_FACTORY_ERASE_ONLY_USER	= 1 << 0, /* erase only blocks used as
-						   * host blks or grown blks */
-	NVM_FACTORY_RESET_HOST_BLKS	= 1 << 1, /* remove host blk marks */
-	NVM_FACTORY_RESET_GRWN_BBLKS	= 1 << 2, /* remove grown blk marks */
-	NVM_FACTORY_NR_BITS		= 1 << 3, /* stops here */
-};
-
-struct nvm_ioctl_dev_factory {
-	char dev[DISK_NAME_LEN];
-
-	__u32 flags;
-};
-
-struct nvm_user_vio {
-	__u8 opcode;
-	__u8 flags;
-	__u16 control;
-	__u16 nppas;
-	__u16 rsvd;
-	__u64 metadata;
-	__u64 addr;
-	__u64 ppa_list;
-	__u32 metadata_len;
-	__u32 data_len;
-	__u64 status;
-	__u32 result;
-	__u32 rsvd3[3];
-};
-
-struct nvm_passthru_vio {
-	__u8 opcode;
-	__u8 flags;
-	__u8 rsvd[2];
-	__u32 nsid;
-	__u32 cdw2;
-	__u32 cdw3;
-	__u64 metadata;
-	__u64 addr;
-	__u32 metadata_len;
-	__u32 data_len;
-	__u64 ppa_list;
-	__u16 nppas;
-	__u16 control;
-	__u32 cdw13;
-	__u32 cdw14;
-	__u32 cdw15;
-	__u64 status;
-	__u32 result;
-	__u32 timeout_ms;
-};
-
-/* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */
-enum {
-	/* top level cmds */
-	NVM_INFO_CMD = 0x20,
-	NVM_GET_DEVICES_CMD,
-
-	/* device level cmds */
-	NVM_DEV_CREATE_CMD,
-	NVM_DEV_REMOVE_CMD,
-
-	/* Init a device to support LightNVM media managers */
-	NVM_DEV_INIT_CMD,
-
-	/* Factory reset device */
-	NVM_DEV_FACTORY_CMD,
-
-	/* Vector user I/O */
-	NVM_DEV_VIO_ADMIN_CMD = 0x41,
-	NVM_DEV_VIO_CMD = 0x42,
-	NVM_DEV_VIO_USER_CMD = 0x43,
-};
-
-#define NVM_IOCTL 'L' /* 0x4c */
-
-#define NVM_INFO		_IOWR(NVM_IOCTL, NVM_INFO_CMD, \
-						struct nvm_ioctl_info)
-#define NVM_GET_DEVICES		_IOR(NVM_IOCTL, NVM_GET_DEVICES_CMD, \
-						struct nvm_ioctl_get_devices)
-#define NVM_DEV_CREATE		_IOW(NVM_IOCTL, NVM_DEV_CREATE_CMD, \
-						struct nvm_ioctl_create)
-#define NVM_DEV_REMOVE		_IOW(NVM_IOCTL, NVM_DEV_REMOVE_CMD, \
-						struct nvm_ioctl_remove)
-#define NVM_DEV_INIT		_IOW(NVM_IOCTL, NVM_DEV_INIT_CMD, \
-						struct nvm_ioctl_dev_init)
-#define NVM_DEV_FACTORY		_IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \
-						struct nvm_ioctl_dev_factory)
-
-#define NVME_NVM_IOCTL_IO_VIO		_IOWR(NVM_IOCTL, NVM_DEV_VIO_USER_CMD, \
-						struct nvm_passthru_vio)
-#define NVME_NVM_IOCTL_ADMIN_VIO	_IOWR(NVM_IOCTL, NVM_DEV_VIO_ADMIN_CMD,\
-						struct nvm_passthru_vio)
-#define NVME_NVM_IOCTL_SUBMIT_VIO	_IOWR(NVM_IOCTL, NVM_DEV_VIO_CMD,\
-						struct nvm_user_vio)
-
-#define NVM_VERSION_MAJOR	1
-#define NVM_VERSION_MINOR	0
-#define NVM_VERSION_PATCHLEVEL	0
-
-#endif
-- 
cgit v1.2.3-71-gd317


From 5b4ecc3d4c4aab8d002fe6358885c10e7b57e432 Mon Sep 17 00:00:00 2001
From: Guangbin Huang <huangguangbin2@huawei.com>
Date: Mon, 16 Aug 2021 10:15:27 +0800
Subject: ethtool: add two link extended substates of bad signal integrity

ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST means the input
external clock signal for SerDes is too weak or lost.

ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS means the received signal for
SerDes is too weak because analog loss of signal.

Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 67aa7134b301..b6db6590baf0 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -639,6 +639,8 @@ enum ethtool_link_ext_substate_link_logical_mismatch {
 enum ethtool_link_ext_substate_bad_signal_integrity {
 	ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1,
 	ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE,
+	ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST,
+	ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS,
 };
 
 /* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */
-- 
cgit v1.2.3-71-gd317


From b89fbfbb854c9afc3047e8273cc3a694650b802e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 15 Aug 2021 00:05:57 -0700
Subject: bpf: Implement minimal BPF perf link

Introduce a new type of BPF link - BPF perf link. This brings perf_event-based
BPF program attachments (perf_event, tracepoints, kprobes, and uprobes) into
the common BPF link infrastructure, allowing to list all active perf_event
based attachments, auto-detaching BPF program from perf_event when link's FD
is closed, get generic BPF link fdinfo/get_info functionality.

BPF_LINK_CREATE command expects perf_event's FD as target_fd. No extra flags
are currently supported.

Force-detaching and atomic BPF program updates are not yet implemented, but
with perf_event-based BPF links we now have common framework for this without
the need to extend ioctl()-based perf_event interface.

One interesting consideration is a new value for bpf_attach_type, which
BPF_LINK_CREATE command expects. Generally, it's either 1-to-1 mapping from
bpf_attach_type to bpf_prog_type, or many-to-1 mapping from a subset of
bpf_attach_types to one bpf_prog_type (e.g., see BPF_PROG_TYPE_SK_SKB or
BPF_PROG_TYPE_CGROUP_SOCK). In this case, though, we have three different
program types (KPROBE, TRACEPOINT, PERF_EVENT) using the same perf_event-based
mechanism, so it's many bpf_prog_types to one bpf_attach_type. I chose to
define a single BPF_PERF_EVENT attach type for all of them and adjust
link_create()'s logic for checking correspondence between attach type and
program type.

The alternative would be to define three new attach types (e.g., BPF_KPROBE,
BPF_TRACEPOINT, and BPF_PERF_EVENT), but that seemed like unnecessary overkill
and BPF_KPROBE will cause naming conflicts with BPF_KPROBE() macro, defined by
libbpf. I chose to not do this to avoid unnecessary proliferation of
bpf_attach_type enum values and not have to deal with naming conflicts.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/20210815070609.987780-5-andrii@kernel.org
---
 include/linux/bpf_types.h      |   3 ++
 include/linux/trace_events.h   |   3 ++
 include/uapi/linux/bpf.h       |   2 +
 kernel/bpf/syscall.c           | 105 ++++++++++++++++++++++++++++++++++++++---
 kernel/events/core.c           |  10 ++--
 tools/include/uapi/linux/bpf.h |   2 +
 6 files changed, 112 insertions(+), 13 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index ae3ac3a2018c..9c81724e4b98 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -136,3 +136,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
 BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
 BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp)
 #endif
+#ifdef CONFIG_PERF_EVENTS
+BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
+#endif
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index ad413b382a3c..8ac92560d3a3 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -803,6 +803,9 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+void perf_event_free_bpf_prog(struct perf_event *event);
+
 void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
 void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
 void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2db6925e04f4..94fe8329b28f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -993,6 +993,7 @@ enum bpf_attach_type {
 	BPF_SK_SKB_VERDICT,
 	BPF_SK_REUSEPORT_SELECT,
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
+	BPF_PERF_EVENT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1006,6 +1007,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_ITER = 4,
 	BPF_LINK_TYPE_NETNS = 5,
 	BPF_LINK_TYPE_XDP = 6,
+	BPF_LINK_TYPE_PERF_EVENT = 7,
 
 	MAX_BPF_LINK_TYPE,
 };
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9a2068e39d23..80c03bedd6e6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2906,6 +2906,79 @@ static const struct bpf_link_ops bpf_raw_tp_link_lops = {
 	.fill_link_info = bpf_raw_tp_link_fill_link_info,
 };
 
+#ifdef CONFIG_PERF_EVENTS
+struct bpf_perf_link {
+	struct bpf_link link;
+	struct file *perf_file;
+};
+
+static void bpf_perf_link_release(struct bpf_link *link)
+{
+	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+	struct perf_event *event = perf_link->perf_file->private_data;
+
+	perf_event_free_bpf_prog(event);
+	fput(perf_link->perf_file);
+}
+
+static void bpf_perf_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+
+	kfree(perf_link);
+}
+
+static const struct bpf_link_ops bpf_perf_link_lops = {
+	.release = bpf_perf_link_release,
+	.dealloc = bpf_perf_link_dealloc,
+};
+
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_link_primer link_primer;
+	struct bpf_perf_link *link;
+	struct perf_event *event;
+	struct file *perf_file;
+	int err;
+
+	if (attr->link_create.flags)
+		return -EINVAL;
+
+	perf_file = perf_event_get(attr->link_create.target_fd);
+	if (IS_ERR(perf_file))
+		return PTR_ERR(perf_file);
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto out_put_file;
+	}
+	bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
+	link->perf_file = perf_file;
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err) {
+		kfree(link);
+		goto out_put_file;
+	}
+
+	event = perf_file->private_data;
+	err = perf_event_set_bpf_prog(event, prog);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		goto out_put_file;
+	}
+	/* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
+	bpf_prog_inc(prog);
+
+	return bpf_link_settle(&link_primer);
+
+out_put_file:
+	fput(perf_file);
+	return err;
+}
+#endif /* CONFIG_PERF_EVENTS */
+
 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
 
 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
@@ -4147,15 +4220,26 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	if (ret)
 		goto out;
 
-	if (prog->type == BPF_PROG_TYPE_EXT) {
+	switch (prog->type) {
+	case BPF_PROG_TYPE_EXT:
 		ret = tracing_bpf_link_attach(attr, uattr, prog);
 		goto out;
-	}
-
-	ptype = attach_type_to_prog_type(attr->link_create.attach_type);
-	if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
-		ret = -EINVAL;
-		goto out;
+	case BPF_PROG_TYPE_PERF_EVENT:
+	case BPF_PROG_TYPE_KPROBE:
+	case BPF_PROG_TYPE_TRACEPOINT:
+		if (attr->link_create.attach_type != BPF_PERF_EVENT) {
+			ret = -EINVAL;
+			goto out;
+		}
+		ptype = prog->type;
+		break;
+	default:
+		ptype = attach_type_to_prog_type(attr->link_create.attach_type);
+		if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
 	}
 
 	switch (ptype) {
@@ -4179,6 +4263,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_XDP:
 		ret = bpf_xdp_link_attach(attr, prog);
 		break;
+#endif
+#ifdef CONFIG_PERF_EVENTS
+	case BPF_PROG_TYPE_PERF_EVENT:
+	case BPF_PROG_TYPE_TRACEPOINT:
+	case BPF_PROG_TYPE_KPROBE:
+		ret = bpf_perf_link_attach(attr, prog);
+		break;
 #endif
 	default:
 		ret = -EINVAL;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2f07718bd41c..9fd65667bcb2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4697,7 +4697,6 @@ errout:
 }
 
 static void perf_event_free_filter(struct perf_event *event);
-static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -5574,7 +5573,6 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
 static int perf_copy_attr(struct perf_event_attr __user *uattr,
 			  struct perf_event_attr *attr);
 
@@ -10013,7 +10011,7 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
 	return false;
 }
 
-static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
 {
 	bool is_kprobe, is_tracepoint, is_syscall_tp;
 
@@ -10047,7 +10045,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *pr
 	return perf_event_attach_bpf_prog(event, prog);
 }
 
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
 {
 	if (!perf_event_is_tracing(event)) {
 		perf_event_free_bpf_handler(event);
@@ -10066,12 +10064,12 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
-static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
 {
 	return -ENOENT;
 }
 
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
 {
 }
 #endif /* CONFIG_EVENT_TRACING */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2db6925e04f4..94fe8329b28f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -993,6 +993,7 @@ enum bpf_attach_type {
 	BPF_SK_SKB_VERDICT,
 	BPF_SK_REUSEPORT_SELECT,
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
+	BPF_PERF_EVENT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1006,6 +1007,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_ITER = 4,
 	BPF_LINK_TYPE_NETNS = 5,
 	BPF_LINK_TYPE_XDP = 6,
+	BPF_LINK_TYPE_PERF_EVENT = 7,
 
 	MAX_BPF_LINK_TYPE,
 };
-- 
cgit v1.2.3-71-gd317


From 82e6b1eee6a8875ef4eacfd60711cce6965c6b04 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 15 Aug 2021 00:05:58 -0700
Subject: bpf: Allow to specify user-provided bpf_cookie for BPF perf links

Add ability for users to specify custom u64 value (bpf_cookie) when creating
BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event,
tracepoints).

This is useful for cases when the same BPF program is used for attaching and
processing invocation of different tracepoints/kprobes/uprobes in a generic
fashion, but such that each invocation is distinguished from each other (e.g.,
BPF program can look up additional information associated with a specific
kernel function without having to rely on function IP lookups). This enables
new use cases to be implemented simply and efficiently that previously were
possible only through code generation (and thus multiple instances of almost
identical BPF program) or compilation at runtime (BCC-style) on target hosts
(even more expensive resource-wise). For uprobes it is not even possible in
some cases to know function IP before hand (e.g., when attaching to shared
library without PID filtering, in which case base load address is not known
for a library).

This is done by storing u64 bpf_cookie in struct bpf_prog_array_item,
corresponding to each attached and run BPF program. Given cgroup BPF programs
already use two 8-byte pointers for their needs and cgroup BPF programs don't
have (yet?) support for bpf_cookie, reuse that space through union of
cgroup_storage and new bpf_cookie field.

Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx.
This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF
program execution code, which luckily is now also split from
BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper
giving access to this user-provided cookie value from inside a BPF program.
Generic perf_event BPF programs will access this value from perf_event itself
through passed in BPF program context.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org
---
 drivers/media/rc/bpf-lirc.c    |  4 ++--
 include/linux/bpf.h            | 16 +++++++++++++++-
 include/linux/perf_event.h     |  1 +
 include/linux/trace_events.h   |  6 +++---
 include/uapi/linux/bpf.h       |  7 +++++++
 kernel/bpf/core.c              | 29 ++++++++++++++++++-----------
 kernel/bpf/syscall.c           |  2 +-
 kernel/events/core.c           | 21 ++++++++++++++-------
 kernel/trace/bpf_trace.c       |  8 +++++---
 tools/include/uapi/linux/bpf.h |  7 +++++++
 10 files changed, 73 insertions(+), 28 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index bb5a9dc78f1b..3eff08d7b8e5 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -160,7 +160,7 @@ static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog)
 		goto unlock;
 	}
 
-	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	ret = bpf_prog_array_copy(old_array, NULL, prog, 0, &new_array);
 	if (ret < 0)
 		goto unlock;
 
@@ -193,7 +193,7 @@ static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog)
 	}
 
 	old_array = lirc_rcu_dereference(raw->progs);
-	ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array);
+	ret = bpf_prog_array_copy(old_array, prog, NULL, 0, &new_array);
 	/*
 	 * Do not use bpf_prog_array_delete_safe() as we would end up
 	 * with a dummy entry in the array, and the we would free the
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 344e0d4d8ef6..83c3cc5e90df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1114,7 +1114,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
  */
 struct bpf_prog_array_item {
 	struct bpf_prog *prog;
-	struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+	union {
+		struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+		u64 bpf_cookie;
+	};
 };
 
 struct bpf_prog_array {
@@ -1140,6 +1143,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *exclude_prog,
 			struct bpf_prog *include_prog,
+			u64 bpf_cookie,
 			struct bpf_prog_array **new_array);
 
 struct bpf_run_ctx {};
@@ -1149,6 +1153,11 @@ struct bpf_cg_run_ctx {
 	const struct bpf_prog_array_item *prog_item;
 };
 
+struct bpf_trace_run_ctx {
+	struct bpf_run_ctx run_ctx;
+	u64 bpf_cookie;
+};
+
 static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
 {
 	struct bpf_run_ctx *old_ctx = NULL;
@@ -1239,6 +1248,8 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu,
 	const struct bpf_prog_array_item *item;
 	const struct bpf_prog *prog;
 	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_trace_run_ctx run_ctx;
 	u32 ret = 1;
 
 	migrate_disable();
@@ -1246,11 +1257,14 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu,
 	array = rcu_dereference(array_rcu);
 	if (unlikely(!array))
 		goto out;
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	item = &array->items[0];
 	while ((prog = READ_ONCE(item->prog))) {
+		run_ctx.bpf_cookie = item->bpf_cookie;
 		ret &= run_prog(prog, ctx);
 		item++;
 	}
+	bpf_reset_run_ctx(old_run_ctx);
 out:
 	rcu_read_unlock();
 	migrate_enable();
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2d510ad750ed..fe156a8170aa 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -762,6 +762,7 @@ struct perf_event {
 #ifdef CONFIG_BPF_SYSCALL
 	perf_overflow_handler_t		orig_overflow_handler;
 	struct bpf_prog			*prog;
+	u64				bpf_cookie;
 #endif
 
 #ifdef CONFIG_EVENT_TRACING
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8ac92560d3a3..8e0631a4b046 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -675,7 +675,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 
 #ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
-int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
 void perf_event_detach_bpf_prog(struct perf_event *event);
 int perf_event_query_prog_array(struct perf_event *event, void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
@@ -692,7 +692,7 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c
 }
 
 static inline int
-perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie)
 {
 	return -EOPNOTSUPP;
 }
@@ -803,7 +803,7 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
 void perf_event_free_bpf_prog(struct perf_event *event);
 
 void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 94fe8329b28f..63ee482d50e1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1448,6 +1448,13 @@ union bpf_attr {
 				__aligned_u64	iter_info;	/* extra bpf_iter_link_info */
 				__u32		iter_info_len;	/* iter_info length */
 			};
+			struct {
+				/* black box user-provided value passed through
+				 * to BPF program at the execution time and
+				 * accessible through bpf_get_attach_cookie() BPF helper
+				 */
+				__u64		bpf_cookie;
+			} perf_event;
 		};
 	} link_create;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5ee2ec27c3d4..91f24c7b38a1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2119,13 +2119,13 @@ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
 int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *exclude_prog,
 			struct bpf_prog *include_prog,
+			u64 bpf_cookie,
 			struct bpf_prog_array **new_array)
 {
 	int new_prog_cnt, carry_prog_cnt = 0;
-	struct bpf_prog_array_item *existing;
+	struct bpf_prog_array_item *existing, *new;
 	struct bpf_prog_array *array;
 	bool found_exclude = false;
-	int new_prog_idx = 0;
 
 	/* Figure out how many existing progs we need to carry over to
 	 * the new array.
@@ -2162,20 +2162,27 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 	array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
 	if (!array)
 		return -ENOMEM;
+	new = array->items;
 
 	/* Fill in the new prog array */
 	if (carry_prog_cnt) {
 		existing = old_array->items;
-		for (; existing->prog; existing++)
-			if (existing->prog != exclude_prog &&
-			    existing->prog != &dummy_bpf_prog.prog) {
-				array->items[new_prog_idx++].prog =
-					existing->prog;
-			}
+		for (; existing->prog; existing++) {
+			if (existing->prog == exclude_prog ||
+			    existing->prog == &dummy_bpf_prog.prog)
+				continue;
+
+			new->prog = existing->prog;
+			new->bpf_cookie = existing->bpf_cookie;
+			new++;
+		}
 	}
-	if (include_prog)
-		array->items[new_prog_idx++].prog = include_prog;
-	array->items[new_prog_idx].prog = NULL;
+	if (include_prog) {
+		new->prog = include_prog;
+		new->bpf_cookie = bpf_cookie;
+		new++;
+	}
+	new->prog = NULL;
 	*new_array = array;
 	return 0;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 80c03bedd6e6..7420e1334ab2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2963,7 +2963,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
 	}
 
 	event = perf_file->private_data;
-	err = perf_event_set_bpf_prog(event, prog);
+	err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		goto out_put_file;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9fd65667bcb2..2d1e63dd97f2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5643,7 +5643,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 
-		err = perf_event_set_bpf_prog(event, prog);
+		err = perf_event_set_bpf_prog(event, prog, 0);
 		if (err) {
 			bpf_prog_put(prog);
 			return err;
@@ -9936,7 +9936,9 @@ out:
 	event->orig_overflow_handler(event, data, regs);
 }
 
-static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+				      struct bpf_prog *prog,
+				      u64 bpf_cookie)
 {
 	if (event->overflow_handler_context)
 		/* hw breakpoint or kernel counter */
@@ -9966,6 +9968,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog
 	}
 
 	event->prog = prog;
+	event->bpf_cookie = bpf_cookie;
 	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
 	WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
 	return 0;
@@ -9983,7 +9986,9 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
 	bpf_prog_put(prog);
 }
 #else
-static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog)
+static int perf_event_set_bpf_handler(struct perf_event *event,
+				      struct bpf_prog *prog,
+				      u64 bpf_cookie)
 {
 	return -EOPNOTSUPP;
 }
@@ -10011,12 +10016,13 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
 	return false;
 }
 
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+			    u64 bpf_cookie)
 {
 	bool is_kprobe, is_tracepoint, is_syscall_tp;
 
 	if (!perf_event_is_tracing(event))
-		return perf_event_set_bpf_handler(event, prog);
+		return perf_event_set_bpf_handler(event, prog, bpf_cookie);
 
 	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
@@ -10042,7 +10048,7 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
 			return -EACCES;
 	}
 
-	return perf_event_attach_bpf_prog(event, prog);
+	return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
 }
 
 void perf_event_free_bpf_prog(struct perf_event *event)
@@ -10064,7 +10070,8 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+			    u64 bpf_cookie)
 {
 	return -ENOENT;
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 91867b14b222..57879d28f824 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1675,7 +1675,8 @@ static DEFINE_MUTEX(bpf_event_mutex);
 #define BPF_TRACE_MAX_PROGS 64
 
 int perf_event_attach_bpf_prog(struct perf_event *event,
-			       struct bpf_prog *prog)
+			       struct bpf_prog *prog,
+			       u64 bpf_cookie)
 {
 	struct bpf_prog_array *old_array;
 	struct bpf_prog_array *new_array;
@@ -1702,12 +1703,13 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 		goto unlock;
 	}
 
-	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
 	if (ret < 0)
 		goto unlock;
 
 	/* set the new array to event->tp_event and set event->prog */
 	event->prog = prog;
+	event->bpf_cookie = bpf_cookie;
 	rcu_assign_pointer(event->tp_event->prog_array, new_array);
 	bpf_prog_array_free(old_array);
 
@@ -1728,7 +1730,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 		goto unlock;
 
 	old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
-	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+	ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
 	if (ret == -ENOENT)
 		goto unlock;
 	if (ret < 0) {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 94fe8329b28f..63ee482d50e1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1448,6 +1448,13 @@ union bpf_attr {
 				__aligned_u64	iter_info;	/* extra bpf_iter_link_info */
 				__u32		iter_info_len;	/* iter_info length */
 			};
+			struct {
+				/* black box user-provided value passed through
+				 * to BPF program at the execution time and
+				 * accessible through bpf_get_attach_cookie() BPF helper
+				 */
+				__u64		bpf_cookie;
+			} perf_event;
 		};
 	} link_create;
 
-- 
cgit v1.2.3-71-gd317


From 7adfc6c9b315e174cf8743b21b7b691c8766791b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 15 Aug 2021 00:05:59 -0700
Subject: bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie
 value

Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs
to get access to a user-provided bpf_cookie value, specified during BPF
program attachment (BPF link creation) time.

Naming is hard, though. With the concept being named "BPF cookie", I've
considered calling the helper:
  - bpf_get_cookie() -- seems too unspecific and easily mistaken with socket
    cookie;
  - bpf_get_bpf_cookie() -- too much tautology;
  - bpf_get_link_cookie() -- would be ok, but while we create a BPF link to
    attach BPF program to BPF hook, it's still an "attachment" and the
    bpf_cookie is associated with BPF program attachment to a hook, not a BPF
    link itself. Technically, we could support bpf_cookie with old-style
    cgroup programs.So I ultimately rejected it in favor of
    bpf_get_attach_cookie().

Currently all perf_event-backed BPF program types support
bpf_get_attach_cookie() helper. Follow-up patches will add support for
fentry/fexit programs as well.

While at it, mark bpf_tracing_func_proto() as static to make it obvious that
it's only used from within the kernel/trace/bpf_trace.c.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org
---
 include/linux/bpf.h            |  3 ---
 include/uapi/linux/bpf.h       | 16 ++++++++++++++++
 kernel/trace/bpf_trace.c       | 35 ++++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h | 16 ++++++++++++++++
 4 files changed, 66 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 83c3cc5e90df..f4c16f19f83e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2102,9 +2102,6 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 
-const struct bpf_func_proto *bpf_tracing_func_proto(
-	enum bpf_func_id func_id, const struct bpf_prog *prog);
-
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 63ee482d50e1..c4f7892edb2b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4856,6 +4856,21 @@ union bpf_attr {
  * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
+ *
+ * u64 bpf_get_attach_cookie(void *ctx)
+ * 	Description
+ * 		Get bpf_cookie value provided (optionally) during the program
+ * 		attachment. It might be different for each individual
+ * 		attachment, even if BPF program itself is the same.
+ * 		Expects BPF program context *ctx* as a first argument.
+ *
+ * 		Supported for the following program types:
+ *			- kprobe/uprobe;
+ *			- tracepoint;
+ *			- perf_event.
+ * 	Return
+ *		Value specified by user at BPF link creation/attachment time
+ *		or 0, if it was not specified.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5032,6 +5047,7 @@ union bpf_attr {
 	FN(timer_start),		\
 	FN(timer_cancel),		\
 	FN(get_func_ip),		\
+	FN(get_attach_cookie),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 57879d28f824..cbc73c08c4a4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -975,7 +975,34 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
-const struct bpf_func_proto *
+BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
+{
+	struct bpf_trace_run_ctx *run_ctx;
+
+	run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+	return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = {
+	.func		= bpf_get_attach_cookie_trace,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx)
+{
+	return ctx->event->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
+	.func		= bpf_get_attach_cookie_pe,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
+static const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
@@ -1109,6 +1136,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 	case BPF_FUNC_get_func_ip:
 		return &bpf_get_func_ip_proto_kprobe;
+	case BPF_FUNC_get_attach_cookie:
+		return &bpf_get_attach_cookie_proto_trace;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}
@@ -1219,6 +1248,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_stackid_proto_tp;
 	case BPF_FUNC_get_stack:
 		return &bpf_get_stack_proto_tp;
+	case BPF_FUNC_get_attach_cookie:
+		return &bpf_get_attach_cookie_proto_trace;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}
@@ -1326,6 +1357,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_prog_read_value_proto;
 	case BPF_FUNC_read_branch_records:
 		return &bpf_read_branch_records_proto;
+	case BPF_FUNC_get_attach_cookie:
+		return &bpf_get_attach_cookie_proto_pe;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 63ee482d50e1..c4f7892edb2b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4856,6 +4856,21 @@ union bpf_attr {
  * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
+ *
+ * u64 bpf_get_attach_cookie(void *ctx)
+ * 	Description
+ * 		Get bpf_cookie value provided (optionally) during the program
+ * 		attachment. It might be different for each individual
+ * 		attachment, even if BPF program itself is the same.
+ * 		Expects BPF program context *ctx* as a first argument.
+ *
+ * 		Supported for the following program types:
+ *			- kprobe/uprobe;
+ *			- tracepoint;
+ *			- perf_event.
+ * 	Return
+ *		Value specified by user at BPF link creation/attachment time
+ *		or 0, if it was not specified.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5032,6 +5047,7 @@ union bpf_attr {
 	FN(timer_start),		\
 	FN(timer_cancel),		\
 	FN(get_func_ip),		\
+	FN(get_attach_cookie),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3-71-gd317


From 0d2ab3aea50bb02ff0c9c3d53c7b2b4b21cdd59d Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Fri, 2 Jul 2021 19:44:07 +0200
Subject: nl80211: add support for BSS coloring

This patch adds support for BSS color collisions to the wireless subsystem.
Add the required functionality to nl80211 that will notify about color
collisions, triggering the color change and notifying when it is completed.

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: John Crispin <john@phrozen.org>
Link: https://lore.kernel.org/r/500b3582aec8fe2c42ef46f3117b148cb7cbceb5.1625247619.git.lorenzo@kernel.org
[remove unnecessary NULL initialisation]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  92 +++++++++++++++++++++++++
 include/uapi/linux/nl80211.h |  43 ++++++++++++
 net/wireless/nl80211.c       | 157 +++++++++++++++++++++++++++++++++++++++++++
 net/wireless/rdev-ops.h      |  13 ++++
 net/wireless/trace.h         |  46 +++++++++++++
 5 files changed, 351 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 161cdf7df1a0..62dd8422e0dc 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1252,6 +1252,27 @@ struct cfg80211_csa_settings {
 	u8 count;
 };
 
+/**
+ * struct cfg80211_color_change_settings - color change settings
+ *
+ * Used for bss color change
+ *
+ * @beacon_color_change: beacon data while performing the color countdown
+ * @counter_offsets_beacon: offsets of the counters within the beacon (tail)
+ * @counter_offsets_presp: offsets of the counters within the probe response
+ * @beacon_next: beacon data to be used after the color change
+ * @count: number of beacons until the color change
+ * @color: the color used after the change
+ */
+struct cfg80211_color_change_settings {
+	struct cfg80211_beacon_data beacon_color_change;
+	u16 counter_offset_beacon;
+	u16 counter_offset_presp;
+	struct cfg80211_beacon_data beacon_next;
+	u8 count;
+	u8 color;
+};
+
 /**
  * struct iface_combination_params - input parameters for interface combinations
  *
@@ -3995,6 +4016,8 @@ struct mgmt_frame_regs {
  *	given TIDs. This callback may sleep.
  *
  * @set_sar_specs: Update the SAR (TX power) settings.
+ *
+ * @color_change: Initiate a color change.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -4322,6 +4345,9 @@ struct cfg80211_ops {
 				    const u8 *peer, u8 tids);
 	int	(*set_sar_specs)(struct wiphy *wiphy,
 				 struct cfg80211_sar_specs *sar);
+	int	(*color_change)(struct wiphy *wiphy,
+				struct net_device *dev,
+				struct cfg80211_color_change_settings *params);
 };
 
 /*
@@ -8218,4 +8244,70 @@ void cfg80211_update_owe_info_event(struct net_device *netdev,
  */
 void cfg80211_bss_flush(struct wiphy *wiphy);
 
+/**
+ * cfg80211_bss_color_notify - notify about bss color event
+ * @dev: network device
+ * @gfp: allocation flags
+ * @cmd: the actual event we want to notify
+ * @count: the number of TBTTs until the color change happens
+ * @color_bitmap: representations of the colors that the local BSS is aware of
+ */
+int cfg80211_bss_color_notify(struct net_device *dev, gfp_t gfp,
+			      enum nl80211_commands cmd, u8 count,
+			      u64 color_bitmap);
+
+/**
+ * cfg80211_obss_color_collision_notify - notify about bss color collision
+ * @dev: network device
+ * @color_bitmap: representations of the colors that the local BSS is aware of
+ */
+static inline int cfg80211_obss_color_collision_notify(struct net_device *dev,
+						       u64 color_bitmap)
+{
+	return cfg80211_bss_color_notify(dev, GFP_KERNEL,
+					 NL80211_CMD_OBSS_COLOR_COLLISION,
+					 0, color_bitmap);
+}
+
+/**
+ * cfg80211_color_change_started_notify - notify color change start
+ * @dev: the device on which the color is switched
+ * @count: the number of TBTTs until the color change happens
+ *
+ * Inform the userspace about the color change that has started.
+ */
+static inline int cfg80211_color_change_started_notify(struct net_device *dev,
+						       u8 count)
+{
+	return cfg80211_bss_color_notify(dev, GFP_KERNEL,
+					 NL80211_CMD_COLOR_CHANGE_STARTED,
+					 count, 0);
+}
+
+/**
+ * cfg80211_color_change_aborted_notify - notify color change abort
+ * @dev: the device on which the color is switched
+ *
+ * Inform the userspace about the color change that has aborted.
+ */
+static inline int cfg80211_color_change_aborted_notify(struct net_device *dev)
+{
+	return cfg80211_bss_color_notify(dev, GFP_KERNEL,
+					 NL80211_CMD_COLOR_CHANGE_ABORTED,
+					 0, 0);
+}
+
+/**
+ * cfg80211_color_change_notify - notify color change completion
+ * @dev: the device on which the color was switched
+ *
+ * Inform the userspace about the color change that has completed.
+ */
+static inline int cfg80211_color_change_notify(struct net_device *dev)
+{
+	return cfg80211_bss_color_notify(dev, GFP_KERNEL,
+					 NL80211_CMD_COLOR_CHANGE_COMPLETED,
+					 0, 0);
+}
+
 #endif /* __NET_CFG80211_H */
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index db474994fa73..c2efea98e060 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1185,6 +1185,21 @@
  *	passed using %NL80211_ATTR_SAR_SPEC. %NL80211_ATTR_WIPHY is used to
  *	specify the wiphy index to be applied to.
  *
+ * @NL80211_CMD_OBSS_COLOR_COLLISION: This notification is sent out whenever
+ *	mac80211/drv detects a bss color collision.
+ *
+ * @NL80211_CMD_COLOR_CHANGE_REQUEST: This command is used to indicate that
+ *	userspace wants to change the BSS color.
+ *
+ * @NL80211_CMD_COLOR_CHANGE_STARTED: Notify userland, that a color change has
+ *	started
+ *
+ * @NL80211_CMD_COLOR_CHANGE_ABORTED: Notify userland, that the color change has
+ *	been aborted
+ *
+ * @NL80211_CMD_COLOR_CHANGE_COMPLETED: Notify userland that the color change
+ *	has completed
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1417,6 +1432,14 @@ enum nl80211_commands {
 
 	NL80211_CMD_SET_SAR_SPECS,
 
+	NL80211_CMD_OBSS_COLOR_COLLISION,
+
+	NL80211_CMD_COLOR_CHANGE_REQUEST,
+
+	NL80211_CMD_COLOR_CHANGE_STARTED,
+	NL80211_CMD_COLOR_CHANGE_ABORTED,
+	NL80211_CMD_COLOR_CHANGE_COMPLETED,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2560,6 +2583,16 @@ enum nl80211_commands {
  *	disassoc events to indicate that an immediate reconnect to the AP
  *	is desired.
  *
+ * @NL80211_ATTR_OBSS_COLOR_BITMAP: bitmap of the u64 BSS colors for the
+ *	%NL80211_CMD_OBSS_COLOR_COLLISION event.
+ *
+ * @NL80211_ATTR_COLOR_CHANGE_COUNT: u8 attribute specifying the number of TBTT's
+ *	until the color switch event.
+ * @NL80211_ATTR_COLOR_CHANGE_COLOR: u8 attribute specifying the color that we are
+ *	switching to
+ * @NL80211_ATTR_COLOR_CHANGE_ELEMS: Nested set of attributes containing the IE
+ *	information for the time while performing a color switch.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3057,6 +3090,12 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_DISABLE_HE,
 
+	NL80211_ATTR_OBSS_COLOR_BITMAP,
+
+	NL80211_ATTR_COLOR_CHANGE_COUNT,
+	NL80211_ATTR_COLOR_CHANGE_COLOR,
+	NL80211_ATTR_COLOR_CHANGE_ELEMS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -5953,6 +5992,9 @@ enum nl80211_feature_flags {
  *      frame protection for all management frames exchanged during the
  *      negotiation and range measurement procedure.
  *
+ * @NL80211_EXT_FEATURE_BSS_COLOR: The driver supports BSS color collision
+ *	detection and change announcemnts.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -6017,6 +6059,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_SECURE_LTF,
 	NL80211_EXT_FEATURE_SECURE_RTT,
 	NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE,
+	NL80211_EXT_FEATURE_BSS_COLOR,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index dceed5b5b226..bf7cd4752547 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -759,6 +759,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_RECONNECT_REQUESTED] = { .type = NLA_REJECT },
 	[NL80211_ATTR_SAR_SPEC] = NLA_POLICY_NESTED(sar_policy),
 	[NL80211_ATTR_DISABLE_HE] = { .type = NLA_FLAG },
+	[NL80211_ATTR_OBSS_COLOR_BITMAP] = { .type = NLA_U64 },
+	[NL80211_ATTR_COLOR_CHANGE_COUNT] = { .type = NLA_U8 },
+	[NL80211_ATTR_COLOR_CHANGE_COLOR] = { .type = NLA_U8 },
+	[NL80211_ATTR_COLOR_CHANGE_ELEMS] = NLA_POLICY_NESTED(nl80211_policy),
 };
 
 /* policy for the key attributes */
@@ -14800,6 +14804,106 @@ bad_tid_conf:
 	return ret;
 }
 
+static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct cfg80211_color_change_settings params = {};
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct nlattr **tb;
+	u16 offset;
+	int err;
+
+	if (!rdev->ops->color_change)
+		return -EOPNOTSUPP;
+
+	if (!wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_BSS_COLOR))
+		return -EOPNOTSUPP;
+
+	if (wdev->iftype != NL80211_IFTYPE_AP)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT] ||
+	    !info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR] ||
+	    !info->attrs[NL80211_ATTR_COLOR_CHANGE_ELEMS])
+		return -EINVAL;
+
+	params.count = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT]);
+	params.color = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR]);
+
+	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_next);
+	if (err)
+		return err;
+
+	tb = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*tb), GFP_KERNEL);
+	if (!tb)
+		return -ENOMEM;
+
+	err = nla_parse_nested(tb, NL80211_ATTR_MAX,
+			       info->attrs[NL80211_ATTR_COLOR_CHANGE_ELEMS],
+			       nl80211_policy, info->extack);
+	if (err)
+		goto out;
+
+	err = nl80211_parse_beacon(rdev, tb, &params.beacon_color_change);
+	if (err)
+		goto out;
+
+	if (!tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (nla_len(tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]) != sizeof(u16)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	offset = nla_get_u16(tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]);
+	if (offset >= params.beacon_color_change.tail_len) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (params.beacon_color_change.tail[offset] != params.count) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	params.counter_offset_beacon = offset;
+
+	if (tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]) {
+		if (nla_len(tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]) !=
+		    sizeof(u16)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		offset = nla_get_u16(tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]);
+		if (offset >= params.beacon_color_change.probe_resp_len) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (params.beacon_color_change.probe_resp[offset] !=
+		    params.count) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		params.counter_offset_presp = offset;
+	}
+
+	wdev_lock(wdev);
+	err = rdev_color_change(rdev, dev, &params);
+	wdev_unlock(wdev);
+
+out:
+	kfree(tb);
+	return err;
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -15795,6 +15899,14 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_WIPHY |
 				  NL80211_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL80211_CMD_COLOR_CHANGE_REQUEST,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = nl80211_color_change,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
@@ -17424,6 +17536,51 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev,
 }
 EXPORT_SYMBOL(cfg80211_ch_switch_started_notify);
 
+int cfg80211_bss_color_notify(struct net_device *dev, gfp_t gfp,
+			      enum nl80211_commands cmd, u8 count,
+			      u64 color_bitmap)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+	struct sk_buff *msg;
+	void *hdr;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	trace_cfg80211_bss_color_notify(dev, cmd, count, color_bitmap);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
+	if (!hdr)
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
+		goto nla_put_failure;
+
+	if (cmd == NL80211_CMD_COLOR_CHANGE_STARTED &&
+	    nla_put_u32(msg, NL80211_ATTR_COLOR_CHANGE_COUNT, count))
+		goto nla_put_failure;
+
+	if (cmd == NL80211_CMD_OBSS_COLOR_COLLISION &&
+	    nla_put_u64_64bit(msg, NL80211_ATTR_OBSS_COLOR_BITMAP,
+			      color_bitmap, NL80211_ATTR_PAD))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
+				       msg, 0, NL80211_MCGRP_MLME, gfp);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(cfg80211_bss_color_notify);
+
 void
 nl80211_radar_notify(struct cfg80211_registered_device *rdev,
 		     const struct cfg80211_chan_def *chandef,
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index b1d37f582dc6..ce6bf218a1a3 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1368,4 +1368,17 @@ static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev,
 	return ret;
 }
 
+static inline int rdev_color_change(struct cfg80211_registered_device *rdev,
+				    struct net_device *dev,
+				    struct cfg80211_color_change_settings *params)
+{
+	int ret;
+
+	trace_rdev_color_change(&rdev->wiphy, dev, params);
+	ret = rdev->ops->color_change(&rdev->wiphy, dev, params);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+
+	return ret;
+}
+
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 440bce5f0274..911bb3b8969e 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3597,6 +3597,52 @@ TRACE_EVENT(rdev_set_sar_specs,
 		  WIPHY_PR_ARG, __entry->type, __entry->num)
 );
 
+TRACE_EVENT(rdev_color_change,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct cfg80211_color_change_settings *params),
+	TP_ARGS(wiphy, netdev, params),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		__field(u8, count)
+		__field(u16, bcn_ofs)
+		__field(u16, pres_ofs)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		__entry->count = params->count;
+		__entry->bcn_ofs = params->counter_offset_beacon;
+		__entry->pres_ofs = params->counter_offset_presp;
+	),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
+		  ", count: %u",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG,
+		  __entry->count)
+);
+
+TRACE_EVENT(cfg80211_bss_color_notify,
+	TP_PROTO(struct net_device *netdev,
+		 enum nl80211_commands cmd,
+		 u8 count, u64 color_bitmap),
+	TP_ARGS(netdev, cmd, count, color_bitmap),
+	TP_STRUCT__entry(
+		NETDEV_ENTRY
+		__field(enum nl80211_bss_scan_width, cmd)
+		__field(u8, count)
+		__field(u64, color_bitmap)
+	),
+	TP_fast_assign(
+		NETDEV_ASSIGN;
+		__entry->cmd = cmd;
+		__entry->count = count;
+		__entry->color_bitmap = color_bitmap;
+	),
+	TP_printk(NETDEV_PR_FMT ", cmd: %x, count: %u, bitmap: %llx",
+		  NETDEV_PR_ARG, __entry->cmd, __entry->count,
+		  __entry->color_bitmap)
+);
+
 #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3-71-gd317


From ea49dc79002c416a9003f3204bc14f846a0dbcae Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 28 Jul 2021 08:56:09 +1000
Subject: NFSD: remove vanity comments

Including one's name in copyright claims is appropriate.  Including it
in random comments is just vanity.  After 2 decades, it is time for
these to be gone.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/vfs.c                   | 1 -
 include/uapi/linux/nfsd/nfsfh.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7732a384f949..7851cf30a75d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -244,7 +244,6 @@ out_nfserr:
  * returned. Otherwise the covered directory is returned.
  * NOTE: this mountpoint crossing is not supported properly by all
  *   clients and is explicitly disallowed for NFSv3
- *      NeilBrown <neilb@cse.unsw.edu.au>
  */
 __be32
 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h
index 427294dd56a1..e29e8accc4f4 100644
--- a/include/uapi/linux/nfsd/nfsfh.h
+++ b/include/uapi/linux/nfsd/nfsfh.h
@@ -33,7 +33,6 @@ struct nfs_fhbase_old {
 
 /*
  * This is the new flexible, extensible style NFSv2/v3/v4 file handle.
- * by Neil Brown <neilb@cse.unsw.edu.au> - March 2000
  *
  * The file handle starts with a sequence of four-byte words.
  * The first word contains a version number (1) and three descriptor bytes
-- 
cgit v1.2.3-71-gd317


From 2843ff6f36db7074e17bf5d637a14da08c54aed8 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@xiaomi.com>
Date: Tue, 17 Aug 2021 15:07:23 -0700
Subject: mptcp: remote addresses fullmesh

This patch added and managed a new per endpoint flag, named
MPTCP_PM_ADDR_FLAG_FULLMESH.

In mptcp_pm_create_subflow_or_signal_addr(), if such flag is set, instead
of:
        remote_address((struct sock_common *)sk, &remote);
fill a temporary allocated array of all known remote address. After
releaseing the pm lock loop on such array and create a subflow for each
remote address from the given local.

Note that the we could still use an array even for non 'fullmesh'
endpoint: with a single entry corresponding to the primary MPC subflow
remote address.

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Geliang Tang <geliangtang@xiaomi.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mptcp.h |  1 +
 net/mptcp/pm_netlink.c     | 59 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 56 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 7b05f7102321..f66038b9551f 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -73,6 +73,7 @@ enum {
 #define MPTCP_PM_ADDR_FLAG_SIGNAL			(1 << 0)
 #define MPTCP_PM_ADDR_FLAG_SUBFLOW			(1 << 1)
 #define MPTCP_PM_ADDR_FLAG_BACKUP			(1 << 2)
+#define MPTCP_PM_ADDR_FLAG_FULLMESH			(1 << 3)
 
 enum {
 	MPTCP_PM_CMD_UNSPEC,
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 64a39f30659f..bf5b9b475bb7 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -410,6 +410,55 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
 	}
 }
 
+static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr,
+				  struct mptcp_addr_info *addr)
+{
+	int i;
+
+	for (i = 0; i < nr; i++) {
+		if (addresses_equal(&addrs[i], addr, addr->port))
+			return true;
+	}
+
+	return false;
+}
+
+/* Fill all the remote addresses into the array addrs[],
+ * and return the array size.
+ */
+static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh,
+					      struct mptcp_addr_info *addrs)
+{
+	struct sock *sk = (struct sock *)msk, *ssk;
+	struct mptcp_subflow_context *subflow;
+	struct mptcp_addr_info remote = { 0 };
+	unsigned int subflows_max;
+	int i = 0;
+
+	subflows_max = mptcp_pm_get_subflows_max(msk);
+
+	/* Non-fullmesh endpoint, fill in the single entry
+	 * corresponding to the primary MPC subflow remote address
+	 */
+	if (!fullmesh) {
+		remote_address((struct sock_common *)sk, &remote);
+		msk->pm.subflows++;
+		addrs[i++] = remote;
+	} else {
+		mptcp_for_each_subflow(msk, subflow) {
+			ssk = mptcp_subflow_tcp_sock(subflow);
+			remote_address((struct sock_common *)ssk, &remote);
+			if (!lookup_address_in_vec(addrs, i, &remote) &&
+			    msk->pm.subflows < subflows_max) {
+				msk->pm.subflows++;
+				addrs[i++] = remote;
+			}
+		}
+	}
+
+	return i;
+}
+
 static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
 {
 	struct sock *sk = (struct sock *)msk;
@@ -455,14 +504,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
 	    !READ_ONCE(msk->pm.remote_deny_join_id0)) {
 		local = select_local_address(pernet, msk);
 		if (local) {
-			struct mptcp_addr_info remote = { 0 };
+			bool fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
+			struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
+			int i, nr;
 
 			msk->pm.local_addr_used++;
-			msk->pm.subflows++;
 			check_work_pending(msk);
-			remote_address((struct sock_common *)sk, &remote);
+			nr = fill_remote_addresses_vec(msk, fullmesh, addrs);
 			spin_unlock_bh(&msk->pm.lock);
-			__mptcp_subflow_connect(sk, &local->addr, &remote);
+			for (i = 0; i < nr; i++)
+				__mptcp_subflow_connect(sk, &local->addr, &addrs[i]);
 			spin_lock_bh(&msk->pm.lock);
 			return;
 		}
-- 
cgit v1.2.3-71-gd317


From 25bca50e523cbe96c0207fbb92f22ff2bc28e9aa Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 11 Aug 2021 12:36:58 +0900
Subject: block: improve ioprio class description comment

In include/usapi/linux/ioprio.h, change the ioprio class enum comment
to remove the outdated reference to CFQ and mention BFQ and mq-deadline
instead. Also document the high priority NCQ command use for RT class
IOs directed at ATA drives that support NCQ priority.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Link: https://lore.kernel.org/r/20210811033702.368488-3-damien.lemoal@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/ioprio.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h
index 77b17e08b0da..6b735854aebd 100644
--- a/include/uapi/linux/ioprio.h
+++ b/include/uapi/linux/ioprio.h
@@ -13,10 +13,12 @@
 #define IOPRIO_PRIO_VALUE(class, data)	(((class) << IOPRIO_CLASS_SHIFT) | data)
 
 /*
- * These are the io priority groups as implemented by CFQ. RT is the realtime
- * class, it always gets premium service. BE is the best-effort scheduling
- * class, the default for any process. IDLE is the idle scheduling class, it
- * is only served when no one else is using the disk.
+ * These are the io priority groups as implemented by the BFQ and mq-deadline
+ * schedulers. RT is the realtime class, it always gets premium service. For
+ * ATA disks supporting NCQ IO priority, RT class IOs will be processed using
+ * high priority NCQ commands. BE is the best-effort scheduling class, the
+ * default for any process. IDLE is the idle scheduling class, it is only
+ * served when no one else is using the disk.
  */
 enum {
 	IOPRIO_CLASS_NONE,
-- 
cgit v1.2.3-71-gd317


From a553a835ca57668b0d9907d8ec2507ec51292d9a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 11 Aug 2021 12:36:59 +0900
Subject: block: change ioprio_valid() to an inline function

Change the ioprio_valid() macro in include/usapi/linux/ioprio.h to an
inline function declared on the kernel side in include/linux/ioprio.h.
Also improve checks on the class value by checking the upper bound
value.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Link: https://lore.kernel.org/r/20210811033702.368488-4-damien.lemoal@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ioprio.h      | 10 ++++++++++
 include/uapi/linux/ioprio.h |  2 --
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index ef9ad4fb245f..2ee3373684b1 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -8,6 +8,16 @@
 
 #include <uapi/linux/ioprio.h>
 
+/*
+ * Check that a priority value has a valid class.
+ */
+static inline bool ioprio_valid(unsigned short ioprio)
+{
+	unsigned short class = IOPRIO_PRIO_CLASS(ioprio);
+
+	return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE;
+}
+
 /*
  * if process has set io priority explicitly, use that. if not, convert
  * the cpu scheduler nice value to an io priority
diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h
index 6b735854aebd..5064e230374c 100644
--- a/include/uapi/linux/ioprio.h
+++ b/include/uapi/linux/ioprio.h
@@ -27,8 +27,6 @@ enum {
 	IOPRIO_CLASS_IDLE,
 };
 
-#define ioprio_valid(mask)	(IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
-
 /*
  * 8 best effort priority levels are supported
  */
-- 
cgit v1.2.3-71-gd317


From ba05200fcce0a73fa8db16c514fbaa476d1d9399 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 11 Aug 2021 12:37:00 +0900
Subject: block: fix IOPRIO_PRIO_CLASS() and IOPRIO_PRIO_VALUE() macros

The ki_ioprio field of struct kiocb is 16-bits (u16) but often handled
as an int in the block layer. E.g. ioprio_check_cap() takes an int as
argument.

With such implicit int casting function calls, the upper 16-bits of the
int argument may be left uninitialized by the compiler, resulting in
invalid values for the IOPRIO_PRIO_CLASS() macro (garbage upper bits)
and in an error return for functions such as ioprio_check_cap().

Fix this by masking the result of the shift by IOPRIO_CLASS_SHIFT bits
in the IOPRIO_PRIO_CLASS() macro. The new macro IOPRIO_CLASS_MASK
defines the 3-bits mask for the priority class.
Similarly, apply the IOPRIO_PRIO_MASK mask to the data argument of the
IOPRIO_PRIO_VALUE() macro to ignore the upper bits of the data value.
The IOPRIO_CLASS_MASK mask is also applied to the class argument of this
macro before shifting the result by IOPRIO_CLASS_SHIFT bits.

While at it, also change the argument name of the IOPRIO_PRIO_CLASS()
and IOPRIO_PRIO_DATA() macros from "mask" to "ioprio" to reflect the
fact that a priority value should be passed rather than a mask.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Link: https://lore.kernel.org/r/20210811033702.368488-5-damien.lemoal@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/ioprio.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h
index 5064e230374c..936f0d8f30e1 100644
--- a/include/uapi/linux/ioprio.h
+++ b/include/uapi/linux/ioprio.h
@@ -5,12 +5,16 @@
 /*
  * Gives us 8 prio classes with 13-bits of data for each class
  */
-#define IOPRIO_CLASS_SHIFT	(13)
+#define IOPRIO_CLASS_SHIFT	13
+#define IOPRIO_CLASS_MASK	0x07
 #define IOPRIO_PRIO_MASK	((1UL << IOPRIO_CLASS_SHIFT) - 1)
 
-#define IOPRIO_PRIO_CLASS(mask)	((mask) >> IOPRIO_CLASS_SHIFT)
-#define IOPRIO_PRIO_DATA(mask)	((mask) & IOPRIO_PRIO_MASK)
-#define IOPRIO_PRIO_VALUE(class, data)	(((class) << IOPRIO_CLASS_SHIFT) | data)
+#define IOPRIO_PRIO_CLASS(ioprio)	\
+	(((ioprio) >> IOPRIO_CLASS_SHIFT) & IOPRIO_CLASS_MASK)
+#define IOPRIO_PRIO_DATA(ioprio)	((ioprio) & IOPRIO_PRIO_MASK)
+#define IOPRIO_PRIO_VALUE(class, data)	\
+	((((class) & IOPRIO_CLASS_MASK) << IOPRIO_CLASS_SHIFT) | \
+	 ((data) & IOPRIO_PRIO_MASK))
 
 /*
  * These are the io priority groups as implemented by the BFQ and mq-deadline
-- 
cgit v1.2.3-71-gd317


From 202bc942c5cd4340d37b06c4e0b8b03f9925d818 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 11 Aug 2021 12:37:01 +0900
Subject: block: Introduce IOPRIO_NR_LEVELS

The BFQ scheduler and ioprio_check_cap() both assume that the RT
priority class (IOPRIO_CLASS_RT) can have up to 8 different priority
levels, similarly to the BE class (IOPRIO_CLASS_iBE). This is
controlled using the IOPRIO_BE_NR macro , which is badly named as the
number of levels also applies to the RT class.

Introduce the class independent IOPRIO_NR_LEVELS macro, defined to 8,
to make things clear. Keep the old IOPRIO_BE_NR macro definition as an
alias for IOPRIO_NR_LEVELS.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Link: https://lore.kernel.org/r/20210811033702.368488-6-damien.lemoal@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c         | 8 ++++----
 block/bfq-iosched.h         | 4 ++--
 block/bfq-wf2q.c            | 6 +++---
 block/ioprio.c              | 3 +--
 fs/f2fs/sysfs.c             | 2 +-
 include/uapi/linux/ioprio.h | 5 +++--
 6 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/uapi')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index e546a5f4bff9..4b434369e411 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2508,7 +2508,7 @@ void bfq_end_wr_async_queues(struct bfq_data *bfqd,
 	int i, j;
 
 	for (i = 0; i < 2; i++)
-		for (j = 0; j < IOPRIO_BE_NR; j++)
+		for (j = 0; j < IOPRIO_NR_LEVELS; j++)
 			if (bfqg->async_bfqq[i][j])
 				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
 	if (bfqg->async_idle_bfqq)
@@ -5293,10 +5293,10 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 		break;
 	}
 
-	if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
+	if (bfqq->new_ioprio >= IOPRIO_NR_LEVELS) {
 		pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
 			bfqq->new_ioprio);
-		bfqq->new_ioprio = IOPRIO_BE_NR - 1;
+		bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1;
 	}
 
 	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
@@ -6825,7 +6825,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
 	int i, j;
 
 	for (i = 0; i < 2; i++)
-		for (j = 0; j < IOPRIO_BE_NR; j++)
+		for (j = 0; j < IOPRIO_NR_LEVELS; j++)
 			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
 
 	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 99c2a3cb081e..385e28a843d1 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -931,7 +931,7 @@ struct bfq_group {
 
 	void *bfqd;
 
-	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
 	struct bfq_queue *async_idle_bfqq;
 
 	struct bfq_entity *my_entity;
@@ -948,7 +948,7 @@ struct bfq_group {
 	struct bfq_entity entity;
 	struct bfq_sched_data sched_data;
 
-	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
 	struct bfq_queue *async_idle_bfqq;
 
 	struct rb_root rq_pos_tree;
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 7a462df71f68..b74cc0da118e 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -505,7 +505,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
  */
 unsigned short bfq_ioprio_to_weight(int ioprio)
 {
-	return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
+	return (IOPRIO_NR_LEVELS - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
 }
 
 /**
@@ -514,12 +514,12 @@ unsigned short bfq_ioprio_to_weight(int ioprio)
  *
  * To preserve as much as possible the old only-ioprio user interface,
  * 0 is used as an escape ioprio value for weights (numerically) equal or
- * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ * larger than IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF.
  */
 static unsigned short bfq_weight_to_ioprio(int weight)
 {
 	return max_t(int, 0,
-		     IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+		     IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight);
 }
 
 static void bfq_get_entity(struct bfq_entity *entity)
diff --git a/block/ioprio.c b/block/ioprio.c
index bee628f9f1b2..ca6b136c5586 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -74,9 +74,8 @@ int ioprio_check_cap(int ioprio)
 			fallthrough;
 			/* rt has prio field too */
 		case IOPRIO_CLASS_BE:
-			if (data >= IOPRIO_BE_NR || data < 0)
+			if (data >= IOPRIO_NR_LEVELS || data < 0)
 				return -EINVAL;
-
 			break;
 		case IOPRIO_CLASS_IDLE:
 			break;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 6642246206bd..daad532a4e2b 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -378,7 +378,7 @@ out:
 		ret = kstrtol(name, 10, &data);
 		if (ret)
 			return ret;
-		if (data >= IOPRIO_BE_NR || data < 0)
+		if (data >= IOPRIO_NR_LEVELS || data < 0)
 			return -EINVAL;
 
 		cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data);
diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h
index 936f0d8f30e1..aac39338d02c 100644
--- a/include/uapi/linux/ioprio.h
+++ b/include/uapi/linux/ioprio.h
@@ -32,9 +32,10 @@ enum {
 };
 
 /*
- * 8 best effort priority levels are supported
+ * The RT and BE priority classes both support up to 8 priority levels.
  */
-#define IOPRIO_BE_NR	(8)
+#define IOPRIO_NR_LEVELS	8
+#define IOPRIO_BE_NR		IOPRIO_NR_LEVELS
 
 enum {
 	IOPRIO_WHO_PROCESS = 1,
-- 
cgit v1.2.3-71-gd317


From e70344c05995a190a56bbd1a23dc2218bcc8c924 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 11 Aug 2021 12:37:02 +0900
Subject: block: fix default IO priority handling

The default IO priority is the best effort (BE) class with the
normal priority level IOPRIO_NORM (4). However, get_task_ioprio()
returns IOPRIO_CLASS_NONE/IOPRIO_NORM as the default priority and
get_current_ioprio() returns IOPRIO_CLASS_NONE/0. Let's be consistent
with the defined default and have both of these functions return the
default priority IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM) when
the user did not define another default IO priority for the task.

In include/uapi/linux/ioprio.h, introduce the IOPRIO_BE_NORM macro as
an alias to IOPRIO_NORM to clarify that this default level applies to
the BE priotity class. In include/linux/ioprio.h, define the macro
IOPRIO_DEFAULT as IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM)
and use this new macro when setting a priority to the default.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Link: https://lore.kernel.org/r/20210811033702.368488-7-damien.lemoal@wdc.com
[axboe: drop unnecessary lightnvm change]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c         | 2 +-
 block/ioprio.c              | 6 +++---
 include/linux/ioprio.h      | 7 ++++++-
 include/uapi/linux/ioprio.h | 5 +++--
 4 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 4b434369e411..e92bc0348433 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5411,7 +5411,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
 	case IOPRIO_CLASS_RT:
 		return &bfqg->async_bfqq[0][ioprio];
 	case IOPRIO_CLASS_NONE:
-		ioprio = IOPRIO_NORM;
+		ioprio = IOPRIO_BE_NORM;
 		fallthrough;
 	case IOPRIO_CLASS_BE:
 		return &bfqg->async_bfqq[1][ioprio];
diff --git a/block/ioprio.c b/block/ioprio.c
index ca6b136c5586..0e4ff245f2bf 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -170,7 +170,7 @@ static int get_task_ioprio(struct task_struct *p)
 	ret = security_task_getioprio(p);
 	if (ret)
 		goto out;
-	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+	ret = IOPRIO_DEFAULT;
 	task_lock(p);
 	if (p->io_context)
 		ret = p->io_context->ioprio;
@@ -182,9 +182,9 @@ out:
 int ioprio_best(unsigned short aprio, unsigned short bprio)
 {
 	if (!ioprio_valid(aprio))
-		aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+		aprio = IOPRIO_DEFAULT;
 	if (!ioprio_valid(bprio))
-		bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+		bprio = IOPRIO_DEFAULT;
 
 	return min(aprio, bprio);
 }
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 2ee3373684b1..3f53bc27a19b 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -8,6 +8,11 @@
 
 #include <uapi/linux/ioprio.h>
 
+/*
+ * Default IO priority.
+ */
+#define IOPRIO_DEFAULT	IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM)
+
 /*
  * Check that a priority value has a valid class.
  */
@@ -51,7 +56,7 @@ static inline int get_current_ioprio(void)
 
 	if (ioc)
 		return ioc->ioprio;
-	return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+	return IOPRIO_DEFAULT;
 }
 
 /*
diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h
index aac39338d02c..f70f2596a6bf 100644
--- a/include/uapi/linux/ioprio.h
+++ b/include/uapi/linux/ioprio.h
@@ -44,8 +44,9 @@ enum {
 };
 
 /*
- * Fallback BE priority
+ * Fallback BE priority level.
  */
-#define IOPRIO_NORM	(4)
+#define IOPRIO_NORM	4
+#define IOPRIO_BE_NORM	IOPRIO_NORM
 
 #endif /* _UAPI_LINUX_IOPRIO_H */
-- 
cgit v1.2.3-71-gd317


From 3cfc88380413d20f777dc6648a38f683962e52bf Mon Sep 17 00:00:00 2001
From: Jie Deng <jie.deng@intel.com>
Date: Fri, 23 Jul 2021 13:44:35 +0800
Subject: i2c: virtio: add a virtio i2c frontend driver

Add an I2C bus driver for virtio para-virtualization.

The controller can be emulated by the backend driver in
any device model software by following the virtio protocol.

The device specification can be found on
https://lists.oasis-open.org/archives/virtio-comment/202101/msg00008.html.

By following the specification, people may implement different
backend drivers to emulate different controllers according to
their needs.

Co-developed-by: Conghui Chen <conghui.chen@intel.com>
Signed-off-by: Conghui Chen <conghui.chen@intel.com>
Signed-off-by: Jie Deng <jie.deng@intel.com>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 MAINTAINERS                     |   9 ++
 drivers/i2c/busses/Kconfig      |  11 ++
 drivers/i2c/busses/Makefile     |   1 +
 drivers/i2c/busses/i2c-virtio.c | 290 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/virtio_i2c.h |  41 ++++++
 include/uapi/linux/virtio_ids.h |   1 +
 6 files changed, 353 insertions(+)
 create mode 100644 drivers/i2c/busses/i2c-virtio.c
 create mode 100644 include/uapi/linux/virtio_i2c.h

(limited to 'include/uapi')

diff --git a/MAINTAINERS b/MAINTAINERS
index 19135a9d778e..6b2cabf8a64a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19708,6 +19708,15 @@ S:	Maintained
 F:	include/uapi/linux/virtio_snd.h
 F:	sound/virtio/*
 
+VIRTIO I2C DRIVER
+M:	Jie Deng <jie.deng@intel.com>
+M:	Viresh Kumar <viresh.kumar@linaro.org>
+L:	linux-i2c@vger.kernel.org
+L:	virtualization@lists.linux-foundation.org
+S:	Maintained
+F:	drivers/i2c/busses/i2c-virtio.c
+F:	include/uapi/linux/virtio_i2c.h
+
 VIRTUAL BOX GUEST DEVICE DRIVER
 M:	Hans de Goede <hdegoede@redhat.com>
 M:	Arnd Bergmann <arnd@arndb.de>
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 23b2eca45bce..e17790fe35a7 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -1393,4 +1393,15 @@ config I2C_FSI
 	  This driver can also be built as a module. If so, the module will be
 	  called as i2c-fsi.
 
+config I2C_VIRTIO
+        tristate "Virtio I2C Adapter"
+        select VIRTIO
+        help
+          If you say yes to this option, support will be included for the virtio
+          I2C adapter driver. The hardware can be emulated by any device model
+          software according to the virtio protocol.
+
+          This driver can also be built as a module. If so, the module
+          will be called i2c-virtio.
+
 endmenu
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index afdb84902dd2..1336b04f40e2 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -145,5 +145,6 @@ obj-$(CONFIG_I2C_SIBYTE)	+= i2c-sibyte.o
 obj-$(CONFIG_I2C_XGENE_SLIMPRO) += i2c-xgene-slimpro.o
 obj-$(CONFIG_SCx200_ACB)	+= scx200_acb.o
 obj-$(CONFIG_I2C_FSI)		+= i2c-fsi.o
+obj-$(CONFIG_I2C_VIRTIO)	+= i2c-virtio.o
 
 ccflags-$(CONFIG_I2C_DEBUG_BUS) := -DDEBUG
diff --git a/drivers/i2c/busses/i2c-virtio.c b/drivers/i2c/busses/i2c-virtio.c
new file mode 100644
index 000000000000..f10a603b13fb
--- /dev/null
+++ b/drivers/i2c/busses/i2c-virtio.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtio I2C Bus Driver
+ *
+ * The Virtio I2C Specification:
+ * https://raw.githubusercontent.com/oasis-tcs/virtio-spec/master/virtio-i2c.tex
+ *
+ * Copyright (c) 2021 Intel Corporation. All rights reserved.
+ */
+
+#include <linux/acpi.h>
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_i2c.h>
+
+/**
+ * struct virtio_i2c - virtio I2C data
+ * @vdev: virtio device for this controller
+ * @completion: completion of virtio I2C message
+ * @adap: I2C adapter for this controller
+ * @vq: the virtio virtqueue for communication
+ */
+struct virtio_i2c {
+	struct virtio_device *vdev;
+	struct completion completion;
+	struct i2c_adapter adap;
+	struct virtqueue *vq;
+};
+
+/**
+ * struct virtio_i2c_req - the virtio I2C request structure
+ * @out_hdr: the OUT header of the virtio I2C message
+ * @buf: the buffer into which data is read, or from which it's written
+ * @in_hdr: the IN header of the virtio I2C message
+ */
+struct virtio_i2c_req {
+	struct virtio_i2c_out_hdr out_hdr	____cacheline_aligned;
+	uint8_t *buf				____cacheline_aligned;
+	struct virtio_i2c_in_hdr in_hdr		____cacheline_aligned;
+};
+
+static void virtio_i2c_msg_done(struct virtqueue *vq)
+{
+	struct virtio_i2c *vi = vq->vdev->priv;
+
+	complete(&vi->completion);
+}
+
+static int virtio_i2c_prepare_reqs(struct virtqueue *vq,
+				   struct virtio_i2c_req *reqs,
+				   struct i2c_msg *msgs, int num)
+{
+	struct scatterlist *sgs[3], out_hdr, msg_buf, in_hdr;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		int outcnt = 0, incnt = 0;
+
+		/*
+		 * We don't support 0 length messages and so filter out
+		 * 0 length transfers by using i2c_adapter_quirks.
+		 */
+		if (!msgs[i].len)
+			break;
+
+		/*
+		 * Only 7-bit mode supported for this moment. For the address
+		 * format, Please check the Virtio I2C Specification.
+		 */
+		reqs[i].out_hdr.addr = cpu_to_le16(msgs[i].addr << 1);
+
+		if (i != num - 1)
+			reqs[i].out_hdr.flags = cpu_to_le32(VIRTIO_I2C_FLAGS_FAIL_NEXT);
+
+		sg_init_one(&out_hdr, &reqs[i].out_hdr, sizeof(reqs[i].out_hdr));
+		sgs[outcnt++] = &out_hdr;
+
+		reqs[i].buf = i2c_get_dma_safe_msg_buf(&msgs[i], 1);
+		if (!reqs[i].buf)
+			break;
+
+		sg_init_one(&msg_buf, reqs[i].buf, msgs[i].len);
+
+		if (msgs[i].flags & I2C_M_RD)
+			sgs[outcnt + incnt++] = &msg_buf;
+		else
+			sgs[outcnt++] = &msg_buf;
+
+		sg_init_one(&in_hdr, &reqs[i].in_hdr, sizeof(reqs[i].in_hdr));
+		sgs[outcnt + incnt++] = &in_hdr;
+
+		if (virtqueue_add_sgs(vq, sgs, outcnt, incnt, &reqs[i], GFP_KERNEL)) {
+			i2c_put_dma_safe_msg_buf(reqs[i].buf, &msgs[i], false);
+			break;
+		}
+	}
+
+	return i;
+}
+
+static int virtio_i2c_complete_reqs(struct virtqueue *vq,
+				    struct virtio_i2c_req *reqs,
+				    struct i2c_msg *msgs, int num,
+				    bool timedout)
+{
+	struct virtio_i2c_req *req;
+	bool failed = timedout;
+	unsigned int len;
+	int i, j = 0;
+
+	for (i = 0; i < num; i++) {
+		/* Detach the ith request from the vq */
+		req = virtqueue_get_buf(vq, &len);
+
+		/*
+		 * Condition req == &reqs[i] should always meet since we have
+		 * total num requests in the vq. reqs[i] can never be NULL here.
+		 */
+		if (!failed && (WARN_ON(req != &reqs[i]) ||
+				req->in_hdr.status != VIRTIO_I2C_MSG_OK))
+			failed = true;
+
+		i2c_put_dma_safe_msg_buf(reqs[i].buf, &msgs[i], !failed);
+
+		if (!failed)
+			j++;
+	}
+
+	return timedout ? -ETIMEDOUT : j;
+}
+
+static int virtio_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs,
+			   int num)
+{
+	struct virtio_i2c *vi = i2c_get_adapdata(adap);
+	struct virtqueue *vq = vi->vq;
+	struct virtio_i2c_req *reqs;
+	unsigned long time_left;
+	int count;
+
+	reqs = kcalloc(num, sizeof(*reqs), GFP_KERNEL);
+	if (!reqs)
+		return -ENOMEM;
+
+	count = virtio_i2c_prepare_reqs(vq, reqs, msgs, num);
+	if (!count)
+		goto err_free;
+
+	/*
+	 * For the case where count < num, i.e. we weren't able to queue all the
+	 * msgs, ideally we should abort right away and return early, but some
+	 * of the messages are already sent to the remote I2C controller and the
+	 * virtqueue will be left in undefined state in that case. We kick the
+	 * remote here to clear the virtqueue, so we can try another set of
+	 * messages later on.
+	 */
+
+	reinit_completion(&vi->completion);
+	virtqueue_kick(vq);
+
+	time_left = wait_for_completion_timeout(&vi->completion, adap->timeout);
+	if (!time_left)
+		dev_err(&adap->dev, "virtio i2c backend timeout.\n");
+
+	count = virtio_i2c_complete_reqs(vq, reqs, msgs, count, !time_left);
+
+err_free:
+	kfree(reqs);
+	return count;
+}
+
+static void virtio_i2c_del_vqs(struct virtio_device *vdev)
+{
+	vdev->config->reset(vdev);
+	vdev->config->del_vqs(vdev);
+}
+
+static int virtio_i2c_setup_vqs(struct virtio_i2c *vi)
+{
+	struct virtio_device *vdev = vi->vdev;
+
+	vi->vq = virtio_find_single_vq(vdev, virtio_i2c_msg_done, "msg");
+	return PTR_ERR_OR_ZERO(vi->vq);
+}
+
+static u32 virtio_i2c_func(struct i2c_adapter *adap)
+{
+	return I2C_FUNC_I2C | (I2C_FUNC_SMBUS_EMUL & ~I2C_FUNC_SMBUS_QUICK);
+}
+
+static struct i2c_algorithm virtio_algorithm = {
+	.master_xfer = virtio_i2c_xfer,
+	.functionality = virtio_i2c_func,
+};
+
+static const struct i2c_adapter_quirks virtio_i2c_quirks = {
+	.flags = I2C_AQ_NO_ZERO_LEN,
+};
+
+static int virtio_i2c_probe(struct virtio_device *vdev)
+{
+	struct virtio_i2c *vi;
+	int ret;
+
+	vi = devm_kzalloc(&vdev->dev, sizeof(*vi), GFP_KERNEL);
+	if (!vi)
+		return -ENOMEM;
+
+	vdev->priv = vi;
+	vi->vdev = vdev;
+
+	init_completion(&vi->completion);
+
+	ret = virtio_i2c_setup_vqs(vi);
+	if (ret)
+		return ret;
+
+	vi->adap.owner = THIS_MODULE;
+	snprintf(vi->adap.name, sizeof(vi->adap.name),
+		 "i2c_virtio at virtio bus %d", vdev->index);
+	vi->adap.algo = &virtio_algorithm;
+	vi->adap.quirks = &virtio_i2c_quirks;
+	vi->adap.dev.parent = &vdev->dev;
+	vi->adap.dev.of_node = vdev->dev.of_node;
+	i2c_set_adapdata(&vi->adap, vi);
+
+	/*
+	 * Setup ACPI node for controlled devices which will be probed through
+	 * ACPI.
+	 */
+	ACPI_COMPANION_SET(&vi->adap.dev, ACPI_COMPANION(vdev->dev.parent));
+
+	ret = i2c_add_adapter(&vi->adap);
+	if (ret)
+		virtio_i2c_del_vqs(vdev);
+
+	return ret;
+}
+
+static void virtio_i2c_remove(struct virtio_device *vdev)
+{
+	struct virtio_i2c *vi = vdev->priv;
+
+	i2c_del_adapter(&vi->adap);
+	virtio_i2c_del_vqs(vdev);
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_I2C_ADAPTER, VIRTIO_DEV_ANY_ID },
+	{}
+};
+MODULE_DEVICE_TABLE(virtio, id_table);
+
+#ifdef CONFIG_PM_SLEEP
+static int virtio_i2c_freeze(struct virtio_device *vdev)
+{
+	virtio_i2c_del_vqs(vdev);
+	return 0;
+}
+
+static int virtio_i2c_restore(struct virtio_device *vdev)
+{
+	return virtio_i2c_setup_vqs(vdev->priv);
+}
+#endif
+
+static struct virtio_driver virtio_i2c_driver = {
+	.id_table	= id_table,
+	.probe		= virtio_i2c_probe,
+	.remove		= virtio_i2c_remove,
+	.driver	= {
+		.name	= "i2c_virtio",
+	},
+#ifdef CONFIG_PM_SLEEP
+	.freeze = virtio_i2c_freeze,
+	.restore = virtio_i2c_restore,
+#endif
+};
+module_virtio_driver(virtio_i2c_driver);
+
+MODULE_AUTHOR("Jie Deng <jie.deng@intel.com>");
+MODULE_AUTHOR("Conghui Chen <conghui.chen@intel.com>");
+MODULE_DESCRIPTION("Virtio i2c bus driver");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/virtio_i2c.h b/include/uapi/linux/virtio_i2c.h
new file mode 100644
index 000000000000..7c6a6fc01ad6
--- /dev/null
+++ b/include/uapi/linux/virtio_i2c.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later WITH Linux-syscall-note */
+/*
+ * Definitions for virtio I2C Adpter
+ *
+ * Copyright (c) 2021 Intel Corporation. All rights reserved.
+ */
+
+#ifndef _UAPI_LINUX_VIRTIO_I2C_H
+#define _UAPI_LINUX_VIRTIO_I2C_H
+
+#include <linux/const.h>
+#include <linux/types.h>
+
+/* The bit 0 of the @virtio_i2c_out_hdr.@flags, used to group the requests */
+#define VIRTIO_I2C_FLAGS_FAIL_NEXT	_BITUL(0)
+
+/**
+ * struct virtio_i2c_out_hdr - the virtio I2C message OUT header
+ * @addr: the controlled device address
+ * @padding: used to pad to full dword
+ * @flags: used for feature extensibility
+ */
+struct virtio_i2c_out_hdr {
+	__le16 addr;
+	__le16 padding;
+	__le32 flags;
+};
+
+/**
+ * struct virtio_i2c_in_hdr - the virtio I2C message IN header
+ * @status: the processing result from the backend
+ */
+struct virtio_i2c_in_hdr {
+	__u8 status;
+};
+
+/* The final status written by the device */
+#define VIRTIO_I2C_MSG_OK	0
+#define VIRTIO_I2C_MSG_ERR	1
+
+#endif /* _UAPI_LINUX_VIRTIO_I2C_H */
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 70a8057ad4bb..99aa27b100bc 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -55,6 +55,7 @@
 #define VIRTIO_ID_FS			26 /* virtio filesystem */
 #define VIRTIO_ID_PMEM			27 /* virtio pmem */
 #define VIRTIO_ID_MAC80211_HWSIM	29 /* virtio mac80211-hwsim */
+#define VIRTIO_ID_I2C_ADAPTER		34 /* virtio i2c adapter */
 #define VIRTIO_ID_BT			40 /* virtio bluetooth */
 
 /*
-- 
cgit v1.2.3-71-gd317


From 2796d846d74a18cc6563e96eff8bf28c5e06f912 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Fri, 20 Aug 2021 15:42:55 +0300
Subject: net: bridge: vlan: convert mcast router global option to per-vlan
 entry

The per-vlan router option controls the port/vlan and host vlan entries'
mcast router config. The global option controlled only the host vlan
config, but that is unnecessary and incosistent as it's not really a
global vlan option, but rather bridge option to control host router
config, so convert BRIDGE_VLANDB_GOPTS_MCAST_ROUTER to
BRIDGE_VLANDB_ENTRY_MCAST_ROUTER which can be used to control both host
vlan and port vlan mcast router config.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  2 +-
 net/bridge/br_multicast.c      | 13 +++++++++++
 net/bridge/br_private.h        | 15 +++++++++++++
 net/bridge/br_vlan.c           |  1 +
 net/bridge/br_vlan_options.c   | 51 +++++++++++++++++++++++++++---------------
 5 files changed, 63 insertions(+), 19 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index f71a81fdbbc6..2711c3522010 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -506,6 +506,7 @@ enum {
 	BRIDGE_VLANDB_ENTRY_STATE,
 	BRIDGE_VLANDB_ENTRY_TUNNEL_INFO,
 	BRIDGE_VLANDB_ENTRY_STATS,
+	BRIDGE_VLANDB_ENTRY_MCAST_ROUTER,
 	__BRIDGE_VLANDB_ENTRY_MAX,
 };
 #define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1)
@@ -561,7 +562,6 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
-	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
 	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE,
 	__BRIDGE_VLANDB_GOPTS_MAX
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index be9d1376e249..2c437d4bf632 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4310,6 +4310,19 @@ unlock:
 	return err;
 }
 
+int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router)
+{
+	int err;
+
+	if (br_vlan_is_master(v))
+		err = br_multicast_set_router(&v->br_mcast_ctx, mcast_router);
+	else
+		err = br_multicast_set_port_router(&v->port_mcast_ctx,
+						   mcast_router);
+
+	return err;
+}
+
 static void br_multicast_start_querier(struct net_bridge_mcast *brmctx,
 				       struct bridge_mcast_own_query *query)
 {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index fcc0fcf44a95..b4cef3a97f12 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -676,6 +676,20 @@ static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur,
 	return true;
 }
 
+static inline u8 br_vlan_multicast_router(const struct net_bridge_vlan *v)
+{
+	u8 mcast_router = MDB_RTR_TYPE_DISABLED;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	if (!br_vlan_is_master(v))
+		mcast_router = v->port_mcast_ctx.multicast_router;
+	else
+		mcast_router = v->br_mcast_ctx.multicast_router;
+#endif
+
+	return mcast_router;
+}
+
 static inline int br_afspec_cmd_to_rtm(int cmd)
 {
 	switch (cmd) {
@@ -881,6 +895,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb,
 int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val);
 int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx,
 				 unsigned long val);
+int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router);
 int br_multicast_toggle(struct net_bridge *br, unsigned long val,
 			struct netlink_ext_ack *extack);
 int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val);
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index e25e288e7a85..19f65ab91a02 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -2136,6 +2136,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] =
 	[BRIDGE_VLANDB_ENTRY_RANGE]	= { .type = NLA_U16 },
 	[BRIDGE_VLANDB_ENTRY_STATE]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED },
+	[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]	= { .type = NLA_U8 },
 };
 
 static int br_vlan_rtm_process_one(struct net_device *dev,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index a3b8a086284b..8ffd4ed2563c 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -40,22 +40,38 @@ static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr,
 bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
 			   const struct net_bridge_vlan *range_end)
 {
+	u8 range_mc_rtr = br_vlan_multicast_router(range_end);
+	u8 curr_mc_rtr = br_vlan_multicast_router(v_curr);
+
 	return v_curr->state == range_end->state &&
-	       __vlan_tun_can_enter_range(v_curr, range_end);
+	       __vlan_tun_can_enter_range(v_curr, range_end) &&
+	       curr_mc_rtr == range_mc_rtr;
 }
 
 bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v)
 {
-	return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE,
-			   br_vlan_get_state(v)) &&
-	       __vlan_tun_put(skb, v);
+	if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) ||
+	    !__vlan_tun_put(skb, v))
+		return false;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_MCAST_ROUTER,
+		       br_vlan_multicast_router(v)))
+		return false;
+#endif
+
+	return true;
 }
 
 size_t br_vlan_opts_nl_size(void)
 {
 	return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */
 	       + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY_TUNNEL_INFO */
-	       + nla_total_size(sizeof(u32)); /* BRIDGE_VLANDB_TINFO_ID */
+	       + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_TINFO_ID */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	       + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_MCAST_ROUTER */
+#endif
+	       + 0;
 }
 
 static int br_vlan_modify_state(struct net_bridge_vlan_group *vg,
@@ -181,6 +197,18 @@ static int br_vlan_process_one_opts(const struct net_bridge *br,
 			return err;
 	}
 
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	if (tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]) {
+		u8 val;
+
+		val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]);
+		err = br_multicast_set_vlan_router(v, val);
+		if (err)
+			return err;
+		*changed = true;
+	}
+#endif
+
 	return 0;
 }
 
@@ -298,8 +326,6 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 			v_opts->br_mcast_ctx.multicast_startup_query_count) ||
 	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
 		       v_opts->br_mcast_ctx.multicast_querier) ||
-	    nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER,
-		       v_opts->br_mcast_ctx.multicast_router) ||
 	    br_multicast_dump_querier_state(skb, &v_opts->br_mcast_ctx,
 					    BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE))
 		goto out_err;
@@ -380,7 +406,6 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(const struct net_bridge_vlan *v)
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
 		+ nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
 		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */
-		+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER */
 		+ br_multicast_querier_state_size() /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE */
 		+ nla_total_size(0) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */
 		+ br_rports_size(&v->br_mcast_ctx) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */
@@ -522,15 +547,6 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 			return err;
 		*changed = true;
 	}
-	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]) {
-		u8 val;
-
-		val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]);
-		err = br_multicast_set_router(&v->br_mcast_ctx, val);
-		if (err)
-			return err;
-		*changed = true;
-	}
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
 		u8 ver;
@@ -554,7 +570,6 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]	= { .type = NLA_U8 },
-	[BRIDGE_VLANDB_GOPTS_MCAST_ROUTER]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]	= { .type = NLA_U32 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]	= { .type = NLA_U32 },
-- 
cgit v1.2.3-71-gd317


From f95937ccf5bd5e0a6bbac2b8e65a87982ffae403 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Mon, 2 Aug 2021 16:56:29 +0000
Subject: KVM: stats: Support linear and logarithmic histogram statistics

Add new types of KVM stats, linear and logarithmic histogram.
Histogram are very useful for observing the value distribution
of time or size related stats.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
Message-Id: <20210802165633.1866976-2-jingzhangos@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/guest.c    |  4 ---
 arch/mips/kvm/mips.c      |  4 ---
 arch/powerpc/kvm/book3s.c |  4 ---
 arch/powerpc/kvm/booke.c  |  4 ---
 arch/s390/kvm/kvm-s390.c  |  4 ---
 arch/x86/kvm/x86.c        |  4 ---
 include/linux/kvm_host.h  | 90 +++++++++++++++++++++++++++++++++++++++--------
 include/uapi/linux/kvm.h  | 11 +++---
 8 files changed, 82 insertions(+), 43 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 1dfb83578277..5188184d25d0 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -31,8 +31,6 @@
 const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 	KVM_GENERIC_VM_STATS()
 };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-		sizeof(struct kvm_vm_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vm_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
@@ -52,8 +50,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, mmio_exit_kernel),
 	STATS_DESC_COUNTER(VCPU, exits)
 };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index af9dd029a4e1..75c6f264c626 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -41,8 +41,6 @@
 const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 	KVM_GENERIC_VM_STATS()
 };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-		sizeof(struct kvm_vm_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vm_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
@@ -85,8 +83,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, vz_cpucfg_exits),
 #endif
 };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 79833f78d1da..5cc6e90095b0 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -43,8 +43,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 	STATS_DESC_ICOUNTER(VM, num_2M_pages),
 	STATS_DESC_ICOUNTER(VM, num_1G_pages)
 };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-		sizeof(struct kvm_vm_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vm_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
@@ -88,8 +86,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, pthru_host),
 	STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
 };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 551b30d84aee..5ed6c235e059 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -41,8 +41,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 	STATS_DESC_ICOUNTER(VM, num_2M_pages),
 	STATS_DESC_ICOUNTER(VM, num_1G_pages)
 };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-		sizeof(struct kvm_vm_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vm_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
@@ -79,8 +77,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, pthru_host),
 	STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
 };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 02574d7b3612..4dc7e966a720 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -66,8 +66,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 	STATS_DESC_COUNTER(VM, inject_service_signal),
 	STATS_DESC_COUNTER(VM, inject_virtio)
 };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-		sizeof(struct kvm_vm_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vm_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
@@ -174,8 +172,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
 	STATS_DESC_COUNTER(VCPU, pfault_sync)
 };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bf8cb1021d11..9425589f34ca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -238,8 +238,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 	STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
 	STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
 };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-		sizeof(struct kvm_vm_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vm_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
@@ -279,8 +277,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, directed_yield_successful),
 	STATS_DESC_ICOUNTER(VCPU, guest_mode)
 };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
 	.name_size = KVM_STATS_NAME_SIZE,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4e43843fe0d7..09fc0274b1eb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1356,56 +1356,66 @@ struct _kvm_stats_desc {
 	char name[KVM_STATS_NAME_SIZE];
 };
 
-#define STATS_DESC_COMMON(type, unit, base, exp)			       \
+#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz)		       \
 	.flags = type | unit | base |					       \
 		 BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) |	       \
 		 BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) |	       \
 		 BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK),	       \
 	.exponent = exp,						       \
-	.size = 1
+	.size = sz,							       \
+	.bucket_size = bsz
 
-#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp)		       \
+#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz)	       \
 	{								       \
 		{							       \
-			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
 			.offset = offsetof(struct kvm_vm_stat, generic.stat)   \
 		},							       \
 		.name = #stat,						       \
 	}
-#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp)		       \
+#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz)	       \
 	{								       \
 		{							       \
-			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
 			.offset = offsetof(struct kvm_vcpu_stat, generic.stat) \
 		},							       \
 		.name = #stat,						       \
 	}
-#define VM_STATS_DESC(stat, type, unit, base, exp)			       \
+#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz)		       \
 	{								       \
 		{							       \
-			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
 			.offset = offsetof(struct kvm_vm_stat, stat)	       \
 		},							       \
 		.name = #stat,						       \
 	}
-#define VCPU_STATS_DESC(stat, type, unit, base, exp)			       \
+#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz)		       \
 	{								       \
 		{							       \
-			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
 			.offset = offsetof(struct kvm_vcpu_stat, stat)	       \
 		},							       \
 		.name = #stat,						       \
 	}
 /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */
-#define STATS_DESC(SCOPE, stat, type, unit, base, exp)			       \
-	SCOPE##_STATS_DESC(stat, type, unit, base, exp)
+#define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz)		       \
+	SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz)
 
 #define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent)	       \
-	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, unit, base, exponent)
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE,		       \
+		unit, base, exponent, 1, 0)
 #define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent)		       \
-	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent)
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT,			       \
+		unit, base, exponent, 1, 0)
 #define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent)		       \
-	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, unit, base, exponent)
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK,			       \
+		unit, base, exponent, 1, 0)
+#define STATS_DESC_LINEAR_HIST(SCOPE, name, unit, base, exponent, sz, bsz)     \
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LINEAR_HIST,		       \
+		unit, base, exponent, sz, bsz)
+#define STATS_DESC_LOG_HIST(SCOPE, name, unit, base, exponent, sz)	       \
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LOG_HIST,		       \
+		unit, base, exponent, sz, 0)
 
 /* Cumulative counter, read/write */
 #define STATS_DESC_COUNTER(SCOPE, name)					       \
@@ -1424,6 +1434,14 @@ struct _kvm_stats_desc {
 #define STATS_DESC_TIME_NSEC(SCOPE, name)				       \
 	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
 		KVM_STATS_BASE_POW10, -9)
+/* Linear histogram for time in nanosecond */
+#define STATS_DESC_LINHIST_TIME_NSEC(SCOPE, name, sz, bsz)		       \
+	STATS_DESC_LINEAR_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
+		KVM_STATS_BASE_POW10, -9, sz, bsz)
+/* Logarithmic histogram for time in nanosecond */
+#define STATS_DESC_LOGHIST_TIME_NSEC(SCOPE, name, sz)			       \
+	STATS_DESC_LOG_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
+		KVM_STATS_BASE_POW10, -9, sz)
 
 #define KVM_GENERIC_VM_STATS()						       \
 	STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush)
@@ -1437,10 +1455,52 @@ struct _kvm_stats_desc {
 	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns)
 
 extern struct dentry *kvm_debugfs_dir;
+
 ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
 		       const struct _kvm_stats_desc *desc,
 		       void *stats, size_t size_stats,
 		       char __user *user_buffer, size_t size, loff_t *offset);
+
+/**
+ * kvm_stats_linear_hist_update() - Update bucket value for linear histogram
+ * statistics data.
+ *
+ * @data: start address of the stats data
+ * @size: the number of bucket of the stats data
+ * @value: the new value used to update the linear histogram's bucket
+ * @bucket_size: the size (width) of a bucket
+ */
+static inline void kvm_stats_linear_hist_update(u64 *data, size_t size,
+						u64 value, size_t bucket_size)
+{
+	size_t index = div64_u64(value, bucket_size);
+
+	index = min(index, size - 1);
+	++data[index];
+}
+
+/**
+ * kvm_stats_log_hist_update() - Update bucket value for logarithmic histogram
+ * statistics data.
+ *
+ * @data: start address of the stats data
+ * @size: the number of bucket of the stats data
+ * @value: the new value used to update the logarithmic histogram's bucket
+ */
+static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value)
+{
+	size_t index = fls64(value);
+
+	index = min(index, size - 1);
+	++data[index];
+}
+
+#define KVM_STATS_LINEAR_HIST_UPDATE(array, value, bsize)		       \
+	kvm_stats_linear_hist_update(array, ARRAY_SIZE(array), value, bsize)
+#define KVM_STATS_LOG_HIST_UPDATE(array, value)				       \
+	kvm_stats_log_hist_update(array, ARRAY_SIZE(array), value)
+
+
 extern const struct kvm_stats_header kvm_vm_stats_header;
 extern const struct _kvm_stats_desc kvm_vm_stats_desc[];
 extern const struct kvm_stats_header kvm_vcpu_stats_header;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d9e4aabcb31a..a067410ebea5 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1965,7 +1965,9 @@ struct kvm_stats_header {
 #define KVM_STATS_TYPE_CUMULATIVE	(0x0 << KVM_STATS_TYPE_SHIFT)
 #define KVM_STATS_TYPE_INSTANT		(0x1 << KVM_STATS_TYPE_SHIFT)
 #define KVM_STATS_TYPE_PEAK		(0x2 << KVM_STATS_TYPE_SHIFT)
-#define KVM_STATS_TYPE_MAX		KVM_STATS_TYPE_PEAK
+#define KVM_STATS_TYPE_LINEAR_HIST	(0x3 << KVM_STATS_TYPE_SHIFT)
+#define KVM_STATS_TYPE_LOG_HIST		(0x4 << KVM_STATS_TYPE_SHIFT)
+#define KVM_STATS_TYPE_MAX		KVM_STATS_TYPE_LOG_HIST
 
 #define KVM_STATS_UNIT_SHIFT		4
 #define KVM_STATS_UNIT_MASK		(0xF << KVM_STATS_UNIT_SHIFT)
@@ -1988,8 +1990,9 @@ struct kvm_stats_header {
  * @size: The number of data items for this stats.
  *        Every data item is of type __u64.
  * @offset: The offset of the stats to the start of stat structure in
- *          struture kvm or kvm_vcpu.
- * @unused: Unused field for future usage. Always 0 for now.
+ *          structure kvm or kvm_vcpu.
+ * @bucket_size: A parameter value used for histogram stats. It is only used
+ *		for linear histogram stats, specifying the size of the bucket;
  * @name: The name string for the stats. Its size is indicated by the
  *        &kvm_stats_header->name_size.
  */
@@ -1998,7 +2001,7 @@ struct kvm_stats_desc {
 	__s16 exponent;
 	__u16 size;
 	__u32 offset;
-	__u32 unused;
+	__u32 bucket_size;
 	char name[];
 };
 
-- 
cgit v1.2.3-71-gd317


From 3a29355a22c0275fe864100794fee58a73175d93 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 19 Aug 2021 10:00:22 +0530
Subject: gpio: Add virtio-gpio driver

This patch adds a new driver for Virtio based GPIO devices.

This allows a guest VM running Linux to access GPIO lines provided by
the host. It supports all basic operations, except interrupts for the
GPIO lines.

Based on the initial work posted by:
"Enrico Weigelt, metux IT consult" <lkml@metux.net>.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 MAINTAINERS                      |   7 +
 drivers/gpio/Kconfig             |   9 +
 drivers/gpio/Makefile            |   1 +
 drivers/gpio/gpio-virtio.c       | 375 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/virtio_gpio.h |  47 +++++
 include/uapi/linux/virtio_ids.h  |   1 +
 6 files changed, 440 insertions(+)
 create mode 100644 drivers/gpio/gpio-virtio.c
 create mode 100644 include/uapi/linux/virtio_gpio.h

(limited to 'include/uapi')

diff --git a/MAINTAINERS b/MAINTAINERS
index 6ffb5cbbeb93..9904c5c1996c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19647,6 +19647,13 @@ F:	Documentation/filesystems/virtiofs.rst
 F:	fs/fuse/virtio_fs.c
 F:	include/uapi/linux/virtio_fs.h
 
+VIRTIO GPIO DRIVER
+M:	Enrico Weigelt, metux IT consult <info@metux.net>
+M:	Viresh Kumar <vireshk@kernel.org>
+S:	Maintained
+F:	drivers/gpio/gpio-virtio.c
+F:	include/uapi/linux/virtio_gpio.h
+
 VIRTIO GPU DRIVER
 M:	David Airlie <airlied@linux.ie>
 M:	Gerd Hoffmann <kraxel@redhat.com>
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index fab571016adf..e5993d6864fb 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -1669,6 +1669,15 @@ config GPIO_MOCKUP
 	  tools/testing/selftests/gpio/gpio-mockup.sh. Reference the usage in
 	  it.
 
+config GPIO_VIRTIO
+	tristate "VirtIO GPIO support"
+	depends on VIRTIO
+	help
+	  Say Y here to enable guest support for virtio-based GPIO controllers.
+
+	  These virtual GPIOs can be routed to real GPIOs or attached to
+	  simulators on the host (like QEMU).
+
 endmenu
 
 endif
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 32a32659866a..e0301cfedd8d 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -165,6 +165,7 @@ obj-$(CONFIG_GPIO_UCB1400)		+= gpio-ucb1400.o
 obj-$(CONFIG_GPIO_UNIPHIER)		+= gpio-uniphier.o
 obj-$(CONFIG_GPIO_VF610)		+= gpio-vf610.o
 obj-$(CONFIG_GPIO_VIPERBOARD)		+= gpio-viperboard.o
+obj-$(CONFIG_GPIO_VIRTIO)		+= gpio-virtio.o
 obj-$(CONFIG_GPIO_VISCONTI)		+= gpio-visconti.o
 obj-$(CONFIG_GPIO_VR41XX)		+= gpio-vr41xx.o
 obj-$(CONFIG_GPIO_VX855)		+= gpio-vx855.o
diff --git a/drivers/gpio/gpio-virtio.c b/drivers/gpio/gpio-virtio.c
new file mode 100644
index 000000000000..d33eb237c0b9
--- /dev/null
+++ b/drivers/gpio/gpio-virtio.c
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * GPIO driver for virtio-based virtual GPIO controllers
+ *
+ * Copyright (C) 2021 metux IT consult
+ * Enrico Weigelt, metux IT consult <info@metux.net>
+ *
+ * Copyright (C) 2021 Linaro.
+ * Viresh Kumar <viresh.kumar@linaro.org>
+ */
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/gpio/driver.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/virtio_config.h>
+#include <uapi/linux/virtio_gpio.h>
+#include <uapi/linux/virtio_ids.h>
+
+struct virtio_gpio_line {
+	struct mutex lock; /* Protects line operation */
+	struct completion completion;
+	struct virtio_gpio_request req ____cacheline_aligned;
+	struct virtio_gpio_response res ____cacheline_aligned;
+	unsigned int rxlen;
+};
+
+struct virtio_gpio {
+	struct virtio_device *vdev;
+	struct mutex lock; /* Protects virtqueue operation */
+	struct gpio_chip gc;
+	struct virtio_gpio_config config;
+	struct virtio_gpio_line *lines;
+	struct virtqueue *request_vq;
+};
+
+static int _virtio_gpio_req(struct virtio_gpio *vgpio, u16 type, u16 gpio,
+			    u8 txvalue, u8 *rxvalue, void *response, u32 rxlen)
+{
+	struct virtio_gpio_line *line = &vgpio->lines[gpio];
+	struct virtio_gpio_request *req = &line->req;
+	struct virtio_gpio_response *res = response;
+	struct scatterlist *sgs[2], req_sg, res_sg;
+	struct device *dev = &vgpio->vdev->dev;
+	int ret;
+
+	/*
+	 * Prevent concurrent requests for the same line since we have
+	 * pre-allocated request/response buffers for each GPIO line. Moreover
+	 * Linux always accesses a GPIO line sequentially, so this locking shall
+	 * always go through without any delays.
+	 */
+	mutex_lock(&line->lock);
+
+	req->type = cpu_to_le16(type);
+	req->gpio = cpu_to_le16(gpio);
+	req->value = txvalue;
+
+	sg_init_one(&req_sg, req, sizeof(*req));
+	sg_init_one(&res_sg, res, rxlen);
+	sgs[0] = &req_sg;
+	sgs[1] = &res_sg;
+
+	line->rxlen = 0;
+	reinit_completion(&line->completion);
+
+	/*
+	 * Virtqueue callers need to ensure they don't call its APIs with other
+	 * virtqueue operations at the same time.
+	 */
+	mutex_lock(&vgpio->lock);
+	ret = virtqueue_add_sgs(vgpio->request_vq, sgs, 1, 1, line, GFP_KERNEL);
+	if (ret) {
+		dev_err(dev, "failed to add request to vq\n");
+		mutex_unlock(&vgpio->lock);
+		goto out;
+	}
+
+	virtqueue_kick(vgpio->request_vq);
+	mutex_unlock(&vgpio->lock);
+
+	if (!wait_for_completion_timeout(&line->completion, HZ)) {
+		dev_err(dev, "GPIO operation timed out\n");
+		ret = -ETIMEDOUT;
+		goto out;
+	}
+
+	if (unlikely(res->status != VIRTIO_GPIO_STATUS_OK)) {
+		dev_err(dev, "GPIO request failed: %d\n", gpio);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (unlikely(line->rxlen != rxlen)) {
+		dev_err(dev, "GPIO operation returned incorrect len (%u : %u)\n",
+			rxlen, line->rxlen);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (rxvalue)
+		*rxvalue = res->value;
+
+out:
+	mutex_unlock(&line->lock);
+	return ret;
+}
+
+static int virtio_gpio_req(struct virtio_gpio *vgpio, u16 type, u16 gpio,
+			   u8 txvalue, u8 *rxvalue)
+{
+	struct virtio_gpio_line *line = &vgpio->lines[gpio];
+	struct virtio_gpio_response *res = &line->res;
+
+	return _virtio_gpio_req(vgpio, type, gpio, txvalue, rxvalue, res,
+				sizeof(*res));
+}
+
+static void virtio_gpio_free(struct gpio_chip *gc, unsigned int gpio)
+{
+	struct virtio_gpio *vgpio = gpiochip_get_data(gc);
+
+	virtio_gpio_req(vgpio, VIRTIO_GPIO_MSG_SET_DIRECTION, gpio,
+			VIRTIO_GPIO_DIRECTION_NONE, NULL);
+}
+
+static int virtio_gpio_get_direction(struct gpio_chip *gc, unsigned int gpio)
+{
+	struct virtio_gpio *vgpio = gpiochip_get_data(gc);
+	u8 direction;
+	int ret;
+
+	ret = virtio_gpio_req(vgpio, VIRTIO_GPIO_MSG_GET_DIRECTION, gpio, 0,
+			      &direction);
+	if (ret)
+		return ret;
+
+	switch (direction) {
+	case VIRTIO_GPIO_DIRECTION_IN:
+		return GPIO_LINE_DIRECTION_IN;
+	case VIRTIO_GPIO_DIRECTION_OUT:
+		return GPIO_LINE_DIRECTION_OUT;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int virtio_gpio_direction_input(struct gpio_chip *gc, unsigned int gpio)
+{
+	struct virtio_gpio *vgpio = gpiochip_get_data(gc);
+
+	return virtio_gpio_req(vgpio, VIRTIO_GPIO_MSG_SET_DIRECTION, gpio,
+			       VIRTIO_GPIO_DIRECTION_IN, NULL);
+}
+
+static int virtio_gpio_direction_output(struct gpio_chip *gc, unsigned int gpio,
+					int value)
+{
+	struct virtio_gpio *vgpio = gpiochip_get_data(gc);
+	int ret;
+
+	ret = virtio_gpio_req(vgpio, VIRTIO_GPIO_MSG_SET_VALUE, gpio, value, NULL);
+	if (ret)
+		return ret;
+
+	return virtio_gpio_req(vgpio, VIRTIO_GPIO_MSG_SET_DIRECTION, gpio,
+			       VIRTIO_GPIO_DIRECTION_OUT, NULL);
+}
+
+static int virtio_gpio_get(struct gpio_chip *gc, unsigned int gpio)
+{
+	struct virtio_gpio *vgpio = gpiochip_get_data(gc);
+	u8 value;
+	int ret;
+
+	ret = virtio_gpio_req(vgpio, VIRTIO_GPIO_MSG_GET_VALUE, gpio, 0, &value);
+	return ret ? ret : value;
+}
+
+static void virtio_gpio_set(struct gpio_chip *gc, unsigned int gpio, int value)
+{
+	struct virtio_gpio *vgpio = gpiochip_get_data(gc);
+
+	virtio_gpio_req(vgpio, VIRTIO_GPIO_MSG_SET_VALUE, gpio, value, NULL);
+}
+
+static void virtio_gpio_request_vq(struct virtqueue *vq)
+{
+	struct virtio_gpio_line *line;
+	unsigned int len;
+
+	do {
+		line = virtqueue_get_buf(vq, &len);
+		if (!line)
+			return;
+
+		line->rxlen = len;
+		complete(&line->completion);
+	} while (1);
+}
+
+static void virtio_gpio_free_vqs(struct virtio_device *vdev)
+{
+	vdev->config->reset(vdev);
+	vdev->config->del_vqs(vdev);
+}
+
+static int virtio_gpio_alloc_vqs(struct virtio_gpio *vgpio,
+				 struct virtio_device *vdev)
+{
+	const char * const names[] = { "requestq" };
+	vq_callback_t *cbs[] = {
+		virtio_gpio_request_vq,
+	};
+	struct virtqueue *vqs[1] = { NULL };
+	int ret;
+
+	ret = virtio_find_vqs(vdev, 1, vqs, cbs, names, NULL);
+	if (ret) {
+		dev_err(&vdev->dev, "failed to find vqs: %d\n", ret);
+		return ret;
+	}
+
+	if (!vqs[0]) {
+		dev_err(&vdev->dev, "failed to find requestq vq\n");
+		return -ENODEV;
+	}
+	vgpio->request_vq = vqs[0];
+
+	return 0;
+}
+
+static const char **virtio_gpio_get_names(struct virtio_gpio *vgpio)
+{
+	struct virtio_gpio_config *config = &vgpio->config;
+	struct virtio_gpio_response_get_names *res;
+	struct device *dev = &vgpio->vdev->dev;
+	u8 *gpio_names, *str;
+	const char **names;
+	int i, ret, len;
+
+	if (!config->gpio_names_size)
+		return NULL;
+
+	len = sizeof(*res) + config->gpio_names_size;
+	res = devm_kzalloc(dev, len, GFP_KERNEL);
+	if (!res)
+		return NULL;
+	gpio_names = res->value;
+
+	ret = _virtio_gpio_req(vgpio, VIRTIO_GPIO_MSG_GET_NAMES, 0, 0, NULL,
+			       res, len);
+	if (ret) {
+		dev_err(dev, "Failed to get GPIO names: %d\n", ret);
+		return NULL;
+	}
+
+	names = devm_kcalloc(dev, config->ngpio, sizeof(*names), GFP_KERNEL);
+	if (!names)
+		return NULL;
+
+	/* NULL terminate the string instead of checking it */
+	gpio_names[config->gpio_names_size - 1] = '\0';
+
+	for (i = 0, str = gpio_names; i < config->ngpio; i++) {
+		names[i] = str;
+		str += strlen(str) + 1; /* zero-length strings are allowed */
+
+		if (str > gpio_names + config->gpio_names_size) {
+			dev_err(dev, "gpio_names block is too short (%d)\n", i);
+			return NULL;
+		}
+	}
+
+	return names;
+}
+
+static int virtio_gpio_probe(struct virtio_device *vdev)
+{
+	struct virtio_gpio_config *config;
+	struct device *dev = &vdev->dev;
+	struct virtio_gpio *vgpio;
+	int ret, i;
+
+	vgpio = devm_kzalloc(dev, sizeof(*vgpio), GFP_KERNEL);
+	if (!vgpio)
+		return -ENOMEM;
+
+	config = &vgpio->config;
+
+	/* Read configuration */
+	virtio_cread_bytes(vdev, 0, config, sizeof(*config));
+	config->gpio_names_size = le32_to_cpu(config->gpio_names_size);
+	config->ngpio = le16_to_cpu(config->ngpio);
+	if (!config->ngpio) {
+		dev_err(dev, "Number of GPIOs can't be zero\n");
+		return -EINVAL;
+	}
+
+	vgpio->lines = devm_kcalloc(dev, config->ngpio, sizeof(*vgpio->lines), GFP_KERNEL);
+	if (!vgpio->lines)
+		return -ENOMEM;
+
+	for (i = 0; i < config->ngpio; i++) {
+		mutex_init(&vgpio->lines[i].lock);
+		init_completion(&vgpio->lines[i].completion);
+	}
+
+	mutex_init(&vgpio->lock);
+	vdev->priv = vgpio;
+
+	vgpio->vdev			= vdev;
+	vgpio->gc.free			= virtio_gpio_free;
+	vgpio->gc.get_direction		= virtio_gpio_get_direction;
+	vgpio->gc.direction_input	= virtio_gpio_direction_input;
+	vgpio->gc.direction_output	= virtio_gpio_direction_output;
+	vgpio->gc.get			= virtio_gpio_get;
+	vgpio->gc.set			= virtio_gpio_set;
+	vgpio->gc.ngpio			= config->ngpio;
+	vgpio->gc.base			= -1; /* Allocate base dynamically */
+	vgpio->gc.label			= dev_name(dev);
+	vgpio->gc.parent		= dev;
+	vgpio->gc.owner			= THIS_MODULE;
+	vgpio->gc.can_sleep		= true;
+
+	ret = virtio_gpio_alloc_vqs(vgpio, vdev);
+	if (ret)
+		return ret;
+
+	/* Mark the device ready to perform operations from within probe() */
+	virtio_device_ready(vdev);
+
+	vgpio->gc.names = virtio_gpio_get_names(vgpio);
+
+	ret = gpiochip_add_data(&vgpio->gc, vgpio);
+	if (ret) {
+		virtio_gpio_free_vqs(vdev);
+		dev_err(dev, "Failed to add virtio-gpio controller\n");
+	}
+
+	return ret;
+}
+
+static void virtio_gpio_remove(struct virtio_device *vdev)
+{
+	struct virtio_gpio *vgpio = vdev->priv;
+
+	gpiochip_remove(&vgpio->gc);
+	virtio_gpio_free_vqs(vdev);
+}
+
+static const struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_GPIO, VIRTIO_DEV_ANY_ID },
+	{},
+};
+MODULE_DEVICE_TABLE(virtio, id_table);
+
+static struct virtio_driver virtio_gpio_driver = {
+	.id_table		= id_table,
+	.probe			= virtio_gpio_probe,
+	.remove			= virtio_gpio_remove,
+	.driver			= {
+		.name		= KBUILD_MODNAME,
+		.owner		= THIS_MODULE,
+	},
+};
+module_virtio_driver(virtio_gpio_driver);
+
+MODULE_AUTHOR("Enrico Weigelt, metux IT consult <info@metux.net>");
+MODULE_AUTHOR("Viresh Kumar <viresh.kumar@linaro.org>");
+MODULE_DESCRIPTION("VirtIO GPIO driver");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/virtio_gpio.h b/include/uapi/linux/virtio_gpio.h
new file mode 100644
index 000000000000..844574acf095
--- /dev/null
+++ b/include/uapi/linux/virtio_gpio.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _LINUX_VIRTIO_GPIO_H
+#define _LINUX_VIRTIO_GPIO_H
+
+#include <linux/types.h>
+
+/* Virtio GPIO request types */
+#define VIRTIO_GPIO_MSG_GET_NAMES		0x0001
+#define VIRTIO_GPIO_MSG_GET_DIRECTION		0x0002
+#define VIRTIO_GPIO_MSG_SET_DIRECTION		0x0003
+#define VIRTIO_GPIO_MSG_GET_VALUE		0x0004
+#define VIRTIO_GPIO_MSG_SET_VALUE		0x0005
+
+/* Possible values of the status field */
+#define VIRTIO_GPIO_STATUS_OK			0x0
+#define VIRTIO_GPIO_STATUS_ERR			0x1
+
+/* Direction types */
+#define VIRTIO_GPIO_DIRECTION_NONE		0x00
+#define VIRTIO_GPIO_DIRECTION_OUT		0x01
+#define VIRTIO_GPIO_DIRECTION_IN		0x02
+
+struct virtio_gpio_config {
+	__u16 ngpio;
+	__u8 padding[2];
+	__u32 gpio_names_size;
+} __packed;
+
+/* Virtio GPIO Request / Response */
+struct virtio_gpio_request {
+	__u16 type;
+	__u16 gpio;
+	__u32 value;
+};
+
+struct virtio_gpio_response {
+	__u8 status;
+	__u8 value;
+};
+
+struct virtio_gpio_response_get_names {
+	__u8 status;
+	__u8 value[];
+};
+
+#endif /* _LINUX_VIRTIO_GPIO_H */
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 70a8057ad4bb..e04fa2bfc0eb 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -56,6 +56,7 @@
 #define VIRTIO_ID_PMEM			27 /* virtio pmem */
 #define VIRTIO_ID_MAC80211_HWSIM	29 /* virtio mac80211-hwsim */
 #define VIRTIO_ID_BT			40 /* virtio bluetooth */
+#define VIRTIO_ID_GPIO			41 /* virtio gpio */
 
 /*
  * Virtio Transitional IDs
-- 
cgit v1.2.3-71-gd317


From 146054090b0859b28fc39015c7704ccc3c3a347f Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Wed, 30 Jun 2021 13:01:49 -0700
Subject: btrfs: initial fsverity support

Add support for fsverity in btrfs. To support the generic interface in
fs/verity, we add two new item types in the fs tree for inodes with
verity enabled. One stores the per-file verity descriptor and btrfs
verity item and the other stores the Merkle tree data itself.

Verity checking is done in end_page_read just before a page is marked
uptodate. This naturally handles a variety of edge cases like holes,
preallocated extents, and inline extents. Some care needs to be taken to
not try to verity pages past the end of the file, which are accessed by
the generic buffered file reading code under some circumstances like
reading to the end of the last page and trying to read again. Direct IO
on a verity file falls back to buffered reads.

Verity relies on PageChecked for the Merkle tree data itself to avoid
re-walking up shared paths in the tree. For this reason, we need to
cache the Merkle tree data. Since the file is immutable after verity is
turned on, we can cache it at an index past EOF.

Use the new inode ro_flags to store verity on the inode item, so that we
can enable verity on a file, then rollback to an older kernel and still
mount the file system and read the file. Since we can't safely write the
file anymore without ruining the invariants of the Merkle tree, we mark
a ro_compat flag on the file system when a file has verity enabled.

Acked-by: Eric Biggers <ebiggers@google.com>
Co-developed-by: Chris Mason <clm@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile               |   1 +
 fs/btrfs/btrfs_inode.h          |   7 +
 fs/btrfs/ctree.h                |  31 +-
 fs/btrfs/extent_io.c            |  26 +-
 fs/btrfs/file.c                 |  10 +
 fs/btrfs/inode.c                |   6 +
 fs/btrfs/ioctl.c                |  14 +-
 fs/btrfs/super.c                |   3 +
 fs/btrfs/sysfs.c                |   6 +
 fs/btrfs/verity.c               | 738 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h      |   1 +
 include/uapi/linux/btrfs_tree.h |  35 ++
 12 files changed, 859 insertions(+), 19 deletions(-)
 create mode 100644 fs/btrfs/verity.c

(limited to 'include/uapi')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index cec88a66bd6c..3dcf9bcc2326 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -36,6 +36,7 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
 btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
+btrfs-$(CONFIG_FS_VERITY) += verity.o
 
 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
 	tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1093b00130be..76ee1452c57b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,13 @@ enum {
 	 * the file range, inode's io_tree).
 	 */
 	BTRFS_INODE_NO_DELALLOC_FLUSH,
+	/*
+	 * Set when we are working on enabling verity for a file. Computing and
+	 * writing the whole Merkle tree can take a while so we want to prevent
+	 * races where two separate tasks attempt to simultaneously start verity
+	 * on the same file.
+	 */
+	BTRFS_INODE_VERITY_IN_PROGRESS,
 };
 
 /* in memory btrfs inode */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9e3b7a56a78f..f17be4b023cb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -281,7 +281,8 @@ struct btrfs_super_block {
 
 #define BTRFS_FEATURE_COMPAT_RO_SUPP			\
 	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |	\
-	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID)
+	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+	 BTRFS_FEATURE_COMPAT_RO_VERITY)
 
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
@@ -1512,7 +1513,9 @@ do {                                                                   \
 	 BTRFS_INODE_COMPRESS |						\
 	 BTRFS_INODE_ROOT_ITEM_INIT)
 
-#define BTRFS_INODE_RO_FLAG_MASK					(0)
+#define BTRFS_INODE_RO_VERITY		(1U << 0)
+
+#define BTRFS_INODE_RO_FLAG_MASK	(BTRFS_INODE_RO_VERITY)
 
 struct btrfs_map_token {
 	struct extent_buffer *eb;
@@ -3791,6 +3794,30 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
 	return signal_pending(current);
 }
 
+/* verity.c */
+#ifdef CONFIG_FS_VERITY
+
+extern const struct fsverity_operations btrfs_verityops;
+int btrfs_drop_verity_items(struct btrfs_inode *inode);
+
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+		   encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+		   size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+			 struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+			 struct btrfs_verity_descriptor_item, size, 64);
+
+#else
+
+static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+	return 0;
+}
+
+#endif
+
 /* Sanity test specific functions */
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 66888b10b00d..96de6e70d06c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -13,6 +13,7 @@
 #include <linux/pagevec.h>
 #include <linux/prefetch.h>
 #include <linux/cleancache.h>
+#include <linux/fsverity.h>
 #include "misc.h"
 #include "extent_io.h"
 #include "extent-io-tree.h"
@@ -2247,18 +2248,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	return bitset;
 }
 
-/*
- * helper function to set a given page up to date if all the
- * extents in the tree for that page are up to date
- */
-static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
-{
-	u64 start = page_offset(page);
-	u64 end = start + PAGE_SIZE - 1;
-	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
-		SetPageUptodate(page);
-}
-
 int free_io_failure(struct extent_io_tree *failure_tree,
 		    struct extent_io_tree *io_tree,
 		    struct io_failure_record *rec)
@@ -2690,7 +2679,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
 	       start + len <= page_offset(page) + PAGE_SIZE);
 
 	if (uptodate) {
-		btrfs_page_set_uptodate(fs_info, page, start, len);
+		if (fsverity_active(page->mapping->host) &&
+		    !PageError(page) &&
+		    !PageUptodate(page) &&
+		    start < i_size_read(page->mapping->host) &&
+		    !fsverity_verify_page(page)) {
+			btrfs_page_set_error(fs_info, page, start, len);
+		} else {
+			btrfs_page_set_uptodate(fs_info, page, start, len);
+		}
 	} else {
 		btrfs_page_clear_uptodate(fs_info, page, start, len);
 		btrfs_page_set_error(fs_info, page, start, len);
@@ -3105,7 +3102,7 @@ readpage_ok:
 		/* Update page status and unlock */
 		end_page_read(page, uptodate, start, len);
 		endio_readpage_release_extent(&processed, BTRFS_I(inode),
-					      start, end, uptodate);
+					      start, end, PageUptodate(page));
 	}
 	/* Release the last extent */
 	endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -3706,7 +3703,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		/* the get_extent function already copied into the page */
 		if (test_range_bit(tree, cur, cur_end,
 				   EXTENT_UPTODATE, 1, NULL)) {
-			check_page_uptodate(tree, page);
 			unlock_extent(tree, cur, cur + iosize - 1);
 			end_page_read(page, true, cur, iosize);
 			cur = cur + iosize;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d3f2623a2af0..7ff577005d0f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -16,6 +16,7 @@
 #include <linux/btrfs.h>
 #include <linux/uio.h>
 #include <linux/iversion.h>
+#include <linux/fsverity.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -3615,7 +3616,13 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 
 static int btrfs_file_open(struct inode *inode, struct file *filp)
 {
+	int ret;
+
 	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+
+	ret = fsverity_file_open(inode, filp);
+	if (ret)
+		return ret;
 	return generic_file_open(inode, filp);
 }
 
@@ -3644,6 +3651,9 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
 	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
 
+	if (fsverity_active(inode))
+		return 0;
+
 	if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
 		return 0;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cd5a67ba7e71..766cd35be33d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
 #include <linux/sched/mm.h>
 #include <linux/iomap.h>
 #include <asm/unaligned.h>
+#include <linux/fsverity.h>
 #include "misc.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -5560,6 +5561,7 @@ void btrfs_evict_inode(struct inode *inode)
 	trace_btrfs_inode_evict(inode);
 
 	if (!root) {
+		fsverity_cleanup_inode(inode);
 		clear_inode(inode);
 		return;
 	}
@@ -5642,6 +5644,7 @@ no_delete:
 	 * to retry these periodically in the future.
 	 */
 	btrfs_remove_delayed_node(BTRFS_I(inode));
+	fsverity_cleanup_inode(inode);
 	clear_inode(inode);
 }
 
@@ -9250,6 +9253,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
 	struct inode *inode = d_inode(path->dentry);
 	u32 blocksize = inode->i_sb->s_blocksize;
 	u32 bi_flags = BTRFS_I(inode)->flags;
+	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
 
 	stat->result_mask |= STATX_BTIME;
 	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
@@ -9262,6 +9266,8 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
 		stat->attributes |= STATX_ATTR_IMMUTABLE;
 	if (bi_flags & BTRFS_INODE_NODUMP)
 		stat->attributes |= STATX_ATTR_NODUMP;
+	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+		stat->attributes |= STATX_ATTR_VERITY;
 
 	stat->attributes_mask |= (STATX_ATTR_APPEND |
 				  STATX_ATTR_COMPRESSED |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 17aefb5f08ea..85c8b5a87a6a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -27,6 +27,7 @@
 #include <linux/uaccess.h>
 #include <linux/iversion.h>
 #include <linux/fileattr.h>
+#include <linux/fsverity.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "export.h"
@@ -107,6 +108,7 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
 {
 	unsigned int iflags = 0;
 	u32 flags = binode->flags;
+	u32 ro_flags = binode->ro_flags;
 
 	if (flags & BTRFS_INODE_SYNC)
 		iflags |= FS_SYNC_FL;
@@ -122,6 +124,8 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
 		iflags |= FS_DIRSYNC_FL;
 	if (flags & BTRFS_INODE_NODATACOW)
 		iflags |= FS_NOCOW_FL;
+	if (ro_flags & BTRFS_INODE_RO_VERITY)
+		iflags |= FS_VERITY_FL;
 
 	if (flags & BTRFS_INODE_NOCOMPRESS)
 		iflags |= FS_NOCOMP_FL;
@@ -149,10 +153,12 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (binode->flags & BTRFS_INODE_DIRSYNC)
 		new_fl |= S_DIRSYNC;
+	if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+		new_fl |= S_VERITY;
 
 	set_mask_bits(&inode->i_flags,
-		      S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
-		      new_fl);
+		      S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
+		      S_VERITY, new_fl);
 }
 
 /*
@@ -5020,6 +5026,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_get_subvol_rootref(file, argp);
 	case BTRFS_IOC_INO_LOOKUP_USER:
 		return btrfs_ioctl_ino_lookup_user(file, argp);
+	case FS_IOC_ENABLE_VERITY:
+		return fsverity_ioctl_enable(file, (const void __user *)argp);
+	case FS_IOC_MEASURE_VERITY:
+		return fsverity_ioctl_measure(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0d2e3ab2fc31..2bdc544b4c95 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1353,6 +1353,9 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_op = &btrfs_super_ops;
 	sb->s_d_op = &btrfs_dentry_operations;
 	sb->s_export_op = &btrfs_export_ops;
+#ifdef CONFIG_FS_VERITY
+	sb->s_vop = &btrfs_verityops;
+#endif
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index d9d53a255ef9..bfe5e27617b0 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -267,6 +267,9 @@ BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
 #ifdef CONFIG_BTRFS_DEBUG
 BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
 #endif
+#ifdef CONFIG_FS_VERITY
+BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
+#endif
 
 static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -284,6 +287,9 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(raid1c34),
 #ifdef CONFIG_BTRFS_DEBUG
 	BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
+#ifdef CONFIG_FS_VERITY
+	BTRFS_FEAT_ATTR_PTR(verity),
 #endif
 	NULL
 };
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
new file mode 100644
index 000000000000..ac4c2ca45925
--- /dev/null
+++ b/fs/btrfs/verity.c
@@ -0,0 +1,738 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/iversion.h>
+#include <linux/fsverity.h>
+#include <linux/sched/mm.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+
+/*
+ * Implementation of the interface defined in struct fsverity_operations.
+ *
+ * The main question is how and where to store the verity descriptor and the
+ * Merkle tree. We store both in dedicated btree items in the filesystem tree,
+ * together with the rest of the inode metadata. This means we'll need to do
+ * extra work to encrypt them once encryption is supported in btrfs, but btrfs
+ * has a lot of careful code around i_size and it seems better to make a new key
+ * type than try and adjust all of our expectations for i_size.
+ *
+ * Note that this differs from the implementation in ext4 and f2fs, where
+ * this data is stored as if it were in the file, but past EOF. However, btrfs
+ * does not have a widespread mechanism for caching opaque metadata pages, so we
+ * do pretend that the Merkle tree pages themselves are past EOF for the
+ * purposes of caching them (as opposed to creating a virtual inode).
+ *
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the
+ * size of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.
+ * The latter are opaque to btrfs, we just read and write them as a blob for
+ * the higher level verity code.  The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.
+ * So when fsverity asks for page 0 of the merkle tree, we pull up one page
+ * starting at offset 0 for this key type.  These are also opaque to btrfs,
+ * we're blindly storing whatever fsverity sends down.
+ */
+
+#define MERKLE_START_ALIGN			65536
+
+/*
+ * Compute the logical file offset where we cache the Merkle tree.
+ *
+ * @inode:  inode of the verity file
+ *
+ * For the purposes of caching the Merkle tree pages, as required by
+ * fs-verity, it is convenient to do size computations in terms of a file
+ * offset, rather than in terms of page indices.
+ *
+ * Use 64K to be sure it's past the last page in the file, even with 64K pages.
+ * That rounding operation itself can overflow loff_t, so we do it in u64 and
+ * check.
+ *
+ * Returns the file offset on success, negative error code on failure.
+ */
+static loff_t merkle_file_pos(const struct inode *inode)
+{
+	u64 sz = inode->i_size;
+	u64 rounded = round_up(sz, MERKLE_START_ALIGN);
+
+	if (rounded > inode->i_sb->s_maxbytes)
+		return -EFBIG;
+
+	return rounded;
+}
+
+/*
+ * Drop all the items for this inode with this key_type.
+ *
+ * @inode:     inode to drop items for
+ * @key_type:  type of items to drop (BTRFS_VERITY_DESC_ITEM or
+ *             BTRFS_VERITY_MERKLE_ITEM)
+ *
+ * Before doing a verity enable we cleanup any existing verity items.
+ * This is also used to clean up if a verity enable failed half way through.
+ *
+ * Returns number of dropped items on success, negative error code on failure.
+ */
+static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = inode->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int count = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		/* 1 for the item being dropped */
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out;
+		}
+
+		/*
+		 * Walk backwards through all the items until we find one that
+		 * isn't from our key type or objectid
+		 */
+		key.objectid = btrfs_ino(inode);
+		key.type = key_type;
+		key.offset = (u64)-1;
+
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret > 0) {
+			ret = 0;
+			/* No more keys of this type, we're done */
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		} else if (ret < 0) {
+			btrfs_end_transaction(trans);
+			goto out;
+		}
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+		/* No more keys of this type, we're done */
+		if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+			break;
+
+		/*
+		 * This shouldn't be a performance sensitive function because
+		 * it's not used as part of truncate.  If it ever becomes
+		 * perf sensitive, change this to walk forward and bulk delete
+		 * items
+		 */
+		ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
+		if (ret) {
+			btrfs_end_transaction(trans);
+			goto out;
+		}
+		count++;
+		btrfs_release_path(path);
+		btrfs_end_transaction(trans);
+	}
+	ret = count;
+	btrfs_end_transaction(trans);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Drop all verity items
+ *
+ * @inode:  inode to drop verity items for
+ *
+ * In most contexts where we are dropping verity items, we want to do it for all
+ * the types of verity items, not a particular one.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+	int ret;
+
+	ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY);
+	if (ret < 0)
+		return ret;
+	ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+/*
+ * Insert and write inode items with a given key type and offset.
+ *
+ * @inode:     inode to insert for
+ * @key_type:  key type to insert
+ * @offset:    item offset to insert at
+ * @src:       source data to write
+ * @len:       length of source data to write
+ *
+ * Write len bytes from src into items of up to 2K length.
+ * The inserted items will have key (ino, key_type, offset + off) where off is
+ * consecutively increasing from 0 up to the last item ending at offset + len.
+ *
+ * Returns 0 on success and a negative error code on failure.
+ */
+static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+			   const char *src, u64 len)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_root *root = inode->root;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long copy_bytes;
+	unsigned long src_offset = 0;
+	void *data;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (len > 0) {
+		/* 1 for the new item being inserted */
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+
+		key.objectid = btrfs_ino(inode);
+		key.type = key_type;
+		key.offset = offset;
+
+		/*
+		 * Insert 2K at a time mostly to be friendly for smaller leaf
+		 * size filesystems
+		 */
+		copy_bytes = min_t(u64, len, 2048);
+
+		ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes);
+		if (ret) {
+			btrfs_end_transaction(trans);
+			break;
+		}
+
+		leaf = path->nodes[0];
+
+		data = btrfs_item_ptr(leaf, path->slots[0], void);
+		write_extent_buffer(leaf, src + src_offset,
+				    (unsigned long)data, copy_bytes);
+		offset += copy_bytes;
+		src_offset += copy_bytes;
+		len -= copy_bytes;
+
+		btrfs_release_path(path);
+		btrfs_end_transaction(trans);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Read inode items of the given key type and offset from the btree.
+ *
+ * @inode:      inode to read items of
+ * @key_type:   key type to read
+ * @offset:     item offset to read from
+ * @dest:       Buffer to read into. This parameter has slightly tricky
+ *              semantics.  If it is NULL, the function will not do any copying
+ *              and will just return the size of all the items up to len bytes.
+ *              If dest_page is passed, then the function will kmap_local the
+ *              page and ignore dest, but it must still be non-NULL to avoid the
+ *              counting-only behavior.
+ * @len:        length in bytes to read
+ * @dest_page:  copy into this page instead of the dest buffer
+ *
+ * Helper function to read items from the btree.  This returns the number of
+ * bytes read or < 0 for errors.  We can return short reads if the items don't
+ * exist on disk or aren't big enough to fill the desired length.  Supports
+ * reading into a provided buffer (dest) or into the page cache
+ *
+ * Returns number of bytes read or a negative error code on failure.
+ */
+static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+			  char *dest, u64 len, struct page *dest_page)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root = inode->root;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	u64 item_end;
+	u64 copy_end;
+	int copied = 0;
+	u32 copy_offset;
+	unsigned long copy_bytes;
+	unsigned long dest_offset = 0;
+	void *data;
+	char *kaddr = dest;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (dest_page)
+		path->reada = READA_FORWARD;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = key_type;
+	key.offset = offset;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (ret > 0) {
+		ret = 0;
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+
+	while (len > 0) {
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+		if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+			break;
+
+		item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
+
+		if (copied > 0) {
+			/*
+			 * Once we've copied something, we want all of the items
+			 * to be sequential
+			 */
+			if (key.offset != offset)
+				break;
+		} else {
+			/*
+			 * Our initial offset might be in the middle of an
+			 * item.  Make sure it all makes sense.
+			 */
+			if (key.offset > offset)
+				break;
+			if (item_end <= offset)
+				break;
+		}
+
+		/* desc = NULL to just sum all the item lengths */
+		if (!dest)
+			copy_end = item_end;
+		else
+			copy_end = min(offset + len, item_end);
+
+		/* Number of bytes in this item we want to copy */
+		copy_bytes = copy_end - offset;
+
+		/* Offset from the start of item for copying */
+		copy_offset = offset - key.offset;
+
+		if (dest) {
+			if (dest_page)
+				kaddr = kmap_local_page(dest_page);
+
+			data = btrfs_item_ptr(leaf, path->slots[0], void);
+			read_extent_buffer(leaf, kaddr + dest_offset,
+					   (unsigned long)data + copy_offset,
+					   copy_bytes);
+
+			if (dest_page)
+				kunmap_local(kaddr);
+		}
+
+		offset += copy_bytes;
+		dest_offset += copy_bytes;
+		len -= copy_bytes;
+		copied += copy_bytes;
+
+		path->slots[0]++;
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			/*
+			 * We've reached the last slot in this leaf and we need
+			 * to go to the next leaf.
+			 */
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				break;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+		}
+	}
+out:
+	btrfs_free_path(path);
+	if (!ret)
+		ret = copied;
+	return ret;
+}
+
+/*
+ * Rollback in-progress verity if we encounter an error.
+ *
+ * @inode:  inode verity had an error for
+ *
+ * We try to handle recoverable errors while enabling verity by rolling it back
+ * and just failing the operation, rather than having an fs level error no
+ * matter what. However, any error in rollback is unrecoverable.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int rollback_verity(struct btrfs_inode *inode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = inode->root;
+	int ret;
+
+	ASSERT(inode_is_locked(&inode->vfs_inode));
+	truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
+	clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+	ret = btrfs_drop_verity_items(inode);
+	if (ret) {
+		btrfs_handle_fs_error(root->fs_info, ret,
+				"failed to drop verity items in rollback %llu",
+				(u64)inode->vfs_inode.i_ino);
+		goto out;
+	}
+
+	/* 1 for updating the inode flag */
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		btrfs_handle_fs_error(root->fs_info, ret,
+			"failed to start transaction in verity rollback %llu",
+			(u64)inode->vfs_inode.i_ino);
+		goto out;
+	}
+	inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
+	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		goto out;
+	}
+	btrfs_end_transaction(trans);
+out:
+	return ret;
+}
+
+/*
+ * Finalize making the file a valid verity file
+ *
+ * @inode:      inode to be marked as verity
+ * @desc:       contents of the verity descriptor to write (not NULL)
+ * @desc_size:  size of the verity descriptor
+ *
+ * Do the actual work of finalizing verity after successfully writing the Merkle
+ * tree:
+ *
+ * - write out the descriptor items
+ * - mark the inode with the verity flag
+ * - mark the ro compat bit
+ * - clear the in progress bit
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int finish_verity(struct btrfs_inode *inode, const void *desc,
+			 size_t desc_size)
+{
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_root *root = inode->root;
+	struct btrfs_verity_descriptor_item item;
+	int ret;
+
+	/* Write out the descriptor item */
+	memset(&item, 0, sizeof(item));
+	btrfs_set_stack_verity_descriptor_size(&item, desc_size);
+	ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0,
+			      (const char *)&item, sizeof(item));
+	if (ret)
+		goto out;
+
+	/* Write out the descriptor itself */
+	ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1,
+			      desc, desc_size);
+	if (ret)
+		goto out;
+
+	/* 1 for updating the inode flag */
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+	inode->ro_flags |= BTRFS_INODE_RO_VERITY;
+	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret)
+		goto end_trans;
+	clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+	btrfs_set_fs_compat_ro(root->fs_info, VERITY);
+end_trans:
+	btrfs_end_transaction(trans);
+out:
+	return ret;
+
+}
+
+/*
+ * fsverity op that begins enabling verity.
+ *
+ * @filp:  file to enable verity on
+ *
+ * Begin enabling fsverity for the file. We drop any existing verity items
+ * and set the in progress bit.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int btrfs_begin_enable_verity(struct file *filp)
+{
+	struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+	int ret;
+
+	ASSERT(inode_is_locked(file_inode(filp)));
+
+	if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
+		return -EBUSY;
+
+	ret = btrfs_drop_verity_items(inode);
+	if (ret)
+		return ret;
+
+	set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+
+	return 0;
+}
+
+/*
+ * fsverity op that ends enabling verity.
+ *
+ * @filp:              file we are finishing enabling verity on
+ * @desc:              verity descriptor to write out (NULL in error conditions)
+ * @desc_size:         size of the verity descriptor (variable with signatures)
+ * @merkle_tree_size:  size of the merkle tree in bytes
+ *
+ * If desc is null, then VFS is signaling an error occurred during verity
+ * enable, and we should try to rollback. Otherwise, attempt to finish verity.
+ *
+ * Returns 0 on success, negative error code on error.
+ */
+static int btrfs_end_enable_verity(struct file *filp, const void *desc,
+				   size_t desc_size, u64 merkle_tree_size)
+{
+	struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+	int ret = 0;
+	int rollback_ret;
+
+	ASSERT(inode_is_locked(file_inode(filp)));
+
+	if (desc == NULL)
+		goto rollback;
+
+	ret = finish_verity(inode, desc, desc_size);
+	if (ret)
+		goto rollback;
+	return ret;
+
+rollback:
+	rollback_ret = rollback_verity(inode);
+	if (rollback_ret)
+		btrfs_err(inode->root->fs_info,
+			  "failed to rollback verity items: %d", rollback_ret);
+	return ret;
+}
+
+/*
+ * fsverity op that gets the struct fsverity_descriptor.
+ *
+ * @inode:     inode to get the descriptor of
+ * @buf:       output buffer for the descriptor contents
+ * @buf_size:  size of the output buffer. 0 to query the size
+ *
+ * fsverity does a two pass setup for reading the descriptor, in the first pass
+ * it calls with buf_size = 0 to query the size of the descriptor, and then in
+ * the second pass it actually reads the descriptor off disk.
+ *
+ * Returns the size on success or a negative error code on failure.
+ */
+static int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+				       size_t buf_size)
+{
+	u64 true_size;
+	int ret = 0;
+	struct btrfs_verity_descriptor_item item;
+
+	memset(&item, 0, sizeof(item));
+	ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0,
+			     (char *)&item, sizeof(item), NULL);
+	if (ret < 0)
+		return ret;
+
+	if (item.reserved[0] != 0 || item.reserved[1] != 0)
+		return -EUCLEAN;
+
+	true_size = btrfs_stack_verity_descriptor_size(&item);
+	if (true_size > INT_MAX)
+		return -EUCLEAN;
+
+	if (buf_size == 0)
+		return true_size;
+	if (buf_size < true_size)
+		return -ERANGE;
+
+	ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1,
+			     buf, buf_size, NULL);
+	if (ret < 0)
+		return ret;
+	if (ret != true_size)
+		return -EIO;
+
+	return true_size;
+}
+
+/*
+ * fsverity op that reads and caches a merkle tree page.
+ *
+ * @inode:         inode to read a merkle tree page for
+ * @index:         page index relative to the start of the merkle tree
+ * @num_ra_pages:  number of pages to readahead. Optional, we ignore it
+ *
+ * The Merkle tree is stored in the filesystem btree, but its pages are cached
+ * with a logical position past EOF in the inode's mapping.
+ *
+ * Returns the page we read, or an ERR_PTR on error.
+ */
+static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
+						pgoff_t index,
+						unsigned long num_ra_pages)
+{
+	struct page *page;
+	u64 off = (u64)index << PAGE_SHIFT;
+	loff_t merkle_pos = merkle_file_pos(inode);
+	int ret;
+
+	if (merkle_pos < 0)
+		return ERR_PTR(merkle_pos);
+	if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE)
+		return ERR_PTR(-EFBIG);
+	index += merkle_pos >> PAGE_SHIFT;
+again:
+	page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+	if (page) {
+		if (PageUptodate(page))
+			return page;
+
+		lock_page(page);
+		/*
+		 * We only insert uptodate pages, so !Uptodate has to be
+		 * an error
+		 */
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			put_page(page);
+			return ERR_PTR(-EIO);
+		}
+		unlock_page(page);
+		return page;
+	}
+
+	page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Merkle item keys are indexed from byte 0 in the merkle tree.
+	 * They have the form:
+	 *
+	 * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
+	 */
+	ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
+			     page_address(page), PAGE_SIZE, page);
+	if (ret < 0) {
+		put_page(page);
+		return ERR_PTR(ret);
+	}
+	if (ret < PAGE_SIZE)
+		memzero_page(page, ret, PAGE_SIZE - ret);
+
+	SetPageUptodate(page);
+	ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS);
+
+	if (!ret) {
+		/* Inserted and ready for fsverity */
+		unlock_page(page);
+	} else {
+		put_page(page);
+		/* Did someone race us into inserting this page? */
+		if (ret == -EEXIST)
+			goto again;
+		page = ERR_PTR(ret);
+	}
+	return page;
+}
+
+/*
+ * fsverity op that writes a Merkle tree block into the btree.
+ *
+ * @inode:          inode to write a Merkle tree block for
+ * @buf:            Merkle tree data block to write
+ * @index:          index of the block in the Merkle tree
+ * @log_blocksize:  log base 2 of the Merkle tree block size
+ *
+ * Note that the block size could be different from the page size, so it is not
+ * safe to assume that index is a page index.
+ *
+ * Returns 0 on success or negative error code on failure
+ */
+static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
+					u64 index, int log_blocksize)
+{
+	u64 off = index << log_blocksize;
+	u64 len = 1ULL << log_blocksize;
+	loff_t merkle_pos = merkle_file_pos(inode);
+
+	if (merkle_pos < 0)
+		return merkle_pos;
+	if (merkle_pos > inode->i_sb->s_maxbytes - off - len)
+		return -EFBIG;
+
+	return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY,
+			       off, buf, len);
+}
+
+const struct fsverity_operations btrfs_verityops = {
+	.begin_enable_verity     = btrfs_begin_enable_verity,
+	.end_enable_verity       = btrfs_end_enable_verity,
+	.get_verity_descriptor   = btrfs_get_verity_descriptor,
+	.read_merkle_tree_page   = btrfs_read_merkle_tree_page,
+	.write_merkle_tree_block = btrfs_write_merkle_tree_block,
+};
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 22cd037123fa..d7d3cfead056 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -288,6 +288,7 @@ struct btrfs_ioctl_fs_info_args {
  * first mount when booting older kernel versions.
  */
 #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID	(1ULL << 1)
+#define BTRFS_FEATURE_COMPAT_RO_VERITY			(1ULL << 2)
 
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index ccdb40fe40dc..e1c4c732aaba 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -118,6 +118,29 @@
 #define BTRFS_INODE_REF_KEY		12
 #define BTRFS_INODE_EXTREF_KEY		13
 #define BTRFS_XATTR_ITEM_KEY		24
+
+/*
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the size
+ * of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.  The
+ * latter are opaque to btrfs, we just read and write them as a blob for the
+ * higher level verity code.  The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.  When
+ * fsverity asks for page 0 of the merkle tree, we pull up one page starting at
+ * offset 0 for this key type.  These are also opaque to btrfs, we're blindly
+ * storing whatever fsverity sends down.
+ */
+#define BTRFS_VERITY_DESC_ITEM_KEY	36
+#define BTRFS_VERITY_MERKLE_ITEM_KEY	37
+
 #define BTRFS_ORPHAN_ITEM_KEY		48
 /* reserve 2-15 close to the inode for later flexibility */
 
@@ -991,4 +1014,16 @@ struct btrfs_qgroup_limit_item {
 	__le64 rsv_excl;
 } __attribute__ ((__packed__));
 
+struct btrfs_verity_descriptor_item {
+	/* Size of the verity descriptor in bytes */
+	__le64 size;
+	/*
+	 * When we implement support for fscrypt, we will need to encrypt the
+	 * Merkle tree for encrypted verity files. These 128 bits are for the
+	 * eventual storage of an fscrypt initialization vector.
+	 */
+	__le64 reserved[2];
+	__u8 encryption;
+} __attribute__ ((__packed__));
+
 #endif /* _BTRFS_CTREE_H_ */
-- 
cgit v1.2.3-71-gd317


From a4ad42d28618eef83bee02e0a19af0d467bd9722 Mon Sep 17 00:00:00 2001
From: Kareem Shaik <kareem.m.shaik@intel.com>
Date: Wed, 18 Aug 2021 09:57:39 +0200
Subject: ASoC: Intel: Skylake: Support multiple format configs

A module can have two kinds of set params, as per topology requirements.
For example, one pre-init and one post-init. But currently, there is
support for just one type, as the format_config.

This patch extends the format_configs to 4, so as to be able to support
pre-init, post-init and post-bind type of set params, for the same
module, simultaneously.

Signed-off-by: Kareem Shaik <kareem.m.shaik@intel.com>
Signed-off-by: Gustaw Lewandowski <gustaw.lewandowski@linux.intel.com>
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Tested-by: Lukasz Majczak <lma@semihalf.com>
Link: https://lore.kernel.org/r/20210818075742.1515155-9-cezary.rojewski@intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/snd_sst_tokens.h    |  6 ++-
 sound/soc/intel/skylake/skl-messages.c | 19 ++++---
 sound/soc/intel/skylake/skl-topology.c | 99 ++++++++++++++++++++++------------
 sound/soc/intel/skylake/skl-topology.h |  5 +-
 4 files changed, 82 insertions(+), 47 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/sound/snd_sst_tokens.h b/include/uapi/sound/snd_sst_tokens.h
index 8ba0112e5336..ff3748e9308a 100644
--- a/include/uapi/sound/snd_sst_tokens.h
+++ b/include/uapi/sound/snd_sst_tokens.h
@@ -233,6 +233,8 @@
  *
  * %SKL_TKN_U32_ASTATE_CLK_SRC: Clock source for A-State entry
  *
+ * %SKL_TKN_U32_FMT_CFG_IDX:    Format config index
+ *
  * module_id and loadable flags dont have tokens as these values will be
  * read from the DSP FW manifest
  *
@@ -324,7 +326,9 @@ enum SKL_TKNS {
 	SKL_TKN_U32_ASTATE_COUNT,
 	SKL_TKN_U32_ASTATE_KCPS,
 	SKL_TKN_U32_ASTATE_CLK_SRC,
-	SKL_TKN_MAX = SKL_TKN_U32_ASTATE_CLK_SRC,
+
+	SKL_TKN_U32_FMT_CFG_IDX = 96,
+	SKL_TKN_MAX = SKL_TKN_U32_FMT_CFG_IDX,
 };
 
 #endif
diff --git a/sound/soc/intel/skylake/skl-messages.c b/sound/soc/intel/skylake/skl-messages.c
index 79c6cf2c14bf..7257acf0b6ee 100644
--- a/sound/soc/intel/skylake/skl-messages.c
+++ b/sound/soc/intel/skylake/skl-messages.c
@@ -479,15 +479,15 @@ static void skl_set_base_module_format(struct skl_dev *skl,
 static void skl_copy_copier_caps(struct skl_module_cfg *mconfig,
 				struct skl_cpr_cfg *cpr_mconfig)
 {
-	if (mconfig->formats_config.caps_size == 0)
+	if (mconfig->formats_config[SKL_PARAM_INIT].caps_size == 0)
 		return;
 
 	memcpy(cpr_mconfig->gtw_cfg.config_data,
-			mconfig->formats_config.caps,
-			mconfig->formats_config.caps_size);
+			mconfig->formats_config[SKL_PARAM_INIT].caps,
+			mconfig->formats_config[SKL_PARAM_INIT].caps_size);
 
 	cpr_mconfig->gtw_cfg.config_length =
-			(mconfig->formats_config.caps_size) / 4;
+			(mconfig->formats_config[SKL_PARAM_INIT].caps_size) / 4;
 }
 
 #define SKL_NON_GATEWAY_CPR_NODE_ID 0xFFFFFFFF
@@ -749,13 +749,12 @@ static void skl_set_algo_format(struct skl_dev *skl,
 	struct skl_base_cfg *base_cfg = (struct skl_base_cfg *)algo_mcfg;
 
 	skl_set_base_module_format(skl, mconfig, base_cfg);
-
-	if (mconfig->formats_config.caps_size == 0)
+	if (mconfig->formats_config[SKL_PARAM_INIT].caps_size == 0)
 		return;
 
 	memcpy(algo_mcfg->params,
-			mconfig->formats_config.caps,
-			mconfig->formats_config.caps_size);
+			mconfig->formats_config[SKL_PARAM_INIT].caps,
+			mconfig->formats_config[SKL_PARAM_INIT].caps_size);
 
 }
 
@@ -786,7 +785,7 @@ static u16 skl_get_module_param_size(struct skl_dev *skl,
 	switch (mconfig->m_type) {
 	case SKL_MODULE_TYPE_COPIER:
 		param_size = sizeof(struct skl_cpr_cfg);
-		param_size += mconfig->formats_config.caps_size;
+		param_size += mconfig->formats_config[SKL_PARAM_INIT].caps_size;
 		return param_size;
 
 	case SKL_MODULE_TYPE_SRCINT:
@@ -797,7 +796,7 @@ static u16 skl_get_module_param_size(struct skl_dev *skl,
 
 	case SKL_MODULE_TYPE_ALGO:
 		param_size = sizeof(struct skl_base_cfg);
-		param_size += mconfig->formats_config.caps_size;
+		param_size += mconfig->formats_config[SKL_PARAM_INIT].caps_size;
 		return param_size;
 
 	case SKL_MODULE_TYPE_BASE_OUTFMT:
diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c
index 51fb594bb172..04fbf83f3835 100644
--- a/sound/soc/intel/skylake/skl-topology.c
+++ b/sound/soc/intel/skylake/skl-topology.c
@@ -292,7 +292,7 @@ static int skl_tplg_update_be_blob(struct snd_soc_dapm_widget *w,
 	struct skl_module_iface *m_iface = &m_cfg->module->formats[fmt_idx];
 
 	/* check if we already have blob */
-	if (m_cfg->formats_config.caps_size > 0)
+	if (m_cfg->formats_config[SKL_PARAM_INIT].caps_size > 0)
 		return 0;
 
 	dev_dbg(skl->dev, "Applying default cfg blob\n");
@@ -328,8 +328,8 @@ static int skl_tplg_update_be_blob(struct snd_soc_dapm_widget *w,
 	cfg = skl_get_ep_blob(skl, m_cfg->vbus_id, link_type,
 					s_fmt, ch, s_freq, dir, dev_type);
 	if (cfg) {
-		m_cfg->formats_config.caps_size = cfg->size;
-		m_cfg->formats_config.caps = (u32 *) &cfg->caps;
+		m_cfg->formats_config[SKL_PARAM_INIT].caps_size = cfg->size;
+		m_cfg->formats_config[SKL_PARAM_INIT].caps = (u32 *)&cfg->caps;
 	} else {
 		dev_err(skl->dev, "Blob NULL for id %x type %d dirn %d\n",
 					m_cfg->vbus_id, link_type, dir);
@@ -386,9 +386,9 @@ static int skl_tplg_set_module_params(struct snd_soc_dapm_widget *w,
 	struct skl_algo_data *bc;
 	struct skl_specific_cfg *sp_cfg;
 
-	if (mconfig->formats_config.caps_size > 0 &&
-		mconfig->formats_config.set_params == SKL_PARAM_SET) {
-		sp_cfg = &mconfig->formats_config;
+	if (mconfig->formats_config[SKL_PARAM_SET].caps_size > 0 &&
+	    mconfig->formats_config[SKL_PARAM_SET].set_params == SKL_PARAM_SET) {
+		sp_cfg = &mconfig->formats_config[SKL_PARAM_SET];
 		ret = skl_set_module_params(skl, sp_cfg->caps,
 					sp_cfg->caps_size,
 					sp_cfg->param_id, mconfig);
@@ -438,8 +438,10 @@ static int skl_tplg_set_module_init_data(struct snd_soc_dapm_widget *w)
 			if (bc->set_params != SKL_PARAM_INIT)
 				continue;
 
-			mconfig->formats_config.caps = (u32 *)bc->params;
-			mconfig->formats_config.caps_size = bc->size;
+			mconfig->formats_config[SKL_PARAM_INIT].caps =
+							(u32 *)bc->params;
+			mconfig->formats_config[SKL_PARAM_INIT].caps_size =
+								bc->size;
 
 			break;
 		}
@@ -798,9 +800,10 @@ static int skl_tplg_set_module_bind_params(struct snd_soc_dapm_widget *w,
 			return 0;
 	}
 
-	if (mconfig->formats_config.caps_size > 0 &&
-		mconfig->formats_config.set_params == SKL_PARAM_BIND) {
-		sp_cfg = &mconfig->formats_config;
+	if (mconfig->formats_config[SKL_PARAM_BIND].caps_size > 0 &&
+	    mconfig->formats_config[SKL_PARAM_BIND].set_params ==
+								SKL_PARAM_BIND) {
+		sp_cfg = &mconfig->formats_config[SKL_PARAM_BIND];
 		ret = skl_set_module_params(skl, sp_cfg->caps,
 					sp_cfg->caps_size,
 					sp_cfg->param_id, mconfig);
@@ -1496,7 +1499,8 @@ static int skl_tplg_mic_control_get(struct snd_kcontrol *kcontrol,
 static int skl_fill_mic_sel_params(struct skl_module_cfg *mconfig,
 	struct skl_mic_sel_config *mic_cfg, struct device *dev)
 {
-	struct skl_specific_cfg *sp_cfg = &mconfig->formats_config;
+	struct skl_specific_cfg *sp_cfg =
+				&mconfig->formats_config[SKL_PARAM_INIT];
 
 	sp_cfg->caps_size = sizeof(struct skl_mic_sel_config);
 	sp_cfg->set_params = SKL_PARAM_SET;
@@ -1849,8 +1853,8 @@ static int skl_tplg_be_fill_pipe_params(struct snd_soc_dai *dai,
 					pipe_fmt->freq, pipe->direction,
 					dev_type);
 	if (cfg) {
-		mconfig->formats_config.caps_size = cfg->size;
-		mconfig->formats_config.caps = (u32 *) &cfg->caps;
+		mconfig->formats_config[SKL_PARAM_INIT].caps_size = cfg->size;
+		mconfig->formats_config[SKL_PARAM_INIT].caps = (u32 *)&cfg->caps;
 	} else {
 		dev_err(dai->dev, "Blob NULL for id:%d type:%d dirn:%d ch:%d, freq:%d, fmt:%d\n",
 			mconfig->vbus_id, link_type, params->stream,
@@ -2567,19 +2571,26 @@ static int skl_tplg_get_token(struct device *dev,
 
 		break;
 
+	case SKL_TKN_U32_FMT_CFG_IDX:
+		if (tkn_elem->value > SKL_MAX_PARAMS_TYPES)
+			return -EINVAL;
+
+		mconfig->fmt_cfg_idx = tkn_elem->value;
+		break;
+
 	case SKL_TKN_U32_CAPS_SIZE:
-		mconfig->formats_config.caps_size =
+		mconfig->formats_config[mconfig->fmt_cfg_idx].caps_size =
 			tkn_elem->value;
 
 		break;
 
 	case SKL_TKN_U32_CAPS_SET_PARAMS:
-		mconfig->formats_config.set_params =
+		mconfig->formats_config[mconfig->fmt_cfg_idx].set_params =
 				tkn_elem->value;
 		break;
 
 	case SKL_TKN_U32_CAPS_PARAMS_ID:
-		mconfig->formats_config.param_id =
+		mconfig->formats_config[mconfig->fmt_cfg_idx].param_id =
 				tkn_elem->value;
 		break;
 
@@ -2793,6 +2804,7 @@ static int skl_tplg_get_pvt_data_v4(struct snd_soc_tplg_dapm_widget *tplg_w,
 	struct skl_dfw_v4_module *dfw =
 				(struct skl_dfw_v4_module *)tplg_w->priv.data;
 	int ret;
+	int idx = mconfig->fmt_cfg_idx;
 
 	dev_dbg(dev, "Parsing Skylake v4 widget topology data\n");
 
@@ -2826,7 +2838,7 @@ static int skl_tplg_get_pvt_data_v4(struct snd_soc_tplg_dapm_widget *tplg_w,
 	mconfig->dev_type = dfw->dev_type;
 	mconfig->hw_conn_type = dfw->hw_conn_type;
 	mconfig->time_slot = dfw->time_slot;
-	mconfig->formats_config.caps_size = dfw->caps.caps_size;
+	mconfig->formats_config[idx].caps_size = dfw->caps.caps_size;
 
 	mconfig->m_in_pin = devm_kcalloc(dev,
 				MAX_IN_QUEUE, sizeof(*mconfig->m_in_pin),
@@ -2847,21 +2859,39 @@ static int skl_tplg_get_pvt_data_v4(struct snd_soc_tplg_dapm_widget *tplg_w,
 				    dfw->is_dynamic_out_pin,
 				    mconfig->module->max_output_pins);
 
-	if (mconfig->formats_config.caps_size) {
-		mconfig->formats_config.set_params = dfw->caps.set_params;
-		mconfig->formats_config.param_id = dfw->caps.param_id;
-		mconfig->formats_config.caps =
-		devm_kzalloc(dev, mconfig->formats_config.caps_size,
+	if (mconfig->formats_config[idx].caps_size) {
+		mconfig->formats_config[idx].set_params = dfw->caps.set_params;
+		mconfig->formats_config[idx].param_id = dfw->caps.param_id;
+		mconfig->formats_config[idx].caps =
+		devm_kzalloc(dev, mconfig->formats_config[idx].caps_size,
 			     GFP_KERNEL);
-		if (!mconfig->formats_config.caps)
+		if (!mconfig->formats_config[idx].caps)
 			return -ENOMEM;
-		memcpy(mconfig->formats_config.caps, dfw->caps.caps,
+		memcpy(mconfig->formats_config[idx].caps, dfw->caps.caps,
 		       dfw->caps.caps_size);
 	}
 
 	return 0;
 }
 
+static int skl_tplg_get_caps_data(struct device *dev, char *data,
+				  struct skl_module_cfg *mconfig)
+{
+	int idx = mconfig->fmt_cfg_idx;
+
+	if (mconfig->formats_config[idx].caps_size > 0) {
+		mconfig->formats_config[idx].caps =
+			devm_kzalloc(dev, mconfig->formats_config[idx].caps_size,
+				     GFP_KERNEL);
+		if (!mconfig->formats_config[idx].caps)
+			return -ENOMEM;
+		memcpy(mconfig->formats_config[idx].caps, data,
+		       mconfig->formats_config[idx].caps_size);
+	}
+
+	return mconfig->formats_config[idx].caps_size;
+}
+
 /*
  * Parse the private data for the token and corresponding value.
  * The private data can have multiple data blocks. So, a data block
@@ -2922,18 +2952,14 @@ static int skl_tplg_get_pvt_data(struct snd_soc_tplg_dapm_widget *tplg_w,
 		if (block_type == SKL_TYPE_TUPLE) {
 			ret = skl_tplg_get_tokens(dev, data,
 					skl, mconfig, block_size);
-
-			if (ret < 0)
-				return ret;
-
-			--num_blocks;
 		} else {
-			if (mconfig->formats_config.caps_size > 0)
-				memcpy(mconfig->formats_config.caps, data,
-					mconfig->formats_config.caps_size);
-			--num_blocks;
-			ret = mconfig->formats_config.caps_size;
+			ret = skl_tplg_get_caps_data(dev, data, mconfig);
 		}
+
+		if (ret < 0)
+			return ret;
+
+		--num_blocks;
 		off += ret;
 	}
 
@@ -3024,6 +3050,9 @@ static int skl_tplg_widget_load(struct snd_soc_component *cmpnt, int index,
 	 */
 	mconfig->id.module_id = -1;
 
+	/* To provide backward compatibility, set default as SKL_PARAM_INIT */
+	mconfig->fmt_cfg_idx = SKL_PARAM_INIT;
+
 	/* Parse private data for tuples */
 	ret = skl_tplg_get_pvt_data(tplg_w, skl, bus->dev, mconfig);
 	if (ret < 0)
diff --git a/sound/soc/intel/skylake/skl-topology.h b/sound/soc/intel/skylake/skl-topology.h
index ad230fd64758..ef332c83d5c0 100644
--- a/sound/soc/intel/skylake/skl-topology.h
+++ b/sound/soc/intel/skylake/skl-topology.h
@@ -81,6 +81,8 @@ enum skl_s_freq {
 	SKL_FS_INVALID
 };
 
+#define SKL_MAX_PARAMS_TYPES	4
+
 enum skl_widget_type {
 	SKL_WIDGET_VMIXER = 1,
 	SKL_WIDGET_MIXER = 2,
@@ -371,6 +373,7 @@ struct skl_module_cfg {
 	struct skl_module *module;
 	int res_idx;
 	int fmt_idx;
+	int fmt_cfg_idx;
 	u8 domain;
 	bool homogenous_inputs;
 	bool homogenous_outputs;
@@ -401,7 +404,7 @@ struct skl_module_cfg {
 	enum skl_hw_conn_type  hw_conn_type;
 	enum skl_module_state m_state;
 	struct skl_pipe *pipe;
-	struct skl_specific_cfg formats_config;
+	struct skl_specific_cfg formats_config[SKL_MAX_PARAMS_TYPES];
 	struct skl_pipe_mcfg mod_cfg[SKL_MAX_MODULES_IN_PIPE];
 };
 
-- 
cgit v1.2.3-71-gd317


From d5ae8d7f85b7f6f6e60f1af8ff4be52b0926fde1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 23 Aug 2021 09:49:09 -0700
Subject: Revert "media: dvb header files: move some headers to staging"

This reverts commit 819fbd3d8ef36c09576c2a0ffea503f5c46e9177.

It turns out that some user-space applications use these uapi header
files, so even though the only user of the interface is an old driver
that was moved to staging, moving the header files causes unnecessary
pain.

Generally, we really don't want user space to use kernel headers
directly (exactly because it causes pain when we re-organize), and
instead copy them as needed.  But these things happen, and the headers
were in the uapi directory, so I guess it's not entirely unreasonable.

Link: https://lore.kernel.org/lkml/4e3e0d40-df4a-94f8-7c2d-85010b0873c4@web.de/
Reported-by: Soeren Moch <smoch@web.de>
Cc: stable@kernel.org  # 5.13
Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/staging/media/av7110/audio.h  | 101 ----------------
 drivers/staging/media/av7110/av7110.h |   7 +-
 drivers/staging/media/av7110/osd.h    | 181 ----------------------------
 drivers/staging/media/av7110/video.h  | 220 ----------------------------------
 include/uapi/linux/dvb/audio.h        | 101 ++++++++++++++++
 include/uapi/linux/dvb/osd.h          | 181 ++++++++++++++++++++++++++++
 include/uapi/linux/dvb/video.h        | 220 ++++++++++++++++++++++++++++++++++
 7 files changed, 505 insertions(+), 506 deletions(-)
 delete mode 100644 drivers/staging/media/av7110/audio.h
 delete mode 100644 drivers/staging/media/av7110/osd.h
 delete mode 100644 drivers/staging/media/av7110/video.h
 create mode 100644 include/uapi/linux/dvb/audio.h
 create mode 100644 include/uapi/linux/dvb/osd.h
 create mode 100644 include/uapi/linux/dvb/video.h

(limited to 'include/uapi')

diff --git a/drivers/staging/media/av7110/audio.h b/drivers/staging/media/av7110/audio.h
deleted file mode 100644
index 2f869da69171..000000000000
--- a/drivers/staging/media/av7110/audio.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1+ WITH Linux-syscall-note */
-/*
- * audio.h - DEPRECATED MPEG-TS audio decoder API
- *
- * NOTE: should not be used on future drivers
- *
- * Copyright (C) 2000 Ralph  Metzler <ralph@convergence.de>
- *                  & Marcus Metzler <marcus@convergence.de>
- *                    for convergence integrated media GmbH
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Lesser Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- */
-
-#ifndef _DVBAUDIO_H_
-#define _DVBAUDIO_H_
-
-#include <linux/types.h>
-
-typedef enum {
-	AUDIO_SOURCE_DEMUX, /* Select the demux as the main source */
-	AUDIO_SOURCE_MEMORY /* Select internal memory as the main source */
-} audio_stream_source_t;
-
-
-typedef enum {
-	AUDIO_STOPPED,      /* Device is stopped */
-	AUDIO_PLAYING,      /* Device is currently playing */
-	AUDIO_PAUSED        /* Device is paused */
-} audio_play_state_t;
-
-
-typedef enum {
-	AUDIO_STEREO,
-	AUDIO_MONO_LEFT,
-	AUDIO_MONO_RIGHT,
-	AUDIO_MONO,
-	AUDIO_STEREO_SWAPPED
-} audio_channel_select_t;
-
-
-typedef struct audio_mixer {
-	unsigned int volume_left;
-	unsigned int volume_right;
-  /* what else do we need? bass, pass-through, ... */
-} audio_mixer_t;
-
-
-typedef struct audio_status {
-	int                    AV_sync_state;  /* sync audio and video? */
-	int                    mute_state;     /* audio is muted */
-	audio_play_state_t     play_state;     /* current playback state */
-	audio_stream_source_t  stream_source;  /* current stream source */
-	audio_channel_select_t channel_select; /* currently selected channel */
-	int                    bypass_mode;    /* pass on audio data to */
-	audio_mixer_t	       mixer_state;    /* current mixer state */
-} audio_status_t;                              /* separate decoder hardware */
-
-
-/* for GET_CAPABILITIES and SET_FORMAT, the latter should only set one bit */
-#define AUDIO_CAP_DTS    1
-#define AUDIO_CAP_LPCM   2
-#define AUDIO_CAP_MP1    4
-#define AUDIO_CAP_MP2    8
-#define AUDIO_CAP_MP3   16
-#define AUDIO_CAP_AAC   32
-#define AUDIO_CAP_OGG   64
-#define AUDIO_CAP_SDDS 128
-#define AUDIO_CAP_AC3  256
-
-#define AUDIO_STOP                 _IO('o', 1)
-#define AUDIO_PLAY                 _IO('o', 2)
-#define AUDIO_PAUSE                _IO('o', 3)
-#define AUDIO_CONTINUE             _IO('o', 4)
-#define AUDIO_SELECT_SOURCE        _IO('o', 5)
-#define AUDIO_SET_MUTE             _IO('o', 6)
-#define AUDIO_SET_AV_SYNC          _IO('o', 7)
-#define AUDIO_SET_BYPASS_MODE      _IO('o', 8)
-#define AUDIO_CHANNEL_SELECT       _IO('o', 9)
-#define AUDIO_GET_STATUS           _IOR('o', 10, audio_status_t)
-
-#define AUDIO_GET_CAPABILITIES     _IOR('o', 11, unsigned int)
-#define AUDIO_CLEAR_BUFFER         _IO('o',  12)
-#define AUDIO_SET_ID               _IO('o', 13)
-#define AUDIO_SET_MIXER            _IOW('o', 14, audio_mixer_t)
-#define AUDIO_SET_STREAMTYPE       _IO('o', 15)
-#define AUDIO_BILINGUAL_CHANNEL_SELECT _IO('o', 20)
-
-#endif /* _DVBAUDIO_H_ */
diff --git a/drivers/staging/media/av7110/av7110.h b/drivers/staging/media/av7110/av7110.h
index b8e8fc8ddbe9..809d938ae166 100644
--- a/drivers/staging/media/av7110/av7110.h
+++ b/drivers/staging/media/av7110/av7110.h
@@ -9,12 +9,11 @@
 #include <linux/input.h>
 #include <linux/time.h>
 
-#include "video.h"
-#include "audio.h"
-#include "osd.h"
-
+#include <linux/dvb/video.h>
+#include <linux/dvb/audio.h>
 #include <linux/dvb/dmx.h>
 #include <linux/dvb/ca.h>
+#include <linux/dvb/osd.h>
 #include <linux/dvb/net.h>
 #include <linux/mutex.h>
 
diff --git a/drivers/staging/media/av7110/osd.h b/drivers/staging/media/av7110/osd.h
deleted file mode 100644
index 858997c74043..000000000000
--- a/drivers/staging/media/av7110/osd.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1+ WITH Linux-syscall-note */
-/*
- * osd.h - DEPRECATED On Screen Display API
- *
- * NOTE: should not be used on future drivers
- *
- * Copyright (C) 2001 Ralph  Metzler <ralph@convergence.de>
- *                  & Marcus Metzler <marcus@convergence.de>
- *                    for convergence integrated media GmbH
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Lesser Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- */
-
-#ifndef _DVBOSD_H_
-#define _DVBOSD_H_
-
-#include <linux/compiler.h>
-
-typedef enum {
-	/* All functions return -2 on "not open" */
-	OSD_Close = 1,	/* () */
-	/*
-	 * Disables OSD and releases the buffers
-	 * returns 0 on success
-	 */
-	OSD_Open,	/* (x0,y0,x1,y1,BitPerPixel[2/4/8](color&0x0F),mix[0..15](color&0xF0)) */
-	/*
-	 * Opens OSD with this size and bit depth
-	 * returns 0 on success, -1 on DRAM allocation error, -2 on "already open"
-	 */
-	OSD_Show,	/* () */
-	/*
-	 * enables OSD mode
-	 * returns 0 on success
-	 */
-	OSD_Hide,	/* () */
-	/*
-	 * disables OSD mode
-	 * returns 0 on success
-	 */
-	OSD_Clear,	/* () */
-	/*
-	 * Sets all pixel to color 0
-	 * returns 0 on success
-	 */
-	OSD_Fill,	/* (color) */
-	/*
-	 * Sets all pixel to color <col>
-	 * returns 0 on success
-	 */
-	OSD_SetColor,	/* (color,R{x0},G{y0},B{x1},opacity{y1}) */
-	/*
-	 * set palette entry <num> to <r,g,b>, <mix> and <trans> apply
-	 * R,G,B: 0..255
-	 * R=Red, G=Green, B=Blue
-	 * opacity=0:      pixel opacity 0% (only video pixel shows)
-	 * opacity=1..254: pixel opacity as specified in header
-	 * opacity=255:    pixel opacity 100% (only OSD pixel shows)
-	 * returns 0 on success, -1 on error
-	 */
-	OSD_SetPalette,	/* (firstcolor{color},lastcolor{x0},data) */
-	/*
-	 * Set a number of entries in the palette
-	 * sets the entries "firstcolor" through "lastcolor" from the array "data"
-	 * data has 4 byte for each color:
-	 * R,G,B, and a opacity value: 0->transparent, 1..254->mix, 255->pixel
-	 */
-	OSD_SetTrans,	/* (transparency{color}) */
-	/*
-	 * Sets transparency of mixed pixel (0..15)
-	 * returns 0 on success
-	 */
-	OSD_SetPixel,	/* (x0,y0,color) */
-	/*
-	 * sets pixel <x>,<y> to color number <col>
-	 * returns 0 on success, -1 on error
-	 */
-	OSD_GetPixel,	/* (x0,y0) */
-	/* returns color number of pixel <x>,<y>,  or -1 */
-	OSD_SetRow,	/* (x0,y0,x1,data) */
-	/*
-	 * fills pixels x0,y through  x1,y with the content of data[]
-	 * returns 0 on success, -1 on clipping all pixel (no pixel drawn)
-	 */
-	OSD_SetBlock,	/* (x0,y0,x1,y1,increment{color},data) */
-	/*
-	 * fills pixels x0,y0 through  x1,y1 with the content of data[]
-	 * inc contains the width of one line in the data block,
-	 * inc<=0 uses blockwidth as linewidth
-	 * returns 0 on success, -1 on clipping all pixel
-	 */
-	OSD_FillRow,	/* (x0,y0,x1,color) */
-	/*
-	 * fills pixels x0,y through  x1,y with the color <col>
-	 * returns 0 on success, -1 on clipping all pixel
-	 */
-	OSD_FillBlock,	/* (x0,y0,x1,y1,color) */
-	/*
-	 * fills pixels x0,y0 through  x1,y1 with the color <col>
-	 * returns 0 on success, -1 on clipping all pixel
-	 */
-	OSD_Line,	/* (x0,y0,x1,y1,color) */
-	/*
-	 * draw a line from x0,y0 to x1,y1 with the color <col>
-	 * returns 0 on success
-	 */
-	OSD_Query,	/* (x0,y0,x1,y1,xasp{color}}), yasp=11 */
-	/*
-	 * fills parameters with the picture dimensions and the pixel aspect ratio
-	 * returns 0 on success
-	 */
-	OSD_Test,       /* () */
-	/*
-	 * draws a test picture. for debugging purposes only
-	 * returns 0 on success
-	 * TODO: remove "test" in final version
-	 */
-	OSD_Text,	/* (x0,y0,size,color,text) */
-	OSD_SetWindow,	/* (x0) set window with number 0<x0<8 as current */
-	OSD_MoveWindow,	/* move current window to (x0, y0) */
-	OSD_OpenRaw,	/* Open other types of OSD windows */
-} OSD_Command;
-
-typedef struct osd_cmd_s {
-	OSD_Command cmd;
-	int x0;
-	int y0;
-	int x1;
-	int y1;
-	int color;
-	void __user *data;
-} osd_cmd_t;
-
-/* OSD_OpenRaw: set 'color' to desired window type */
-typedef enum {
-	OSD_BITMAP1,           /* 1 bit bitmap */
-	OSD_BITMAP2,           /* 2 bit bitmap */
-	OSD_BITMAP4,           /* 4 bit bitmap */
-	OSD_BITMAP8,           /* 8 bit bitmap */
-	OSD_BITMAP1HR,         /* 1 Bit bitmap half resolution */
-	OSD_BITMAP2HR,         /* 2 bit bitmap half resolution */
-	OSD_BITMAP4HR,         /* 4 bit bitmap half resolution */
-	OSD_BITMAP8HR,         /* 8 bit bitmap half resolution */
-	OSD_YCRCB422,          /* 4:2:2 YCRCB Graphic Display */
-	OSD_YCRCB444,          /* 4:4:4 YCRCB Graphic Display */
-	OSD_YCRCB444HR,        /* 4:4:4 YCRCB graphic half resolution */
-	OSD_VIDEOTSIZE,        /* True Size Normal MPEG Video Display */
-	OSD_VIDEOHSIZE,        /* MPEG Video Display Half Resolution */
-	OSD_VIDEOQSIZE,        /* MPEG Video Display Quarter Resolution */
-	OSD_VIDEODSIZE,        /* MPEG Video Display Double Resolution */
-	OSD_VIDEOTHSIZE,       /* True Size MPEG Video Display Half Resolution */
-	OSD_VIDEOTQSIZE,       /* True Size MPEG Video Display Quarter Resolution*/
-	OSD_VIDEOTDSIZE,       /* True Size MPEG Video Display Double Resolution */
-	OSD_VIDEONSIZE,        /* Full Size MPEG Video Display */
-	OSD_CURSOR             /* Cursor */
-} osd_raw_window_t;
-
-typedef struct osd_cap_s {
-	int  cmd;
-#define OSD_CAP_MEMSIZE         1  /* memory size */
-	long val;
-} osd_cap_t;
-
-
-#define OSD_SEND_CMD            _IOW('o', 160, osd_cmd_t)
-#define OSD_GET_CAPABILITY      _IOR('o', 161, osd_cap_t)
-
-#endif
diff --git a/drivers/staging/media/av7110/video.h b/drivers/staging/media/av7110/video.h
deleted file mode 100644
index 179f1ec60af6..000000000000
--- a/drivers/staging/media/av7110/video.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1+ WITH Linux-syscall-note */
-/*
- * video.h - DEPRECATED MPEG-TS video decoder API
- *
- * NOTE: should not be used on future drivers
- *
- * Copyright (C) 2000 Marcus Metzler <marcus@convergence.de>
- *                  & Ralph  Metzler <ralph@convergence.de>
- *                    for convergence integrated media GmbH
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- */
-
-#ifndef _UAPI_DVBVIDEO_H_
-#define _UAPI_DVBVIDEO_H_
-
-#include <linux/types.h>
-#ifndef __KERNEL__
-#include <time.h>
-#endif
-
-typedef enum {
-	VIDEO_FORMAT_4_3,     /* Select 4:3 format */
-	VIDEO_FORMAT_16_9,    /* Select 16:9 format. */
-	VIDEO_FORMAT_221_1    /* 2.21:1 */
-} video_format_t;
-
-
-typedef enum {
-	VIDEO_PAN_SCAN,       /* use pan and scan format */
-	VIDEO_LETTER_BOX,     /* use letterbox format */
-	VIDEO_CENTER_CUT_OUT  /* use center cut out format */
-} video_displayformat_t;
-
-typedef struct {
-	int w;
-	int h;
-	video_format_t aspect_ratio;
-} video_size_t;
-
-typedef enum {
-	VIDEO_SOURCE_DEMUX, /* Select the demux as the main source */
-	VIDEO_SOURCE_MEMORY /* If this source is selected, the stream
-			       comes from the user through the write
-			       system call */
-} video_stream_source_t;
-
-
-typedef enum {
-	VIDEO_STOPPED, /* Video is stopped */
-	VIDEO_PLAYING, /* Video is currently playing */
-	VIDEO_FREEZED  /* Video is freezed */
-} video_play_state_t;
-
-
-/* Decoder commands */
-#define VIDEO_CMD_PLAY        (0)
-#define VIDEO_CMD_STOP        (1)
-#define VIDEO_CMD_FREEZE      (2)
-#define VIDEO_CMD_CONTINUE    (3)
-
-/* Flags for VIDEO_CMD_FREEZE */
-#define VIDEO_CMD_FREEZE_TO_BLACK	(1 << 0)
-
-/* Flags for VIDEO_CMD_STOP */
-#define VIDEO_CMD_STOP_TO_BLACK		(1 << 0)
-#define VIDEO_CMD_STOP_IMMEDIATELY	(1 << 1)
-
-/* Play input formats: */
-/* The decoder has no special format requirements */
-#define VIDEO_PLAY_FMT_NONE         (0)
-/* The decoder requires full GOPs */
-#define VIDEO_PLAY_FMT_GOP          (1)
-
-/* The structure must be zeroed before use by the application
-   This ensures it can be extended safely in the future. */
-struct video_command {
-	__u32 cmd;
-	__u32 flags;
-	union {
-		struct {
-			__u64 pts;
-		} stop;
-
-		struct {
-			/* 0 or 1000 specifies normal speed,
-			   1 specifies forward single stepping,
-			   -1 specifies backward single stepping,
-			   >1: playback at speed/1000 of the normal speed,
-			   <-1: reverse playback at (-speed/1000) of the normal speed. */
-			__s32 speed;
-			__u32 format;
-		} play;
-
-		struct {
-			__u32 data[16];
-		} raw;
-	};
-};
-
-/* FIELD_UNKNOWN can be used if the hardware does not know whether
-   the Vsync is for an odd, even or progressive (i.e. non-interlaced)
-   field. */
-#define VIDEO_VSYNC_FIELD_UNKNOWN	(0)
-#define VIDEO_VSYNC_FIELD_ODD		(1)
-#define VIDEO_VSYNC_FIELD_EVEN		(2)
-#define VIDEO_VSYNC_FIELD_PROGRESSIVE	(3)
-
-struct video_event {
-	__s32 type;
-#define VIDEO_EVENT_SIZE_CHANGED	1
-#define VIDEO_EVENT_FRAME_RATE_CHANGED	2
-#define VIDEO_EVENT_DECODER_STOPPED	3
-#define VIDEO_EVENT_VSYNC		4
-	/* unused, make sure to use atomic time for y2038 if it ever gets used */
-	long timestamp;
-	union {
-		video_size_t size;
-		unsigned int frame_rate;	/* in frames per 1000sec */
-		unsigned char vsync_field;	/* unknown/odd/even/progressive */
-	} u;
-};
-
-
-struct video_status {
-	int                   video_blank;   /* blank video on freeze? */
-	video_play_state_t    play_state;    /* current state of playback */
-	video_stream_source_t stream_source; /* current source (demux/memory) */
-	video_format_t        video_format;  /* current aspect ratio of stream*/
-	video_displayformat_t display_format;/* selected cropping mode */
-};
-
-
-struct video_still_picture {
-	char __user *iFrame;        /* pointer to a single iframe in memory */
-	__s32 size;
-};
-
-
-typedef __u16 video_attributes_t;
-/*   bits: descr. */
-/*   15-14 Video compression mode (0=MPEG-1, 1=MPEG-2) */
-/*   13-12 TV system (0=525/60, 1=625/50) */
-/*   11-10 Aspect ratio (0=4:3, 3=16:9) */
-/*    9- 8 permitted display mode on 4:3 monitor (0=both, 1=only pan-sca */
-/*    7    line 21-1 data present in GOP (1=yes, 0=no) */
-/*    6    line 21-2 data present in GOP (1=yes, 0=no) */
-/*    5- 3 source resolution (0=720x480/576, 1=704x480/576, 2=352x480/57 */
-/*    2    source letterboxed (1=yes, 0=no) */
-/*    0    film/camera mode (0=
- *camera, 1=film (625/50 only)) */
-
-
-/* bit definitions for capabilities: */
-/* can the hardware decode MPEG1 and/or MPEG2? */
-#define VIDEO_CAP_MPEG1   1
-#define VIDEO_CAP_MPEG2   2
-/* can you send a system and/or program stream to video device?
-   (you still have to open the video and the audio device but only
-    send the stream to the video device) */
-#define VIDEO_CAP_SYS     4
-#define VIDEO_CAP_PROG    8
-/* can the driver also handle SPU, NAVI and CSS encoded data?
-   (CSS API is not present yet) */
-#define VIDEO_CAP_SPU    16
-#define VIDEO_CAP_NAVI   32
-#define VIDEO_CAP_CSS    64
-
-
-#define VIDEO_STOP                 _IO('o', 21)
-#define VIDEO_PLAY                 _IO('o', 22)
-#define VIDEO_FREEZE               _IO('o', 23)
-#define VIDEO_CONTINUE             _IO('o', 24)
-#define VIDEO_SELECT_SOURCE        _IO('o', 25)
-#define VIDEO_SET_BLANK            _IO('o', 26)
-#define VIDEO_GET_STATUS           _IOR('o', 27, struct video_status)
-#define VIDEO_GET_EVENT            _IOR('o', 28, struct video_event)
-#define VIDEO_SET_DISPLAY_FORMAT   _IO('o', 29)
-#define VIDEO_STILLPICTURE         _IOW('o', 30, struct video_still_picture)
-#define VIDEO_FAST_FORWARD         _IO('o', 31)
-#define VIDEO_SLOWMOTION           _IO('o', 32)
-#define VIDEO_GET_CAPABILITIES     _IOR('o', 33, unsigned int)
-#define VIDEO_CLEAR_BUFFER         _IO('o',  34)
-#define VIDEO_SET_STREAMTYPE       _IO('o', 36)
-#define VIDEO_SET_FORMAT           _IO('o', 37)
-#define VIDEO_GET_SIZE             _IOR('o', 55, video_size_t)
-
-/**
- * VIDEO_GET_PTS
- *
- * Read the 33 bit presentation time stamp as defined
- * in ITU T-REC-H.222.0 / ISO/IEC 13818-1.
- *
- * The PTS should belong to the currently played
- * frame if possible, but may also be a value close to it
- * like the PTS of the last decoded frame or the last PTS
- * extracted by the PES parser.
- */
-#define VIDEO_GET_PTS              _IOR('o', 57, __u64)
-
-/* Read the number of displayed frames since the decoder was started */
-#define VIDEO_GET_FRAME_COUNT	   _IOR('o', 58, __u64)
-
-#define VIDEO_COMMAND		   _IOWR('o', 59, struct video_command)
-#define VIDEO_TRY_COMMAND	   _IOWR('o', 60, struct video_command)
-
-#endif /* _UAPI_DVBVIDEO_H_ */
diff --git a/include/uapi/linux/dvb/audio.h b/include/uapi/linux/dvb/audio.h
new file mode 100644
index 000000000000..2f869da69171
--- /dev/null
+++ b/include/uapi/linux/dvb/audio.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: LGPL-2.1+ WITH Linux-syscall-note */
+/*
+ * audio.h - DEPRECATED MPEG-TS audio decoder API
+ *
+ * NOTE: should not be used on future drivers
+ *
+ * Copyright (C) 2000 Ralph  Metzler <ralph@convergence.de>
+ *                  & Marcus Metzler <marcus@convergence.de>
+ *                    for convergence integrated media GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Lesser Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ */
+
+#ifndef _DVBAUDIO_H_
+#define _DVBAUDIO_H_
+
+#include <linux/types.h>
+
+typedef enum {
+	AUDIO_SOURCE_DEMUX, /* Select the demux as the main source */
+	AUDIO_SOURCE_MEMORY /* Select internal memory as the main source */
+} audio_stream_source_t;
+
+
+typedef enum {
+	AUDIO_STOPPED,      /* Device is stopped */
+	AUDIO_PLAYING,      /* Device is currently playing */
+	AUDIO_PAUSED        /* Device is paused */
+} audio_play_state_t;
+
+
+typedef enum {
+	AUDIO_STEREO,
+	AUDIO_MONO_LEFT,
+	AUDIO_MONO_RIGHT,
+	AUDIO_MONO,
+	AUDIO_STEREO_SWAPPED
+} audio_channel_select_t;
+
+
+typedef struct audio_mixer {
+	unsigned int volume_left;
+	unsigned int volume_right;
+  /* what else do we need? bass, pass-through, ... */
+} audio_mixer_t;
+
+
+typedef struct audio_status {
+	int                    AV_sync_state;  /* sync audio and video? */
+	int                    mute_state;     /* audio is muted */
+	audio_play_state_t     play_state;     /* current playback state */
+	audio_stream_source_t  stream_source;  /* current stream source */
+	audio_channel_select_t channel_select; /* currently selected channel */
+	int                    bypass_mode;    /* pass on audio data to */
+	audio_mixer_t	       mixer_state;    /* current mixer state */
+} audio_status_t;                              /* separate decoder hardware */
+
+
+/* for GET_CAPABILITIES and SET_FORMAT, the latter should only set one bit */
+#define AUDIO_CAP_DTS    1
+#define AUDIO_CAP_LPCM   2
+#define AUDIO_CAP_MP1    4
+#define AUDIO_CAP_MP2    8
+#define AUDIO_CAP_MP3   16
+#define AUDIO_CAP_AAC   32
+#define AUDIO_CAP_OGG   64
+#define AUDIO_CAP_SDDS 128
+#define AUDIO_CAP_AC3  256
+
+#define AUDIO_STOP                 _IO('o', 1)
+#define AUDIO_PLAY                 _IO('o', 2)
+#define AUDIO_PAUSE                _IO('o', 3)
+#define AUDIO_CONTINUE             _IO('o', 4)
+#define AUDIO_SELECT_SOURCE        _IO('o', 5)
+#define AUDIO_SET_MUTE             _IO('o', 6)
+#define AUDIO_SET_AV_SYNC          _IO('o', 7)
+#define AUDIO_SET_BYPASS_MODE      _IO('o', 8)
+#define AUDIO_CHANNEL_SELECT       _IO('o', 9)
+#define AUDIO_GET_STATUS           _IOR('o', 10, audio_status_t)
+
+#define AUDIO_GET_CAPABILITIES     _IOR('o', 11, unsigned int)
+#define AUDIO_CLEAR_BUFFER         _IO('o',  12)
+#define AUDIO_SET_ID               _IO('o', 13)
+#define AUDIO_SET_MIXER            _IOW('o', 14, audio_mixer_t)
+#define AUDIO_SET_STREAMTYPE       _IO('o', 15)
+#define AUDIO_BILINGUAL_CHANNEL_SELECT _IO('o', 20)
+
+#endif /* _DVBAUDIO_H_ */
diff --git a/include/uapi/linux/dvb/osd.h b/include/uapi/linux/dvb/osd.h
new file mode 100644
index 000000000000..858997c74043
--- /dev/null
+++ b/include/uapi/linux/dvb/osd.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: LGPL-2.1+ WITH Linux-syscall-note */
+/*
+ * osd.h - DEPRECATED On Screen Display API
+ *
+ * NOTE: should not be used on future drivers
+ *
+ * Copyright (C) 2001 Ralph  Metzler <ralph@convergence.de>
+ *                  & Marcus Metzler <marcus@convergence.de>
+ *                    for convergence integrated media GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Lesser Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ */
+
+#ifndef _DVBOSD_H_
+#define _DVBOSD_H_
+
+#include <linux/compiler.h>
+
+typedef enum {
+	/* All functions return -2 on "not open" */
+	OSD_Close = 1,	/* () */
+	/*
+	 * Disables OSD and releases the buffers
+	 * returns 0 on success
+	 */
+	OSD_Open,	/* (x0,y0,x1,y1,BitPerPixel[2/4/8](color&0x0F),mix[0..15](color&0xF0)) */
+	/*
+	 * Opens OSD with this size and bit depth
+	 * returns 0 on success, -1 on DRAM allocation error, -2 on "already open"
+	 */
+	OSD_Show,	/* () */
+	/*
+	 * enables OSD mode
+	 * returns 0 on success
+	 */
+	OSD_Hide,	/* () */
+	/*
+	 * disables OSD mode
+	 * returns 0 on success
+	 */
+	OSD_Clear,	/* () */
+	/*
+	 * Sets all pixel to color 0
+	 * returns 0 on success
+	 */
+	OSD_Fill,	/* (color) */
+	/*
+	 * Sets all pixel to color <col>
+	 * returns 0 on success
+	 */
+	OSD_SetColor,	/* (color,R{x0},G{y0},B{x1},opacity{y1}) */
+	/*
+	 * set palette entry <num> to <r,g,b>, <mix> and <trans> apply
+	 * R,G,B: 0..255
+	 * R=Red, G=Green, B=Blue
+	 * opacity=0:      pixel opacity 0% (only video pixel shows)
+	 * opacity=1..254: pixel opacity as specified in header
+	 * opacity=255:    pixel opacity 100% (only OSD pixel shows)
+	 * returns 0 on success, -1 on error
+	 */
+	OSD_SetPalette,	/* (firstcolor{color},lastcolor{x0},data) */
+	/*
+	 * Set a number of entries in the palette
+	 * sets the entries "firstcolor" through "lastcolor" from the array "data"
+	 * data has 4 byte for each color:
+	 * R,G,B, and a opacity value: 0->transparent, 1..254->mix, 255->pixel
+	 */
+	OSD_SetTrans,	/* (transparency{color}) */
+	/*
+	 * Sets transparency of mixed pixel (0..15)
+	 * returns 0 on success
+	 */
+	OSD_SetPixel,	/* (x0,y0,color) */
+	/*
+	 * sets pixel <x>,<y> to color number <col>
+	 * returns 0 on success, -1 on error
+	 */
+	OSD_GetPixel,	/* (x0,y0) */
+	/* returns color number of pixel <x>,<y>,  or -1 */
+	OSD_SetRow,	/* (x0,y0,x1,data) */
+	/*
+	 * fills pixels x0,y through  x1,y with the content of data[]
+	 * returns 0 on success, -1 on clipping all pixel (no pixel drawn)
+	 */
+	OSD_SetBlock,	/* (x0,y0,x1,y1,increment{color},data) */
+	/*
+	 * fills pixels x0,y0 through  x1,y1 with the content of data[]
+	 * inc contains the width of one line in the data block,
+	 * inc<=0 uses blockwidth as linewidth
+	 * returns 0 on success, -1 on clipping all pixel
+	 */
+	OSD_FillRow,	/* (x0,y0,x1,color) */
+	/*
+	 * fills pixels x0,y through  x1,y with the color <col>
+	 * returns 0 on success, -1 on clipping all pixel
+	 */
+	OSD_FillBlock,	/* (x0,y0,x1,y1,color) */
+	/*
+	 * fills pixels x0,y0 through  x1,y1 with the color <col>
+	 * returns 0 on success, -1 on clipping all pixel
+	 */
+	OSD_Line,	/* (x0,y0,x1,y1,color) */
+	/*
+	 * draw a line from x0,y0 to x1,y1 with the color <col>
+	 * returns 0 on success
+	 */
+	OSD_Query,	/* (x0,y0,x1,y1,xasp{color}}), yasp=11 */
+	/*
+	 * fills parameters with the picture dimensions and the pixel aspect ratio
+	 * returns 0 on success
+	 */
+	OSD_Test,       /* () */
+	/*
+	 * draws a test picture. for debugging purposes only
+	 * returns 0 on success
+	 * TODO: remove "test" in final version
+	 */
+	OSD_Text,	/* (x0,y0,size,color,text) */
+	OSD_SetWindow,	/* (x0) set window with number 0<x0<8 as current */
+	OSD_MoveWindow,	/* move current window to (x0, y0) */
+	OSD_OpenRaw,	/* Open other types of OSD windows */
+} OSD_Command;
+
+typedef struct osd_cmd_s {
+	OSD_Command cmd;
+	int x0;
+	int y0;
+	int x1;
+	int y1;
+	int color;
+	void __user *data;
+} osd_cmd_t;
+
+/* OSD_OpenRaw: set 'color' to desired window type */
+typedef enum {
+	OSD_BITMAP1,           /* 1 bit bitmap */
+	OSD_BITMAP2,           /* 2 bit bitmap */
+	OSD_BITMAP4,           /* 4 bit bitmap */
+	OSD_BITMAP8,           /* 8 bit bitmap */
+	OSD_BITMAP1HR,         /* 1 Bit bitmap half resolution */
+	OSD_BITMAP2HR,         /* 2 bit bitmap half resolution */
+	OSD_BITMAP4HR,         /* 4 bit bitmap half resolution */
+	OSD_BITMAP8HR,         /* 8 bit bitmap half resolution */
+	OSD_YCRCB422,          /* 4:2:2 YCRCB Graphic Display */
+	OSD_YCRCB444,          /* 4:4:4 YCRCB Graphic Display */
+	OSD_YCRCB444HR,        /* 4:4:4 YCRCB graphic half resolution */
+	OSD_VIDEOTSIZE,        /* True Size Normal MPEG Video Display */
+	OSD_VIDEOHSIZE,        /* MPEG Video Display Half Resolution */
+	OSD_VIDEOQSIZE,        /* MPEG Video Display Quarter Resolution */
+	OSD_VIDEODSIZE,        /* MPEG Video Display Double Resolution */
+	OSD_VIDEOTHSIZE,       /* True Size MPEG Video Display Half Resolution */
+	OSD_VIDEOTQSIZE,       /* True Size MPEG Video Display Quarter Resolution*/
+	OSD_VIDEOTDSIZE,       /* True Size MPEG Video Display Double Resolution */
+	OSD_VIDEONSIZE,        /* Full Size MPEG Video Display */
+	OSD_CURSOR             /* Cursor */
+} osd_raw_window_t;
+
+typedef struct osd_cap_s {
+	int  cmd;
+#define OSD_CAP_MEMSIZE         1  /* memory size */
+	long val;
+} osd_cap_t;
+
+
+#define OSD_SEND_CMD            _IOW('o', 160, osd_cmd_t)
+#define OSD_GET_CAPABILITY      _IOR('o', 161, osd_cap_t)
+
+#endif
diff --git a/include/uapi/linux/dvb/video.h b/include/uapi/linux/dvb/video.h
new file mode 100644
index 000000000000..179f1ec60af6
--- /dev/null
+++ b/include/uapi/linux/dvb/video.h
@@ -0,0 +1,220 @@
+/* SPDX-License-Identifier: LGPL-2.1+ WITH Linux-syscall-note */
+/*
+ * video.h - DEPRECATED MPEG-TS video decoder API
+ *
+ * NOTE: should not be used on future drivers
+ *
+ * Copyright (C) 2000 Marcus Metzler <marcus@convergence.de>
+ *                  & Ralph  Metzler <ralph@convergence.de>
+ *                    for convergence integrated media GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ */
+
+#ifndef _UAPI_DVBVIDEO_H_
+#define _UAPI_DVBVIDEO_H_
+
+#include <linux/types.h>
+#ifndef __KERNEL__
+#include <time.h>
+#endif
+
+typedef enum {
+	VIDEO_FORMAT_4_3,     /* Select 4:3 format */
+	VIDEO_FORMAT_16_9,    /* Select 16:9 format. */
+	VIDEO_FORMAT_221_1    /* 2.21:1 */
+} video_format_t;
+
+
+typedef enum {
+	VIDEO_PAN_SCAN,       /* use pan and scan format */
+	VIDEO_LETTER_BOX,     /* use letterbox format */
+	VIDEO_CENTER_CUT_OUT  /* use center cut out format */
+} video_displayformat_t;
+
+typedef struct {
+	int w;
+	int h;
+	video_format_t aspect_ratio;
+} video_size_t;
+
+typedef enum {
+	VIDEO_SOURCE_DEMUX, /* Select the demux as the main source */
+	VIDEO_SOURCE_MEMORY /* If this source is selected, the stream
+			       comes from the user through the write
+			       system call */
+} video_stream_source_t;
+
+
+typedef enum {
+	VIDEO_STOPPED, /* Video is stopped */
+	VIDEO_PLAYING, /* Video is currently playing */
+	VIDEO_FREEZED  /* Video is freezed */
+} video_play_state_t;
+
+
+/* Decoder commands */
+#define VIDEO_CMD_PLAY        (0)
+#define VIDEO_CMD_STOP        (1)
+#define VIDEO_CMD_FREEZE      (2)
+#define VIDEO_CMD_CONTINUE    (3)
+
+/* Flags for VIDEO_CMD_FREEZE */
+#define VIDEO_CMD_FREEZE_TO_BLACK	(1 << 0)
+
+/* Flags for VIDEO_CMD_STOP */
+#define VIDEO_CMD_STOP_TO_BLACK		(1 << 0)
+#define VIDEO_CMD_STOP_IMMEDIATELY	(1 << 1)
+
+/* Play input formats: */
+/* The decoder has no special format requirements */
+#define VIDEO_PLAY_FMT_NONE         (0)
+/* The decoder requires full GOPs */
+#define VIDEO_PLAY_FMT_GOP          (1)
+
+/* The structure must be zeroed before use by the application
+   This ensures it can be extended safely in the future. */
+struct video_command {
+	__u32 cmd;
+	__u32 flags;
+	union {
+		struct {
+			__u64 pts;
+		} stop;
+
+		struct {
+			/* 0 or 1000 specifies normal speed,
+			   1 specifies forward single stepping,
+			   -1 specifies backward single stepping,
+			   >1: playback at speed/1000 of the normal speed,
+			   <-1: reverse playback at (-speed/1000) of the normal speed. */
+			__s32 speed;
+			__u32 format;
+		} play;
+
+		struct {
+			__u32 data[16];
+		} raw;
+	};
+};
+
+/* FIELD_UNKNOWN can be used if the hardware does not know whether
+   the Vsync is for an odd, even or progressive (i.e. non-interlaced)
+   field. */
+#define VIDEO_VSYNC_FIELD_UNKNOWN	(0)
+#define VIDEO_VSYNC_FIELD_ODD		(1)
+#define VIDEO_VSYNC_FIELD_EVEN		(2)
+#define VIDEO_VSYNC_FIELD_PROGRESSIVE	(3)
+
+struct video_event {
+	__s32 type;
+#define VIDEO_EVENT_SIZE_CHANGED	1
+#define VIDEO_EVENT_FRAME_RATE_CHANGED	2
+#define VIDEO_EVENT_DECODER_STOPPED	3
+#define VIDEO_EVENT_VSYNC		4
+	/* unused, make sure to use atomic time for y2038 if it ever gets used */
+	long timestamp;
+	union {
+		video_size_t size;
+		unsigned int frame_rate;	/* in frames per 1000sec */
+		unsigned char vsync_field;	/* unknown/odd/even/progressive */
+	} u;
+};
+
+
+struct video_status {
+	int                   video_blank;   /* blank video on freeze? */
+	video_play_state_t    play_state;    /* current state of playback */
+	video_stream_source_t stream_source; /* current source (demux/memory) */
+	video_format_t        video_format;  /* current aspect ratio of stream*/
+	video_displayformat_t display_format;/* selected cropping mode */
+};
+
+
+struct video_still_picture {
+	char __user *iFrame;        /* pointer to a single iframe in memory */
+	__s32 size;
+};
+
+
+typedef __u16 video_attributes_t;
+/*   bits: descr. */
+/*   15-14 Video compression mode (0=MPEG-1, 1=MPEG-2) */
+/*   13-12 TV system (0=525/60, 1=625/50) */
+/*   11-10 Aspect ratio (0=4:3, 3=16:9) */
+/*    9- 8 permitted display mode on 4:3 monitor (0=both, 1=only pan-sca */
+/*    7    line 21-1 data present in GOP (1=yes, 0=no) */
+/*    6    line 21-2 data present in GOP (1=yes, 0=no) */
+/*    5- 3 source resolution (0=720x480/576, 1=704x480/576, 2=352x480/57 */
+/*    2    source letterboxed (1=yes, 0=no) */
+/*    0    film/camera mode (0=
+ *camera, 1=film (625/50 only)) */
+
+
+/* bit definitions for capabilities: */
+/* can the hardware decode MPEG1 and/or MPEG2? */
+#define VIDEO_CAP_MPEG1   1
+#define VIDEO_CAP_MPEG2   2
+/* can you send a system and/or program stream to video device?
+   (you still have to open the video and the audio device but only
+    send the stream to the video device) */
+#define VIDEO_CAP_SYS     4
+#define VIDEO_CAP_PROG    8
+/* can the driver also handle SPU, NAVI and CSS encoded data?
+   (CSS API is not present yet) */
+#define VIDEO_CAP_SPU    16
+#define VIDEO_CAP_NAVI   32
+#define VIDEO_CAP_CSS    64
+
+
+#define VIDEO_STOP                 _IO('o', 21)
+#define VIDEO_PLAY                 _IO('o', 22)
+#define VIDEO_FREEZE               _IO('o', 23)
+#define VIDEO_CONTINUE             _IO('o', 24)
+#define VIDEO_SELECT_SOURCE        _IO('o', 25)
+#define VIDEO_SET_BLANK            _IO('o', 26)
+#define VIDEO_GET_STATUS           _IOR('o', 27, struct video_status)
+#define VIDEO_GET_EVENT            _IOR('o', 28, struct video_event)
+#define VIDEO_SET_DISPLAY_FORMAT   _IO('o', 29)
+#define VIDEO_STILLPICTURE         _IOW('o', 30, struct video_still_picture)
+#define VIDEO_FAST_FORWARD         _IO('o', 31)
+#define VIDEO_SLOWMOTION           _IO('o', 32)
+#define VIDEO_GET_CAPABILITIES     _IOR('o', 33, unsigned int)
+#define VIDEO_CLEAR_BUFFER         _IO('o',  34)
+#define VIDEO_SET_STREAMTYPE       _IO('o', 36)
+#define VIDEO_SET_FORMAT           _IO('o', 37)
+#define VIDEO_GET_SIZE             _IOR('o', 55, video_size_t)
+
+/**
+ * VIDEO_GET_PTS
+ *
+ * Read the 33 bit presentation time stamp as defined
+ * in ITU T-REC-H.222.0 / ISO/IEC 13818-1.
+ *
+ * The PTS should belong to the currently played
+ * frame if possible, but may also be a value close to it
+ * like the PTS of the last decoded frame or the last PTS
+ * extracted by the PES parser.
+ */
+#define VIDEO_GET_PTS              _IOR('o', 57, __u64)
+
+/* Read the number of displayed frames since the decoder was started */
+#define VIDEO_GET_FRAME_COUNT	   _IOR('o', 58, __u64)
+
+#define VIDEO_COMMAND		   _IOWR('o', 59, struct video_command)
+#define VIDEO_TRY_COMMAND	   _IOWR('o', 60, struct video_command)
+
+#endif /* _UAPI_DVBVIDEO_H_ */
-- 
cgit v1.2.3-71-gd317


From e34a02dc40c95d126bb6486dcf802bbb8d1624a0 Mon Sep 17 00:00:00 2001
From: Dmitry Kadashev <dkadashev@gmail.com>
Date: Thu, 8 Jul 2021 13:34:45 +0700
Subject: io_uring: add support for IORING_OP_MKDIRAT

IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags
and arguments.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20210708063447.3556403-10-dkadashev@gmail.com
[axboe: add splice_fd_in check]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 60 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 61 insertions(+)

(limited to 'include/uapi')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3898f7ab14f6..8e14e71bf6ac 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -662,6 +662,13 @@ struct io_unlink {
 	struct filename			*filename;
 };
 
+struct io_mkdir {
+	struct file			*file;
+	int				dfd;
+	umode_t				mode;
+	struct filename			*filename;
+};
+
 struct io_completion {
 	struct file			*file;
 	u32				cflags;
@@ -821,6 +828,7 @@ struct io_kiocb {
 		struct io_shutdown	shutdown;
 		struct io_rename	rename;
 		struct io_unlink	unlink;
+		struct io_mkdir		mkdir;
 		/* use only after cleaning per-op data, see io_clean_op() */
 		struct io_completion	compl;
 	};
@@ -1032,6 +1040,7 @@ static const struct io_op_def io_op_defs[] = {
 	},
 	[IORING_OP_RENAMEAT] = {},
 	[IORING_OP_UNLINKAT] = {},
+	[IORING_OP_MKDIRAT] = {},
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -3603,6 +3612,49 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static int io_mkdirat_prep(struct io_kiocb *req,
+			    const struct io_uring_sqe *sqe)
+{
+	struct io_mkdir *mkd = &req->mkdir;
+	const char __user *fname;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
+	    sqe->splice_fd_in)
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	mkd->dfd = READ_ONCE(sqe->fd);
+	mkd->mode = READ_ONCE(sqe->len);
+
+	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	mkd->filename = getname(fname);
+	if (IS_ERR(mkd->filename))
+		return PTR_ERR(mkd->filename);
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+static int io_mkdirat(struct io_kiocb *req, int issue_flags)
+{
+	struct io_mkdir *mkd = &req->mkdir;
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
+
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_complete(req, ret);
+	return 0;
+}
+
 static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -6000,6 +6052,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_renameat_prep(req, sqe);
 	case IORING_OP_UNLINKAT:
 		return io_unlinkat_prep(req, sqe);
+	case IORING_OP_MKDIRAT:
+		return io_mkdirat_prep(req, sqe);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6163,6 +6217,9 @@ static void io_clean_op(struct io_kiocb *req)
 		case IORING_OP_UNLINKAT:
 			putname(req->unlink.filename);
 			break;
+		case IORING_OP_MKDIRAT:
+			putname(req->mkdir.filename);
+			break;
 		}
 	}
 	if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -6291,6 +6348,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	case IORING_OP_UNLINKAT:
 		ret = io_unlinkat(req, issue_flags);
 		break;
+	case IORING_OP_MKDIRAT:
+		ret = io_mkdirat(req, issue_flags);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 79126d5cd289..a926407c230e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -133,6 +133,7 @@ enum {
 	IORING_OP_SHUTDOWN,
 	IORING_OP_RENAMEAT,
 	IORING_OP_UNLINKAT,
+	IORING_OP_MKDIRAT,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
cgit v1.2.3-71-gd317


From 7a8721f84fcb3b2946a92380b6fc311e017ff02c Mon Sep 17 00:00:00 2001
From: Dmitry Kadashev <dkadashev@gmail.com>
Date: Thu, 8 Jul 2021 13:34:46 +0700
Subject: io_uring: add support for IORING_OP_SYMLINKAT

IORING_OP_SYMLINKAT behaves like symlinkat(2) and takes the same flags
and arguments.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/io-uring/20210514145259.wtl4xcsp52woi6ab@wittgenstein/
Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20210708063447.3556403-11-dkadashev@gmail.com
[axboe: add splice_fd_in check]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/internal.h                 |  1 +
 fs/io_uring.c                 | 67 +++++++++++++++++++++++++++++++++++++++++++
 fs/namei.c                    |  3 +-
 include/uapi/linux/io_uring.h |  1 +
 4 files changed, 70 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/internal.h b/fs/internal.h
index d6b15dad1310..2f9750aefbd6 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -77,6 +77,7 @@ int may_linkat(struct user_namespace *mnt_userns, struct path *link);
 int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
 		 struct filename *newname, unsigned int flags);
 int do_mkdirat(int dfd, struct filename *name, umode_t mode);
+int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
 
 /*
  * namespace.c
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8e14e71bf6ac..33941df9084b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -669,6 +669,13 @@ struct io_mkdir {
 	struct filename			*filename;
 };
 
+struct io_symlink {
+	struct file			*file;
+	int				new_dfd;
+	struct filename			*oldpath;
+	struct filename			*newpath;
+};
+
 struct io_completion {
 	struct file			*file;
 	u32				cflags;
@@ -829,6 +836,7 @@ struct io_kiocb {
 		struct io_rename	rename;
 		struct io_unlink	unlink;
 		struct io_mkdir		mkdir;
+		struct io_symlink	symlink;
 		/* use only after cleaning per-op data, see io_clean_op() */
 		struct io_completion	compl;
 	};
@@ -1041,6 +1049,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_RENAMEAT] = {},
 	[IORING_OP_UNLINKAT] = {},
 	[IORING_OP_MKDIRAT] = {},
+	[IORING_OP_SYMLINKAT] = {},
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -3655,6 +3664,55 @@ static int io_mkdirat(struct io_kiocb *req, int issue_flags)
 	return 0;
 }
 
+static int io_symlinkat_prep(struct io_kiocb *req,
+			    const struct io_uring_sqe *sqe)
+{
+	struct io_symlink *sl = &req->symlink;
+	const char __user *oldpath, *newpath;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
+	    sqe->splice_fd_in)
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	sl->new_dfd = READ_ONCE(sqe->fd);
+	oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+
+	sl->oldpath = getname(oldpath);
+	if (IS_ERR(sl->oldpath))
+		return PTR_ERR(sl->oldpath);
+
+	sl->newpath = getname(newpath);
+	if (IS_ERR(sl->newpath)) {
+		putname(sl->oldpath);
+		return PTR_ERR(sl->newpath);
+	}
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+static int io_symlinkat(struct io_kiocb *req, int issue_flags)
+{
+	struct io_symlink *sl = &req->symlink;
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
+
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_complete(req, ret);
+	return 0;
+}
+
 static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -6054,6 +6112,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_unlinkat_prep(req, sqe);
 	case IORING_OP_MKDIRAT:
 		return io_mkdirat_prep(req, sqe);
+	case IORING_OP_SYMLINKAT:
+		return io_symlinkat_prep(req, sqe);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6220,6 +6280,10 @@ static void io_clean_op(struct io_kiocb *req)
 		case IORING_OP_MKDIRAT:
 			putname(req->mkdir.filename);
 			break;
+		case IORING_OP_SYMLINKAT:
+			putname(req->symlink.oldpath);
+			putname(req->symlink.newpath);
+			break;
 		}
 	}
 	if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -6351,6 +6415,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	case IORING_OP_MKDIRAT:
 		ret = io_mkdirat(req, issue_flags);
 		break;
+	case IORING_OP_SYMLINKAT:
+		ret = io_symlinkat(req, issue_flags);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/fs/namei.c b/fs/namei.c
index e2425c0a406e..803fc95b7658 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4223,8 +4223,7 @@ int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 }
 EXPORT_SYMBOL(vfs_symlink);
 
-static int do_symlinkat(struct filename *from, int newdfd,
-		  struct filename *to)
+int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
 {
 	int error;
 	struct dentry *dentry;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a926407c230e..61fd347ab176 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -134,6 +134,7 @@ enum {
 	IORING_OP_RENAMEAT,
 	IORING_OP_UNLINKAT,
 	IORING_OP_MKDIRAT,
+	IORING_OP_SYMLINKAT,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
cgit v1.2.3-71-gd317


From cf30da90bc3a26911d369f199411f38b701394de Mon Sep 17 00:00:00 2001
From: Dmitry Kadashev <dkadashev@gmail.com>
Date: Thu, 8 Jul 2021 13:34:47 +0700
Subject: io_uring: add support for IORING_OP_LINKAT

IORING_OP_LINKAT behaves like linkat(2) and takes the same flags and
arguments.

In some internal places 'hardlink' is used instead of 'link' to avoid
confusion with the SQE links. Name 'link' conflicts with the existing
'link' member of io_kiocb.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/io-uring/20210514145259.wtl4xcsp52woi6ab@wittgenstein/
Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20210708063447.3556403-12-dkadashev@gmail.com
[axboe: add splice_fd_in check]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/internal.h                 |  2 ++
 fs/io_uring.c                 | 71 +++++++++++++++++++++++++++++++++++++++++++
 fs/namei.c                    |  2 +-
 include/uapi/linux/io_uring.h |  2 ++
 4 files changed, 76 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/fs/internal.h b/fs/internal.h
index 2f9750aefbd6..2bb444600852 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -78,6 +78,8 @@ int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
 		 struct filename *newname, unsigned int flags);
 int do_mkdirat(int dfd, struct filename *name, umode_t mode);
 int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
+int do_linkat(int olddfd, struct filename *old, int newdfd,
+			struct filename *new, int flags);
 
 /*
  * namespace.c
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 33941df9084b..a89bbffbe042 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -676,6 +676,15 @@ struct io_symlink {
 	struct filename			*newpath;
 };
 
+struct io_hardlink {
+	struct file			*file;
+	int				old_dfd;
+	int				new_dfd;
+	struct filename			*oldpath;
+	struct filename			*newpath;
+	int				flags;
+};
+
 struct io_completion {
 	struct file			*file;
 	u32				cflags;
@@ -837,6 +846,7 @@ struct io_kiocb {
 		struct io_unlink	unlink;
 		struct io_mkdir		mkdir;
 		struct io_symlink	symlink;
+		struct io_hardlink	hardlink;
 		/* use only after cleaning per-op data, see io_clean_op() */
 		struct io_completion	compl;
 	};
@@ -1050,6 +1060,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_UNLINKAT] = {},
 	[IORING_OP_MKDIRAT] = {},
 	[IORING_OP_SYMLINKAT] = {},
+	[IORING_OP_LINKAT] = {},
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -3713,6 +3724,57 @@ static int io_symlinkat(struct io_kiocb *req, int issue_flags)
 	return 0;
 }
 
+static int io_linkat_prep(struct io_kiocb *req,
+			    const struct io_uring_sqe *sqe)
+{
+	struct io_hardlink *lnk = &req->hardlink;
+	const char __user *oldf, *newf;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	lnk->old_dfd = READ_ONCE(sqe->fd);
+	lnk->new_dfd = READ_ONCE(sqe->len);
+	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	lnk->flags = READ_ONCE(sqe->hardlink_flags);
+
+	lnk->oldpath = getname(oldf);
+	if (IS_ERR(lnk->oldpath))
+		return PTR_ERR(lnk->oldpath);
+
+	lnk->newpath = getname(newf);
+	if (IS_ERR(lnk->newpath)) {
+		putname(lnk->oldpath);
+		return PTR_ERR(lnk->newpath);
+	}
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+static int io_linkat(struct io_kiocb *req, int issue_flags)
+{
+	struct io_hardlink *lnk = &req->hardlink;
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
+				lnk->newpath, lnk->flags);
+
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_complete(req, ret);
+	return 0;
+}
+
 static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -6114,6 +6176,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_mkdirat_prep(req, sqe);
 	case IORING_OP_SYMLINKAT:
 		return io_symlinkat_prep(req, sqe);
+	case IORING_OP_LINKAT:
+		return io_linkat_prep(req, sqe);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6284,6 +6348,10 @@ static void io_clean_op(struct io_kiocb *req)
 			putname(req->symlink.oldpath);
 			putname(req->symlink.newpath);
 			break;
+		case IORING_OP_LINKAT:
+			putname(req->hardlink.oldpath);
+			putname(req->hardlink.newpath);
+			break;
 		}
 	}
 	if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -6418,6 +6486,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	case IORING_OP_SYMLINKAT:
 		ret = io_symlinkat(req, issue_flags);
 		break;
+	case IORING_OP_LINKAT:
+		ret = io_linkat(req, issue_flags);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/fs/namei.c b/fs/namei.c
index 803fc95b7658..0718e1e87eb4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4368,7 +4368,7 @@ EXPORT_SYMBOL(vfs_link);
  * with linux 2.0, and to avoid hard-linking to directories
  * and other special files.  --ADM
  */
-static int do_linkat(int olddfd, struct filename *old, int newdfd,
+int do_linkat(int olddfd, struct filename *old, int newdfd,
 	      struct filename *new, int flags)
 {
 	struct user_namespace *mnt_userns;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 61fd347ab176..10eb38d2864f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -44,6 +44,7 @@ struct io_uring_sqe {
 		__u32		splice_flags;
 		__u32		rename_flags;
 		__u32		unlink_flags;
+		__u32		hardlink_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -135,6 +136,7 @@ enum {
 	IORING_OP_UNLINKAT,
 	IORING_OP_MKDIRAT,
 	IORING_OP_SYMLINKAT,
+	IORING_OP_LINKAT,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
cgit v1.2.3-71-gd317


From 6fc88c354f3af83ffa2c285b86e76c759755693f Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Thu, 19 Aug 2021 02:24:20 -0700
Subject: bpf: Migrate cgroup_bpf to internal cgroup_bpf_attach_type enum

Add an enum (cgroup_bpf_attach_type) containing only valid cgroup_bpf
attach types and a function to map bpf_attach_type values to the new
enum. Inspired by netns_bpf_attach_type.

Then, migrate cgroup_bpf to use cgroup_bpf_attach_type wherever
possible.  Functionality is unchanged as attach_type_to_prog_type
switches in bpf/syscall.c were preventing non-cgroup programs from
making use of the invalid cgroup_bpf array slots.

As a result struct cgroup_bpf uses 504 fewer bytes relative to when its
arrays were sized using MAX_BPF_ATTACH_TYPE.

bpf_cgroup_storage is notably not migrated as struct
bpf_cgroup_storage_key is part of uapi and contains a bpf_attach_type
member which is not meant to be opaque. Similarly, bpf_cgroup_link
continues to report its bpf_attach_type member to userspace via fdinfo
and bpf_link_info.

To ease disambiguation, bpf_attach_type variables are renamed from
'type' to 'atype' when changed to cgroup_bpf_attach_type.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210819092420.1984861-2-davemarchevsky@fb.com
---
 include/linux/bpf-cgroup.h     | 182 ++++++++++++++++++++++++++++-------------
 include/uapi/linux/bpf.h       |   2 +-
 kernel/bpf/cgroup.c            | 156 +++++++++++++++++++++--------------
 net/ipv4/af_inet.c             |   6 +-
 net/ipv4/udp.c                 |   2 +-
 net/ipv6/af_inet6.c            |   6 +-
 net/ipv6/udp.c                 |   2 +-
 tools/include/uapi/linux/bpf.h |   2 +-
 8 files changed, 226 insertions(+), 132 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index a74cd1c3bd87..2746fd804216 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -23,9 +23,73 @@ struct ctl_table_header;
 struct task_struct;
 
 #ifdef CONFIG_CGROUP_BPF
+enum cgroup_bpf_attach_type {
+	CGROUP_BPF_ATTACH_TYPE_INVALID = -1,
+	CGROUP_INET_INGRESS = 0,
+	CGROUP_INET_EGRESS,
+	CGROUP_INET_SOCK_CREATE,
+	CGROUP_SOCK_OPS,
+	CGROUP_DEVICE,
+	CGROUP_INET4_BIND,
+	CGROUP_INET6_BIND,
+	CGROUP_INET4_CONNECT,
+	CGROUP_INET6_CONNECT,
+	CGROUP_INET4_POST_BIND,
+	CGROUP_INET6_POST_BIND,
+	CGROUP_UDP4_SENDMSG,
+	CGROUP_UDP6_SENDMSG,
+	CGROUP_SYSCTL,
+	CGROUP_UDP4_RECVMSG,
+	CGROUP_UDP6_RECVMSG,
+	CGROUP_GETSOCKOPT,
+	CGROUP_SETSOCKOPT,
+	CGROUP_INET4_GETPEERNAME,
+	CGROUP_INET6_GETPEERNAME,
+	CGROUP_INET4_GETSOCKNAME,
+	CGROUP_INET6_GETSOCKNAME,
+	CGROUP_INET_SOCK_RELEASE,
+	MAX_CGROUP_BPF_ATTACH_TYPE
+};
+
+#define CGROUP_ATYPE(type) \
+	case BPF_##type: return type
+
+static inline enum cgroup_bpf_attach_type
+to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type)
+{
+	switch (attach_type) {
+	CGROUP_ATYPE(CGROUP_INET_INGRESS);
+	CGROUP_ATYPE(CGROUP_INET_EGRESS);
+	CGROUP_ATYPE(CGROUP_INET_SOCK_CREATE);
+	CGROUP_ATYPE(CGROUP_SOCK_OPS);
+	CGROUP_ATYPE(CGROUP_DEVICE);
+	CGROUP_ATYPE(CGROUP_INET4_BIND);
+	CGROUP_ATYPE(CGROUP_INET6_BIND);
+	CGROUP_ATYPE(CGROUP_INET4_CONNECT);
+	CGROUP_ATYPE(CGROUP_INET6_CONNECT);
+	CGROUP_ATYPE(CGROUP_INET4_POST_BIND);
+	CGROUP_ATYPE(CGROUP_INET6_POST_BIND);
+	CGROUP_ATYPE(CGROUP_UDP4_SENDMSG);
+	CGROUP_ATYPE(CGROUP_UDP6_SENDMSG);
+	CGROUP_ATYPE(CGROUP_SYSCTL);
+	CGROUP_ATYPE(CGROUP_UDP4_RECVMSG);
+	CGROUP_ATYPE(CGROUP_UDP6_RECVMSG);
+	CGROUP_ATYPE(CGROUP_GETSOCKOPT);
+	CGROUP_ATYPE(CGROUP_SETSOCKOPT);
+	CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME);
+	CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME);
+	CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME);
+	CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME);
+	CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE);
+	default:
+		return CGROUP_BPF_ATTACH_TYPE_INVALID;
+	}
+}
+
+#undef CGROUP_ATYPE
 
-extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE];
-#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type])
+extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE];
+#define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype])
 
 #define for_each_cgroup_storage_type(stype) \
 	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
@@ -67,15 +131,15 @@ struct bpf_prog_array;
 
 struct cgroup_bpf {
 	/* array of effective progs in this cgroup */
-	struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
+	struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
 
 	/* attached progs to this cgroup and attach flags
 	 * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
 	 * have either zero or one element
 	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
 	 */
-	struct list_head progs[MAX_BPF_ATTACH_TYPE];
-	u32 flags[MAX_BPF_ATTACH_TYPE];
+	struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
+	u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
 
 	/* list of cgroup shared storages */
 	struct list_head storages;
@@ -115,28 +179,28 @@ int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
-				enum bpf_attach_type type);
+				enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
-			       enum bpf_attach_type type);
+			       enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type,
+				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
 				      u32 *flags);
 
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
-				     enum bpf_attach_type type);
+				     enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
-				      short access, enum bpf_attach_type type);
+				      short access, enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
 				   char **buf, size_t *pcount, loff_t *ppos,
-				   enum bpf_attach_type type);
+				   enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
 				       int *optname, char __user *optval,
@@ -179,9 +243,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
 	int __ret = 0;							      \
-	if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS))		      \
+	if (cgroup_bpf_enabled(CGROUP_INET_INGRESS))		      \
 		__ret = __cgroup_bpf_run_filter_skb(sk, skb,		      \
-						    BPF_CGROUP_INET_INGRESS); \
+						    CGROUP_INET_INGRESS); \
 									      \
 	__ret;								      \
 })
@@ -189,54 +253,54 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb)			       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
+	if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
 		typeof(sk) __sk = sk_to_full_sk(sk);			       \
 		if (sk_fullsock(__sk))					       \
 			__ret = __cgroup_bpf_run_filter_skb(__sk, skb,	       \
-						      BPF_CGROUP_INET_EGRESS); \
+						      CGROUP_INET_EGRESS); \
 	}								       \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SK_PROG(sk, type)				       \
+#define BPF_CGROUP_RUN_SK_PROG(sk, atype)				       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(type)) {					       \
-		__ret = __cgroup_bpf_run_filter_sk(sk, type);		       \
+	if (cgroup_bpf_enabled(atype)) {					       \
+		__ret = __cgroup_bpf_run_filter_sk(sk, atype);		       \
 	}								       \
 	__ret;								       \
 })
 
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
-	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE)
+	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE)
 
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk)			       \
-	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE)
+	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_RELEASE)
 
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk)				       \
-	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND)
+	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET4_POST_BIND)
 
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk)				       \
-	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND)
+	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND)
 
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type)				       \
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype)				       \
 ({									       \
 	u32 __unused_flags;						       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(type))					       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+	if (cgroup_bpf_enabled(atype))					       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
 							  NULL,		       \
 							  &__unused_flags);    \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)		       \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx)		       \
 ({									       \
 	u32 __unused_flags;						       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(type))	{				       \
+	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
 							  t_ctx,	       \
 							  &__unused_flags);    \
 		release_sock(sk);					       \
@@ -249,13 +313,13 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
  * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check
  * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE).
  */
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, bind_flags)	       \
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags)	       \
 ({									       \
 	u32 __flags = 0;						       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(type))	{				       \
+	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
 							  NULL, &__flags);     \
 		release_sock(sk);					       \
 		if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE)	       \
@@ -265,33 +329,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 })
 
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk)				       \
-	((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) ||		       \
-	  cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) &&		       \
+	((cgroup_bpf_enabled(CGROUP_INET4_CONNECT) ||		       \
+	  cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) &&		       \
 	 (sk)->sk_prot->pre_connect)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT)
 
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL)
 
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL)
 
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL)
 
 /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
  * fullsock and its parent fullsock cannot be traced by
@@ -311,33 +375,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk)			\
 ({									\
 	int __ret = 0;							\
-	if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS))			\
+	if (cgroup_bpf_enabled(CGROUP_SOCK_OPS))			\
 		__ret = __cgroup_bpf_run_filter_sock_ops(sk,		\
 							 sock_ops,	\
-							 BPF_CGROUP_SOCK_OPS); \
+							 CGROUP_SOCK_OPS); \
 	__ret;								\
 })
 
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) {       \
+	if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && (sock_ops)->sk) {       \
 		typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk);	       \
 		if (__sk && sk_fullsock(__sk))				       \
 			__ret = __cgroup_bpf_run_filter_sock_ops(__sk,	       \
 								 sock_ops,     \
-							 BPF_CGROUP_SOCK_OPS); \
+							 CGROUP_SOCK_OPS); \
 	}								       \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access)	      \
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access)	      \
 ({									      \
 	int __ret = 0;							      \
-	if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE))			      \
-		__ret = __cgroup_bpf_check_dev_permission(type, major, minor, \
+	if (cgroup_bpf_enabled(CGROUP_DEVICE))			      \
+		__ret = __cgroup_bpf_check_dev_permission(atype, major, minor, \
 							  access,	      \
-							  BPF_CGROUP_DEVICE); \
+							  CGROUP_DEVICE); \
 									      \
 	__ret;								      \
 })
@@ -346,10 +410,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos)  \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL))			       \
+	if (cgroup_bpf_enabled(CGROUP_SYSCTL))			       \
 		__ret = __cgroup_bpf_run_filter_sysctl(head, table, write,     \
 						       buf, count, pos,        \
-						       BPF_CGROUP_SYSCTL);     \
+						       CGROUP_SYSCTL);     \
 	__ret;								       \
 })
 
@@ -357,7 +421,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 				       kernel_optval)			       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT))			       \
+	if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT))			       \
 		__ret = __cgroup_bpf_run_filter_setsockopt(sock, level,	       \
 							   optname, optval,    \
 							   optlen,	       \
@@ -368,7 +432,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen)			       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
+	if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))			       \
 		get_user(__ret, optlen);				       \
 	__ret;								       \
 })
@@ -377,7 +441,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 				       max_optlen, retval)		       \
 ({									       \
 	int __ret = retval;						       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
+	if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))			       \
 		if (!(sock)->sk_prot->bpf_bypass_getsockopt ||		       \
 		    !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \
 					tcp_bpf_bypass_getsockopt,	       \
@@ -392,7 +456,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 					    optlen, retval)		       \
 ({									       \
 	int __ret = retval;						       \
-	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
+	if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))			       \
 		__ret = __cgroup_bpf_run_filter_getsockopt_kern(	       \
 			sock, level, optname, optval, optlen, retval);	       \
 	__ret;								       \
@@ -451,14 +515,14 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 	return 0;
 }
 
-#define cgroup_bpf_enabled(type) (0)
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; })
+#define cgroup_bpf_enabled(atype) (0)
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, flags) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
@@ -470,7 +534,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c4f7892edb2b..191f0b286ee3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -84,7 +84,7 @@ struct bpf_lpm_trie_key {
 
 struct bpf_cgroup_storage_key {
 	__u64	cgroup_inode_id;	/* cgroup inode id */
-	__u32	attach_type;		/* program attach type */
+	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
 union bpf_iter_link_info {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8e9d99e2ade4..03145d45e3d5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -19,7 +19,7 @@
 
 #include "../cgroup/cgroup-internal.h"
 
-DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE);
+DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 
 void cgroup_bpf_offline(struct cgroup *cgrp)
@@ -113,12 +113,12 @@ static void cgroup_bpf_release(struct work_struct *work)
 	struct list_head *storages = &cgrp->bpf.storages;
 	struct bpf_cgroup_storage *storage, *stmp;
 
-	unsigned int type;
+	unsigned int atype;
 
 	mutex_lock(&cgroup_mutex);
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
-		struct list_head *progs = &cgrp->bpf.progs[type];
+	for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
+		struct list_head *progs = &cgrp->bpf.progs[atype];
 		struct bpf_prog_list *pl, *pltmp;
 
 		list_for_each_entry_safe(pl, pltmp, progs, node) {
@@ -128,10 +128,10 @@ static void cgroup_bpf_release(struct work_struct *work)
 			if (pl->link)
 				bpf_cgroup_link_auto_detach(pl->link);
 			kfree(pl);
-			static_branch_dec(&cgroup_bpf_enabled_key[type]);
+			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 		}
 		old_array = rcu_dereference_protected(
-				cgrp->bpf.effective[type],
+				cgrp->bpf.effective[atype],
 				lockdep_is_held(&cgroup_mutex));
 		bpf_prog_array_free(old_array);
 	}
@@ -196,7 +196,7 @@ static u32 prog_list_length(struct list_head *head)
  * if parent has overridable or multi-prog, allow attaching
  */
 static bool hierarchy_allows_attach(struct cgroup *cgrp,
-				    enum bpf_attach_type type)
+				    enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup *p;
 
@@ -204,12 +204,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
 	if (!p)
 		return true;
 	do {
-		u32 flags = p->bpf.flags[type];
+		u32 flags = p->bpf.flags[atype];
 		u32 cnt;
 
 		if (flags & BPF_F_ALLOW_MULTI)
 			return true;
-		cnt = prog_list_length(&p->bpf.progs[type]);
+		cnt = prog_list_length(&p->bpf.progs[atype]);
 		WARN_ON_ONCE(cnt > 1);
 		if (cnt == 1)
 			return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -225,7 +225,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
  * to programs in this cgroup
  */
 static int compute_effective_progs(struct cgroup *cgrp,
-				   enum bpf_attach_type type,
+				   enum cgroup_bpf_attach_type atype,
 				   struct bpf_prog_array **array)
 {
 	struct bpf_prog_array_item *item;
@@ -236,8 +236,8 @@ static int compute_effective_progs(struct cgroup *cgrp,
 
 	/* count number of effective programs by walking parents */
 	do {
-		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
-			cnt += prog_list_length(&p->bpf.progs[type]);
+		if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+			cnt += prog_list_length(&p->bpf.progs[atype]);
 		p = cgroup_parent(p);
 	} while (p);
 
@@ -249,10 +249,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
 	cnt = 0;
 	p = cgrp;
 	do {
-		if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+		if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 			continue;
 
-		list_for_each_entry(pl, &p->bpf.progs[type], node) {
+		list_for_each_entry(pl, &p->bpf.progs[atype], node) {
 			if (!prog_list_prog(pl))
 				continue;
 
@@ -269,10 +269,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
 }
 
 static void activate_effective_progs(struct cgroup *cgrp,
-				     enum bpf_attach_type type,
+				     enum cgroup_bpf_attach_type atype,
 				     struct bpf_prog_array *old_array)
 {
-	old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
+	old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
 					lockdep_is_held(&cgroup_mutex));
 	/* free prog array after grace period, since __cgroup_bpf_run_*()
 	 * might be still walking the array
@@ -328,7 +328,7 @@ cleanup:
 }
 
 static int update_effective_progs(struct cgroup *cgrp,
-				  enum bpf_attach_type type)
+				  enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup_subsys_state *css;
 	int err;
@@ -340,7 +340,7 @@ static int update_effective_progs(struct cgroup *cgrp,
 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
 			continue;
 
-		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
 		if (err)
 			goto cleanup;
 	}
@@ -357,7 +357,7 @@ static int update_effective_progs(struct cgroup *cgrp,
 			continue;
 		}
 
-		activate_effective_progs(desc, type, desc->bpf.inactive);
+		activate_effective_progs(desc, atype, desc->bpf.inactive);
 		desc->bpf.inactive = NULL;
 	}
 
@@ -436,11 +436,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 			enum bpf_attach_type type, u32 flags)
 {
 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
-	struct list_head *progs = &cgrp->bpf.progs[type];
 	struct bpf_prog *old_prog = NULL;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog_list *pl;
+	struct list_head *progs;
 	int err;
 
 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
@@ -454,10 +455,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 		/* replace_prog implies BPF_F_REPLACE, and vice versa */
 		return -EINVAL;
 
-	if (!hierarchy_allows_attach(cgrp, type))
+	atype = to_cgroup_bpf_attach_type(type);
+	if (atype < 0)
+		return -EINVAL;
+
+	progs = &cgrp->bpf.progs[atype];
+
+	if (!hierarchy_allows_attach(cgrp, atype))
 		return -EPERM;
 
-	if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
+	if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
 		/* Disallow attaching non-overridable on top
 		 * of existing overridable in this cgroup.
 		 * Disallow attaching multi-prog if overridable or none
@@ -490,16 +497,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	pl->prog = prog;
 	pl->link = link;
 	bpf_cgroup_storages_assign(pl->storage, storage);
-	cgrp->bpf.flags[type] = saved_flags;
+	cgrp->bpf.flags[atype] = saved_flags;
 
-	err = update_effective_progs(cgrp, type);
+	err = update_effective_progs(cgrp, atype);
 	if (err)
 		goto cleanup;
 
 	if (old_prog)
 		bpf_prog_put(old_prog);
 	else
-		static_branch_inc(&cgroup_bpf_enabled_key[type]);
+		static_branch_inc(&cgroup_bpf_enabled_key[atype]);
 	bpf_cgroup_storages_link(new_storage, cgrp, type);
 	return 0;
 
@@ -520,7 +527,7 @@ cleanup:
  * all descendant cgroups. This function is guaranteed to succeed.
  */
 static void replace_effective_prog(struct cgroup *cgrp,
-				   enum bpf_attach_type type,
+				   enum cgroup_bpf_attach_type atype,
 				   struct bpf_cgroup_link *link)
 {
 	struct bpf_prog_array_item *item;
@@ -539,10 +546,10 @@ static void replace_effective_prog(struct cgroup *cgrp,
 
 		/* find position of link in effective progs array */
 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
-			if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 				continue;
 
-			head = &cg->bpf.progs[type];
+			head = &cg->bpf.progs[atype];
 			list_for_each_entry(pl, head, node) {
 				if (!prog_list_prog(pl))
 					continue;
@@ -554,7 +561,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
 found:
 		BUG_ON(!cg);
 		progs = rcu_dereference_protected(
-				desc->bpf.effective[type],
+				desc->bpf.effective[atype],
 				lockdep_is_held(&cgroup_mutex));
 		item = &progs->items[pos];
 		WRITE_ONCE(item->prog, link->link.prog);
@@ -574,11 +581,18 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 				struct bpf_cgroup_link *link,
 				struct bpf_prog *new_prog)
 {
-	struct list_head *progs = &cgrp->bpf.progs[link->type];
+	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog *old_prog;
 	struct bpf_prog_list *pl;
+	struct list_head *progs;
 	bool found = false;
 
+	atype = to_cgroup_bpf_attach_type(link->type);
+	if (atype < 0)
+		return -EINVAL;
+
+	progs = &cgrp->bpf.progs[atype];
+
 	if (link->link.prog->type != new_prog->type)
 		return -EINVAL;
 
@@ -592,7 +606,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 		return -ENOENT;
 
 	old_prog = xchg(&link->link.prog, new_prog);
-	replace_effective_prog(cgrp, link->type, link);
+	replace_effective_prog(cgrp, atype, link);
 	bpf_prog_put(old_prog);
 	return 0;
 }
@@ -667,12 +681,20 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			struct bpf_cgroup_link *link, enum bpf_attach_type type)
 {
-	struct list_head *progs = &cgrp->bpf.progs[type];
-	u32 flags = cgrp->bpf.flags[type];
-	struct bpf_prog_list *pl;
+	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog *old_prog;
+	struct bpf_prog_list *pl;
+	struct list_head *progs;
+	u32 flags;
 	int err;
 
+	atype = to_cgroup_bpf_attach_type(type);
+	if (atype < 0)
+		return -EINVAL;
+
+	progs = &cgrp->bpf.progs[atype];
+	flags = cgrp->bpf.flags[atype];
+
 	if (prog && link)
 		/* only one of prog or link can be specified */
 		return -EINVAL;
@@ -686,7 +708,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	pl->prog = NULL;
 	pl->link = NULL;
 
-	err = update_effective_progs(cgrp, type);
+	err = update_effective_progs(cgrp, atype);
 	if (err)
 		goto cleanup;
 
@@ -695,10 +717,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	kfree(pl);
 	if (list_empty(progs))
 		/* last program was detached, reset flags to zero */
-		cgrp->bpf.flags[type] = 0;
+		cgrp->bpf.flags[atype] = 0;
 	if (old_prog)
 		bpf_prog_put(old_prog);
-	static_branch_dec(&cgroup_bpf_enabled_key[type]);
+	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 	return 0;
 
 cleanup:
@@ -714,13 +736,21 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 {
 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 	enum bpf_attach_type type = attr->query.attach_type;
-	struct list_head *progs = &cgrp->bpf.progs[type];
-	u32 flags = cgrp->bpf.flags[type];
+	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog_array *effective;
+	struct list_head *progs;
 	struct bpf_prog *prog;
 	int cnt, ret = 0, i;
+	u32 flags;
+
+	atype = to_cgroup_bpf_attach_type(type);
+	if (atype < 0)
+		return -EINVAL;
+
+	progs = &cgrp->bpf.progs[atype];
+	flags = cgrp->bpf.flags[atype];
 
-	effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+	effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
 					      lockdep_is_held(&cgroup_mutex));
 
 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
@@ -925,14 +955,14 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 	link->cgroup = cgrp;
 	link->type = attr->link_create.attach_type;
 
-	err  = bpf_link_prime(&link->link, &link_primer);
+	err = bpf_link_prime(&link->link, &link_primer);
 	if (err) {
 		kfree(link);
 		goto out_put_cgroup;
 	}
 
-	err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
-				BPF_F_ALLOW_MULTI);
+	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
+				link->type, BPF_F_ALLOW_MULTI);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		goto out_put_cgroup;
@@ -986,7 +1016,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
  */
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
-				enum bpf_attach_type type)
+				enum cgroup_bpf_attach_type atype)
 {
 	unsigned int offset = skb->data - skb_network_header(skb);
 	struct sock *save_sk;
@@ -1008,11 +1038,11 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	/* compute pointers for the bpf prog */
 	bpf_compute_and_save_data_end(skb, &saved_data_end);
 
-	if (type == BPF_CGROUP_INET_EGRESS) {
+	if (atype == CGROUP_INET_EGRESS) {
 		ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
-			cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+			cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
 	} else {
-		ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], skb,
+		ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
 					    __bpf_prog_run_save_cb);
 		ret = (ret == 1 ? 0 : -EPERM);
 	}
@@ -1038,12 +1068,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
  * and if it returned != 1 during execution. In all other cases, 0 is returned.
  */
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
-			       enum bpf_attach_type type)
+			       enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	int ret;
 
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sk, bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1065,7 +1095,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type,
+				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
 				      u32 *flags)
 {
@@ -1090,7 +1120,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	}
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[type], &ctx,
+	ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
 				          bpf_prog_run, flags);
 
 	return ret == 1 ? 0 : -EPERM;
@@ -1115,19 +1145,19 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
  */
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
-				     enum bpf_attach_type type)
+				     enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	int ret;
 
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sock_ops,
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
 				    bpf_prog_run);
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
-				      short access, enum bpf_attach_type type)
+				      short access, enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup *cgrp;
 	struct bpf_cgroup_dev_ctx ctx = {
@@ -1139,7 +1169,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx,
+	allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
 				      bpf_prog_run);
 	rcu_read_unlock();
 
@@ -1231,7 +1261,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
 				   char **buf, size_t *pcount, loff_t *ppos,
-				   enum bpf_attach_type type)
+				   enum cgroup_bpf_attach_type atype)
 {
 	struct bpf_sysctl_kern ctx = {
 		.head = head,
@@ -1271,7 +1301,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
 	rcu_read_unlock();
 
 	kfree(ctx.cur_val);
@@ -1289,7 +1319,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 #ifdef CONFIG_NET
 static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
-					     enum bpf_attach_type attach_type)
+					     enum cgroup_bpf_attach_type attach_type)
 {
 	struct bpf_prog_array *prog_array;
 	bool empty;
@@ -1364,7 +1394,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	 * attached to the hook so we don't waste time allocating
 	 * memory and locking the socket.
 	 */
-	if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+	if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
 		return 0;
 
 	/* Allocate a bit more than the initial user buffer for
@@ -1385,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	}
 
 	lock_sock(sk);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
 				    &ctx, bpf_prog_run);
 	release_sock(sk);
 
@@ -1460,7 +1490,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	 * attached to the hook so we don't waste time allocating
 	 * memory and locking the socket.
 	 */
-	if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+	if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
 		return retval;
 
 	ctx.optlen = max_optlen;
@@ -1495,7 +1525,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	}
 
 	lock_sock(sk);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
 				    &ctx, bpf_prog_run);
 	release_sock(sk);
 
@@ -1556,7 +1586,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 	 * be called if that data shouldn't be "exported".
 	 */
 
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
 				    &ctx, bpf_prog_run);
 	if (!ret)
 		return -EPERM;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0e4d758c2585..1d816a5fd3eb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -452,7 +452,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	 * changes context in a wrong way it will be caught.
 	 */
 	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
-						 BPF_CGROUP_INET4_BIND, &flags);
+						 CGROUP_INET4_BIND, &flags);
 	if (err)
 		return err;
 
@@ -781,7 +781,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		sin->sin_port = inet->inet_dport;
 		sin->sin_addr.s_addr = inet->inet_daddr;
 		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-					    BPF_CGROUP_INET4_GETPEERNAME,
+					    CGROUP_INET4_GETPEERNAME,
 					    NULL);
 	} else {
 		__be32 addr = inet->inet_rcv_saddr;
@@ -790,7 +790,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		sin->sin_port = inet->inet_sport;
 		sin->sin_addr.s_addr = addr;
 		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-					    BPF_CGROUP_INET4_GETSOCKNAME,
+					    CGROUP_INET4_GETSOCKNAME,
 					    NULL);
 	}
 	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1a742b710e54..8851c9463b4b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1143,7 +1143,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		rcu_read_unlock();
 	}
 
-	if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) {
+	if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
 					    (struct sockaddr *)usin, &ipc.addr);
 		if (err)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d92c90d97763..b5878bb8e419 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -455,7 +455,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	 * changes context in a wrong way it will be caught.
 	 */
 	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
-						 BPF_CGROUP_INET6_BIND, &flags);
+						 CGROUP_INET6_BIND, &flags);
 	if (err)
 		return err;
 
@@ -532,7 +532,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		if (np->sndflow)
 			sin->sin6_flowinfo = np->flow_label;
 		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-					    BPF_CGROUP_INET6_GETPEERNAME,
+					    CGROUP_INET6_GETPEERNAME,
 					    NULL);
 	} else {
 		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
@@ -541,7 +541,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 			sin->sin6_addr = sk->sk_v6_rcv_saddr;
 		sin->sin6_port = inet->inet_sport;
 		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-					    BPF_CGROUP_INET6_GETSOCKNAME,
+					    CGROUP_INET6_GETSOCKNAME,
 					    NULL);
 	}
 	sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c5e15e94bb00..ea53847b5b7e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1475,7 +1475,7 @@ do_udp_sendmsg:
 		fl6.saddr = np->saddr;
 	fl6.fl6_sport = inet->inet_sport;
 
-	if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) {
+	if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
 					   (struct sockaddr *)sin6, &fl6.saddr);
 		if (err)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c4f7892edb2b..191f0b286ee3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -84,7 +84,7 @@ struct bpf_lpm_trie_key {
 
 struct bpf_cgroup_storage_key {
 	__u64	cgroup_inode_id;	/* cgroup inode id */
-	__u32	attach_type;		/* program attach type */
+	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
 union bpf_iter_link_info {
-- 
cgit v1.2.3-71-gd317


From 029ee6b14356b94120bedb852dcdaefc0a282cf1 Mon Sep 17 00:00:00 2001
From: Yufeng Mo <moyufeng@huawei.com>
Date: Fri, 20 Aug 2021 15:35:17 +0800
Subject: ethtool: add two coalesce attributes for CQE mode

Currently, there are many drivers who support CQE mode configuration,
some configure it as a fixed when initialized, some provide an
interface to change it by ethtool private flags. In order to make it
more generic, add two new 'ETHTOOL_A_COALESCE_USE_CQE_TX' and
'ETHTOOL_A_COALESCE_USE_CQE_RX' coalesce attributes, then these
parameters can be accessed by ethtool netlink coalesce uAPI.

Also add an new structure kernel_ethtool_coalesce, then the
new parameter can be added into this struct.

Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ethtool-netlink.rst | 15 +++++++++++++++
 include/linux/ethtool.h                      | 11 ++++++++++-
 include/uapi/linux/ethtool_netlink.h         |  2 ++
 net/ethtool/coalesce.c                       | 19 +++++++++++++++++--
 net/ethtool/netlink.h                        |  2 +-
 5 files changed, 45 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index c690bb37430d..d9b55b7a1a4d 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -947,12 +947,25 @@ Kernel response contents:
   ``ETHTOOL_A_COALESCE_TX_USECS_HIGH``         u32     delay (us), high Tx
   ``ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH``    u32     max packets, high Tx
   ``ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL``  u32     rate sampling interval
+  ``ETHTOOL_A_COALESCE_USE_CQE_TX``            bool    timer reset mode, Tx
+  ``ETHTOOL_A_COALESCE_USE_CQE_RX``            bool    timer reset mode, Rx
   ===========================================  ======  =======================
 
 Attributes are only included in reply if their value is not zero or the
 corresponding bit in ``ethtool_ops::supported_coalesce_params`` is set (i.e.
 they are declared as supported by driver).
 
+Timer reset mode (``ETHTOOL_A_COALESCE_USE_CQE_TX`` and
+``ETHTOOL_A_COALESCE_USE_CQE_RX``) controls the interaction between packet
+arrival and the various time based delay parameters. By default timers are
+expected to limit the max delay between any packet arrival/departure and a
+corresponding interrupt. In this mode timer should be started by packet
+arrival (sometimes delivery of previous interrupt) and reset when interrupt
+is delivered.
+Setting the appropriate attribute to 1 will enable ``CQE`` mode, where
+each packet event resets the timer. In this mode timer is used to force
+the interrupt if queue goes idle, while busy queues depend on the packet
+limit to trigger interrupts.
 
 COALESCE_SET
 ============
@@ -985,6 +998,8 @@ Request contents:
   ``ETHTOOL_A_COALESCE_TX_USECS_HIGH``         u32     delay (us), high Tx
   ``ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH``    u32     max packets, high Tx
   ``ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL``  u32     rate sampling interval
+  ``ETHTOOL_A_COALESCE_USE_CQE_TX``            bool    timer reset mode, Tx
+  ``ETHTOOL_A_COALESCE_USE_CQE_RX``            bool    timer reset mode, Rx
   ===========================================  ======  =======================
 
 Request is rejected if it attributes declared as unsupported by driver (i.e.
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 4711b96dae0c..a9d77a6a3e00 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -172,6 +172,11 @@ extern int
 __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
+struct kernel_ethtool_coalesce {
+	u8 use_cqe_mode_tx;
+	u8 use_cqe_mode_rx;
+};
+
 /**
  * ethtool_intersect_link_masks - Given two link masks, AND them together
  * @dst: first mask and where result is stored
@@ -211,7 +216,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 #define ETHTOOL_COALESCE_TX_USECS_HIGH		BIT(19)
 #define ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH	BIT(20)
 #define ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL	BIT(21)
-#define ETHTOOL_COALESCE_ALL_PARAMS		GENMASK(21, 0)
+#define ETHTOOL_COALESCE_USE_CQE_RX		BIT(22)
+#define ETHTOOL_COALESCE_USE_CQE_TX		BIT(23)
+#define ETHTOOL_COALESCE_ALL_PARAMS		GENMASK(23, 0)
 
 #define ETHTOOL_COALESCE_USECS						\
 	(ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS)
@@ -237,6 +244,8 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 	 ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_RX_USECS_HIGH | \
 	 ETHTOOL_COALESCE_PKT_RATE_LOW | ETHTOOL_COALESCE_PKT_RATE_HIGH | \
 	 ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL)
+#define ETHTOOL_COALESCE_USE_CQE					\
+	(ETHTOOL_COALESCE_USE_CQE_RX | ETHTOOL_COALESCE_USE_CQE_TX)
 
 #define ETHTOOL_STAT_NOT_SET	(~0ULL)
 
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index b3b93710eff7..5545f1ca9237 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -377,6 +377,8 @@ enum {
 	ETHTOOL_A_COALESCE_TX_USECS_HIGH,		/* u32 */
 	ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH,		/* u32 */
 	ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL,	/* u32 */
+	ETHTOOL_A_COALESCE_USE_CQE_MODE_TX,		/* u8 */
+	ETHTOOL_A_COALESCE_USE_CQE_MODE_RX,		/* u8 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_COALESCE_CNT,
diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c
index 1d6bc132aa4d..e6bc53634e68 100644
--- a/net/ethtool/coalesce.c
+++ b/net/ethtool/coalesce.c
@@ -10,6 +10,7 @@ struct coalesce_req_info {
 struct coalesce_reply_data {
 	struct ethnl_reply_data		base;
 	struct ethtool_coalesce		coalesce;
+	struct kernel_ethtool_coalesce	kernel_coalesce;
 	u32				supported_params;
 };
 
@@ -100,7 +101,9 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base,
 	       nla_total_size(sizeof(u32)) +	/* _RX_MAX_FRAMES_HIGH */
 	       nla_total_size(sizeof(u32)) +	/* _TX_USECS_HIGH */
 	       nla_total_size(sizeof(u32)) +	/* _TX_MAX_FRAMES_HIGH */
-	       nla_total_size(sizeof(u32));	/* _RATE_SAMPLE_INTERVAL */
+	       nla_total_size(sizeof(u32)) +	/* _RATE_SAMPLE_INTERVAL */
+	       nla_total_size(sizeof(u8)) +	/* _USE_CQE_MODE_TX */
+	       nla_total_size(sizeof(u8));	/* _USE_CQE_MODE_RX */
 }
 
 static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val,
@@ -124,6 +127,7 @@ static int coalesce_fill_reply(struct sk_buff *skb,
 			       const struct ethnl_reply_data *reply_base)
 {
 	const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base);
+	const struct kernel_ethtool_coalesce *kcoal = &data->kernel_coalesce;
 	const struct ethtool_coalesce *coal = &data->coalesce;
 	u32 supported = data->supported_params;
 
@@ -170,7 +174,11 @@ static int coalesce_fill_reply(struct sk_buff *skb,
 	    coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH,
 			     coal->tx_max_coalesced_frames_high, supported) ||
 	    coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL,
-			     coal->rate_sample_interval, supported))
+			     coal->rate_sample_interval, supported) ||
+	    coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX,
+			      kcoal->use_cqe_mode_tx, supported) ||
+	    coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX,
+			      kcoal->use_cqe_mode_rx, supported))
 		return -EMSGSIZE;
 
 	return 0;
@@ -215,10 +223,13 @@ const struct nla_policy ethnl_coalesce_set_policy[] = {
 	[ETHTOOL_A_COALESCE_TX_USECS_HIGH]	= { .type = NLA_U32 },
 	[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH]	= { .type = NLA_U32 },
 	[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 },
+	[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX]	= NLA_POLICY_MAX(NLA_U8, 1),
+	[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX]	= NLA_POLICY_MAX(NLA_U8, 1),
 };
 
 int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info)
 {
+	struct kernel_ethtool_coalesce kernel_coalesce = {};
 	struct ethtool_coalesce coalesce = {};
 	struct ethnl_req_info req_info = {};
 	struct nlattr **tb = info->attrs;
@@ -303,6 +314,10 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info)
 			 tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod);
 	ethnl_update_u32(&coalesce.rate_sample_interval,
 			 tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod);
+	ethnl_update_u8(&kernel_coalesce.use_cqe_mode_tx,
+			tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod);
+	ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx,
+			tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod);
 	ret = 0;
 	if (!mod)
 		goto out_ops;
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 077aac3929a8..e8987e28036f 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -359,7 +359,7 @@ extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX + 1];
 extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1];
 extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1];
 extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1];
-extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL + 1];
+extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1];
 extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_HEADER + 1];
 extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1];
 extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1];
-- 
cgit v1.2.3-71-gd317


From 54404d357284d1405d98c424951357d970f41168 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Mon, 16 Aug 2021 09:28:46 -0700
Subject: scsi: fc: Add EDC ELS definition

Add Exchange Diagnostic Capabilities (EDC) ELS definition and the following
capability descriptors:

 - Link Fault Capability Descriptor

 - Congestion Signaling Capability Descriptor

Definitions taken from FC-LS-5 r5.01

Link: https://lore.kernel.org/r/20210816162901.121235-2-jsmart2021@gmail.com
Signed-off-by: James Smart <jsmart2021@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/uapi/scsi/fc/fc_els.h | 106 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/scsi/fc/fc_els.h b/include/uapi/scsi/fc/fc_els.h
index 91d4be987220..c9812c5c2fc4 100644
--- a/include/uapi/scsi/fc/fc_els.h
+++ b/include/uapi/scsi/fc/fc_els.h
@@ -41,6 +41,7 @@ enum fc_els_cmd {
 	ELS_REC =	0x13,	/* read exchange concise */
 	ELS_SRR =	0x14,	/* sequence retransmission request */
 	ELS_FPIN =	0x16,	/* Fabric Performance Impact Notification */
+	ELS_EDC =	0x17,	/* Exchange Diagnostic Capabilities */
 	ELS_RDP =	0x18,	/* Read Diagnostic Parameters */
 	ELS_RDF =	0x19,	/* Register Diagnostic Functions */
 	ELS_PRLI =	0x20,	/* process login */
@@ -111,6 +112,7 @@ enum fc_els_cmd {
 	[ELS_REC] =	"REC",			\
 	[ELS_SRR] =	"SRR",			\
 	[ELS_FPIN] =	"FPIN",			\
+	[ELS_EDC] =	"EDC",			\
 	[ELS_RDP] =	"RDP",			\
 	[ELS_RDF] =	"RDF",			\
 	[ELS_PRLI] =	"PRLI",			\
@@ -218,6 +220,10 @@ enum fc_els_rjt_explan {
 enum fc_ls_tlv_dtag {
 	ELS_DTAG_LS_REQ_INFO =		0x00000001,
 		/* Link Service Request Information Descriptor */
+	ELS_DTAG_LNK_FAULT_CAP =	0x0001000D,
+		/* Link Fault Capability Descriptor */
+	ELS_DTAG_CG_SIGNAL_CAP =	0x0001000F,
+		/* Congestion Signaling Capability Descriptor */
 	ELS_DTAG_LNK_INTEGRITY =	0x00020001,
 		/* Link Integrity Notification Descriptor */
 	ELS_DTAG_DELIVERY =		0x00020002,
@@ -236,6 +242,8 @@ enum fc_ls_tlv_dtag {
  */
 #define FC_LS_TLV_DTAG_INIT {					      \
 	{ ELS_DTAG_LS_REQ_INFO,		"Link Service Request Information" }, \
+	{ ELS_DTAG_LNK_FAULT_CAP,	"Link Fault Capability" },	      \
+	{ ELS_DTAG_CG_SIGNAL_CAP,	"Congestion Signaling Capability" },  \
 	{ ELS_DTAG_LNK_INTEGRITY,	"Link Integrity Notification" },      \
 	{ ELS_DTAG_DELIVERY,		"Delivery Notification Present" },    \
 	{ ELS_DTAG_PEER_CONGEST,	"Peer Congestion Notification" },     \
@@ -1144,4 +1152,102 @@ struct fc_els_rdf_resp {
 };
 
 
+/*
+ * Diagnostic Capability Descriptors for EDC ELS
+ */
+
+/*
+ * Diagnostic: Link Fault Capability Descriptor
+ */
+struct fc_diag_lnkflt_desc {
+	__be32		desc_tag;	/* Descriptor Tag (0x0001000D) */
+	__be32		desc_len;	/* Length of Descriptor (in bytes).
+					 * Size of descriptor excluding
+					 * desc_tag and desc_len fields.
+					 * 12 bytes
+					 */
+	__be32		degrade_activate_threshold;
+	__be32		degrade_deactivate_threshold;
+	__be32		fec_degrade_interval;
+};
+
+enum fc_edc_cg_signal_cap_types {
+	/* Note: Capability: bits 31:4 Rsvd; bits 3:0 are capabilities */
+	EDC_CG_SIG_NOTSUPPORTED =	0x00, /* neither supported */
+	EDC_CG_SIG_WARN_ONLY =		0x01,
+	EDC_CG_SIG_WARN_ALARM =		0x02, /* both supported */
+};
+
+/*
+ * Initializer useful for decoding table.
+ * Please keep this in sync with the above definitions.
+ */
+#define FC_EDC_CG_SIGNAL_CAP_TYPES_INIT {				\
+	{ EDC_CG_SIG_NOTSUPPORTED,	"Signaling Not Supported" },	\
+	{ EDC_CG_SIG_WARN_ONLY,		"Warning Signal" },		\
+	{ EDC_CG_SIG_WARN_ALARM,	"Warning and Alarm Signals" },	\
+}
+
+enum fc_diag_cg_sig_freq_types {
+	EDC_CG_SIGFREQ_CNT_MIN =	1,	/* Min Frequency Count */
+	EDC_CG_SIGFREQ_CNT_MAX =	999,	/* Max Frequency Count */
+
+	EDC_CG_SIGFREQ_SEC =		0x1,	/* Units: seconds */
+	EDC_CG_SIGFREQ_MSEC =		0x2,	/* Units: milliseconds */
+};
+
+struct fc_diag_cg_sig_freq {
+	__be16		count;		/* Time between signals
+					 * note: upper 6 bits rsvd
+					 */
+	__be16		units;		/* Time unit for count
+					 * note: upper 12 bits rsvd
+					 */
+};
+
+/*
+ * Diagnostic: Congestion Signaling Capability Descriptor
+ */
+struct fc_diag_cg_sig_desc {
+	__be32		desc_tag;	/* Descriptor Tag (0x0001000F) */
+	__be32		desc_len;	/* Length of Descriptor (in bytes).
+					 * Size of descriptor excluding
+					 * desc_tag and desc_len fields.
+					 * 16 bytes
+					 */
+	__be32				xmt_signal_capability;
+	struct fc_diag_cg_sig_freq	xmt_signal_frequency;
+	__be32				rcv_signal_capability;
+	struct fc_diag_cg_sig_freq	rcv_signal_frequency;
+};
+
+/*
+ * ELS_EDC - Exchange Diagnostic Capabilities
+ */
+struct fc_els_edc {
+	__u8		edc_cmd;	/* command (0x17) */
+	__u8		edc_zero[3];	/* specified as zero - part of cmd */
+	__be32		desc_len;	/* Length of Descriptor List (in bytes).
+					 * Size of ELS excluding edc_cmd,
+					 * edc_zero and desc_len fields.
+					 */
+	struct fc_tlv_desc	desc[0];
+					/* Diagnostic Descriptor list */
+};
+
+/*
+ * ELS EDC LS_ACC Response.
+ */
+struct fc_els_edc_resp {
+	struct fc_els_ls_acc	acc_hdr;
+	__be32			desc_list_len;	/* Length of response (in
+						 * bytes). Excludes acc_hdr
+						 * and desc_list_len fields.
+						 */
+	struct fc_els_lsri_desc	lsri;
+	struct fc_tlv_desc	desc[0];
+				    /* Supported Diagnostic Descriptor list */
+};
+
+
 #endif /* _FC_ELS_H_ */
-- 
cgit v1.2.3-71-gd317


From b9445598d8c60a1379887b957024b71343965f74 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 25 Aug 2021 12:25:45 +0100
Subject: io_uring: openat directly into fixed fd table

Instead of opening a file into a process's file table as usual and then
registering the fd within io_uring, some users may want to skip the
first step and place it directly into io_uring's fixed file table.
This patch adds such a capability for IORING_OP_OPENAT and
IORING_OP_OPENAT2.

The behaviour is controlled by setting sqe->file_index, where 0 implies
the old behaviour using normal file tables. If non-zero value is
specified, then it will behave as described and place the file into a
fixed file slot sqe->file_index - 1. A file table should be already
created, the slot should be valid and empty, otherwise the operation
will fail.

Keep the error codes consistent with IORING_OP_FILES_UPDATE, ENXIO and
EINVAL on inappropriate fixed tables, and return EBADF on collision with
already registered file.

Note: IOSQE_FIXED_FILE can't be used to switch between modes, because
accept takes a file, and it already uses the flag with a different
meaning.

Suggested-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/e9b33d1163286f51ea707f87d95bd596dada1e65.1629888991.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 74 ++++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/io_uring.h |  5 ++-
 2 files changed, 70 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6859438c4e09..62da427d614d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -580,6 +580,7 @@ struct io_sr_msg {
 struct io_open {
 	struct file			*file;
 	int				dfd;
+	u32				file_slot;
 	struct filename			*filename;
 	struct open_how			how;
 	unsigned long			nofile;
@@ -1063,6 +1064,9 @@ static void io_req_task_queue(struct io_kiocb *req);
 static void io_submit_flush_completions(struct io_ring_ctx *ctx);
 static int io_req_prep_async(struct io_kiocb *req);
 
+static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+				 unsigned int issue_flags, u32 slot_index);
+
 static struct kmem_cache *req_cachep;
 
 static const struct file_operations io_uring_fops;
@@ -3864,7 +3868,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (unlikely(sqe->ioprio || sqe->buf_index || sqe->splice_fd_in))
+	if (unlikely(sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -3881,6 +3885,11 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 		req->open.filename = NULL;
 		return ret;
 	}
+
+	req->open.file_slot = READ_ONCE(sqe->file_index);
+	if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
+		return -EINVAL;
+
 	req->open.nofile = rlimit(RLIMIT_NOFILE);
 	req->flags |= REQ_F_NEED_CLEANUP;
 	return 0;
@@ -3918,8 +3927,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct open_flags op;
 	struct file *file;
-	bool nonblock_set;
-	bool resolve_nonblock;
+	bool resolve_nonblock, nonblock_set;
+	bool fixed = !!req->open.file_slot;
 	int ret;
 
 	ret = build_open_flags(&req->open.how, &op);
@@ -3938,9 +3947,11 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 		op.open_flag |= O_NONBLOCK;
 	}
 
-	ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
-	if (ret < 0)
-		goto err;
+	if (!fixed) {
+		ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
+		if (ret < 0)
+			goto err;
+	}
 
 	file = do_filp_open(req->open.dfd, req->open.filename, &op);
 	if (IS_ERR(file)) {
@@ -3949,7 +3960,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 		 * marginal gain for something that is now known to be a slower
 		 * path. So just put it, and we'll get a new one when we retry.
 		 */
-		put_unused_fd(ret);
+		if (!fixed)
+			put_unused_fd(ret);
 
 		ret = PTR_ERR(file);
 		/* only retry if RESOLVE_CACHED wasn't already set by application */
@@ -3962,7 +3974,12 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 	if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
 		file->f_flags &= ~O_NONBLOCK;
 	fsnotify_open(file);
-	fd_install(ret, file);
+
+	if (!fixed)
+		fd_install(ret, file);
+	else
+		ret = io_install_fixed_file(req, file, issue_flags,
+					    req->open.file_slot - 1);
 err:
 	putname(req->open.filename);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -7899,6 +7916,46 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 #endif
 }
 
+static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+				 unsigned int issue_flags, u32 slot_index)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	struct io_fixed_file *file_slot;
+	int ret = -EBADF;
+
+	io_ring_submit_lock(ctx, !force_nonblock);
+	if (file->f_op == &io_uring_fops)
+		goto err;
+	ret = -ENXIO;
+	if (!ctx->file_data)
+		goto err;
+	ret = -EINVAL;
+	if (slot_index >= ctx->nr_user_files)
+		goto err;
+
+	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
+	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
+	ret = -EBADF;
+	if (file_slot->file_ptr)
+		goto err;
+
+	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
+	io_fixed_file_set(file_slot, file);
+	ret = io_sqe_file_register(ctx, file, slot_index);
+	if (ret) {
+		file_slot->file_ptr = 0;
+		goto err;
+	}
+
+	ret = 0;
+err:
+	io_ring_submit_unlock(ctx, !force_nonblock);
+	if (ret)
+		fput(file);
+	return ret;
+}
+
 static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 				 struct io_rsrc_node *node, void *rsrc)
 {
@@ -10366,6 +10423,7 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
 	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
 	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
+	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
 
 	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
 		     sizeof(struct io_uring_rsrc_update));
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 79126d5cd289..45a4f2373694 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -55,7 +55,10 @@ struct io_uring_sqe {
 	} __attribute__((packed));
 	/* personality to use, if used */
 	__u16	personality;
-	__s32	splice_fd_in;
+	union {
+		__s32	splice_fd_in;
+		__u32	file_index;
+	};
 	__u64	__pad2[2];
 };
 
-- 
cgit v1.2.3-71-gd317


From dd6e10fbd9fb86a571d925602c8a24bb4d09a2a7 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Mon, 23 Aug 2021 19:43:49 -0700
Subject: bpf: Add bpf_task_pt_regs() helper

The motivation behind this helper is to access userspace pt_regs in a
kprobe handler.

uprobe's ctx is the userspace pt_regs. kprobe's ctx is the kernelspace
pt_regs. bpf_task_pt_regs() allows accessing userspace pt_regs in a
kprobe handler. The final case (kernelspace pt_regs in uprobe) is
pretty rare (usermode helper) so I think that can be solved later if
necessary.

More concretely, this helper is useful in doing BPF-based DWARF stack
unwinding. Currently the kernel can only do framepointer based stack
unwinds for userspace code. This is because the DWARF state machines are
too fragile to be computed in kernelspace [0]. The idea behind
DWARF-based stack unwinds w/ BPF is to copy a chunk of the userspace
stack (while in prog context) and send it up to userspace for unwinding
(probably with libunwind) [1]. This would effectively enable profiling
applications with -fomit-frame-pointer using kprobes and uprobes.

[0]: https://lkml.org/lkml/2012/2/10/356
[1]: https://github.com/danobi/bpf-dwarf-walk

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/e2718ced2d51ef4268590ab8562962438ab82815.1629772842.git.dxu@dxuuu.xyz
---
 include/uapi/linux/bpf.h       |  7 +++++++
 kernel/bpf/helpers.c           |  3 +++
 kernel/trace/bpf_trace.c       | 19 +++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  7 +++++++
 4 files changed, 36 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 191f0b286ee3..791f31dd0abe 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4871,6 +4871,12 @@ union bpf_attr {
  * 	Return
  *		Value specified by user at BPF link creation/attachment time
  *		or 0, if it was not specified.
+ *
+ * long bpf_task_pt_regs(struct task_struct *task)
+ *	Description
+ *		Get the struct pt_regs associated with **task**.
+ *	Return
+ *		A pointer to struct pt_regs.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5048,6 +5054,7 @@ union bpf_attr {
 	FN(timer_cancel),		\
 	FN(get_func_ip),		\
 	FN(get_attach_cookie),		\
+	FN(task_pt_regs),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 609674f409ed..c227b7d4f56c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1327,6 +1327,7 @@ const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
 const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
 const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
+const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
 
 const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
@@ -1424,6 +1425,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_snprintf_btf_proto;
 	case BPF_FUNC_snprintf:
 		return &bpf_snprintf_proto;
+	case BPF_FUNC_task_pt_regs:
+		return &bpf_task_pt_regs_proto;
 	default:
 		return NULL;
 	}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4e54f3dc209f..580e14ee7ff9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -723,6 +723,23 @@ const struct bpf_func_proto bpf_get_current_task_btf_proto = {
 	.ret_btf_id	= &btf_task_struct_ids[0],
 };
 
+BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
+{
+	return (unsigned long) task_pt_regs(task);
+}
+
+BTF_ID_LIST(bpf_task_pt_regs_ids)
+BTF_ID(struct, pt_regs)
+
+const struct bpf_func_proto bpf_task_pt_regs_proto = {
+	.func		= bpf_task_pt_regs,
+	.gpl_only	= true,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg1_btf_id	= &btf_task_struct_ids[0],
+	.ret_type	= RET_PTR_TO_BTF_ID,
+	.ret_btf_id	= &bpf_task_pt_regs_ids[0],
+};
+
 BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -1032,6 +1049,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_current_task_proto;
 	case BPF_FUNC_get_current_task_btf:
 		return &bpf_get_current_task_btf_proto;
+	case BPF_FUNC_task_pt_regs:
+		return &bpf_task_pt_regs_proto;
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_current_comm:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 191f0b286ee3..791f31dd0abe 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4871,6 +4871,12 @@ union bpf_attr {
  * 	Return
  *		Value specified by user at BPF link creation/attachment time
  *		or 0, if it was not specified.
+ *
+ * long bpf_task_pt_regs(struct task_struct *task)
+ *	Description
+ *		Get the struct pt_regs associated with **task**.
+ *	Return
+ *		A pointer to struct pt_regs.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5048,6 +5054,7 @@ union bpf_attr {
 	FN(timer_cancel),		\
 	FN(get_func_ip),		\
 	FN(get_attach_cookie),		\
+	FN(task_pt_regs),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3-71-gd317


From 6a241d2923c2c0f6893c78c79421ceb3935691fd Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 17 Aug 2021 17:53:29 +0200
Subject: um: virt-pci: fix uapi documentation

The identifier names in the documentation here didn't match
the real ones, and the reserved was missing. Fix that.

Reported-by: Bjorn Helgaas <helgaas@kernel.org>
Fixes: 68f5d3f3b654 ("um: add PCI over virtio emulation driver")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 include/uapi/linux/virtio_pcidev.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/virtio_pcidev.h b/include/uapi/linux/virtio_pcidev.h
index 89daa88bcfef..668b07ce515b 100644
--- a/include/uapi/linux/virtio_pcidev.h
+++ b/include/uapi/linux/virtio_pcidev.h
@@ -9,13 +9,14 @@
 
 /**
  * enum virtio_pcidev_ops - virtual PCI device operations
+ * @VIRTIO_PCIDEV_OP_RESERVED: reserved to catch errors
  * @VIRTIO_PCIDEV_OP_CFG_READ: read config space, size is 1, 2, 4 or 8;
  *	the @data field should be filled in by the device (in little endian).
  * @VIRTIO_PCIDEV_OP_CFG_WRITE: write config space, size is 1, 2, 4 or 8;
  *	the @data field contains the data to write (in little endian).
- * @VIRTIO_PCIDEV_OP_BAR_READ: read BAR mem/pio, size can be variable;
+ * @VIRTIO_PCIDEV_OP_MMIO_READ: read BAR mem/pio, size can be variable;
  *	the @data field should be filled in by the device (in little endian).
- * @VIRTIO_PCIDEV_OP_BAR_WRITE: write BAR mem/pio, size can be variable;
+ * @VIRTIO_PCIDEV_OP_MMIO_WRITE: write BAR mem/pio, size can be variable;
  *	the @data field contains the data to write (in little endian).
  * @VIRTIO_PCIDEV_OP_MMIO_MEMSET: memset MMIO, size is variable but
  *	the @data field only has one byte (unlike @VIRTIO_PCIDEV_OP_MMIO_WRITE)
-- 
cgit v1.2.3-71-gd317


From 49b99da2c9ce13ffcd93fe3a0f5670791c1d76f7 Mon Sep 17 00:00:00 2001
From: Rocco Yue <rocco.yue@mediatek.com>
Date: Fri, 27 Aug 2021 23:04:12 +0800
Subject: ipv6: add IFLA_INET6_RA_MTU to expose mtu value

The kernel provides a "/proc/sys/net/ipv6/conf/<iface>/mtu"
file, which can temporarily record the mtu value of the last
received RA message when the RA mtu value is lower than the
interface mtu, but this proc has following limitations:

(1) when the interface mtu (/sys/class/net/<iface>/mtu) is
updeated, mtu6 (/proc/sys/net/ipv6/conf/<iface>/mtu) will
be updated to the value of interface mtu;
(2) mtu6 (/proc/sys/net/ipv6/conf/<iface>/mtu) only affect
ipv6 connection, and not affect ipv4.

Therefore, when the mtu option is carried in the RA message,
there will be a problem that the user sometimes cannot obtain
RA mtu value correctly by reading mtu6.

After this patch set, if a RA message carries the mtu option,
you can send a netlink msg which nlmsg_type is RTM_GETLINK,
and then by parsing the attribute of IFLA_INET6_RA_MTU to
get the mtu value carried in the RA message received on the
inet6 device. In addition, you can also get a link notification
when ra_mtu is updated so it doesn't have to poll.

In this way, if the MTU values that the device receives from
the network in the PCO IPv4 and the RA IPv6 procedures are
different, the user can obtain the correct ipv6 ra_mtu value
and compare the value of ra_mtu and ipv4 mtu, then the device
can use the lower MTU value for both IPv4 and IPv6.

Signed-off-by: Rocco Yue <rocco.yue@mediatek.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20210827150412.9267-1-rocco.yue@mediatek.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/if_inet6.h             |  2 ++
 include/uapi/linux/if_link.h       |  1 +
 net/ipv6/addrconf.c                | 10 ++++++++++
 net/ipv6/ndisc.c                   | 17 +++++++++++------
 tools/include/uapi/linux/if_link.h |  1 +
 5 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 42235c178b06..653e7d0f65cb 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -210,6 +210,8 @@ struct inet6_dev {
 
 	unsigned long		tstamp; /* ipv6InterfaceTable update timestamp */
 	struct rcu_head		rcu;
+
+	unsigned int		ra_mtu;
 };
 
 static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf)
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8aad65b69054..eebd3894fe89 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -417,6 +417,7 @@ enum {
 	IFLA_INET6_ICMP6STATS,	/* statistics (icmpv6)		*/
 	IFLA_INET6_TOKEN,	/* device token			*/
 	IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */
+	IFLA_INET6_RA_MTU,	/* mtu carried in the RA message */
 	__IFLA_INET6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8381288a0d6e..17756f3ed33b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -394,6 +394,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 		ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 
 	ndev->cnf.mtu6 = dev->mtu;
+	ndev->ra_mtu = 0;
 	ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
 	if (!ndev->nd_parms) {
 		kfree(ndev);
@@ -3849,6 +3850,7 @@ restart:
 	}
 
 	idev->tstamp = jiffies;
+	idev->ra_mtu = 0;
 
 	/* Last: Shot the device (if unregistered) */
 	if (unregister) {
@@ -5543,6 +5545,7 @@ static inline size_t inet6_ifla6_size(void)
 	     + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
 	     + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */
 	     + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */
+	     + nla_total_size(4) /* IFLA_INET6_RA_MTU */
 	     + 0;
 }
 
@@ -5651,6 +5654,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
 	if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
 		goto nla_put_failure;
 
+	if (idev->ra_mtu &&
+	    nla_put_u32(skb, IFLA_INET6_RA_MTU, idev->ra_mtu))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -5767,6 +5774,9 @@ update_lft:
 static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = {
 	[IFLA_INET6_ADDR_GEN_MODE]	= { .type = NLA_U8 },
 	[IFLA_INET6_TOKEN]		= { .len = sizeof(struct in6_addr) },
+	[IFLA_INET6_RA_MTU]		= { .type = NLA_REJECT,
+					    .reject_message =
+						"IFLA_INET6_RA_MTU can not be set" },
 };
 
 static int check_addr_gen_mode(int mode)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index c467c6419893..4b098521a44c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1391,12 +1391,6 @@ skip_defrtr:
 		}
 	}
 
-	/*
-	 *	Send a notify if RA changed managed/otherconf flags or timer settings
-	 */
-	if (send_ifinfo_notify)
-		inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
-
 skip_linkparms:
 
 	/*
@@ -1496,6 +1490,11 @@ skip_routeinfo:
 		memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu));
 		mtu = ntohl(n);
 
+		if (in6_dev->ra_mtu != mtu) {
+			in6_dev->ra_mtu = mtu;
+			send_ifinfo_notify = true;
+		}
+
 		if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
 			ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
 		} else if (in6_dev->cnf.mtu6 != mtu) {
@@ -1519,6 +1518,12 @@ skip_routeinfo:
 		ND_PRINTK(2, warn, "RA: invalid RA options\n");
 	}
 out:
+	/* Send a notify if RA changed managed/otherconf flags or
+	 * timer settings or ra_mtu value
+	 */
+	if (send_ifinfo_notify)
+		inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+
 	fib6_info_release(rt);
 	if (neigh)
 		neigh_release(neigh);
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index eb15f319aa57..b3610fdd1fee 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -230,6 +230,7 @@ enum {
 	IFLA_INET6_ICMP6STATS,	/* statistics (icmpv6)		*/
 	IFLA_INET6_TOKEN,	/* device token			*/
 	IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */
+	IFLA_INET6_RA_MTU,	/* mtu carried in the RA message */
 	__IFLA_INET6_MAX
 };
 
-- 
cgit v1.2.3-71-gd317


From 486e19795f2ee11f0334e2e3fcf8951d4981ff88 Mon Sep 17 00:00:00 2001
From: Yuri Nudelman <ynudelman@habana.ai>
Date: Thu, 3 Jun 2021 17:51:58 +0300
Subject: habanalabs: allow fail on inability to respect hint

A new user flag is required to make memory map hint mandatory, in
contrast to the current situation where it is best effort.
This is due to the requirement to map certain data to specific
pre-determined device virtual address ranges.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/memory.c | 45 ++++++++++++++++++++++++++++++---
 include/uapi/misc/habanalabs.h          |  1 +
 2 files changed, 43 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index d9429b98313a..d54fdd04be5b 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -564,6 +564,7 @@ static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,
  * @hint_addr: hint for requested address by the user.
  * @va_block_align: required alignment of the virtual block start address.
  * @range_type: va range type (host, dram)
+ * @flags: additional memory flags, currently only uses HL_MEM_FORCE_HINT
  *
  * This function does the following:
  * - Iterate on the virtual block list to find a suitable virtual block for the
@@ -574,7 +575,8 @@ static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,
 static u64 get_va_block(struct hl_device *hdev,
 				struct hl_va_range *va_range,
 				u64 size, u64 hint_addr, u32 va_block_align,
-				enum hl_va_range_type range_type)
+				enum hl_va_range_type range_type,
+				u32 flags)
 {
 	struct hl_vm_va_block *va_block, *new_va_block = NULL;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -584,6 +586,7 @@ static u64 get_va_block(struct hl_device *hdev,
 	bool add_prev = false;
 	bool is_align_pow_2  = is_power_of_2(va_range->page_size);
 	bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr);
+	bool force_hint = flags & HL_MEM_FORCE_HINT;
 
 	if (is_align_pow_2)
 		align_mask = ~((u64)va_block_align - 1);
@@ -603,6 +606,15 @@ static u64 get_va_block(struct hl_device *hdev,
 		(!is_align_pow_2 && is_hint_dram_addr &&
 			do_div(tmp_hint_addr, va_range->page_size))) {
 
+		if (force_hint) {
+			/* Hint must be repected, so here we just fail.
+			 */
+			dev_err(hdev->dev,
+				"Hint address 0x%llx is not page aligned - cannot be respected\n",
+				hint_addr);
+			return 0;
+		}
+
 		dev_dbg(hdev->dev,
 			"Hint address 0x%llx will be ignored because it is not aligned\n",
 			hint_addr);
@@ -660,6 +672,17 @@ static u64 get_va_block(struct hl_device *hdev,
 		goto out;
 	}
 
+	if (force_hint && reserved_valid_start != hint_addr) {
+		/* Hint address must be respected. If we are here - this means
+		 * we could not respect it.
+		 */
+		dev_err(hdev->dev,
+			"Hint address 0x%llx could not be respected\n",
+			hint_addr);
+		reserved_valid_start = 0;
+		goto out;
+	}
+
 	/*
 	 * Check if there is some leftover range due to reserving the new
 	 * va block, then return it to the main virtual addresses list.
@@ -712,7 +735,8 @@ u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
 		enum hl_va_range_type type, u32 size, u32 alignment)
 {
 	return get_va_block(hdev, ctx->va_range[type], size, 0,
-			max(alignment, ctx->va_range[type]->page_size), type);
+			max(alignment, ctx->va_range[type]->page_size),
+			type, 0);
 }
 
 /**
@@ -1145,9 +1169,24 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		goto hnode_err;
 	}
 
+	if (hint_addr && phys_pg_pack->offset) {
+		if (args->flags & HL_MEM_FORCE_HINT) {
+			/* If hint must be repected, since we can't - just fail.
+			 */
+			dev_err(hdev->dev,
+				"Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n",
+				hint_addr, phys_pg_pack->offset);
+			rc = -EINVAL;
+			goto va_block_err;
+		}
+		dev_dbg(hdev->dev,
+			"Hint address 0x%llx will be ignored because source memory is not aligned 0x%x\n",
+			hint_addr, phys_pg_pack->offset);
+	}
+
 	ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
 					hint_addr, va_block_align,
-					va_range_type);
+					va_range_type, args->flags);
 	if (!ret_vaddr) {
 		dev_err(hdev->dev, "no available va block for handle %u\n",
 				handle);
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index a47a731e4527..18765eb75b65 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -813,6 +813,7 @@ union hl_wait_cs_args {
 #define HL_MEM_CONTIGUOUS	0x1
 #define HL_MEM_SHARED		0x2
 #define HL_MEM_USERPTR		0x4
+#define HL_MEM_FORCE_HINT	0x8
 
 struct hl_mem_in {
 	union {
-- 
cgit v1.2.3-71-gd317


From 215f0c1775d5506c8a833b5c85a77b5fb65bf26b Mon Sep 17 00:00:00 2001
From: Ohad Sharabi <osharabi@habana.ai>
Date: Mon, 14 Jun 2021 22:18:41 +0300
Subject: habanalabs: add wait-for-multi-CS uAPI

When user sends multiple CSs, waiting for each CS is not efficient
as it involves many user-kernel context switches.

In order to address this issue we add support to "wait on multiple CSs"
using a new uAPI which can wait on maximum of 32 CSs. The new uAPI is
defined using a new flag - WAIT_FOR_MULTI_CS - in the wait_for_cs IOCTL.

The input parameters for this uAPI will be:
@seq: user pointer to an array of up to 32 CS's sequence numbers.
@seq_array_len: length of sequence array.
@timeout_us: timeout for waiting for any CS.

The output paramateres for this API will be:
@status: multi CS ioctl completion status (dedicated status was added as
         well).
@flags: bitmap of output flags of the CS.
@cs_completion_map: bitmap for multi CS, if CS sequence that was placed
                    in index N in input seq array has completed- the N-th
		    bit in cs_completion_map will be 1, otherwise it will
		    be 0.
@timestamp_nsec: timestamp of the first completed CS

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../misc/habanalabs/common/command_submission.c    | 443 ++++++++++++++++++++-
 drivers/misc/habanalabs/common/context.c           |  22 +-
 drivers/misc/habanalabs/common/device.c            |   4 +
 drivers/misc/habanalabs/common/habanalabs.h        |  63 ++-
 drivers/misc/habanalabs/common/hw_queue.c          |   4 +
 drivers/misc/habanalabs/gaudi/gaudi.c              |   1 +
 drivers/misc/habanalabs/goya/goya.c                |   1 +
 include/uapi/misc/habanalabs.h                     |  23 +-
 8 files changed, 556 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index b373a52a47a7..ba0c854b2ed4 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -482,6 +482,91 @@ static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
 	spin_unlock(&hdev->cs_mirror_lock);
 }
 
+/*
+ * force_complete_multi_cs - complete all contexts that wait on multi-CS
+ *
+ * @hdev: pointer to habanalabs device structure
+ */
+static void force_complete_multi_cs(struct hl_device *hdev)
+{
+	int i;
+
+	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+		struct multi_cs_completion *mcs_compl;
+
+		mcs_compl = &hdev->multi_cs_completion[i];
+
+		spin_lock(&mcs_compl->lock);
+
+		if (!mcs_compl->used) {
+			spin_unlock(&mcs_compl->lock);
+			continue;
+		}
+
+		/* when calling force complete no context should be waiting on
+		 * multi-cS.
+		 * We are calling the function as a protection for such case
+		 * to free any pending context and print error message
+		 */
+		dev_err(hdev->dev,
+				"multi-CS completion context %d still waiting when calling force completion\n",
+				i);
+		complete_all(&mcs_compl->completion);
+		spin_unlock(&mcs_compl->lock);
+	}
+}
+
+/*
+ * complete_multi_cs - complete all waiting entities on multi-CS
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @cs: CS structure
+ *
+ * The function signals waiting entity that its waiting stream has common
+ * stream with the completed CS.
+ * For example:
+ * - a completed CS worked on streams 0 and 1, multi CS completion
+ *   is actively waiting on stream 3. don't send signal as no common stream
+ * - a completed CS worked on streams 0 and 1, multi CS completion
+ *   is actively waiting on streams 1 and 3. send signal as stream 1 is common
+ */
+static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
+{
+	struct hl_fence *fence = cs->fence;
+	int i;
+
+	/* in case of multi CS check for completion only for the first CS */
+	if (cs->staged_cs && !cs->staged_first)
+		return;
+
+	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+		struct multi_cs_completion *mcs_compl;
+
+		mcs_compl = &hdev->multi_cs_completion[i];
+		if (!mcs_compl->used)
+			continue;
+
+		spin_lock(&mcs_compl->lock);
+
+		/*
+		 * complete if:
+		 * 1. still waiting for completion
+		 * 2. the completed CS has at least one overlapping stream
+		 *    with the streams in the completion
+		 */
+		if (mcs_compl->used &&
+				(fence->stream_map & mcs_compl->stream_map)) {
+			/* extract the timestamp only of first completed CS */
+			if (!mcs_compl->timestamp)
+				mcs_compl->timestamp =
+						ktime_to_ns(fence->timestamp);
+			complete_all(&mcs_compl->completion);
+		}
+
+		spin_unlock(&mcs_compl->lock);
+	}
+}
+
 static void cs_do_release(struct kref *ref)
 {
 	struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
@@ -575,6 +660,7 @@ out:
 	if (cs->timestamp)
 		cs->fence->timestamp = ktime_get();
 	complete_all(&cs->fence->completion);
+	complete_multi_cs(hdev, cs);
 	hl_fence_put(cs->fence);
 
 	kfree(cs->jobs_in_queue_cnt);
@@ -804,6 +890,8 @@ void hl_cs_rollback_all(struct hl_device *hdev)
 		cs_rollback(hdev, cs);
 		cs_put(cs);
 	}
+
+	force_complete_multi_cs(hdev);
 }
 
 void hl_pending_cb_list_flush(struct hl_ctx *ctx)
@@ -1134,6 +1222,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 	struct hl_cs *cs;
 	struct hl_cb *cb;
 	u64 user_sequence;
+	u8 stream_map = 0;
 	int rc, i;
 
 	cntr = &hdev->aggregated_cs_counters;
@@ -1192,9 +1281,18 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 			cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
 		}
 
-		if (queue_type == QUEUE_TYPE_EXT || queue_type == QUEUE_TYPE_HW)
+		if (queue_type == QUEUE_TYPE_EXT ||
+						queue_type == QUEUE_TYPE_HW) {
 			int_queues_only = false;
 
+			/*
+			 * store which stream are being used for external/HW
+			 * queues of this CS
+			 */
+			if (hdev->supports_wait_for_multi_cs)
+				stream_map |= BIT((chunk->queue_index % 4));
+		}
+
 		job = hl_cs_allocate_job(hdev, queue_type,
 						is_kernel_allocated_cb);
 		if (!job) {
@@ -1255,6 +1353,13 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 		goto free_cs_object;
 	}
 
+	/*
+	 * store the (external/HW queues) streams used by the CS in the
+	 * fence object for multi-CS completion
+	 */
+	if (hdev->supports_wait_for_multi_cs)
+		cs->fence->stream_map = stream_map;
+
 	rc = hl_hw_queue_schedule_cs(cs);
 	if (rc) {
 		if (rc != -EAGAIN)
@@ -1959,6 +2064,95 @@ static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence
 	return rc;
 }
 
+/*
+ * hl_cs_poll_fences - iterate CS fences to check for CS completion
+ *
+ * @mcs_data: multi-CS internal data
+ *
+ * @return 0 on success, otherwise non 0 error code
+ *
+ * The function iterates on all CS sequence in the list and set bit in
+ * completion_bitmap for each completed CS.
+ * while iterating, the function can extracts the stream map to be later
+ * used by the waiting function.
+ * this function shall be called after taking context ref
+ */
+static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
+{
+	struct hl_fence **fence_ptr = mcs_data->fence_arr;
+	struct hl_device *hdev = mcs_data->ctx->hdev;
+	int i, rc, arr_len = mcs_data->arr_len;
+	u64 *seq_arr = mcs_data->seq_arr;
+	ktime_t max_ktime, first_cs_time;
+	enum hl_cs_wait_status status;
+
+	memset(fence_ptr, 0, arr_len * sizeof(*fence_ptr));
+
+	/* get all fences under the same lock */
+	rc = hl_ctx_get_fences(mcs_data->ctx, seq_arr, fence_ptr, arr_len);
+	if (rc)
+		return rc;
+
+	/*
+	 * set to maximum time to verify timestamp is valid: if at the end
+	 * this value is maintained- no timestamp was updated
+	 */
+	max_ktime = ktime_set(KTIME_SEC_MAX, 0);
+	first_cs_time = max_ktime;
+
+	for (i = 0; i < arr_len; i++, fence_ptr++) {
+		struct hl_fence *fence = *fence_ptr;
+
+		/*
+		 * function won't sleep as it is called with timeout 0 (i.e.
+		 * poll the fence)
+		 */
+		rc = hl_wait_for_fence(mcs_data->ctx, seq_arr[i], fence,
+						&status, 0, NULL);
+		if (rc) {
+			dev_err(hdev->dev,
+				"wait_for_fence error :%d for CS seq %llu\n",
+								rc, seq_arr[i]);
+			break;
+		}
+
+		mcs_data->stream_map |= fence->stream_map;
+
+		if (status == CS_WAIT_STATUS_BUSY)
+			continue;
+
+		mcs_data->completion_bitmap |= BIT(i);
+
+		/*
+		 * best effort to extract timestamp. few notes:
+		 * - if even single fence is gone we cannot extract timestamp
+		 *   (as fence not exist anymore)
+		 * - for all completed CSs we take the earliest timestamp.
+		 *   for this we have to validate that:
+		 *       1. given timestamp was indeed set
+		 *       2. the timestamp is earliest of all timestamps so far
+		 */
+
+		if (status == CS_WAIT_STATUS_GONE) {
+			mcs_data->update_ts = false;
+			mcs_data->gone_cs = true;
+		} else if (mcs_data->update_ts &&
+			(ktime_compare(fence->timestamp,
+						ktime_set(0, 0)) > 0) &&
+			(ktime_compare(fence->timestamp, first_cs_time) < 0)) {
+			first_cs_time = fence->timestamp;
+		}
+	}
+
+	hl_fences_put(mcs_data->fence_arr, arr_len);
+
+	if (mcs_data->update_ts &&
+			(ktime_compare(first_cs_time, max_ktime) != 0))
+		mcs_data->timestamp = ktime_to_ns(first_cs_time);
+
+	return rc;
+}
+
 static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 				u64 timeout_us, u64 seq,
 				enum hl_cs_wait_status *status, s64 *timestamp)
@@ -1980,6 +2174,251 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 	return rc;
 }
 
+/*
+ * hl_wait_multi_cs_completion_init - init completion structure
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @stream_map: stream map, set bit indicates stream to wait on
+ *
+ * @return valid completion struct pointer on success, otherwise error pointer
+ *
+ * up to MULTI_CS_MAX_USER_CTX calls can be done concurrently to the driver.
+ * the function gets the first available completion (by marking it "used")
+ * and initialize its values.
+ */
+static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
+							struct hl_device *hdev,
+							u8 stream_map)
+{
+	struct multi_cs_completion *mcs_compl;
+	int i;
+
+	/* find free multi_cs completion structure */
+	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+		mcs_compl = &hdev->multi_cs_completion[i];
+		spin_lock(&mcs_compl->lock);
+		if (!mcs_compl->used) {
+			mcs_compl->used = 1;
+			mcs_compl->timestamp = 0;
+			mcs_compl->stream_map = stream_map;
+			reinit_completion(&mcs_compl->completion);
+			spin_unlock(&mcs_compl->lock);
+			break;
+		}
+		spin_unlock(&mcs_compl->lock);
+	}
+
+	if (i == MULTI_CS_MAX_USER_CTX) {
+		dev_err(hdev->dev,
+				"no available multi-CS completion structure\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	return mcs_compl;
+}
+
+/*
+ * hl_wait_multi_cs_completion_fini - return completion structure and set as
+ *                                    unused
+ *
+ * @mcs_compl: pointer to the completion structure
+ */
+static void hl_wait_multi_cs_completion_fini(
+					struct multi_cs_completion *mcs_compl)
+{
+	/*
+	 * free completion structure, do it under lock to be in-sync with the
+	 * thread that signals completion
+	 */
+	spin_lock(&mcs_compl->lock);
+	mcs_compl->used = 0;
+	spin_unlock(&mcs_compl->lock);
+}
+
+/*
+ * hl_wait_multi_cs_completion - wait for first CS to complete
+ *
+ * @mcs_data: multi-CS internal data
+ *
+ * @return 0 on success, otherwise non 0 error code
+ */
+static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data)
+{
+	struct hl_device *hdev = mcs_data->ctx->hdev;
+	struct multi_cs_completion *mcs_compl;
+	long completion_rc;
+
+	mcs_compl = hl_wait_multi_cs_completion_init(hdev,
+							mcs_data->stream_map);
+	if (IS_ERR(mcs_compl))
+		return PTR_ERR(mcs_compl);
+
+	completion_rc = wait_for_completion_interruptible_timeout(
+					&mcs_compl->completion,
+					usecs_to_jiffies(mcs_data->timeout_us));
+
+	/* update timestamp */
+	if (completion_rc > 0)
+		mcs_data->timestamp = mcs_compl->timestamp;
+
+	hl_wait_multi_cs_completion_fini(mcs_compl);
+
+	mcs_data->wait_status = completion_rc;
+
+	return 0;
+}
+
+/*
+ * hl_multi_cs_completion_init - init array of multi-CS completion structures
+ *
+ * @hdev: pointer to habanalabs device structure
+ */
+void hl_multi_cs_completion_init(struct hl_device *hdev)
+{
+	struct multi_cs_completion *mcs_cmpl;
+	int i;
+
+	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+		mcs_cmpl = &hdev->multi_cs_completion[i];
+		mcs_cmpl->used = 0;
+		spin_lock_init(&mcs_cmpl->lock);
+		init_completion(&mcs_cmpl->completion);
+	}
+}
+
+/*
+ * hl_multi_cs_wait_ioctl - implementation of the multi-CS wait ioctl
+ *
+ * @hpriv: pointer to the private data of the fd
+ * @data: pointer to multi-CS wait ioctl in/out args
+ *
+ */
+static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct multi_cs_data mcs_data = {0};
+	union hl_wait_cs_args *args = data;
+	struct hl_ctx *ctx = hpriv->ctx;
+	struct hl_fence **fence_arr;
+	void __user *seq_arr;
+	u32 size_to_copy;
+	u64 *cs_seq_arr;
+	u8 seq_arr_len;
+	int rc;
+
+	if (!hdev->supports_wait_for_multi_cs) {
+		dev_err(hdev->dev, "Wait for multi CS is not supported\n");
+		return -EPERM;
+	}
+
+	seq_arr_len = args->in.seq_arr_len;
+
+	if (seq_arr_len > HL_WAIT_MULTI_CS_LIST_MAX_LEN) {
+		dev_err(hdev->dev, "Can wait only up to %d CSs, input sequence is of length %u\n",
+				HL_WAIT_MULTI_CS_LIST_MAX_LEN, seq_arr_len);
+		return -EINVAL;
+	}
+
+	/* allocate memory for sequence array */
+	cs_seq_arr =
+		kmalloc_array(seq_arr_len, sizeof(*cs_seq_arr), GFP_KERNEL);
+	if (!cs_seq_arr)
+		return -ENOMEM;
+
+	/* copy CS sequence array from user */
+	seq_arr = (void __user *) (uintptr_t) args->in.seq;
+	size_to_copy = seq_arr_len * sizeof(*cs_seq_arr);
+	if (copy_from_user(cs_seq_arr, seq_arr, size_to_copy)) {
+		dev_err(hdev->dev, "Failed to copy multi-cs sequence array from user\n");
+		rc = -EFAULT;
+		goto free_seq_arr;
+	}
+
+	/* allocate array for the fences */
+	fence_arr = kmalloc_array(seq_arr_len, sizeof(*fence_arr), GFP_KERNEL);
+	if (!fence_arr) {
+		rc = -ENOMEM;
+		goto free_seq_arr;
+	}
+
+	/* initialize the multi-CS internal data */
+	mcs_data.ctx = ctx;
+	mcs_data.seq_arr = cs_seq_arr;
+	mcs_data.fence_arr = fence_arr;
+	mcs_data.arr_len = seq_arr_len;
+
+	hl_ctx_get(hdev, ctx);
+
+	/* poll all CS fences, extract timestamp */
+	mcs_data.update_ts = true;
+	rc = hl_cs_poll_fences(&mcs_data);
+	/*
+	 * skip wait for CS completion when one of the below is true:
+	 * - an error on the poll function
+	 * - one or more CS in the list completed
+	 * - the user called ioctl with timeout 0
+	 */
+	if (rc || mcs_data.completion_bitmap || !args->in.timeout_us)
+		goto put_ctx;
+
+	/* wait (with timeout) for the first CS to be completed */
+	mcs_data.timeout_us = args->in.timeout_us;
+	rc = hl_wait_multi_cs_completion(&mcs_data);
+	if (rc)
+		goto put_ctx;
+
+	if (mcs_data.wait_status > 0) {
+		/*
+		 * poll fences once again to update the CS map.
+		 * no timestamp should be updated this time.
+		 */
+		mcs_data.update_ts = false;
+		rc = hl_cs_poll_fences(&mcs_data);
+
+		/*
+		 * if hl_wait_multi_cs_completion returned before timeout (i.e.
+		 * it got a completion) we expect to see at least one CS
+		 * completed after the poll function.
+		 */
+		if (!mcs_data.completion_bitmap) {
+			dev_err(hdev->dev, "Multi-CS got completion on wait but no CS completed\n");
+			rc = -EFAULT;
+		}
+	}
+
+put_ctx:
+	hl_ctx_put(ctx);
+	kfree(fence_arr);
+
+free_seq_arr:
+	kfree(cs_seq_arr);
+
+	/* update output args */
+	memset(args, 0, sizeof(*args));
+	if (rc)
+		return rc;
+
+	if (mcs_data.completion_bitmap) {
+		args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
+		args->out.cs_completion_map = mcs_data.completion_bitmap;
+
+		/* if timestamp not 0- it's valid */
+		if (mcs_data.timestamp) {
+			args->out.timestamp_nsec = mcs_data.timestamp;
+			args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
+		}
+
+		/* update if some CS was gone */
+		if (mcs_data.timestamp)
+			args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
+	} else if (mcs_data.wait_status == -ERESTARTSYS) {
+		args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED;
+	} else {
+		args->out.status = HL_WAIT_CS_STATUS_BUSY;
+	}
+
+	return 0;
+}
+
 static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	struct hl_device *hdev = hpriv->hdev;
@@ -2221,6 +2660,8 @@ int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 
 	if (flags & HL_WAIT_CS_FLAGS_INTERRUPT)
 		rc = hl_interrupt_wait_ioctl(hpriv, data);
+	else if (flags & HL_WAIT_CS_FLAGS_MULTI_CS)
+		rc = hl_multi_cs_wait_ioctl(hpriv, data);
 	else
 		rc = hl_cs_wait_ioctl(hpriv, data);
 
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 1001c65c5e7a..4d922e4d0393 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -229,7 +229,17 @@ int hl_ctx_put(struct hl_ctx *ctx)
 	return kref_put(&ctx->refcount, hl_ctx_do_release);
 }
 
-/* this function shall be called with cs_lock locked */
+/*
+ * hl_ctx_get_fence_locked - get CS fence under CS lock
+ *
+ * @ctx: pointer to the context structure.
+ * @seq: CS sequences number
+ *
+ * @return valid fence pointer on success, NULL if fence is gone, otherwise
+ *         error pointer.
+ *
+ * NOTE: this function shall be called with cs_lock locked
+ */
 static struct hl_fence *hl_ctx_get_fence_locked(struct hl_ctx *ctx, u64 seq)
 {
 	struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
@@ -259,6 +269,16 @@ struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 	return fence;
 }
 
+/*
+ * hl_ctx_get_fences - get multiple CS fences under the same CS lock
+ *
+ * @ctx: pointer to the context structure.
+ * @seq_arr: array of CS sequences to wait for
+ * @fence: fence array to store the CS fences
+ * @arr_len: length of seq_arr and fence_arr
+ *
+ * @return 0 on success, otherwise non 0 error code
+ */
 int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
 				struct hl_fence **fence, u32 arr_len)
 {
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 4f76c7a51605..3751c915f731 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1297,6 +1297,10 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	if (rc)
 		goto user_interrupts_fini;
 
+
+	/* initialize completion structure for multi CS wait */
+	hl_multi_cs_completion_init(hdev);
+
 	/*
 	 * Initialize the H/W queues. Must be done before hw_init, because
 	 * there the addresses of the kernel queue are being written to the
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 26f75d070f72..6d5154434637 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -585,7 +585,8 @@ struct asic_fixed_properties {
  * @cs_sequence: sequence of the corresponding command submission
  * @error: mark this fence with error
  * @timestamp: timestamp upon completion
- *
+ * @stream_map: streams bitmap to represent all streams that multi cs is
+ *              waiting on
  */
 struct hl_fence {
 	struct completion	completion;
@@ -593,6 +594,7 @@ struct hl_fence {
 	u64			cs_sequence;
 	int			error;
 	ktime_t			timestamp;
+	u8			stream_map;
 };
 
 /**
@@ -2234,6 +2236,58 @@ struct hl_mmu_funcs {
 			u64 virt_addr, struct hl_mmu_hop_info *hops);
 };
 
+/**
+ * number of user contexts allowed to call wait_for_multi_cs ioctl in
+ * parallel
+ */
+#define MULTI_CS_MAX_USER_CTX	2
+
+/**
+ * struct multi_cs_completion - multi CS wait completion.
+ * @completion: completion of any of the CS in the list
+ * @lock: spinlock for the completion structure
+ * @timestamp: timestamp for the multi-CS completion
+ * @used: 1 if in use, otherwise 0
+ * @stream_map: bitmap of all HW/external queues streams on which the multi-CS
+ *              is waiting
+ */
+struct multi_cs_completion {
+	struct completion	completion;
+	spinlock_t		lock;
+	s64			timestamp;
+	u8			used;
+	u8			stream_map;
+};
+
+/**
+ * struct multi_cs_data - internal data for multi CS call
+ * @ctx: pointer to the context structure
+ * @fence_arr: array of fences of all CSs
+ * @seq_arr: array of CS sequence numbers
+ * @timeout_us: timeout in usec for waiting for CS to complete
+ * @timestamp: timestamp of first completed CS
+ * @wait_status: wait for CS status
+ * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0)
+ * @arr_len: fence_arr and seq_arr array length
+ * @stream_map: bitmap of all HW/external queues streams on which the multi-CS
+ *              is waiting
+ * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0)
+ * @update_ts: update timestamp. 1- update the timestamp, otherwise 0.
+ */
+struct multi_cs_data {
+	struct hl_ctx	*ctx;
+	struct hl_fence	**fence_arr;
+	u64		*seq_arr;
+	s64		timeout_us;
+	s64		timestamp;
+	long		wait_status;
+	u32		completion_bitmap;
+	u8		arr_len;
+	u8		stream_map;
+	u8		gone_cs;
+	u8		update_ts;
+};
+
 /**
  * struct hl_device - habanalabs device structure.
  * @pdev: pointer to PCI device, can be NULL in case of simulator device.
@@ -2300,6 +2354,7 @@ struct hl_mmu_funcs {
  * @fw_loader: FW loader manager.
  * @pci_mem_region: array of memory regions in the PCI
  * @state_dump_specs: constants and dictionaries needed to dump system state.
+ * @multi_cs_completion: array of multi-CS completion.
  * @dram_used_mem: current DRAM memory consumption.
  * @timeout_jiffies: device CS timeout value.
  * @max_power: the max power of the device, as configured by the sysadmin. This
@@ -2376,6 +2431,7 @@ struct hl_mmu_funcs {
  *                        halted. We can't halt it again because the COMMS
  *                        protocol will throw an error. Relevant only for
  *                        cases where Linux was not loaded to device CPU
+ * @supports_wait_for_multi_cs: true if wait for multi CS is supported
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -2446,6 +2502,9 @@ struct hl_device {
 
 	struct hl_state_dump_specs	state_dump_specs;
 
+	struct multi_cs_completion	multi_cs_completion[
+							MULTI_CS_MAX_USER_CTX];
+
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
 	u64				max_power;
@@ -2495,6 +2554,7 @@ struct hl_device {
 	u8				curr_reset_cause;
 	u8				skip_reset_on_timeout;
 	u8				device_cpu_is_halted;
+	u8				supports_wait_for_multi_cs;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
@@ -2701,6 +2761,7 @@ bool cs_needs_completion(struct hl_cs *cs);
 bool cs_needs_timeout(struct hl_cs *cs);
 bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs);
 struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq);
+void hl_multi_cs_completion_init(struct hl_device *hdev);
 
 void goya_set_asic_funcs(struct hl_device *hdev);
 void gaudi_set_asic_funcs(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 0afead229e97..f05a0dbd0990 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -603,6 +603,10 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 		}
 
 		list_add_tail(&cs->staged_cs_node, &staged_cs->staged_cs_node);
+
+		/* update stream map of the first CS */
+		if (hdev->supports_wait_for_multi_cs)
+			staged_cs->fence->stream_map |= cs->fence->stream_map;
 	}
 
 	list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 561485dcddaf..a4b33b0b17d4 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1814,6 +1814,7 @@ static int gaudi_sw_init(struct hl_device *hdev)
 	hdev->supports_sync_stream = true;
 	hdev->supports_coresight = true;
 	hdev->supports_staged_submission = true;
+	hdev->supports_wait_for_multi_cs = true;
 
 	gaudi_set_pci_memory_regions(hdev);
 
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index c8d58dd2c041..f6251d8663b2 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -958,6 +958,7 @@ static int goya_sw_init(struct hl_device *hdev)
 	hdev->supports_coresight = true;
 	hdev->supports_soft_reset = true;
 	hdev->allow_external_soft_reset = true;
+	hdev->supports_wait_for_multi_cs = false;
 
 	goya_set_pci_memory_regions(hdev);
 
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 18765eb75b65..49c737c4a2f6 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -735,11 +735,18 @@ union hl_cs_args {
 
 #define HL_WAIT_CS_FLAGS_INTERRUPT	0x2
 #define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000
+#define HL_WAIT_CS_FLAGS_MULTI_CS	0x4
+
+#define HL_WAIT_MULTI_CS_LIST_MAX_LEN	32
 
 struct hl_wait_cs_in {
 	union {
 		struct {
-			/* Command submission sequence number */
+			/*
+			 * In case of wait_cs holds the CS sequence number.
+			 * In case of wait for multi CS hold a user pointer to
+			 * an array of CS sequence numbers
+			 */
 			__u64 seq;
 			/* Absolute timeout to wait for command submission
 			 * in microseconds
@@ -767,12 +774,17 @@ struct hl_wait_cs_in {
 
 	/* Context ID - Currently not in use */
 	__u32 ctx_id;
+
 	/* HL_WAIT_CS_FLAGS_*
 	 * If HL_WAIT_CS_FLAGS_INTERRUPT is set, this field should include
 	 * interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK, in order
 	 * not to specify an interrupt id ,set mask to all 1s.
 	 */
 	__u32 flags;
+
+	/* Multi CS API info- valid entries in multi-CS array */
+	__u8 seq_arr_len;
+	__u8 pad[7];
 };
 
 #define HL_WAIT_CS_STATUS_COMPLETED	0
@@ -789,8 +801,15 @@ struct hl_wait_cs_out {
 	__u32 status;
 	/* HL_WAIT_CS_STATUS_FLAG* */
 	__u32 flags;
-	/* valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set */
+	/*
+	 * valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set
+	 * for wait_cs: timestamp of CS completion
+	 * for wait_multi_cs: timestamp of FIRST CS completion
+	 */
 	__s64 timestamp_nsec;
+	/* multi CS completion bitmap */
+	__u32 cs_completion_map;
+	__u32 pad;
 };
 
 union hl_wait_cs_args {
-- 
cgit v1.2.3-71-gd317


From dadf17abb7245d9556591d8cc78bf57462e3b20a Mon Sep 17 00:00:00 2001
From: farah kassabri <fkassabri@habana.ai>
Date: Mon, 24 May 2021 18:09:22 +0300
Subject: habanalabs: add support for encapsulated signals reservation

The signaling from within encapsulated OP capability is merged into the
existing stream architecture, such that one can trigger multiple
signaling from an encapsulated op, according to the time the event
was done in the graph execution and avoid the need to wait for the
whole encapsulated OP execution to be complete before the stream can
signal.

This commit implements only the reserve/unreserve part.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../misc/habanalabs/common/command_submission.c    | 255 ++++++++++++++++++++-
 drivers/misc/habanalabs/common/context.c           |  56 +++++
 drivers/misc/habanalabs/common/habanalabs.h        |  55 ++++-
 drivers/misc/habanalabs/common/habanalabs_drv.c    |   1 -
 drivers/misc/habanalabs/common/hw_queue.c          |   5 +-
 drivers/misc/habanalabs/gaudi/gaudi.c              |   8 +-
 drivers/misc/habanalabs/goya/goya.c                |   6 +
 include/uapi/misc/habanalabs.h                     | 110 +++++++--
 8 files changed, 468 insertions(+), 28 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 458cdf2ddab5..84032b1bae5c 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -38,7 +38,11 @@ static void hl_sob_reset(struct kref *ref)
 							kref);
 	struct hl_device *hdev = hw_sob->hdev;
 
+	dev_dbg(hdev->dev, "reset sob id %u\n", hw_sob->sob_id);
+
 	hdev->asic_funcs->reset_sob(hdev, hw_sob);
+
+	hw_sob->need_reset = false;
 }
 
 void hl_sob_reset_error(struct kref *ref)
@@ -52,7 +56,7 @@ void hl_sob_reset_error(struct kref *ref)
 		hw_sob->q_idx, hw_sob->sob_id);
 }
 
-static void hw_sob_put(struct hl_hw_sob *hw_sob)
+void hw_sob_put(struct hl_hw_sob *hw_sob)
 {
 	if (hw_sob)
 		kref_put(&hw_sob->kref, hl_sob_reset);
@@ -64,7 +68,7 @@ static void hw_sob_put_err(struct hl_hw_sob *hw_sob)
 		kref_put(&hw_sob->kref, hl_sob_reset_error);
 }
 
-static void hw_sob_get(struct hl_hw_sob *hw_sob)
+void hw_sob_get(struct hl_hw_sob *hw_sob)
 {
 	if (hw_sob)
 		kref_get(&hw_sob->kref);
@@ -576,7 +580,8 @@ static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
 
 	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
 			(hl_cs_cmpl->type == CS_TYPE_WAIT) ||
-			(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)) {
+			(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) ||
+			(!!hl_cs_cmpl->encaps_signals)) {
 		dev_dbg(hdev->dev,
 				"CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n",
 				hl_cs_cmpl->cs_seq,
@@ -829,6 +834,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 
 	cs_cmpl->hdev = hdev;
 	cs_cmpl->type = cs->type;
+	cs_cmpl->encaps_signals = false;
 	spin_lock_init(&cs_cmpl->lock);
 	INIT_WORK(&cs_cmpl->sob_reset_work, sob_reset_work);
 	cs->fence = &cs_cmpl->base_fence;
@@ -1115,6 +1121,10 @@ static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
 		return CS_TYPE_WAIT;
 	else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT)
 		return CS_TYPE_COLLECTIVE_WAIT;
+	else if (cs_type_flags & HL_CS_FLAGS_RESERVE_SIGNALS_ONLY)
+		return CS_RESERVE_SIGNALS;
+	else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
+		return CS_UNRESERVE_SIGNALS;
 	else
 		return CS_TYPE_DEFAULT;
 }
@@ -1652,10 +1662,17 @@ out:
  * hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case.
  * if the SOB value reaches the max value move to the other SOB reserved
  * to the queue.
+ * @hdev: pointer to device structure
+ * @q_idx: stream queue index
+ * @hw_sob: the H/W SOB used in this signal CS.
+ * @count: signals count
+ * @encaps_sig: tells whether it's reservation for encaps signals or not.
+ *
  * Note that this function must be called while hw_queues_lock is taken.
  */
 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
-			struct hl_hw_sob **hw_sob, u32 count)
+			struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig)
+
 {
 	struct hl_sync_stream_properties *prop;
 	struct hl_hw_sob *sob = *hw_sob, *other_sob;
@@ -1688,12 +1705,34 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 			return -EINVAL;
 		}
 
-		prop->next_sob_val = 1;
+		prop->next_sob_val = count;
 
 		/* only two SOBs are currently in use */
 		prop->curr_sob_offset = other_sob_offset;
 		*hw_sob = other_sob;
 
+		/*
+		 * check if other_sob needs reset, then do it before using it
+		 * for the reservation or the next signal cs.
+		 * we do it here, and for both encaps and regular signal cs
+		 * cases in order to avoid possible races of two kref_put
+		 * of the sob which can occur at the same time if we move the
+		 * sob reset(kref_put) to cs_do_release function.
+		 * in addition, if we have combination of cs signal and
+		 * encaps, and at the point we need to reset the sob there was
+		 * no more reservations and only signal cs keep coming,
+		 * in such case we need to signal_cs to put the refcount and
+		 * reset the sob.
+		 */
+		if (other_sob->need_reset)
+			kref_put(&other_sob->kref, hl_sob_reset);
+
+		if (encaps_sig) {
+			/* set reset indication for the sob */
+			sob->need_reset = true;
+			hw_sob_get(other_sob);
+		}
+
 		dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
 				prop->curr_sob_offset, q_idx);
 	} else {
@@ -1817,6 +1856,187 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
 	return 0;
 }
 
+static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
+				u32 q_idx, u32 count,
+				u32 *handle_id, u32 *sob_addr,
+				u32 *signals_count)
+{
+	struct hw_queue_properties *hw_queue_prop;
+	struct hl_sync_stream_properties *prop;
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_cs_encaps_sig_handle *handle;
+	struct hl_encaps_signals_mgr *mgr;
+	struct hl_hw_sob *hw_sob;
+	int hdl_id;
+	int rc = 0;
+
+	if (count >= HL_MAX_SOB_VAL) {
+		dev_err(hdev->dev, "signals count(%u) exceeds the max SOB value\n",
+						count);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (q_idx >= hdev->asic_prop.max_queues) {
+		dev_err(hdev->dev, "Queue index %d is invalid\n",
+			q_idx);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
+
+	if (!hw_queue_prop->supports_sync_stream) {
+		dev_err(hdev->dev,
+			"Queue index %d does not support sync stream operations\n",
+									q_idx);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
+
+	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+	if (!handle) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	handle->count = count;
+	mgr = &hpriv->ctx->sig_mgr;
+
+	spin_lock(&mgr->lock);
+	hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_KERNEL);
+	spin_unlock(&mgr->lock);
+
+	if (hdl_id < 0) {
+		dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	handle->id = hdl_id;
+	handle->q_idx = q_idx;
+	handle->hdev = hdev;
+	kref_init(&handle->refcount);
+
+	hdev->asic_funcs->hw_queues_lock(hdev);
+
+	hw_sob = &prop->hw_sob[prop->curr_sob_offset];
+
+	/*
+	 * Increment the SOB value by count by user request
+	 * to reserve those signals
+	 * check if the signals amount to reserve is not exceeding the max sob
+	 * value, if yes then switch sob.
+	 */
+	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, count,
+						true);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to switch SOB\n");
+		hdev->asic_funcs->hw_queues_unlock(hdev);
+		rc = -EINVAL;
+		goto remove_idr;
+	}
+
+	/* set the hw_sob to the handle after calling the sob wraparound handler
+	 * since sob could have changed.
+	 */
+	handle->hw_sob = hw_sob;
+
+	/* store the current sob value for unreserve validity check, and
+	 * signal offset support
+	 */
+	handle->pre_sob_val = prop->next_sob_val - handle->count;
+
+	*signals_count = prop->next_sob_val;
+	hdev->asic_funcs->hw_queues_unlock(hdev);
+
+	*sob_addr = handle->hw_sob->sob_addr;
+	*handle_id = hdl_id;
+
+	dev_dbg(hdev->dev,
+		"Signals reserved, sob_id: %d, sob addr: 0x%x, sob val: 0x%x, q_idx: %d, hdl_id: %d\n",
+			hw_sob->sob_id, handle->hw_sob->sob_addr,
+			prop->next_sob_val, q_idx, hdl_id);
+	goto out;
+
+remove_idr:
+	spin_lock(&mgr->lock);
+	idr_remove(&mgr->handles, hdl_id);
+	spin_unlock(&mgr->lock);
+
+	kfree(handle);
+out:
+	return rc;
+}
+
+static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
+{
+	struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
+	struct hl_sync_stream_properties *prop;
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_encaps_signals_mgr *mgr;
+	struct hl_hw_sob *hw_sob;
+	u32 q_idx, sob_addr;
+	int rc = 0;
+
+	mgr = &hpriv->ctx->sig_mgr;
+
+	spin_lock(&mgr->lock);
+	encaps_sig_hdl = idr_find(&mgr->handles, handle_id);
+	if (encaps_sig_hdl) {
+		dev_dbg(hdev->dev, "unreserve signals, handle: %u, SOB:0x%x, count: %u\n",
+				handle_id, encaps_sig_hdl->hw_sob->sob_addr,
+					encaps_sig_hdl->count);
+
+		hdev->asic_funcs->hw_queues_lock(hdev);
+
+		q_idx = encaps_sig_hdl->q_idx;
+		prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
+		hw_sob = &prop->hw_sob[prop->curr_sob_offset];
+		sob_addr = hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
+
+		/* Check if sob_val got out of sync due to other
+		 * signal submission requests which were handled
+		 * between the reserve-unreserve calls or SOB switch
+		 * upon reaching SOB max value.
+		 */
+		if (encaps_sig_hdl->pre_sob_val + encaps_sig_hdl->count
+				!= prop->next_sob_val ||
+				sob_addr != encaps_sig_hdl->hw_sob->sob_addr) {
+			dev_err(hdev->dev, "Cannot unreserve signals, SOB val ran out of sync, expected: %u, actual val: %u\n",
+				encaps_sig_hdl->pre_sob_val,
+				(prop->next_sob_val - encaps_sig_hdl->count));
+
+			hdev->asic_funcs->hw_queues_unlock(hdev);
+			rc = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * Decrement the SOB value by count by user request
+		 * to unreserve those signals
+		 */
+		prop->next_sob_val -= encaps_sig_hdl->count;
+
+		hdev->asic_funcs->hw_queues_unlock(hdev);
+
+		hw_sob_put(hw_sob);
+
+		/* Release the id and free allocated memory of the handle */
+		idr_remove(&mgr->handles, handle_id);
+		kfree(encaps_sig_hdl);
+	} else {
+		rc = -EINVAL;
+		dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n");
+	}
+out:
+	spin_unlock(&mgr->lock);
+
+	return rc;
+}
+
 static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 				void __user *chunks, u32 num_chunks,
 				u64 *cs_seq, u32 flags, u32 timeout)
@@ -1996,10 +2216,11 @@ out:
 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	union hl_cs_args *args = data;
-	enum hl_cs_type cs_type;
+	enum hl_cs_type cs_type = 0;
 	u64 cs_seq = ULONG_MAX;
 	void __user *chunks;
-	u32 num_chunks, flags, timeout;
+	u32 num_chunks, flags, timeout,
+		signals_count = 0, sob_addr = 0, handle_id = 0;
 	int rc;
 
 	rc = hl_cs_sanity_checks(hpriv, args);
@@ -2036,17 +2257,33 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
 					&cs_seq, args->in.cs_flags, timeout);
 		break;
+	case CS_RESERVE_SIGNALS:
+		rc = cs_ioctl_reserve_signals(hpriv,
+					args->in.encaps_signals_q_idx,
+					args->in.encaps_signals_count,
+					&handle_id, &sob_addr, &signals_count);
+		break;
+	case CS_UNRESERVE_SIGNALS:
+		rc = cs_ioctl_unreserve_signals(hpriv,
+					args->in.encaps_sig_handle_id);
+		break;
 	default:
 		rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
 						args->in.cs_flags, timeout);
 		break;
 	}
-
 out:
 	if (rc != -EAGAIN) {
 		memset(args, 0, sizeof(*args));
+
+		if (cs_type == CS_RESERVE_SIGNALS) {
+			args->out.handle_id = handle_id;
+			args->out.sob_base_addr_offset = sob_addr;
+			args->out.count = signals_count;
+		} else {
+			args->out.seq = cs_seq;
+		}
 		args->out.status = rc;
-		args->out.seq = cs_seq;
 	}
 
 	return rc;
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 4d922e4d0393..abbba4194d3b 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -9,6 +9,59 @@
 
 #include <linux/slab.h>
 
+void hl_encaps_handle_do_release(struct kref *ref)
+{
+	struct hl_cs_encaps_sig_handle *handle =
+		container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
+	struct hl_ctx *ctx = handle->hdev->compute_ctx;
+	struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr;
+
+	idr_remove(&mgr->handles, handle->id);
+	kfree(handle);
+}
+
+static void hl_encaps_handle_do_release_sob(struct kref *ref)
+{
+	struct hl_cs_encaps_sig_handle *handle =
+		container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
+	struct hl_ctx *ctx = handle->hdev->compute_ctx;
+	struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr;
+
+	/* if we're here, then there was a signals reservation but cs with
+	 * encaps signals wasn't submitted, so need to put refcount
+	 * to hw_sob taken at the reservation.
+	 */
+	hw_sob_put(handle->hw_sob);
+
+	idr_remove(&mgr->handles, handle->id);
+	kfree(handle);
+}
+
+static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
+{
+	spin_lock_init(&mgr->lock);
+	idr_init(&mgr->handles);
+}
+
+static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
+			struct hl_encaps_signals_mgr *mgr)
+{
+	struct hl_cs_encaps_sig_handle *handle;
+	struct idr *idp;
+	u32 id;
+
+	idp = &mgr->handles;
+
+	if (!idr_is_empty(idp)) {
+		dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n");
+		idr_for_each_entry(idp, handle, id)
+			kref_put(&handle->refcount,
+					hl_encaps_handle_do_release_sob);
+	}
+
+	idr_destroy(&mgr->handles);
+}
+
 static void hl_ctx_fini(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
@@ -53,6 +106,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 		hl_cb_va_pool_fini(ctx);
 		hl_vm_ctx_fini(ctx);
 		hl_asid_free(hdev, ctx->asid);
+		hl_encaps_sig_mgr_fini(hdev, &ctx->sig_mgr);
 
 		/* Scrub both SRAM and DRAM */
 		hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
@@ -200,6 +254,8 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 			goto err_cb_va_pool_fini;
 		}
 
+		hl_encaps_sig_mgr_init(&ctx->sig_mgr);
+
 		dev_dbg(hdev->dev, "create user context %d\n", ctx->asid);
 	}
 
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index bf327cb7ddd6..81b6825e0c1c 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -242,7 +242,9 @@ enum hl_cs_type {
 	CS_TYPE_DEFAULT,
 	CS_TYPE_SIGNAL,
 	CS_TYPE_WAIT,
-	CS_TYPE_COLLECTIVE_WAIT
+	CS_TYPE_COLLECTIVE_WAIT,
+	CS_RESERVE_SIGNALS,
+	CS_UNRESERVE_SIGNALS
 };
 
 /*
@@ -287,13 +289,17 @@ enum queue_cb_alloc_flags {
  * @hdev: habanalabs device structure.
  * @kref: refcount of this SOB. The SOB will reset once the refcount is zero.
  * @sob_id: id of this SOB.
+ * @sob_addr: the sob offset from the base address.
  * @q_idx: the H/W queue that uses this SOB.
+ * @need_reset: reset indication set when switching to the other sob.
  */
 struct hl_hw_sob {
 	struct hl_device	*hdev;
 	struct kref		kref;
 	u32			sob_id;
+	u32			sob_addr;
 	u32			q_idx;
+	bool			need_reset;
 };
 
 enum hl_collective_mode {
@@ -608,6 +614,8 @@ struct hl_fence {
  * @type: type of the CS - signal/wait.
  * @sob_val: the SOB value that is used in this signal/wait CS.
  * @sob_group: the SOB group that is used in this collective wait CS.
+ * @encaps_signals: indication whether it's a completion object of cs with
+ * encaps signals or not.
  */
 struct hl_cs_compl {
 	struct work_struct	sob_reset_work;
@@ -619,6 +627,7 @@ struct hl_cs_compl {
 	enum hl_cs_type		type;
 	u16			sob_val;
 	u16			sob_group;
+	bool			encaps_signals;
 };
 
 /*
@@ -730,6 +739,17 @@ struct hl_sync_stream_properties {
 	u8		curr_sob_offset;
 };
 
+/**
+ * struct hl_encaps_signals_mgr - describes sync stream encapsulated signals
+ * handlers manager
+ * @lock: protects handles.
+ * @handles: an idr to hold all encapsulated signals handles.
+ */
+struct hl_encaps_signals_mgr {
+	spinlock_t		lock;
+	struct idr		handles;
+};
+
 /**
  * struct hl_hw_queue - describes a H/W transport queue.
  * @shadow_queue: pointer to a shadow queue that holds pointers to jobs.
@@ -1135,6 +1155,7 @@ struct fw_load_mgr {
  * @init_firmware_loader: initialize data for FW loader.
  * @init_cpu_scrambler_dram: Enable CPU specific DRAM scrambling
  * @state_dump_init: initialize constants required for state dump
+ * @get_sob_addr: get SOB base address offset.
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -1261,6 +1282,7 @@ struct hl_asic_funcs {
 	void (*init_firmware_loader)(struct hl_device *hdev);
 	void (*init_cpu_scrambler_dram)(struct hl_device *hdev);
 	void (*state_dump_init)(struct hl_device *hdev);
+	u32 (*get_sob_addr)(struct hl_device *hdev, u32 sob_id);
 };
 
 
@@ -1353,6 +1375,7 @@ struct hl_pending_cb {
  * @cs_counters: context command submission counters.
  * @cb_va_pool: device VA pool for command buffers which are mapped to the
  *              device's MMU.
+ * @sig_mgr: encaps signals handle manager.
  * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
  *			to user so user could inquire about CS. It is used as
  *			index to cs_pending array.
@@ -1392,6 +1415,7 @@ struct hl_ctx {
 	struct list_head		hw_block_mem_list;
 	struct hl_cs_counters_atomic	cs_counters;
 	struct gen_pool			*cb_va_pool;
+	struct hl_encaps_signals_mgr	sig_mgr;
 	u64				cs_sequence;
 	u64				*dram_default_hops;
 	spinlock_t			pending_cb_lock;
@@ -2504,7 +2528,6 @@ struct hl_device {
 
 	struct multi_cs_completion	multi_cs_completion[
 							MULTI_CS_MAX_USER_CTX];
-
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
 	u64				max_power;
@@ -2576,6 +2599,29 @@ struct hl_device {
 };
 
 
+/**
+ * struct hl_cs_encaps_sig_handle - encapsulated signals handle structure
+ * @refcount: refcount used to protect removing this id when several
+ *            wait cs are used to wait of the reserved encaps signals.
+ * @hdev: pointer to habanalabs device structure.
+ * @hw_sob: pointer to  H/W SOB used in the reservation.
+ * @cs_seq: staged cs sequence which contains encapsulated signals
+ * @id: idr handler id to be used to fetch the handler info
+ * @q_idx: stream queue index
+ * @pre_sob_val: current SOB value before reservation
+ * @count: signals number
+ */
+struct hl_cs_encaps_sig_handle {
+	struct kref refcount;
+	struct hl_device *hdev;
+	struct hl_hw_sob *hw_sob;
+	u64  cs_seq;
+	u32  id;
+	u32  q_idx;
+	u32  pre_sob_val;
+	u32  count;
+};
+
 /*
  * IOCTLs
  */
@@ -2889,9 +2935,12 @@ int hl_set_voltage(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value);
 int hl_set_current(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value);
+void hl_encaps_handle_do_release(struct kref *ref);
+void hw_sob_get(struct hl_hw_sob *hw_sob);
+void hw_sob_put(struct hl_hw_sob *hw_sob);
 void hl_release_pending_user_interrupts(struct hl_device *hdev);
 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
-			struct hl_hw_sob **hw_sob, u32 count);
+			struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig);
 
 int hl_state_dump(struct hl_device *hdev);
 const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id);
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index bffca119946b..3df4313d72cd 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -194,7 +194,6 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 out_err:
 	mutex_unlock(&hdev->fpriv_list_lock);
-
 	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
 	filp->private_data = NULL;
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 2494bd6e9358..9a59b8e9bf53 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -426,7 +426,8 @@ static int init_signal_cs(struct hl_device *hdev,
 	hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb,
 				cs_cmpl->hw_sob->sob_id, 0, true);
 
-	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1);
+	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1,
+								false);
 
 	return rc;
 }
@@ -850,6 +851,8 @@ static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx)
 		hw_sob = &sync_stream_prop->hw_sob[sob];
 		hw_sob->hdev = hdev;
 		hw_sob->sob_id = sync_stream_prop->base_sob_id + sob;
+		hw_sob->sob_addr =
+			hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
 		hw_sob->q_idx = q_idx;
 		kref_init(&hw_sob->kref);
 	}
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 5b7a5692cd21..ae1a8b4e694c 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -8720,6 +8720,11 @@ static u32 gaudi_get_wait_cb_size(struct hl_device *hdev)
 			sizeof(struct packet_msg_prot) * 2;
 }
 
+static u32 gaudi_get_sob_addr(struct hl_device *hdev, u32 sob_id)
+{
+	return mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + (sob_id * 4);
+}
+
 static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id,
 				u32 size, bool eb)
 {
@@ -9424,7 +9429,8 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.map_pll_idx_to_fw_idx = gaudi_map_pll_idx_to_fw_idx,
 	.init_firmware_loader = gaudi_init_firmware_loader,
 	.init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
-	.state_dump_init = gaudi_state_dump_init
+	.state_dump_init = gaudi_state_dump_init,
+	.get_sob_addr = gaudi_get_sob_addr
 };
 
 /**
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index dd218a4bb62e..8a689bf42397 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5575,6 +5575,11 @@ static void goya_state_dump_init(struct hl_device *hdev)
 	hdev->state_dump_specs.funcs = goya_state_dump_funcs;
 }
 
+static u32 goya_get_sob_addr(struct hl_device *hdev, u32 sob_id)
+{
+	return 0;
+}
+
 static const struct hl_asic_funcs goya_funcs = {
 	.early_init = goya_early_init,
 	.early_fini = goya_early_fini,
@@ -5662,6 +5667,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.init_firmware_loader = goya_init_firmware_loader,
 	.init_cpu_scrambler_dram = goya_cpu_init_scrambler_dram,
 	.state_dump_init = goya_state_dump_init,
+	.get_sob_addr = &goya_get_sob_addr
 };
 
 /*
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 49c737c4a2f6..eca86c545916 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -628,12 +628,21 @@ struct hl_cs_chunk {
 		__u64 cb_handle;
 
 		/* Relevant only when HL_CS_FLAGS_WAIT or
-		 * HL_CS_FLAGS_COLLECTIVE_WAIT is set.
+		 * HL_CS_FLAGS_COLLECTIVE_WAIT is set
 		 * This holds address of array of u64 values that contain
-		 * signal CS sequence numbers. The wait described by this job
-		 * will listen on all those signals (wait event per signal)
+		 * signal CS sequence numbers. The wait described by
+		 * this job will listen on all those signals
+		 * (wait event per signal)
 		 */
 		__u64 signal_seq_arr;
+
+		/*
+		 * Relevant only when HL_CS_FLAGS_WAIT or
+		 * HL_CS_FLAGS_COLLECTIVE_WAIT is set
+		 * along with HL_CS_FLAGS_ENCAP_SIGNALS.
+		 * This is the CS sequence which has the encapsulated signals.
+		 */
+		__u64 encaps_signal_seq;
 	};
 
 	/* Index of queue to put the CB on */
@@ -651,6 +660,17 @@ struct hl_cs_chunk {
 		 * Number of entries in signal_seq_arr
 		 */
 		__u32 num_signal_seq_arr;
+
+		/* Relevant only when HL_CS_FLAGS_WAIT or
+		 * HL_CS_FLAGS_COLLECTIVE_WAIT is set along
+		 * with HL_CS_FLAGS_ENCAP_SIGNALS
+		 * This set the signals range that the user want to wait for
+		 * out of the whole reserved signals range.
+		 * e.g if the signals range is 20, and user don't want
+		 * to wait for signal 8, so he set this offset to 7, then
+		 * he call the API again with 9 and so on till 20.
+		 */
+		__u32 encaps_signal_offset;
 	};
 
 	/* HL_CS_CHUNK_FLAGS_* */
@@ -678,6 +698,28 @@ struct hl_cs_chunk {
 #define HL_CS_FLAGS_CUSTOM_TIMEOUT		0x200
 #define HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT	0x400
 
+/*
+ * The encapsulated signals CS is merged into the existing CS ioctls.
+ * In order to use this feature need to follow the below procedure:
+ * 1. Reserve signals, set the CS type to HL_CS_FLAGS_RESERVE_SIGNALS_ONLY
+ *    the output of this API will be the SOB offset from CFG_BASE.
+ *    this address will be used to patch CB cmds to do the signaling for this
+ *    SOB by incrementing it's value.
+ *    for reverting the reservation use HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY
+ *    CS type, note that this might fail if out-of-sync happened to the SOB
+ *    value, in case other signaling request to the same SOB occurred between
+ *    reserve-unreserve calls.
+ * 2. Use the staged CS to do the encapsulated signaling jobs.
+ *    use HL_CS_FLAGS_STAGED_SUBMISSION and HL_CS_FLAGS_STAGED_SUBMISSION_FIRST
+ *    along with HL_CS_FLAGS_ENCAP_SIGNALS flag, and set encaps_signal_offset
+ *    field. This offset allows app to wait on part of the reserved signals.
+ * 3. Use WAIT/COLLECTIVE WAIT CS along with HL_CS_FLAGS_ENCAP_SIGNALS flag
+ *    to wait for the encapsulated signals.
+ */
+#define HL_CS_FLAGS_ENCAP_SIGNALS		0x800
+#define HL_CS_FLAGS_RESERVE_SIGNALS_ONLY	0x1000
+#define HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY	0x2000
+
 #define HL_CS_STATUS_SUCCESS		0
 
 #define HL_MAX_JOBS_PER_CS		512
@@ -690,10 +732,35 @@ struct hl_cs_in {
 	/* holds address of array of hl_cs_chunk for execution phase */
 	__u64 chunks_execute;
 
-	/* Sequence number of a staged submission CS
-	 * valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set
-	 */
-	__u64 seq;
+	union {
+		/*
+		 * Sequence number of a staged submission CS
+		 * valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set and
+		 * HL_CS_FLAGS_STAGED_SUBMISSION_FIRST is unset.
+		 */
+		__u64 seq;
+
+		/*
+		 * Encapsulated signals handle id
+		 * Valid for two flows:
+		 * 1. CS with encapsulated signals:
+		 *    when HL_CS_FLAGS_STAGED_SUBMISSION and
+		 *    HL_CS_FLAGS_STAGED_SUBMISSION_FIRST
+		 *    and HL_CS_FLAGS_ENCAP_SIGNALS are set.
+		 * 2. unreserve signals:
+		 *    valid when HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY is set.
+		 */
+		__u32 encaps_sig_handle_id;
+
+		/* Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set */
+		struct {
+			/* Encapsulated signals number */
+			__u32 encaps_signals_count;
+
+			/* Encapsulated signals queue index (stream) */
+			__u32 encaps_signals_q_idx;
+		};
+	};
 
 	/* Number of chunks in restore phase array. Maximum number is
 	 * HL_MAX_JOBS_PER_CS
@@ -718,14 +785,31 @@ struct hl_cs_in {
 };
 
 struct hl_cs_out {
+	union {
+		/*
+		 * seq holds the sequence number of the CS to pass to wait
+		 * ioctl. All values are valid except for 0 and ULLONG_MAX
+		 */
+		__u64 seq;
+
+		/* Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set */
+		struct {
+			/* This is the resereved signal handle id */
+			__u32 handle_id;
+
+			/* This is the signals count */
+			__u32 count;
+		};
+	};
+
+	/* HL_CS_STATUS */
+	__u32 status;
+
 	/*
-	 * seq holds the sequence number of the CS to pass to wait ioctl. All
-	 * values are valid except for 0 and ULLONG_MAX
+	 * SOB base address offset
+	 * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set
 	 */
-	__u64 seq;
-	/* HL_CS_STATUS_* */
-	__u32 status;
-	__u32 pad;
+	__u32 sob_base_addr_offset;
 };
 
 union hl_cs_args {
-- 
cgit v1.2.3-71-gd317


From d8071323c5632bdf0a8ef9b9e5662fac43649f9d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 19 Aug 2021 09:34:06 -0700
Subject: dmaengine: idxd: fix setting up priv mode for dwq

DSA spec says WQ priv bit is 0 if the Privileged Mode Enable field of the
PCI Express PASID capability is 0 and pasid is enabled. Make sure that the
WQCFG priv field is set correctly according to usage type. Reject config if
setting up kernel WQ type and no support. Also add the correct priv setup
for a descriptor.

Fixes: 484f910e93b4 ("dmaengine: idxd: fix wq config registers offset programming")
Cc: Ramesh Thomas <ramesh.thomas@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162939084657.903168.14160019185148244596.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/Kconfig       |  1 +
 drivers/dma/idxd/device.c | 29 ++++++++++++++++++++++++++++-
 drivers/dma/idxd/dma.c    |  6 +++++-
 include/uapi/linux/idxd.h |  1 +
 4 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 87f2efb4fa65..ca153384ebf0 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -285,6 +285,7 @@ config INTEL_IDXD
 	tristate "Intel Data Accelerators support"
 	depends on PCI && X86_64 && !UML
 	depends on PCI_MSI
+	depends on PCI_PASID
 	depends on SBITMAP
 	select DMA_ENGINE
 	help
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index e093cf225a5c..241df74fc047 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -818,6 +818,15 @@ static int idxd_groups_config_write(struct idxd_device *idxd)
 	return 0;
 }
 
+static bool idxd_device_pasid_priv_enabled(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+
+	if (pdev->pasid_enabled && (pdev->pasid_features & PCI_PASID_CAP_PRIV))
+		return true;
+	return false;
+}
+
 static int idxd_wq_config_write(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd = wq->idxd;
@@ -850,7 +859,6 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	wq->wqcfg->wq_thresh = wq->threshold;
 
 	/* byte 8-11 */
-	wq->wqcfg->priv = !!(wq->type == IDXD_WQT_KERNEL);
 	if (wq_dedicated(wq))
 		wq->wqcfg->mode = 1;
 
@@ -860,6 +868,25 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 			wq->wqcfg->pasid = idxd->pasid;
 	}
 
+	/*
+	 * Here the priv bit is set depending on the WQ type. priv = 1 if the
+	 * WQ type is kernel to indicate privileged access. This setting only
+	 * matters for dedicated WQ. According to the DSA spec:
+	 * If the WQ is in dedicated mode, WQ PASID Enable is 1, and the
+	 * Privileged Mode Enable field of the PCI Express PASID capability
+	 * is 0, this field must be 0.
+	 *
+	 * In the case of a dedicated kernel WQ that is not able to support
+	 * the PASID cap, then the configuration will be rejected.
+	 */
+	wq->wqcfg->priv = !!(wq->type == IDXD_WQT_KERNEL);
+	if (wq_dedicated(wq) && wq->wqcfg->pasid_en &&
+	    !idxd_device_pasid_priv_enabled(idxd) &&
+	    wq->type == IDXD_WQT_KERNEL) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_PRIV;
+		return -EOPNOTSUPP;
+	}
+
 	wq->wqcfg->priority = wq->priority;
 
 	if (idxd->hw.gen_cap.block_on_fault &&
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 5c0a4d8a31f5..e0f056c1d1f5 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -69,7 +69,11 @@ static inline void idxd_prep_desc_common(struct idxd_wq *wq,
 	hw->src_addr = addr_f1;
 	hw->dst_addr = addr_f2;
 	hw->xfer_size = len;
-	hw->priv = !!(wq->type == IDXD_WQT_KERNEL);
+	/*
+	 * For dedicated WQ, this field is ignored and HW will use the WQCFG.priv
+	 * field instead. This field should be set to 1 for kernel descriptors.
+	 */
+	hw->priv = 1;
 	hw->completion_addr = compl;
 }
 
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index ca24c25252fb..c750eac09fc9 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -27,6 +27,7 @@ enum idxd_scmd_stat {
 	IDXD_SCMD_WQ_NO_SWQ_SUPPORT = 0x800c0000,
 	IDXD_SCMD_WQ_NONE_CONFIGURED = 0x800d0000,
 	IDXD_SCMD_WQ_NO_SIZE = 0x800e0000,
+	IDXD_SCMD_WQ_NO_PRIV = 0x800f0000,
 };
 
 #define IDXD_SCMD_SOFTERR_MASK	0x80000000
-- 
cgit v1.2.3-71-gd317


From 2e480058ddc21ec53a10e8b41623e245e908bdbc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 27 Aug 2021 11:33:19 -0600
Subject: io-wq: provide a way to limit max number of workers

io-wq divides work into two categories:

1) Work that completes in a bounded time, like reading from a regular file
   or a block device. This type of work is limited based on the size of
   the SQ ring.

2) Work that may never complete, we call this unbounded work. The amount
   of workers here is just limited by RLIMIT_NPROC.

For various uses cases, it's handy to have the kernel limit the maximum
amount of pending workers for both categories. Provide a way to do with
with a new IORING_REGISTER_IOWQ_MAX_WORKERS operation.

IORING_REGISTER_IOWQ_MAX_WORKERS takes an array of two integers and sets
the max worker count to what is being passed in for each category. The
old values are returned into that same array. If 0 is being passed in for
either category, it simply returns the current value.

The value is capped at RLIMIT_NPROC. This actually isn't that important
as it's more of a hint, if we're exceeding the value then our attempt
to fork a new worker will fail. This happens naturally already if more
than one node is in the system, as these values are per-node internally
for io-wq.

Reported-by: Johannes Lundberg <johalun0@gmail.com>
Link: https://github.com/axboe/liburing/issues/420
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c                    | 29 +++++++++++++++++++++++++++++
 fs/io-wq.h                    |  1 +
 fs/io_uring.c                 | 32 ++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  3 +++
 4 files changed, 65 insertions(+)

(limited to 'include/uapi')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 8da9bb103916..4b5fc621ab39 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -1152,6 +1152,35 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
 	return 0;
 }
 
+/*
+ * Set max number of unbounded workers, returns old value. If new_count is 0,
+ * then just return the old value.
+ */
+int io_wq_max_workers(struct io_wq *wq, int *new_count)
+{
+	int i, node, prev = 0;
+
+	for (i = 0; i < 2; i++) {
+		if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
+			new_count[i] = task_rlimit(current, RLIMIT_NPROC);
+	}
+
+	rcu_read_lock();
+	for_each_node(node) {
+		struct io_wqe_acct *acct;
+
+		for (i = 0; i < 2; i++) {
+			acct = &wq->wqes[node]->acct[i];
+			prev = max_t(int, acct->max_workers, prev);
+			if (new_count[i])
+				acct->max_workers = new_count[i];
+			new_count[i] = prev;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 static __init int io_wq_init(void)
 {
 	int ret;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 308af3928424..bf5c4c533760 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -128,6 +128,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
 void io_wq_hash_work(struct io_wq_work *work, void *val);
 
 int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
+int io_wq_max_workers(struct io_wq *wq, int *new_count);
 
 static inline bool io_wq_is_hashed(struct io_wq_work *work)
 {
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 53326449d685..edbda88142f9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -10233,6 +10233,31 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
 	return io_wq_cpu_affinity(tctx->io_wq, NULL);
 }
 
+static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+					void __user *arg)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	__u32 new_count[2];
+	int i, ret;
+
+	if (!tctx || !tctx->io_wq)
+		return -EINVAL;
+	if (copy_from_user(new_count, arg, sizeof(new_count)))
+		return -EFAULT;
+	for (i = 0; i < ARRAY_SIZE(new_count); i++)
+		if (new_count[i] > INT_MAX)
+			return -EINVAL;
+
+	ret = io_wq_max_workers(tctx->io_wq, new_count);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(arg, new_count, sizeof(new_count)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static bool io_register_op_must_quiesce(int op)
 {
 	switch (op) {
@@ -10250,6 +10275,7 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_REGISTER_BUFFERS_UPDATE:
 	case IORING_REGISTER_IOWQ_AFF:
 	case IORING_UNREGISTER_IOWQ_AFF:
+	case IORING_REGISTER_IOWQ_MAX_WORKERS:
 		return false;
 	default:
 		return true;
@@ -10406,6 +10432,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_unregister_iowq_aff(ctx);
 		break;
+	case IORING_REGISTER_IOWQ_MAX_WORKERS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 2)
+			break;
+		ret = io_register_iowq_max_workers(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 45a4f2373694..64fe809c4e36 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -309,6 +309,9 @@ enum {
 	IORING_REGISTER_IOWQ_AFF		= 17,
 	IORING_UNREGISTER_IOWQ_AFF		= 18,
 
+	/* set/get max number of workers */
+	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
-- 
cgit v1.2.3-71-gd317


From 50c1df2b56e0f581b1dbf334dbf807d6fb8f77b2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 27 Aug 2021 17:11:06 -0600
Subject: io_uring: support CLOCK_BOOTTIME/REALTIME for timeouts

Certain use cases want to use CLOCK_BOOTTIME or CLOCK_REALTIME rather than
CLOCK_MONOTONIC, instead of the default CLOCK_MONOTONIC.

Add an IORING_TIMEOUT_BOOTTIME and IORING_TIMEOUT_REALTIME flag that
allows timeouts and linked timeouts to use the selected clock source.

Only one clock source may be selected, and we -EINVAL the request if more
than one is given. If neither BOOTIME nor REALTIME are selected, the
previous default of MONOTONIC is used.

Link: https://github.com/axboe/liburing/issues/369
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 27 ++++++++++++++++++++++++---
 include/uapi/linux/io_uring.h |  3 +++
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index edbda88142f9..1c99f0143b57 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -508,6 +508,7 @@ struct io_timeout_data {
 	struct hrtimer			timer;
 	struct timespec64		ts;
 	enum hrtimer_mode		mode;
+	u32				flags;
 };
 
 struct io_accept {
@@ -5712,6 +5713,22 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 	return 0;
 }
 
+static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
+{
+	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
+	case IORING_TIMEOUT_BOOTTIME:
+		return CLOCK_BOOTTIME;
+	case IORING_TIMEOUT_REALTIME:
+		return CLOCK_REALTIME;
+	default:
+		/* can't happen, vetted at prep time */
+		WARN_ON_ONCE(1);
+		fallthrough;
+	case 0:
+		return CLOCK_MONOTONIC;
+	}
+}
+
 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 			     struct timespec64 *ts, enum hrtimer_mode mode)
 	__must_hold(&ctx->timeout_lock)
@@ -5725,7 +5742,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 	req->timeout.off = 0; /* noseq */
 	data = req->async_data;
 	list_add_tail(&req->timeout.list, &ctx->timeout_list);
-	hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
+	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
 	data->timer.function = io_timeout_fn;
 	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
 	return 0;
@@ -5807,7 +5824,10 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (off && is_timeout_link)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->timeout_flags);
-	if (flags & ~IORING_TIMEOUT_ABS)
+	if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+		return -EINVAL;
+	/* more than one clock specified is invalid, obviously */
+	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
 		return -EINVAL;
 
 	req->timeout.off = off;
@@ -5819,12 +5839,13 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	data = req->async_data;
 	data->req = req;
+	data->flags = flags;
 
 	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
 		return -EFAULT;
 
 	data->mode = io_translate_timeout_mode(flags);
-	hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
+	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
 
 	if (is_timeout_link) {
 		struct io_submit_link *link = &req->ctx->submit_state.link;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 64fe809c4e36..b6d28d927a3f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -151,6 +151,9 @@ enum {
  */
 #define IORING_TIMEOUT_ABS	(1U << 0)
 #define IORING_TIMEOUT_UPDATE	(1U << 1)
+#define IORING_TIMEOUT_BOOTTIME	(1U << 2)
+#define IORING_TIMEOUT_REALTIME	(1U << 3)
+#define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
 
 /*
  * sqe->splice_flags
-- 
cgit v1.2.3-71-gd317


From f1042b6ccb887f07301f6b096b3d0cfcf9189323 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 28 Aug 2021 19:54:39 -0600
Subject: io_uring: allow updating linked timeouts

We allow updating normal timeouts, add support for adjusting timings of
linked timeouts as well.

Reported-by: Victor Stewart <v@nametag.social>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 44 +++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/io_uring.h | 11 ++++++-----
 2 files changed, 46 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index aa978292f34b..7cc458e0b636 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -552,6 +552,7 @@ struct io_timeout_rem {
 	/* timeout update */
 	struct timespec64		ts;
 	u32				flags;
+	bool				ltimeout;
 };
 
 struct io_rw {
@@ -1069,6 +1070,7 @@ static int io_req_prep_async(struct io_kiocb *req);
 
 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 				 unsigned int issue_flags, u32 slot_index);
+static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
 
 static struct kmem_cache *req_cachep;
 
@@ -5732,6 +5734,31 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
 	}
 }
 
+static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+				    struct timespec64 *ts, enum hrtimer_mode mode)
+	__must_hold(&ctx->timeout_lock)
+{
+	struct io_timeout_data *io;
+	struct io_kiocb *req;
+	bool found = false;
+
+	list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
+		found = user_data == req->user_data;
+		if (found)
+			break;
+	}
+	if (!found)
+		return -ENOENT;
+
+	io = req->async_data;
+	if (hrtimer_try_to_cancel(&io->timer) == -1)
+		return -EALREADY;
+	hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
+	io->timer.function = io_link_timeout_fn;
+	hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
+	return 0;
+}
+
 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 			     struct timespec64 *ts, enum hrtimer_mode mode)
 	__must_hold(&ctx->timeout_lock)
@@ -5763,10 +5790,15 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
 	if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
+	tr->ltimeout = false;
 	tr->addr = READ_ONCE(sqe->addr);
 	tr->flags = READ_ONCE(sqe->timeout_flags);
-	if (tr->flags & IORING_TIMEOUT_UPDATE) {
-		if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
+	if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
+		if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
+			return -EINVAL;
+		if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
+			tr->ltimeout = true;
+		if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
 			return -EINVAL;
 		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
 			return -EFAULT;
@@ -5800,9 +5832,13 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 		spin_unlock_irq(&ctx->timeout_lock);
 		spin_unlock(&ctx->completion_lock);
 	} else {
+		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
+
 		spin_lock_irq(&ctx->timeout_lock);
-		ret = io_timeout_update(ctx, tr->addr, &tr->ts,
-					io_translate_timeout_mode(tr->flags));
+		if (tr->ltimeout)
+			ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
+		else
+			ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
 		spin_unlock_irq(&ctx->timeout_lock);
 	}
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b6d28d927a3f..3caec9199658 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -149,12 +149,13 @@ enum {
 /*
  * sqe->timeout_flags
  */
-#define IORING_TIMEOUT_ABS	(1U << 0)
-#define IORING_TIMEOUT_UPDATE	(1U << 1)
-#define IORING_TIMEOUT_BOOTTIME	(1U << 2)
-#define IORING_TIMEOUT_REALTIME	(1U << 3)
+#define IORING_TIMEOUT_ABS		(1U << 0)
+#define IORING_TIMEOUT_UPDATE		(1U << 1)
+#define IORING_TIMEOUT_BOOTTIME		(1U << 2)
+#define IORING_TIMEOUT_REALTIME		(1U << 3)
+#define IORING_LINK_TIMEOUT_UPDATE	(1U << 4)
 #define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
-
+#define IORING_TIMEOUT_UPDATE_MASK	(IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
 /*
  * sqe->splice_flags
  * extends splice(2) flags
-- 
cgit v1.2.3-71-gd317


From d7e7747ac5c2496c98291944c6066adaa9f3b975 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 26 Aug 2021 15:54:22 +0200
Subject: netfilter: refuse insertion if chain has grown too large

Also add a stat counter for this that gets exported both via old /proc
interface and ctnetlink.

Assuming the old default size of 16536 buckets and max hash occupancy of
64k, this results in 128k insertions (origin+reply), so ~8 entries per
chain on average.

The revised settings in this series will result in about two entries per
bucket on average.

This allows a hard-limit ceiling of 64.

This is not tunable at the moment, but its possible to either increase
nf_conntrack_buckets or decrease nf_conntrack_max to reduce average
lengths.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_common.h      |  1 +
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |  1 +
 net/netfilter/nf_conntrack_core.c                  | 42 ++++++++++++++++++----
 net/netfilter/nf_conntrack_netlink.c               |  4 ++-
 net/netfilter/nf_conntrack_standalone.c            |  4 +--
 5 files changed, 42 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 0c7d8d1e945d..700ea077ce2d 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -18,6 +18,7 @@ struct ip_conntrack_stat {
 	unsigned int expect_create;
 	unsigned int expect_delete;
 	unsigned int search_restart;
+	unsigned int chaintoolong;
 };
 
 #define NFCT_INFOMASK	7UL
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index d8484be72fdc..5ade231f497b 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -257,6 +257,7 @@ enum ctattr_stats_cpu {
 	CTA_STATS_ERROR,
 	CTA_STATS_SEARCH_RESTART,
 	CTA_STATS_CLASH_RESOLVE,
+	CTA_STATS_CHAIN_TOOLONG,
 	__CTA_STATS_MAX,
 };
 #define CTA_STATS_MAX (__CTA_STATS_MAX - 1)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index da2650f872e1..94e18fb9690d 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -77,6 +77,8 @@ static __read_mostly bool nf_conntrack_locks_all;
 #define GC_SCAN_INTERVAL	(120u * HZ)
 #define GC_SCAN_MAX_DURATION	msecs_to_jiffies(10)
 
+#define MAX_CHAINLEN	64u
+
 static struct conntrack_gc_work conntrack_gc_work;
 
 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
@@ -840,7 +842,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
 	unsigned int hash, reply_hash;
 	struct nf_conntrack_tuple_hash *h;
 	struct hlist_nulls_node *n;
+	unsigned int chainlen = 0;
 	unsigned int sequence;
+	int err = -EEXIST;
 
 	zone = nf_ct_zone(ct);
 
@@ -854,15 +858,24 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
 	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 
 	/* See if there's one in the list already, including reverse */
-	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
 		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 				    zone, net))
 			goto out;
 
-	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+		if (chainlen++ > MAX_CHAINLEN)
+			goto chaintoolong;
+	}
+
+	chainlen = 0;
+
+	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
 		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 				    zone, net))
 			goto out;
+		if (chainlen++ > MAX_CHAINLEN)
+			goto chaintoolong;
+	}
 
 	smp_wmb();
 	/* The caller holds a reference to this object */
@@ -872,11 +885,13 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
 	NF_CT_STAT_INC(net, insert);
 	local_bh_enable();
 	return 0;
-
+chaintoolong:
+	NF_CT_STAT_INC(net, chaintoolong);
+	err = -ENOSPC;
 out:
 	nf_conntrack_double_unlock(hash, reply_hash);
 	local_bh_enable();
-	return -EEXIST;
+	return err;
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 
@@ -1089,6 +1104,7 @@ int
 __nf_conntrack_confirm(struct sk_buff *skb)
 {
 	const struct nf_conntrack_zone *zone;
+	unsigned int chainlen = 0, sequence;
 	unsigned int hash, reply_hash;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
@@ -1096,7 +1112,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	struct hlist_nulls_node *n;
 	enum ip_conntrack_info ctinfo;
 	struct net *net;
-	unsigned int sequence;
 	int ret = NF_DROP;
 
 	ct = nf_ct_get(skb, &ctinfo);
@@ -1156,15 +1171,28 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	/* See if there's one in the list already, including reverse:
 	   NAT could have grabbed it without realizing, since we're
 	   not in the hash.  If there is, we lost race. */
-	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
 		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 				    zone, net))
 			goto out;
+		if (chainlen++ > MAX_CHAINLEN)
+			goto chaintoolong;
+	}
 
-	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+	chainlen = 0;
+	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
 		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 				    zone, net))
 			goto out;
+		if (chainlen++ > MAX_CHAINLEN) {
+chaintoolong:
+			nf_ct_add_to_dying_list(ct);
+			NF_CT_STAT_INC(net, chaintoolong);
+			NF_CT_STAT_INC(net, insert_failed);
+			ret = NF_DROP;
+			goto dying;
+		}
+	}
 
 	/* Timer relative to confirmation time, not original
 	   setting time, otherwise we'd get timer wrap in
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index e81af33b233b..3f081ae08266 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2484,7 +2484,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
 	    nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
 				htonl(st->search_restart)) ||
 	    nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE,
-				htonl(st->clash_resolve)))
+				htonl(st->clash_resolve)) ||
+	    nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG,
+			 htonl(st->chaintoolong)))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index e84b499b7bfa..f94ebd5194b5 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -429,7 +429,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 	unsigned int nr_conntracks;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_puts(seq, "entries  clashres found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete search_restart\n");
+		seq_puts(seq, "entries  clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete search_restart\n");
 		return 0;
 	}
 
@@ -444,7 +444,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 		   st->invalid,
 		   0,
 		   0,
-		   0,
+		   st->chaintoolong,
 		   st->insert,
 		   st->insert_failed,
 		   st->drop,
-- 
cgit v1.2.3-71-gd317


From 17395d7742baa4737e9d3b4672cc3d10e5970998 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 31 Aug 2021 10:59:25 +0530
Subject: gpio: virtio: Fix sparse warnings

Fix warnings reported by sparse, related to type mismatch between u16
and __le16.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: 3a29355a22c0 ("gpio: Add virtio-gpio driver")
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 drivers/gpio/gpio-virtio.c       | 41 ++++++++++++++++++++--------------------
 include/uapi/linux/virtio_gpio.h | 10 +++++-----
 2 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpio/gpio-virtio.c b/drivers/gpio/gpio-virtio.c
index d33eb237c0b9..d24f1c9264bc 100644
--- a/drivers/gpio/gpio-virtio.c
+++ b/drivers/gpio/gpio-virtio.c
@@ -32,7 +32,6 @@ struct virtio_gpio {
 	struct virtio_device *vdev;
 	struct mutex lock; /* Protects virtqueue operation */
 	struct gpio_chip gc;
-	struct virtio_gpio_config config;
 	struct virtio_gpio_line *lines;
 	struct virtqueue *request_vq;
 };
@@ -57,7 +56,7 @@ static int _virtio_gpio_req(struct virtio_gpio *vgpio, u16 type, u16 gpio,
 
 	req->type = cpu_to_le16(type);
 	req->gpio = cpu_to_le16(gpio);
-	req->value = txvalue;
+	req->value = cpu_to_le32(txvalue);
 
 	sg_init_one(&req_sg, req, sizeof(*req));
 	sg_init_one(&res_sg, res, rxlen);
@@ -233,19 +232,19 @@ static int virtio_gpio_alloc_vqs(struct virtio_gpio *vgpio,
 	return 0;
 }
 
-static const char **virtio_gpio_get_names(struct virtio_gpio *vgpio)
+static const char **virtio_gpio_get_names(struct virtio_gpio *vgpio,
+					  u32 gpio_names_size, u16 ngpio)
 {
-	struct virtio_gpio_config *config = &vgpio->config;
 	struct virtio_gpio_response_get_names *res;
 	struct device *dev = &vgpio->vdev->dev;
 	u8 *gpio_names, *str;
 	const char **names;
 	int i, ret, len;
 
-	if (!config->gpio_names_size)
+	if (!gpio_names_size)
 		return NULL;
 
-	len = sizeof(*res) + config->gpio_names_size;
+	len = sizeof(*res) + gpio_names_size;
 	res = devm_kzalloc(dev, len, GFP_KERNEL);
 	if (!res)
 		return NULL;
@@ -258,18 +257,18 @@ static const char **virtio_gpio_get_names(struct virtio_gpio *vgpio)
 		return NULL;
 	}
 
-	names = devm_kcalloc(dev, config->ngpio, sizeof(*names), GFP_KERNEL);
+	names = devm_kcalloc(dev, ngpio, sizeof(*names), GFP_KERNEL);
 	if (!names)
 		return NULL;
 
 	/* NULL terminate the string instead of checking it */
-	gpio_names[config->gpio_names_size - 1] = '\0';
+	gpio_names[gpio_names_size - 1] = '\0';
 
-	for (i = 0, str = gpio_names; i < config->ngpio; i++) {
+	for (i = 0, str = gpio_names; i < ngpio; i++) {
 		names[i] = str;
 		str += strlen(str) + 1; /* zero-length strings are allowed */
 
-		if (str > gpio_names + config->gpio_names_size) {
+		if (str > gpio_names + gpio_names_size) {
 			dev_err(dev, "gpio_names block is too short (%d)\n", i);
 			return NULL;
 		}
@@ -280,31 +279,31 @@ static const char **virtio_gpio_get_names(struct virtio_gpio *vgpio)
 
 static int virtio_gpio_probe(struct virtio_device *vdev)
 {
-	struct virtio_gpio_config *config;
+	struct virtio_gpio_config config;
 	struct device *dev = &vdev->dev;
 	struct virtio_gpio *vgpio;
+	u32 gpio_names_size;
+	u16 ngpio;
 	int ret, i;
 
 	vgpio = devm_kzalloc(dev, sizeof(*vgpio), GFP_KERNEL);
 	if (!vgpio)
 		return -ENOMEM;
 
-	config = &vgpio->config;
-
 	/* Read configuration */
-	virtio_cread_bytes(vdev, 0, config, sizeof(*config));
-	config->gpio_names_size = le32_to_cpu(config->gpio_names_size);
-	config->ngpio = le16_to_cpu(config->ngpio);
-	if (!config->ngpio) {
+	virtio_cread_bytes(vdev, 0, &config, sizeof(config));
+	gpio_names_size = le32_to_cpu(config.gpio_names_size);
+	ngpio = le16_to_cpu(config.ngpio);
+	if (!ngpio) {
 		dev_err(dev, "Number of GPIOs can't be zero\n");
 		return -EINVAL;
 	}
 
-	vgpio->lines = devm_kcalloc(dev, config->ngpio, sizeof(*vgpio->lines), GFP_KERNEL);
+	vgpio->lines = devm_kcalloc(dev, ngpio, sizeof(*vgpio->lines), GFP_KERNEL);
 	if (!vgpio->lines)
 		return -ENOMEM;
 
-	for (i = 0; i < config->ngpio; i++) {
+	for (i = 0; i < ngpio; i++) {
 		mutex_init(&vgpio->lines[i].lock);
 		init_completion(&vgpio->lines[i].completion);
 	}
@@ -319,7 +318,7 @@ static int virtio_gpio_probe(struct virtio_device *vdev)
 	vgpio->gc.direction_output	= virtio_gpio_direction_output;
 	vgpio->gc.get			= virtio_gpio_get;
 	vgpio->gc.set			= virtio_gpio_set;
-	vgpio->gc.ngpio			= config->ngpio;
+	vgpio->gc.ngpio			= ngpio;
 	vgpio->gc.base			= -1; /* Allocate base dynamically */
 	vgpio->gc.label			= dev_name(dev);
 	vgpio->gc.parent		= dev;
@@ -333,7 +332,7 @@ static int virtio_gpio_probe(struct virtio_device *vdev)
 	/* Mark the device ready to perform operations from within probe() */
 	virtio_device_ready(vdev);
 
-	vgpio->gc.names = virtio_gpio_get_names(vgpio);
+	vgpio->gc.names = virtio_gpio_get_names(vgpio, gpio_names_size, ngpio);
 
 	ret = gpiochip_add_data(&vgpio->gc, vgpio);
 	if (ret) {
diff --git a/include/uapi/linux/virtio_gpio.h b/include/uapi/linux/virtio_gpio.h
index 844574acf095..0445f905d8cc 100644
--- a/include/uapi/linux/virtio_gpio.h
+++ b/include/uapi/linux/virtio_gpio.h
@@ -22,16 +22,16 @@
 #define VIRTIO_GPIO_DIRECTION_IN		0x02
 
 struct virtio_gpio_config {
-	__u16 ngpio;
+	__le16 ngpio;
 	__u8 padding[2];
-	__u32 gpio_names_size;
+	__le32 gpio_names_size;
 } __packed;
 
 /* Virtio GPIO Request / Response */
 struct virtio_gpio_request {
-	__u16 type;
-	__u16 gpio;
-	__u32 value;
+	__le16 type;
+	__le16 gpio;
+	__le32 value;
 };
 
 struct virtio_gpio_response {
-- 
cgit v1.2.3-71-gd317


From 5dc9ffaff1420676827c4054bc789a7710f2a272 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Thu, 15 Jul 2021 10:48:43 +0300
Subject: habanalabs: expose server type in INFO IOCTL

Add the server type property to the hl_info_hw_ip_info structure
that is exposed to the user via the INFO IOCTL.

This is needed by the userspace s/w stack to know the connections map
of the internal links that connect the ASIC among themselves inside the
server.

The F/W will tell us, as part of the NIC information, the server type
that the GAUDI is located in.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c      |  2 +-
 drivers/misc/habanalabs/common/habanalabs.h       |  3 ++
 drivers/misc/habanalabs/common/habanalabs_ioctl.c |  2 +
 drivers/misc/habanalabs/gaudi/gaudi.c             |  2 +
 drivers/misc/habanalabs/goya/goya.c               |  2 +
 drivers/misc/habanalabs/include/common/cpucp_if.h | 11 ++++++
 include/uapi/misc/habanalabs.h                    | 48 +++++++++++++++++++++--
 7 files changed, 65 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 869c6057ae31..0be3f5414f0b 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index b72fcc9255aa..6c5c7c832af3 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -489,6 +489,8 @@ struct hl_hints_range {
  *                                       reserved for the user
  * @first_available_cq: first available CQ for the user.
  * @user_interrupt_count: number of user interrupts.
+ * @server_type: Server type that the ASIC is currently installed in.
+ *               The value is according to enum hl_server_type in uapi file.
  * @tpc_enabled_mask: which TPCs are enabled.
  * @completion_queues_count: number of completion queues.
  * @fw_security_enabled: true if security measures are enabled in firmware,
@@ -570,6 +572,7 @@ struct asic_fixed_properties {
 	u16				first_available_user_msix_interrupt;
 	u16				first_available_cq[HL_MAX_DCORES];
 	u16				user_interrupt_count;
+	u16				server_type;
 	u8				tpc_enabled_mask;
 	u8				completion_queues_count;
 	u8				fw_security_enabled;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index f4dda7b4acdd..86c3257d9ae1 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -94,6 +94,8 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 
 	hw_ip.first_available_interrupt_id =
 			prop->first_available_user_msix_interrupt;
+	hw_ip.server_type = prop->server_type;
+
 	return copy_to_user(out, &hw_ip,
 		min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0;
 }
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 6187e2e802bc..d3bd58404c38 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -642,6 +642,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->hard_reset_done_by_fw = false;
 	prop->gic_interrupts_enable = true;
 
+	prop->server_type = HL_SERVER_TYPE_UNKNOWN;
+
 	return 0;
 }
 
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index c070cd14753e..8d890c7cce08 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -469,6 +469,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	prop->hard_reset_done_by_fw = false;
 	prop->gic_interrupts_enable = true;
 
+	prop->server_type = HL_SERVER_TYPE_UNKNOWN;
+
 	return 0;
 }
 
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index 80b1d5a9d9f1..2d6f8ea35375 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -700,6 +700,15 @@ struct cpucp_mac_addr {
 	__u8 mac_addr[ETH_ALEN];
 };
 
+enum cpucp_serdes_type {
+	TYPE_1_SERDES_TYPE,
+	TYPE_2_SERDES_TYPE,
+	HLS1_SERDES_TYPE,
+	HLS1H_SERDES_TYPE,
+	UNKNOWN_SERDES_TYPE,
+	MAX_NUM_SERDES_TYPE = UNKNOWN_SERDES_TYPE
+};
+
 struct cpucp_nic_info {
 	struct cpucp_mac_addr mac_addrs[CPUCP_MAX_NICS];
 	__le64 link_mask[CPUCP_NIC_MASK_ARR_LEN];
@@ -708,6 +717,8 @@ struct cpucp_nic_info {
 	__le64 link_ext_mask[CPUCP_NIC_MASK_ARR_LEN];
 	__u8 qsfp_eeprom[CPUCP_NIC_QSFP_EEPROM_MAX_LEN];
 	__le64 auto_neg_mask[CPUCP_NIC_MASK_ARR_LEN];
+	__le16 serdes_type; /* enum cpucp_serdes_type */
+	__u8 reserved[6];
 };
 
 #endif /* CPUCP_IF_H */
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index eca86c545916..6686b73a0834 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -279,6 +279,14 @@ enum hl_device_status {
 	HL_DEVICE_STATUS_NEEDS_RESET
 };
 
+enum hl_server_type {
+	HL_SERVER_TYPE_UNKNOWN = 0,
+	HL_SERVER_GAUDI_HLS1 = 1,
+	HL_SERVER_GAUDI_HLS1H = 2,
+	HL_SERVER_GAUDI_TYPE1 = 3,
+	HL_SERVER_GAUDI_TYPE2 = 4
+};
+
 /* Opcode for management ioctl
  *
  * HW_IP_INFO            - Receive information about different IP blocks in the
@@ -337,17 +345,49 @@ enum hl_device_status {
 #define HL_INFO_VERSION_MAX_LEN	128
 #define HL_INFO_CARD_NAME_MAX_LEN	16
 
+/**
+ * struct hl_info_hw_ip_info - hardware information on various IPs in the ASIC
+ * @sram_base_address: The first SRAM physical base address that is free to be
+ *                     used by the user.
+ * @dram_base_address: The first DRAM virtual or physical base address that is
+ *                     free to be used by the user.
+ * @dram_size: The DRAM size that is available to the user.
+ * @sram_size: The SRAM size that is available to the user.
+ * @num_of_events: The number of events that can be received from the f/w. This
+ *                 is needed so the user can what is the size of the h/w events
+ *                 array he needs to pass to the kernel when he wants to fetch
+ *                 the event counters.
+ * @device_id: PCI device ID of the ASIC.
+ * @module_id: Module ID of the ASIC for mezzanine cards in servers
+ *             (From OCP spec).
+ * @first_available_interrupt_id: The first available interrupt ID for the user
+ *                                to be used when it works with user interrupts.
+ * @server_type: Server type that the Gaudi ASIC is currently installed in.
+ *               The value is according to enum hl_server_type
+ * @cpld_version: CPLD version on the board.
+ * @psoc_pci_pll_nr: PCI PLL NR value. Needed by the profiler in some ASICs.
+ * @psoc_pci_pll_nf: PCI PLL NF value. Needed by the profiler in some ASICs.
+ * @psoc_pci_pll_od: PCI PLL OD value. Needed by the profiler in some ASICs.
+ * @psoc_pci_pll_div_factor: PCI PLL DIV factor value. Needed by the profiler
+ *                           in some ASICs.
+ * @tpc_enabled_mask: Bit-mask that represents which TPCs are enabled. Relevant
+ *                    for Goya/Gaudi only.
+ * @dram_enabled: Whether the DRAM is enabled.
+ * @cpucp_version: The CPUCP f/w version.
+ * @card_name: The card name as passed by the f/w.
+ * @dram_page_size: The DRAM physical page size.
+ */
 struct hl_info_hw_ip_info {
 	__u64 sram_base_address;
 	__u64 dram_base_address;
 	__u64 dram_size;
 	__u32 sram_size;
 	__u32 num_of_events;
-	__u32 device_id; /* PCI Device ID */
-	__u32 module_id; /* For mezzanine cards in servers (From OCP spec.) */
+	__u32 device_id;
+	__u32 module_id;
 	__u32 reserved;
 	__u16 first_available_interrupt_id;
-	__u16 reserved2;
+	__u16 server_type;
 	__u32 cpld_version;
 	__u32 psoc_pci_pll_nr;
 	__u32 psoc_pci_pll_nf;
@@ -358,7 +398,7 @@ struct hl_info_hw_ip_info {
 	__u8 pad[2];
 	__u8 cpucp_version[HL_INFO_VERSION_MAX_LEN];
 	__u8 card_name[HL_INFO_CARD_NAME_MAX_LEN];
-	__u64 reserved3;
+	__u64 reserved2;
 	__u64 dram_page_size;
 };
 
-- 
cgit v1.2.3-71-gd317


From 71731090ab17a208a58020e4b342fdfee280458a Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Mon, 16 Aug 2021 13:27:12 +0300
Subject: habanalabs: add "in device creation" status

On init, the disabled state is cleared right before hw_init and that
causes the device to report on "Operational" state before the device
initialization is finished. Although the char device is not yet exposed
to the user at this stage, the sysfs entries are exposed.

This can cause errors in monitoring applications that use the sysfs
entries.

In order to avoid this, a new state "in device creation" is introduced
to ne reported when the device is not disabled but is still in init
flow.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c         |  3 +++
 drivers/misc/habanalabs/common/habanalabs.h     |  2 +-
 drivers/misc/habanalabs/common/habanalabs_drv.c |  8 ++++++--
 drivers/misc/habanalabs/common/sysfs.c          | 20 +++++++-------------
 include/uapi/misc/habanalabs.h                  |  4 +++-
 5 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 3751c915f731..c2641030d9ff 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -23,6 +23,8 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
 		status = HL_DEVICE_STATUS_NEEDS_RESET;
 	else if (hdev->disabled)
 		status = HL_DEVICE_STATUS_MALFUNCTION;
+	else if (!hdev->init_done)
+		status = HL_DEVICE_STATUS_IN_DEVICE_CREATION;
 	else
 		status = HL_DEVICE_STATUS_OPERATIONAL;
 
@@ -44,6 +46,7 @@ bool hl_device_operational(struct hl_device *hdev,
 	case HL_DEVICE_STATUS_NEEDS_RESET:
 		return false;
 	case HL_DEVICE_STATUS_OPERATIONAL:
+	case HL_DEVICE_STATUS_IN_DEVICE_CREATION:
 	default:
 		return true;
 	}
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 1ca3a920ea6d..7f4548f1d7e1 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1990,7 +1990,7 @@ struct hl_state_dump_specs {
 
 #define HL_STR_MAX	32
 
-#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_NEEDS_RESET + 1)
+#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_LAST + 1)
 
 /* Theoretical limit only. A single host can only contain up to 4 or 8 PCIe
  * x16 cards. In extreme cases, there are hosts that can accommodate 16 cards.
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 3df4313d72cd..2ef59fd465ba 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -317,12 +317,16 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 		hdev->asic_prop.fw_security_enabled = false;
 
 	/* Assign status description string */
-	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
-					"disabled", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL],
+					"operational", HL_STR_MAX);
 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
 					"in reset", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
+					"disabled", HL_STR_MAX);
 	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
 					"needs reset", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
+					"in device creation", HL_STR_MAX);
 
 	hdev->major = hl_major;
 	hdev->reset_on_lockup = reset_on_lockup;
diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c
index db72df282ef8..34f9f2779962 100644
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -9,8 +9,7 @@
 
 #include <linux/pci.h>
 
-long hl_get_frequency(struct hl_device *hdev, u32 pll_index,
-								bool curr)
+long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 {
 	struct cpucp_packet pkt;
 	u32 used_pll_idx;
@@ -44,8 +43,7 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index,
 	return (long) result;
 }
 
-void hl_set_frequency(struct hl_device *hdev, u32 pll_index,
-								u64 freq)
+void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
 {
 	struct cpucp_packet pkt;
 	u32 used_pll_idx;
@@ -285,16 +283,12 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr,
 				char *buf)
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
-	char *str;
+	char str[HL_STR_MAX];
 
-	if (atomic_read(&hdev->in_reset))
-		str = "In reset";
-	else if (hdev->disabled)
-		str = "Malfunction";
-	else if (hdev->needs_reset)
-		str = "Needs Reset";
-	else
-		str = "Operational";
+	strscpy(str, hdev->status[hl_device_status(hdev)], HL_STR_MAX);
+
+	/* use uppercase for backward compatibility */
+	str[0] = 'A' + (str[0] - 'a');
 
 	return sprintf(buf, "%s\n", str);
 }
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 6686b73a0834..7cc2a0f3f2f5 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -276,7 +276,9 @@ enum hl_device_status {
 	HL_DEVICE_STATUS_OPERATIONAL,
 	HL_DEVICE_STATUS_IN_RESET,
 	HL_DEVICE_STATUS_MALFUNCTION,
-	HL_DEVICE_STATUS_NEEDS_RESET
+	HL_DEVICE_STATUS_NEEDS_RESET,
+	HL_DEVICE_STATUS_IN_DEVICE_CREATION,
+	HL_DEVICE_STATUS_LAST = HL_DEVICE_STATUS_IN_DEVICE_CREATION
 };
 
 enum hl_server_type {
-- 
cgit v1.2.3-71-gd317


From b27abaccf8e8b012f126da0c2a1ab32723ec8b9f Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 2 Sep 2021 15:00:06 -0700
Subject: mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes

Patch series "Introduce multi-preference mempolicy", v7.

This patch series introduces the concept of the MPOL_PREFERRED_MANY
mempolicy.  This mempolicy mode can be used with either the
set_mempolicy(2) or mbind(2) interfaces.  Like the MPOL_PREFERRED
interface, it allows an application to set a preference for nodes which
will fulfil memory allocation requests.  Unlike the MPOL_PREFERRED mode,
it takes a set of nodes.  Like the MPOL_BIND interface, it works over a
set of nodes.  Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the
OOM killer if those preferred nodes are not available.

Along with these patches are patches for libnuma, numactl, numademo, and
memhog.  They still need some polish, but can be found here:
https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new
usage: `numactl -P 0,3,4`

The goal of the new mode is to enable some use-cases when using tiered memory
usage models which I've lovingly named.

1a. The Hare - The interconnect is fast enough to meet bandwidth and
    latency requirements allowing preference to be given to all nodes with
    "fast" memory.
1b. The Indiscriminate Hare - An application knows it wants fast
    memory (or perhaps slow memory), but doesn't care which node it runs
    on.  The application can prefer a set of nodes and then xpu bind to
    the local node (cpu, accelerator, etc).  This reverses the nodes are
    chosen today where the kernel attempts to use local memory to the CPU
    whenever possible.  This will attempt to use the local accelerator to
    the memory.
2.  The Tortoise - The administrator (or the application itself) is
    aware it only needs slow memory, and so can prefer that.

Much of this is almost achievable with the bind interface, but the bind
interface suffers from an inability to fallback to another set of nodes if
binding fails to all nodes in the nodemask.

Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the
preference.

> /* Set first two nodes as preferred in an 8 node system. */
> const unsigned long nodes = 0x3
> set_mempolicy(MPOL_PREFER_MANY, &nodes, 8);

> /* Mimic interleave policy, but have fallback *.
> const unsigned long nodes = 0xaa
> set_mempolicy(MPOL_PREFER_MANY, &nodes, 8);

Some internal discussion took place around the interface. There are two
alternatives which we have discussed, plus one I stuck in:

1. Ordered list of nodes.  Currently it's believed that the added
   complexity is nod needed for expected usecases.
2. A flag for bind to allow falling back to other nodes.  This
   confuses the notion of binding and is less flexible than the current
   solution.
3. Create flags or new modes that helps with some ordering.  This
   offers both a friendlier API as well as a solution for more customized
   usage.  It's unknown if it's worth the complexity to support this.
   Here is sample code for how this might work:

> // Prefer specific nodes for some something wacky
> set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024);
>
> // Default
> set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0);
> // which is the same as
> set_mempolicy(MPOL_DEFAULT, NULL, 0);
>
> // The Hare
> set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0);
>
> // The Tortoise
> set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0);
>
> // Prefer the fast memory of the first two sockets
> set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2);
>

This patch (of 5):

The NUMA APIs currently allow passing in a "preferred node" as a single
bit set in a nodemask.  If more than one bit it set, bits after the first
are ignored.

This single node is generally OK for location-based NUMA where memory
being allocated will eventually be operated on by a single CPU.  However,
in systems with multiple memory types, folks want to target a *type* of
memory instead of a location.  For instance, someone might want some
high-bandwidth memory but do not care about the CPU next to which it is
allocated.  Or, they want a cheap, high capacity allocation and want to
target all NUMA nodes which have persistent memory in volatile mode.  In
both of these cases, the application wants to target a *set* of nodes, but
does not want strict MPOL_BIND behavior as that could lead to OOM killer
or SIGSEGV.

So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes
requirement.  This is not a pie-in-the-sky dream for an API.  This was a
response to a specific ask of more than one group at Intel.  Specifically:

1. There are existing libraries that target memory types such as
   https://github.com/memkind/memkind.  These are known to suffer from
   SIGSEGV's when memory is low on targeted memory "kinds" that span more
   than one node.  The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an
   example of this.

2. Volatile-use persistent memory users want to have a memory policy
   which is targeted at either "cheap and slow" (PMEM) or "expensive and
   fast" (DRAM).  However, they do not want to experience allocation
   failures when the targeted type is unavailable.

3. Allocate-then-run.  Generally, we let the process scheduler decide
   on which physical CPU to run a task.  That location provides a default
   allocation policy, and memory availability is not generally considered
   when placing tasks.  For situations where memory is valuable and
   constrained, some users want to allocate memory first, *then* allocate
   close compute resources to the allocation.  This is the reverse of the
   normal (CPU) model.  Accelerators such as GPUs that operate on
   core-mm-managed memory are interested in this model.

A check is added in sanitize_mpol_flags() to not permit 'prefer_many'
policy to be used for now, and will be removed in later patch after all
implementations for 'prefer_many' are ready, as suggested by Michal Hocko.

[mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling]

Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com
Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com
Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com
Co-developed-by: Ben Widawsky <ben.widawsky@intel.com>
Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Huang Ying <ying.huang@intel.com>b
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/mempolicy.h |  1 +
 mm/mempolicy.c                 | 73 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 60 insertions(+), 14 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 19a00bc7fe86..046d0ccba4cd 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -22,6 +22,7 @@ enum {
 	MPOL_BIND,
 	MPOL_INTERLEAVE,
 	MPOL_LOCAL,
+	MPOL_PREFERRED_MANY,
 	MPOL_MAX,	/* always last member of enum */
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c473d2931708..f8f2c18e61cb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -31,6 +31,9 @@
  *                but useful to set in a VMA when you have a non default
  *                process policy.
  *
+ * preferred many Try a set of nodes first before normal fallback. This is
+ *                similar to preferred without the special case.
+ *
  * default        Allocate on the local node first, or when on a VMA
  *                use the process policy. This is what Linux always did
  *		  in a NUMA aware kernel and still does by, ahem, default.
@@ -207,6 +210,14 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 	return 0;
 }
 
+static int mpol_new_preferred_many(struct mempolicy *pol, const nodemask_t *nodes)
+{
+	if (nodes_empty(*nodes))
+		return -EINVAL;
+	pol->nodes = *nodes;
+	return 0;
+}
+
 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	if (nodes_empty(*nodes))
@@ -408,6 +419,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 	[MPOL_LOCAL] = {
 		.rebind = mpol_rebind_default,
 	},
+	[MPOL_PREFERRED_MANY] = {
+		.create = mpol_new_preferred_many,
+		.rebind = mpol_rebind_preferred,
+	},
 };
 
 static int migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -900,6 +915,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED:
+	case MPOL_PREFERRED_MANY:
 		*nodes = p->nodes;
 		break;
 	case MPOL_LOCAL:
@@ -1446,7 +1462,13 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
 {
 	*flags = *mode & MPOL_MODE_FLAGS;
 	*mode &= ~MPOL_MODE_FLAGS;
-	if ((unsigned int)(*mode) >= MPOL_MAX)
+
+	/*
+	 * The check should be 'mode >= MPOL_MAX', but as 'prefer_many'
+	 * is not fully implemented, don't permit it to be used for now,
+	 * and the logic will be restored in following patch
+	 */
+	if ((unsigned int)(*mode) >=  MPOL_PREFERRED_MANY)
 		return -EINVAL;
 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
 		return -EINVAL;
@@ -1875,16 +1897,27 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
  */
 nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 {
+	int mode = policy->mode;
+
 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
-	if (unlikely(policy->mode == MPOL_BIND) &&
-			apply_policy_zone(policy, gfp_zone(gfp)) &&
-			cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+	if (unlikely(mode == MPOL_BIND) &&
+		apply_policy_zone(policy, gfp_zone(gfp)) &&
+		cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+		return &policy->nodes;
+
+	if (mode == MPOL_PREFERRED_MANY)
 		return &policy->nodes;
 
 	return NULL;
 }
 
-/* Return the node id preferred by the given mempolicy, or the given id */
+/*
+ * Return the  preferred node id for 'prefer' mempolicy, and return
+ * the given id for all other policies.
+ *
+ * policy_node() is always coupled with policy_nodemask(), which
+ * secures the nodemask limit for 'bind' and 'prefer-many' policy.
+ */
 static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 {
 	if (policy->mode == MPOL_PREFERRED) {
@@ -1936,7 +1969,9 @@ unsigned int mempolicy_slab_node(void)
 	case MPOL_INTERLEAVE:
 		return interleave_nodes(policy);
 
-	case MPOL_BIND: {
+	case MPOL_BIND:
+	case MPOL_PREFERRED_MANY:
+	{
 		struct zoneref *z;
 
 		/*
@@ -2008,12 +2043,12 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
  * @addr: address in @vma for shared policy lookup and interleave policy
  * @gfp_flags: for requested zone
  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
- * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
+ * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
  *
  * Returns a nid suitable for a huge page allocation and a pointer
  * to the struct mempolicy for conditional unref after allocation.
- * If the effective policy is 'BIND, returns a pointer to the mempolicy's
- * @nodemask for filtering the zonelist.
+ * If the effective policy is 'bind' or 'prefer-many', returns a pointer
+ * to the mempolicy's @nodemask for filtering the zonelist.
  *
  * Must be protected by read_mems_allowed_begin()
  */
@@ -2021,16 +2056,18 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask)
 {
 	int nid;
+	int mode;
 
 	*mpol = get_vma_policy(vma, addr);
-	*nodemask = NULL;	/* assume !MPOL_BIND */
+	*nodemask = NULL;
+	mode = (*mpol)->mode;
 
-	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+	if (unlikely(mode == MPOL_INTERLEAVE)) {
 		nid = interleave_nid(*mpol, vma, addr,
 					huge_page_shift(hstate_vma(vma)));
 	} else {
 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
-		if ((*mpol)->mode == MPOL_BIND)
+		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
 			*nodemask = &(*mpol)->nodes;
 	}
 	return nid;
@@ -2063,6 +2100,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	mempolicy = current->mempolicy;
 	switch (mempolicy->mode) {
 	case MPOL_PREFERRED:
+	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 		*mask = mempolicy->nodes;
@@ -2173,7 +2211,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		 * node and don't fall back to other nodes, as the cost of
 		 * remote accesses would likely offset THP benefits.
 		 *
-		 * If the policy is interleave, or does not allow the current
+		 * If the policy is interleave or does not allow the current
 		 * node in its nodemask, we allocate the standard way.
 		 */
 		if (pol->mode == MPOL_PREFERRED)
@@ -2311,6 +2349,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED:
+	case MPOL_PREFERRED_MANY:
 		return !!nodes_equal(a->nodes, b->nodes);
 	case MPOL_LOCAL:
 		return true;
@@ -2451,6 +2490,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		break;
 
 	case MPOL_PREFERRED:
+		if (node_isset(curnid, pol->nodes))
+			goto out;
 		polnid = first_node(pol->nodes);
 		break;
 
@@ -2465,9 +2506,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 				break;
 			goto out;
 		}
+		fallthrough;
 
+	case MPOL_PREFERRED_MANY:
 		/*
-		 * allows binding to multiple nodes.
 		 * use current page if in policy nodemask,
 		 * else select nearest allowed node, if any.
 		 * If no allowed nodes, use current [!misplaced].
@@ -2829,6 +2871,7 @@ static const char * const policy_modes[] =
 	[MPOL_BIND]       = "bind",
 	[MPOL_INTERLEAVE] = "interleave",
 	[MPOL_LOCAL]      = "local",
+	[MPOL_PREFERRED_MANY]  = "prefer (many)",
 };
 
 
@@ -2907,6 +2950,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
 		if (!nodelist)
 			err = 0;
 		goto out;
+	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 		/*
 		 * Insist on a nodelist
@@ -2993,6 +3037,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	case MPOL_LOCAL:
 		break;
 	case MPOL_PREFERRED:
+	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 		nodes = pol->nodes;
-- 
cgit v1.2.3-71-gd317


From dce49103962840dd61423d7627748d6c558d58c5 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 2 Sep 2021 15:00:33 -0700
Subject: mm: wire up syscall process_mrelease

Split off from prev patch in the series that implements the syscall.

Link: https://lkml.kernel.org/r/20210809185259.405936-2-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Jan Engelhardt <jengelh@inai.de>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/syscalls/syscall.tbl      | 2 ++
 arch/arm/tools/syscall.tbl                  | 2 ++
 arch/arm64/include/asm/unistd.h             | 2 +-
 arch/arm64/include/asm/unistd32.h           | 2 ++
 arch/ia64/kernel/syscalls/syscall.tbl       | 2 ++
 arch/m68k/kernel/syscalls/syscall.tbl       | 2 ++
 arch/microblaze/kernel/syscalls/syscall.tbl | 2 ++
 arch/mips/kernel/syscalls/syscall_n32.tbl   | 2 ++
 arch/mips/kernel/syscalls/syscall_n64.tbl   | 2 ++
 arch/mips/kernel/syscalls/syscall_o32.tbl   | 2 ++
 arch/parisc/kernel/syscalls/syscall.tbl     | 2 ++
 arch/powerpc/kernel/syscalls/syscall.tbl    | 2 ++
 arch/s390/kernel/syscalls/syscall.tbl       | 2 ++
 arch/sh/kernel/syscalls/syscall.tbl         | 2 ++
 arch/sparc/kernel/syscalls/syscall.tbl      | 2 ++
 arch/x86/entry/syscalls/syscall_32.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl      | 1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     | 2 ++
 include/linux/syscalls.h                    | 1 +
 include/uapi/asm-generic/unistd.h           | 4 +++-
 kernel/sys_ni.c                             | 1 +
 21 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index a17687ed4b51..605645eae04c 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -486,3 +486,5 @@
 554	common	landlock_create_ruleset		sys_landlock_create_ruleset
 555	common	landlock_add_rule		sys_landlock_add_rule
 556	common	landlock_restrict_self		sys_landlock_restrict_self
+# 557 reserved for memfd_secret
+558	common	process_mrelease		sys_process_mrelease
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index c5df1179fc5d..2f32eb8beca8 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -460,3 +460,5 @@
 444	common	landlock_create_ruleset		sys_landlock_create_ruleset
 445	common	landlock_add_rule		sys_landlock_add_rule
 446	common	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	common	process_mrelease		sys_process_mrelease
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 727bfc3be99b..3cb206aea3db 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		447
+#define __NR_compat_syscalls		449
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 99ffcafc736c..0f49cdb180dd 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -901,6 +901,8 @@ __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset)
 __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
 #define __NR_landlock_restrict_self 446
 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
+#define __NR_process_mrelease 448
+__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 6d07742c57b8..9bf45f2be966 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -367,3 +367,5 @@
 444	common	landlock_create_ruleset		sys_landlock_create_ruleset
 445	common	landlock_add_rule		sys_landlock_add_rule
 446	common	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	common	process_mrelease		sys_process_mrelease
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 541bc1b3a8f9..f1f98ee6c82d 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -446,3 +446,5 @@
 444	common	landlock_create_ruleset		sys_landlock_create_ruleset
 445	common	landlock_add_rule		sys_landlock_add_rule
 446	common	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	common	process_mrelease		sys_process_mrelease
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index a176faca2927..da49ddd4bb54 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -452,3 +452,5 @@
 444	common	landlock_create_ruleset		sys_landlock_create_ruleset
 445	common	landlock_add_rule		sys_landlock_add_rule
 446	common	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	common	process_mrelease		sys_process_mrelease
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index c2d2e19abea8..56c8d3cf42ed 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -385,3 +385,5 @@
 444	n32	landlock_create_ruleset		sys_landlock_create_ruleset
 445	n32	landlock_add_rule		sys_landlock_add_rule
 446	n32	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	n32	process_mrelease		sys_process_mrelease
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index ac653d08b1ea..1ca7bc337932 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -361,3 +361,5 @@
 444	n64	landlock_create_ruleset		sys_landlock_create_ruleset
 445	n64	landlock_add_rule		sys_landlock_add_rule
 446	n64	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	n64	process_mrelease		sys_process_mrelease
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 253f2cd70b6b..fd3a9df60ec2 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -434,3 +434,5 @@
 444	o32	landlock_create_ruleset		sys_landlock_create_ruleset
 445	o32	landlock_add_rule		sys_landlock_add_rule
 446	o32	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	o32	process_mrelease		sys_process_mrelease
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index e26187b9ab87..040df1b7a589 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -444,3 +444,5 @@
 444	common	landlock_create_ruleset		sys_landlock_create_ruleset
 445	common	landlock_add_rule		sys_landlock_add_rule
 446	common	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	common	process_mrelease		sys_process_mrelease
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index aef2a290e71a..d8ebd7d37c0f 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -526,3 +526,5 @@
 444	common	landlock_create_ruleset		sys_landlock_create_ruleset
 445	common	landlock_add_rule		sys_landlock_add_rule
 446	common	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	common	process_mrelease		sys_process_mrelease
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 64d51ab5a8b4..57233ace30cb 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -449,3 +449,5 @@
 444  common	landlock_create_ruleset	sys_landlock_create_ruleset	sys_landlock_create_ruleset
 445  common	landlock_add_rule	sys_landlock_add_rule		sys_landlock_add_rule
 446  common	landlock_restrict_self	sys_landlock_restrict_self	sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448  common	process_mrelease	sys_process_mrelease		sys_process_mrelease
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index e0a70be77d84..2f6e95eb4690 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -449,3 +449,5 @@
 444	common	landlock_create_ruleset		sys_landlock_create_ruleset
 445	common	landlock_add_rule		sys_landlock_add_rule
 446	common	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	common	process_mrelease		sys_process_mrelease
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 603f5a821502..42fc2906215d 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -492,3 +492,5 @@
 444	common	landlock_create_ruleset		sys_landlock_create_ruleset
 445	common	landlock_add_rule		sys_landlock_add_rule
 446	common	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	common	process_mrelease		sys_process_mrelease
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ce763a12311c..661a03bcfbd1 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -452,3 +452,4 @@
 445	i386	landlock_add_rule	sys_landlock_add_rule
 446	i386	landlock_restrict_self	sys_landlock_restrict_self
 447	i386	memfd_secret		sys_memfd_secret
+448	i386	process_mrelease	sys_process_mrelease
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f6b57799c1ea..807b6a1de8e8 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -369,6 +369,7 @@
 445	common	landlock_add_rule	sys_landlock_add_rule
 446	common	landlock_restrict_self	sys_landlock_restrict_self
 447	common	memfd_secret		sys_memfd_secret
+448	common	process_mrelease	sys_process_mrelease
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 235d67d6ceb4..f4384951f393 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -417,3 +417,5 @@
 444	common	landlock_create_ruleset		sys_landlock_create_ruleset
 445	common	landlock_add_rule		sys_landlock_add_rule
 446	common	landlock_restrict_self		sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448	common	process_mrelease		sys_process_mrelease
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 69c9a7010081..00bc170a50f0 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -915,6 +915,7 @@ asmlinkage long sys_mincore(unsigned long start, size_t len,
 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
 asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
 			size_t vlen, int behavior, unsigned int flags);
+asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 			unsigned long prot, unsigned long pgoff,
 			unsigned long flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a9d6fcd95f42..14c8fe863c6d 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -877,9 +877,11 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
 #define __NR_memfd_secret 447
 __SYSCALL(__NR_memfd_secret, sys_memfd_secret)
 #endif
+#define __NR_process_mrelease 448
+__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
 
 #undef __NR_syscalls
-#define __NR_syscalls 448
+#define __NR_syscalls 449
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 30971b1dd4a9..18a9c2cde767 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -289,6 +289,7 @@ COND_SYSCALL(munlockall);
 COND_SYSCALL(mincore);
 COND_SYSCALL(madvise);
 COND_SYSCALL(process_madvise);
+COND_SYSCALL(process_mrelease);
 COND_SYSCALL(remap_file_pages);
 COND_SYSCALL(mbind);
 COND_SYSCALL_COMPAT(mbind);
-- 
cgit v1.2.3-71-gd317


From c7c5e6ff533fe1f9afef7d2fa46678987a1335a7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 3 Sep 2021 15:03:43 -0700
Subject: fq_codel: reject silly quantum parameters

syzbot found that forcing a big quantum attribute would crash hosts fast,
essentially using this:

tc qd replace dev eth0 root fq_codel quantum 4294967295

This is because fq_codel_dequeue() would have to loop
~2^31 times in :

	if (flow->deficit <= 0) {
		flow->deficit += q->quantum;
		list_move_tail(&flow->flowchain, &q->old_flows);
		goto begin;
	}

SFQ max quantum is 2^19 (half a megabyte)
Lets adopt a max quantum of one megabyte for FQ_CODEL.

Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 ++
 net/sched/sch_fq_codel.c       | 12 ++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 79a699f106b1..ec88590b3198 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -827,6 +827,8 @@ struct tc_codel_xstats {
 
 /* FQ_CODEL */
 
+#define FQ_CODEL_QUANTUM_MAX (1 << 20)
+
 enum {
 	TCA_FQ_CODEL_UNSPEC,
 	TCA_FQ_CODEL_TARGET,
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index c4afdd026f51..bb0cd6d3d2c2 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -369,6 +369,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
+	u32 quantum = 0;
 	int err;
 
 	if (!opt)
@@ -386,6 +387,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,
 		    q->flows_cnt > 65536)
 			return -EINVAL;
 	}
+	if (tb[TCA_FQ_CODEL_QUANTUM]) {
+		quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
+		if (quantum > FQ_CODEL_QUANTUM_MAX) {
+			NL_SET_ERR_MSG(extack, "Invalid quantum");
+			return -EINVAL;
+		}
+	}
 	sch_tree_lock(sch);
 
 	if (tb[TCA_FQ_CODEL_TARGET]) {
@@ -412,8 +420,8 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,
 	if (tb[TCA_FQ_CODEL_ECN])
 		q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]);
 
-	if (tb[TCA_FQ_CODEL_QUANTUM])
-		q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
+	if (quantum)
+		q->quantum = quantum;
 
 	if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])
 		q->drop_batch_size = max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]));
-- 
cgit v1.2.3-71-gd317


From d5a8680dfab0547a4ecd708b1fe9de48598a6757 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 27 Jul 2021 10:53:51 +0530
Subject: uapi: virtio_ids: Sync ids with specification

This synchronizes the virtio ids with the latest list from virtio
specification.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lore.kernel.org/r/61b27e3bc61fb0c9f067001e95cfafc5d37d414a.1627362340.git.viresh.kumar@linaro.org
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_ids.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 70a8057ad4bb..3c8e11820fdb 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -54,8 +54,20 @@
 #define VIRTIO_ID_SOUND			25 /* virtio sound */
 #define VIRTIO_ID_FS			26 /* virtio filesystem */
 #define VIRTIO_ID_PMEM			27 /* virtio pmem */
+#define VIRTIO_ID_RPMB			28 /* virtio rpmb */
 #define VIRTIO_ID_MAC80211_HWSIM	29 /* virtio mac80211-hwsim */
+#define VIRTIO_ID_VIDEO_ENCODER		30 /* virtio video encoder */
+#define VIRTIO_ID_VIDEO_DECODER		31 /* virtio video decoder */
+#define VIRTIO_ID_SCMI			32 /* virtio scmi */
+#define VIRTIO_ID_NITRO_SEC_MOD		33 /* virtio nitro secure module*/
+#define VIRTIO_ID_I2C_ADAPTER		34 /* virtio i2c adapter */
+#define VIRTIO_ID_WATCHDOG		35 /* virtio watchdog */
+#define VIRTIO_ID_CAN			36 /* virtio can */
+#define VIRTIO_ID_DMABUF		37 /* virtio dmabuf */
+#define VIRTIO_ID_PARAM_SERV		38 /* virtio parameter server */
+#define VIRTIO_ID_AUDIO_POLICY		39 /* virtio audio policy */
 #define VIRTIO_ID_BT			40 /* virtio bluetooth */
+#define VIRTIO_ID_GPIO			41 /* virtio gpio */
 
 /*
  * Virtio Transitional IDs
-- 
cgit v1.2.3-71-gd317


From 9af8f1061646e8e22b66413bedf7b3e2ab3d69e5 Mon Sep 17 00:00:00 2001
From: Arseny Krasnov <arseny.krasnov@kaspersky.com>
Date: Fri, 3 Sep 2021 15:31:06 +0300
Subject: virtio/vsock: rename 'EOR' to 'EOM' bit.

This current implemented bit is used to mark end of messages
('EOM' - end of message), not records('EOR' - end of record).
Also rename 'record' to 'message' in implementation as it is
different things.

Signed-off-by: Arseny Krasnov <arseny.krasnov@kaspersky.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20210903123109.3273053-1-arseny.krasnov@kaspersky.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vsock.c                   | 12 ++++++------
 include/uapi/linux/virtio_vsock.h       |  2 +-
 net/vmw_vsock/virtio_transport_common.c | 14 +++++++-------
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f249622ef11b..feaf650affbe 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -178,15 +178,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			 * small rx buffers, headers of packets in rx queue are
 			 * created dynamically and are initialized with header
 			 * of current packet(except length). But in case of
-			 * SOCK_SEQPACKET, we also must clear record delimeter
-			 * bit(VIRTIO_VSOCK_SEQ_EOR). Otherwise, instead of one
-			 * packet with delimeter(which marks end of record),
+			 * SOCK_SEQPACKET, we also must clear message delimeter
+			 * bit(VIRTIO_VSOCK_SEQ_EOM). Otherwise, instead of one
+			 * packet with delimeter(which marks end of message),
 			 * there will be sequence of packets with delimeter
 			 * bit set. After initialized header will be copied to
 			 * rx buffer, this bit will be restored.
 			 */
-			if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
-				pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+			if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) {
+				pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
 				restore_flag = true;
 			}
 		}
@@ -225,7 +225,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		 */
 		if (pkt->off < pkt->len) {
 			if (restore_flag)
-				pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+				pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
 
 			/* We are queueing the same virtio_vsock_pkt to handle
 			 * the remaining bytes, and we want to deliver it
diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
index 3dd3555b2740..8485b004a5f8 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -97,7 +97,7 @@ enum virtio_vsock_shutdown {
 
 /* VIRTIO_VSOCK_OP_RW flags values */
 enum virtio_vsock_rw {
-	VIRTIO_VSOCK_SEQ_EOR = 1,
+	VIRTIO_VSOCK_SEQ_EOM = 1,
 };
 
 #endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 081e7ae93cb1..4d5a93beceb0 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -77,7 +77,7 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
 
 		if (msg_data_left(info->msg) == 0 &&
 		    info->type == VIRTIO_VSOCK_TYPE_SEQPACKET)
-			pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+			pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
 	}
 
 	trace_virtio_transport_alloc_pkt(src_cid, src_port,
@@ -457,7 +457,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
 				dequeued_len += pkt_len;
 		}
 
-		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
+		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) {
 			msg_ready = true;
 			vvs->msg_count--;
 		}
@@ -1029,7 +1029,7 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
 		goto out;
 	}
 
-	if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)
+	if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)
 		vvs->msg_count++;
 
 	/* Try to copy small packets into the buffer of last packet queued,
@@ -1044,12 +1044,12 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
 
 		/* If there is space in the last packet queued, we copy the
 		 * new packet in its buffer. We avoid this if the last packet
-		 * queued has VIRTIO_VSOCK_SEQ_EOR set, because this is
-		 * delimiter of SEQPACKET record, so 'pkt' is the first packet
-		 * of a new record.
+		 * queued has VIRTIO_VSOCK_SEQ_EOM set, because this is
+		 * delimiter of SEQPACKET message, so 'pkt' is the first packet
+		 * of a new message.
 		 */
 		if ((pkt->len <= last_pkt->buf_len - last_pkt->len) &&
-		    !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)) {
+		    !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)) {
 			memcpy(last_pkt->buf + last_pkt->len, pkt->buf,
 			       pkt->len);
 			last_pkt->len += pkt->len;
-- 
cgit v1.2.3-71-gd317


From 41116599a0731f4cd451e9d191d879ab45e31945 Mon Sep 17 00:00:00 2001
From: Arseny Krasnov <arseny.krasnov@kaspersky.com>
Date: Fri, 3 Sep 2021 15:32:23 +0300
Subject: virtio/vsock: add 'VIRTIO_VSOCK_SEQ_EOR' bit.

This bit is used to handle POSIX MSG_EOR flag passed from
userspace in 'send*()' system calls. It marks end of each
record and is visible to receiver using 'recvmsg()' system
call.

Signed-off-by: Arseny Krasnov <arseny.krasnov@kaspersky.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20210903123225.3273425-1-arseny.krasnov@kaspersky.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_vsock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
index 8485b004a5f8..64738838bee5 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -98,6 +98,7 @@ enum virtio_vsock_shutdown {
 /* VIRTIO_VSOCK_OP_RW flags values */
 enum virtio_vsock_rw {
 	VIRTIO_VSOCK_SEQ_EOM = 1,
+	VIRTIO_VSOCK_SEQ_EOR = 2,
 };
 
 #endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
-- 
cgit v1.2.3-71-gd317


From c8a6153b6c59d95c0e091f053f6f180952ade91e Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Tue, 31 Aug 2021 18:36:33 +0800
Subject: vduse: Introduce VDUSE - vDPA Device in Userspace

This VDUSE driver enables implementing software-emulated vDPA
devices in userspace. The vDPA device is created by
ioctl(VDUSE_CREATE_DEV) on /dev/vduse/control. Then a char device
interface (/dev/vduse/$NAME) is exported to userspace for device
emulation.

In order to make the device emulation more secure, the device's
control path is handled in kernel. A message mechnism is introduced
to forward some dataplane related control messages to userspace.

And in the data path, the DMA buffer will be mapped into userspace
address space through different ways depending on the vDPA bus to
which the vDPA device is attached. In virtio-vdpa case, the MMU-based
software IOTLB is used to achieve that. And in vhost-vdpa case, the
DMA buffer is reside in a userspace memory region which can be shared
to the VDUSE userspace processs via transferring the shmfd.

For more details on VDUSE design and usage, please see the follow-on
Documentation commit.

NB(mst): when merging this with
b542e383d8c0 ("eventfd: Make signal recursion protection a task bit")
replace eventfd_signal_count with eventfd_signal_allowed,
and drop the previous
("eventfd: Export eventfd_wake_count to modules").

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210831103634.33-13-xieyongji@bytedance.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |    1 +
 drivers/vdpa/Kconfig                               |   10 +
 drivers/vdpa/Makefile                              |    1 +
 drivers/vdpa/vdpa_user/Makefile                    |    5 +
 drivers/vdpa/vdpa_user/vduse_dev.c                 | 1646 ++++++++++++++++++++
 include/uapi/linux/vduse.h                         |  306 ++++
 6 files changed, 1969 insertions(+)
 create mode 100644 drivers/vdpa/vdpa_user/Makefile
 create mode 100644 drivers/vdpa/vdpa_user/vduse_dev.c
 create mode 100644 include/uapi/linux/vduse.h

(limited to 'include/uapi')

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 1409e40e6345..293ca3aef358 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -300,6 +300,7 @@ Code  Seq#    Include File                                           Comments
 'z'   10-4F  drivers/s390/crypto/zcrypt_api.h                        conflict!
 '|'   00-7F  linux/media.h
 0x80  00-1F  linux/fb.h
+0x81  00-1F  linux/vduse.h
 0x89  00-06  arch/x86/include/asm/sockios.h
 0x89  0B-DF  linux/sockios.h
 0x89  E0-EF  linux/sockios.h                                         SIOCPROTOPRIVATE range
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index e48e2b10ca36..3d91982d8371 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -33,6 +33,16 @@ config VDPA_SIM_BLOCK
 	  vDPA block device simulator which terminates IO request in a
 	  memory buffer.
 
+config VDPA_USER
+	tristate "VDUSE (vDPA Device in Userspace) support"
+	depends on EVENTFD && MMU && HAS_DMA
+	select DMA_OPS
+	select VHOST_IOTLB
+	select IOMMU_IOVA
+	help
+	  With VDUSE it is possible to emulate a vDPA Device
+	  in a userspace program.
+
 config IFCVF
 	tristate "Intel IFC VF vDPA driver"
 	depends on PCI_MSI
diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
index 67fe7f3d6943..f02ebed33f19 100644
--- a/drivers/vdpa/Makefile
+++ b/drivers/vdpa/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VDPA) += vdpa.o
 obj-$(CONFIG_VDPA_SIM) += vdpa_sim/
+obj-$(CONFIG_VDPA_USER) += vdpa_user/
 obj-$(CONFIG_IFCVF)    += ifcvf/
 obj-$(CONFIG_MLX5_VDPA) += mlx5/
 obj-$(CONFIG_VP_VDPA)    += virtio_pci/
diff --git a/drivers/vdpa/vdpa_user/Makefile b/drivers/vdpa/vdpa_user/Makefile
new file mode 100644
index 000000000000..260e0b26af99
--- /dev/null
+++ b/drivers/vdpa/vdpa_user/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+vduse-y := vduse_dev.o iova_domain.o
+
+obj-$(CONFIG_VDPA_USER) += vduse.o
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
new file mode 100644
index 000000000000..b9b2659408e6
--- /dev/null
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -0,0 +1,1646 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDUSE: vDPA Device in Userspace
+ *
+ * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/dma-map-ops.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/vdpa.h>
+#include <linux/nospec.h>
+#include <uapi/linux/vduse.h>
+#include <uapi/linux/vdpa.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_ids.h>
+#include <uapi/linux/virtio_blk.h>
+#include <linux/mod_devicetable.h>
+
+#include "iova_domain.h"
+
+#define DRV_AUTHOR   "Yongji Xie <xieyongji@bytedance.com>"
+#define DRV_DESC     "vDPA Device in Userspace"
+#define DRV_LICENSE  "GPL v2"
+
+#define VDUSE_DEV_MAX (1U << MINORBITS)
+#define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
+#define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
+#define VDUSE_MSG_DEFAULT_TIMEOUT 30
+
+struct vduse_virtqueue {
+	u16 index;
+	u16 num_max;
+	u32 num;
+	u64 desc_addr;
+	u64 driver_addr;
+	u64 device_addr;
+	struct vdpa_vq_state state;
+	bool ready;
+	bool kicked;
+	spinlock_t kick_lock;
+	spinlock_t irq_lock;
+	struct eventfd_ctx *kickfd;
+	struct vdpa_callback cb;
+	struct work_struct inject;
+	struct work_struct kick;
+};
+
+struct vduse_dev;
+
+struct vduse_vdpa {
+	struct vdpa_device vdpa;
+	struct vduse_dev *dev;
+};
+
+struct vduse_dev {
+	struct vduse_vdpa *vdev;
+	struct device *dev;
+	struct vduse_virtqueue *vqs;
+	struct vduse_iova_domain *domain;
+	char *name;
+	struct mutex lock;
+	spinlock_t msg_lock;
+	u64 msg_unique;
+	u32 msg_timeout;
+	wait_queue_head_t waitq;
+	struct list_head send_list;
+	struct list_head recv_list;
+	struct vdpa_callback config_cb;
+	struct work_struct inject;
+	spinlock_t irq_lock;
+	int minor;
+	bool broken;
+	bool connected;
+	u64 api_version;
+	u64 device_features;
+	u64 driver_features;
+	u32 device_id;
+	u32 vendor_id;
+	u32 generation;
+	u32 config_size;
+	void *config;
+	u8 status;
+	u32 vq_num;
+	u32 vq_align;
+};
+
+struct vduse_dev_msg {
+	struct vduse_dev_request req;
+	struct vduse_dev_response resp;
+	struct list_head list;
+	wait_queue_head_t waitq;
+	bool completed;
+};
+
+struct vduse_control {
+	u64 api_version;
+};
+
+static DEFINE_MUTEX(vduse_lock);
+static DEFINE_IDR(vduse_idr);
+
+static dev_t vduse_major;
+static struct class *vduse_class;
+static struct cdev vduse_ctrl_cdev;
+static struct cdev vduse_cdev;
+static struct workqueue_struct *vduse_irq_wq;
+
+static u32 allowed_device_id[] = {
+	VIRTIO_ID_BLOCK,
+};
+
+static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
+{
+	struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
+
+	return vdev->dev;
+}
+
+static inline struct vduse_dev *dev_to_vduse(struct device *dev)
+{
+	struct vdpa_device *vdpa = dev_to_vdpa(dev);
+
+	return vdpa_to_vduse(vdpa);
+}
+
+static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
+					    uint32_t request_id)
+{
+	struct vduse_dev_msg *msg;
+
+	list_for_each_entry(msg, head, list) {
+		if (msg->req.request_id == request_id) {
+			list_del(&msg->list);
+			return msg;
+		}
+	}
+
+	return NULL;
+}
+
+static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
+{
+	struct vduse_dev_msg *msg = NULL;
+
+	if (!list_empty(head)) {
+		msg = list_first_entry(head, struct vduse_dev_msg, list);
+		list_del(&msg->list);
+	}
+
+	return msg;
+}
+
+static void vduse_enqueue_msg(struct list_head *head,
+			      struct vduse_dev_msg *msg)
+{
+	list_add_tail(&msg->list, head);
+}
+
+static void vduse_dev_broken(struct vduse_dev *dev)
+{
+	struct vduse_dev_msg *msg, *tmp;
+
+	if (unlikely(dev->broken))
+		return;
+
+	list_splice_init(&dev->recv_list, &dev->send_list);
+	list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
+		list_del(&msg->list);
+		msg->completed = 1;
+		msg->resp.result = VDUSE_REQ_RESULT_FAILED;
+		wake_up(&msg->waitq);
+	}
+	dev->broken = true;
+	wake_up(&dev->waitq);
+}
+
+static int vduse_dev_msg_sync(struct vduse_dev *dev,
+			      struct vduse_dev_msg *msg)
+{
+	int ret;
+
+	if (unlikely(dev->broken))
+		return -EIO;
+
+	init_waitqueue_head(&msg->waitq);
+	spin_lock(&dev->msg_lock);
+	if (unlikely(dev->broken)) {
+		spin_unlock(&dev->msg_lock);
+		return -EIO;
+	}
+	msg->req.request_id = dev->msg_unique++;
+	vduse_enqueue_msg(&dev->send_list, msg);
+	wake_up(&dev->waitq);
+	spin_unlock(&dev->msg_lock);
+	if (dev->msg_timeout)
+		ret = wait_event_killable_timeout(msg->waitq, msg->completed,
+						  (long)dev->msg_timeout * HZ);
+	else
+		ret = wait_event_killable(msg->waitq, msg->completed);
+
+	spin_lock(&dev->msg_lock);
+	if (!msg->completed) {
+		list_del(&msg->list);
+		msg->resp.result = VDUSE_REQ_RESULT_FAILED;
+		/* Mark the device as malfunction when there is a timeout */
+		if (!ret)
+			vduse_dev_broken(dev);
+	}
+	ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
+	spin_unlock(&dev->msg_lock);
+
+	return ret;
+}
+
+static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
+					 struct vduse_virtqueue *vq,
+					 struct vdpa_vq_state_packed *packed)
+{
+	struct vduse_dev_msg msg = { 0 };
+	int ret;
+
+	msg.req.type = VDUSE_GET_VQ_STATE;
+	msg.req.vq_state.index = vq->index;
+
+	ret = vduse_dev_msg_sync(dev, &msg);
+	if (ret)
+		return ret;
+
+	packed->last_avail_counter =
+			msg.resp.vq_state.packed.last_avail_counter & 0x0001;
+	packed->last_avail_idx =
+			msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
+	packed->last_used_counter =
+			msg.resp.vq_state.packed.last_used_counter & 0x0001;
+	packed->last_used_idx =
+			msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
+
+	return 0;
+}
+
+static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
+					struct vduse_virtqueue *vq,
+					struct vdpa_vq_state_split *split)
+{
+	struct vduse_dev_msg msg = { 0 };
+	int ret;
+
+	msg.req.type = VDUSE_GET_VQ_STATE;
+	msg.req.vq_state.index = vq->index;
+
+	ret = vduse_dev_msg_sync(dev, &msg);
+	if (ret)
+		return ret;
+
+	split->avail_index = msg.resp.vq_state.split.avail_index;
+
+	return 0;
+}
+
+static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
+{
+	struct vduse_dev_msg msg = { 0 };
+
+	msg.req.type = VDUSE_SET_STATUS;
+	msg.req.s.status = status;
+
+	return vduse_dev_msg_sync(dev, &msg);
+}
+
+static int vduse_dev_update_iotlb(struct vduse_dev *dev,
+				  u64 start, u64 last)
+{
+	struct vduse_dev_msg msg = { 0 };
+
+	if (last < start)
+		return -EINVAL;
+
+	msg.req.type = VDUSE_UPDATE_IOTLB;
+	msg.req.iova.start = start;
+	msg.req.iova.last = last;
+
+	return vduse_dev_msg_sync(dev, &msg);
+}
+
+static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct vduse_dev *dev = file->private_data;
+	struct vduse_dev_msg *msg;
+	int size = sizeof(struct vduse_dev_request);
+	ssize_t ret;
+
+	if (iov_iter_count(to) < size)
+		return -EINVAL;
+
+	spin_lock(&dev->msg_lock);
+	while (1) {
+		msg = vduse_dequeue_msg(&dev->send_list);
+		if (msg)
+			break;
+
+		ret = -EAGAIN;
+		if (file->f_flags & O_NONBLOCK)
+			goto unlock;
+
+		spin_unlock(&dev->msg_lock);
+		ret = wait_event_interruptible_exclusive(dev->waitq,
+					!list_empty(&dev->send_list));
+		if (ret)
+			return ret;
+
+		spin_lock(&dev->msg_lock);
+	}
+	spin_unlock(&dev->msg_lock);
+	ret = copy_to_iter(&msg->req, size, to);
+	spin_lock(&dev->msg_lock);
+	if (ret != size) {
+		ret = -EFAULT;
+		vduse_enqueue_msg(&dev->send_list, msg);
+		goto unlock;
+	}
+	vduse_enqueue_msg(&dev->recv_list, msg);
+unlock:
+	spin_unlock(&dev->msg_lock);
+
+	return ret;
+}
+
+static bool is_mem_zero(const char *ptr, int size)
+{
+	int i;
+
+	for (i = 0; i < size; i++) {
+		if (ptr[i])
+			return false;
+	}
+	return true;
+}
+
+static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct vduse_dev *dev = file->private_data;
+	struct vduse_dev_response resp;
+	struct vduse_dev_msg *msg;
+	size_t ret;
+
+	ret = copy_from_iter(&resp, sizeof(resp), from);
+	if (ret != sizeof(resp))
+		return -EINVAL;
+
+	if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
+		return -EINVAL;
+
+	spin_lock(&dev->msg_lock);
+	msg = vduse_find_msg(&dev->recv_list, resp.request_id);
+	if (!msg) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	memcpy(&msg->resp, &resp, sizeof(resp));
+	msg->completed = 1;
+	wake_up(&msg->waitq);
+unlock:
+	spin_unlock(&dev->msg_lock);
+
+	return ret;
+}
+
+static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
+{
+	struct vduse_dev *dev = file->private_data;
+	__poll_t mask = 0;
+
+	poll_wait(file, &dev->waitq, wait);
+
+	spin_lock(&dev->msg_lock);
+
+	if (unlikely(dev->broken))
+		mask |= EPOLLERR;
+	if (!list_empty(&dev->send_list))
+		mask |= EPOLLIN | EPOLLRDNORM;
+	if (!list_empty(&dev->recv_list))
+		mask |= EPOLLOUT | EPOLLWRNORM;
+
+	spin_unlock(&dev->msg_lock);
+
+	return mask;
+}
+
+static void vduse_dev_reset(struct vduse_dev *dev)
+{
+	int i;
+	struct vduse_iova_domain *domain = dev->domain;
+
+	/* The coherent mappings are handled in vduse_dev_free_coherent() */
+	if (domain->bounce_map)
+		vduse_domain_reset_bounce_map(domain);
+
+	dev->status = 0;
+	dev->driver_features = 0;
+	dev->generation++;
+	spin_lock(&dev->irq_lock);
+	dev->config_cb.callback = NULL;
+	dev->config_cb.private = NULL;
+	spin_unlock(&dev->irq_lock);
+	flush_work(&dev->inject);
+
+	for (i = 0; i < dev->vq_num; i++) {
+		struct vduse_virtqueue *vq = &dev->vqs[i];
+
+		vq->ready = false;
+		vq->desc_addr = 0;
+		vq->driver_addr = 0;
+		vq->device_addr = 0;
+		vq->num = 0;
+		memset(&vq->state, 0, sizeof(vq->state));
+
+		spin_lock(&vq->kick_lock);
+		vq->kicked = false;
+		if (vq->kickfd)
+			eventfd_ctx_put(vq->kickfd);
+		vq->kickfd = NULL;
+		spin_unlock(&vq->kick_lock);
+
+		spin_lock(&vq->irq_lock);
+		vq->cb.callback = NULL;
+		vq->cb.private = NULL;
+		spin_unlock(&vq->irq_lock);
+		flush_work(&vq->inject);
+		flush_work(&vq->kick);
+	}
+}
+
+static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
+				u64 desc_area, u64 driver_area,
+				u64 device_area)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+	vq->desc_addr = desc_area;
+	vq->driver_addr = driver_area;
+	vq->device_addr = device_area;
+
+	return 0;
+}
+
+static void vduse_vq_kick(struct vduse_virtqueue *vq)
+{
+	spin_lock(&vq->kick_lock);
+	if (!vq->ready)
+		goto unlock;
+
+	if (vq->kickfd)
+		eventfd_signal(vq->kickfd, 1);
+	else
+		vq->kicked = true;
+unlock:
+	spin_unlock(&vq->kick_lock);
+}
+
+static void vduse_vq_kick_work(struct work_struct *work)
+{
+	struct vduse_virtqueue *vq = container_of(work,
+					struct vduse_virtqueue, kick);
+
+	vduse_vq_kick(vq);
+}
+
+static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+	/*
+	 * TODO: after applying b542e383d8c0 ("eventfd: Make signal recursion protection a task bit")
+	 * replace eventfd_signal_count with eventfd_signal_allowed, and
+	 * drop the previous ("eventfd: Export eventfd_wake_count to modules") 
+	 * */
+	if (eventfd_signal_count()) {
+		schedule_work(&vq->kick);
+		return;
+	}
+	vduse_vq_kick(vq);
+}
+
+static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
+			      struct vdpa_callback *cb)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+	spin_lock(&vq->irq_lock);
+	vq->cb.callback = cb->callback;
+	vq->cb.private = cb->private;
+	spin_unlock(&vq->irq_lock);
+}
+
+static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+	vq->num = num;
+}
+
+static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
+					u16 idx, bool ready)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+	vq->ready = ready;
+}
+
+static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+	return vq->ready;
+}
+
+static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
+				const struct vdpa_vq_state *state)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+	if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
+		vq->state.packed.last_avail_counter =
+				state->packed.last_avail_counter;
+		vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
+		vq->state.packed.last_used_counter =
+				state->packed.last_used_counter;
+		vq->state.packed.last_used_idx = state->packed.last_used_idx;
+	} else
+		vq->state.split.avail_index = state->split.avail_index;
+
+	return 0;
+}
+
+static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
+				struct vdpa_vq_state *state)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+	if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
+		return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
+
+	return vduse_dev_get_vq_state_split(dev, vq, &state->split);
+}
+
+static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	return dev->vq_align;
+}
+
+static u64 vduse_vdpa_get_features(struct vdpa_device *vdpa)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	return dev->device_features;
+}
+
+static int vduse_vdpa_set_features(struct vdpa_device *vdpa, u64 features)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	dev->driver_features = features;
+	return 0;
+}
+
+static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
+				  struct vdpa_callback *cb)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	spin_lock(&dev->irq_lock);
+	dev->config_cb.callback = cb->callback;
+	dev->config_cb.private = cb->private;
+	spin_unlock(&dev->irq_lock);
+}
+
+static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	u16 num_max = 0;
+	int i;
+
+	for (i = 0; i < dev->vq_num; i++)
+		if (num_max < dev->vqs[i].num_max)
+			num_max = dev->vqs[i].num_max;
+
+	return num_max;
+}
+
+static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	return dev->device_id;
+}
+
+static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	return dev->vendor_id;
+}
+
+static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	return dev->status;
+}
+
+static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	if (vduse_dev_set_status(dev, status))
+		return;
+
+	dev->status = status;
+}
+
+static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	return dev->config_size;
+}
+
+static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
+				  void *buf, unsigned int len)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	if (len > dev->config_size - offset)
+		return;
+
+	memcpy(buf, dev->config + offset, len);
+}
+
+static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
+			const void *buf, unsigned int len)
+{
+	/* Now we only support read-only configuration space */
+}
+
+static int vduse_vdpa_reset(struct vdpa_device *vdpa)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	if (vduse_dev_set_status(dev, 0))
+		return -EIO;
+
+	vduse_dev_reset(dev);
+
+	return 0;
+}
+
+static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	return dev->generation;
+}
+
+static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
+				struct vhost_iotlb *iotlb)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	int ret;
+
+	ret = vduse_domain_set_map(dev->domain, iotlb);
+	if (ret)
+		return ret;
+
+	ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
+	if (ret) {
+		vduse_domain_clear_map(dev->domain, iotlb);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void vduse_vdpa_free(struct vdpa_device *vdpa)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	dev->vdev = NULL;
+}
+
+static const struct vdpa_config_ops vduse_vdpa_config_ops = {
+	.set_vq_address		= vduse_vdpa_set_vq_address,
+	.kick_vq		= vduse_vdpa_kick_vq,
+	.set_vq_cb		= vduse_vdpa_set_vq_cb,
+	.set_vq_num             = vduse_vdpa_set_vq_num,
+	.set_vq_ready		= vduse_vdpa_set_vq_ready,
+	.get_vq_ready		= vduse_vdpa_get_vq_ready,
+	.set_vq_state		= vduse_vdpa_set_vq_state,
+	.get_vq_state		= vduse_vdpa_get_vq_state,
+	.get_vq_align		= vduse_vdpa_get_vq_align,
+	.get_features		= vduse_vdpa_get_features,
+	.set_features		= vduse_vdpa_set_features,
+	.set_config_cb		= vduse_vdpa_set_config_cb,
+	.get_vq_num_max		= vduse_vdpa_get_vq_num_max,
+	.get_device_id		= vduse_vdpa_get_device_id,
+	.get_vendor_id		= vduse_vdpa_get_vendor_id,
+	.get_status		= vduse_vdpa_get_status,
+	.set_status		= vduse_vdpa_set_status,
+	.get_config_size	= vduse_vdpa_get_config_size,
+	.get_config		= vduse_vdpa_get_config,
+	.set_config		= vduse_vdpa_set_config,
+	.get_generation		= vduse_vdpa_get_generation,
+	.reset			= vduse_vdpa_reset,
+	.set_map		= vduse_vdpa_set_map,
+	.free			= vduse_vdpa_free,
+};
+
+static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page,
+				     unsigned long offset, size_t size,
+				     enum dma_data_direction dir,
+				     unsigned long attrs)
+{
+	struct vduse_dev *vdev = dev_to_vduse(dev);
+	struct vduse_iova_domain *domain = vdev->domain;
+
+	return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
+}
+
+static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr,
+				size_t size, enum dma_data_direction dir,
+				unsigned long attrs)
+{
+	struct vduse_dev *vdev = dev_to_vduse(dev);
+	struct vduse_iova_domain *domain = vdev->domain;
+
+	return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
+}
+
+static void *vduse_dev_alloc_coherent(struct device *dev, size_t size,
+					dma_addr_t *dma_addr, gfp_t flag,
+					unsigned long attrs)
+{
+	struct vduse_dev *vdev = dev_to_vduse(dev);
+	struct vduse_iova_domain *domain = vdev->domain;
+	unsigned long iova;
+	void *addr;
+
+	*dma_addr = DMA_MAPPING_ERROR;
+	addr = vduse_domain_alloc_coherent(domain, size,
+				(dma_addr_t *)&iova, flag, attrs);
+	if (!addr)
+		return NULL;
+
+	*dma_addr = (dma_addr_t)iova;
+
+	return addr;
+}
+
+static void vduse_dev_free_coherent(struct device *dev, size_t size,
+					void *vaddr, dma_addr_t dma_addr,
+					unsigned long attrs)
+{
+	struct vduse_dev *vdev = dev_to_vduse(dev);
+	struct vduse_iova_domain *domain = vdev->domain;
+
+	vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
+}
+
+static size_t vduse_dev_max_mapping_size(struct device *dev)
+{
+	struct vduse_dev *vdev = dev_to_vduse(dev);
+	struct vduse_iova_domain *domain = vdev->domain;
+
+	return domain->bounce_size;
+}
+
+static const struct dma_map_ops vduse_dev_dma_ops = {
+	.map_page = vduse_dev_map_page,
+	.unmap_page = vduse_dev_unmap_page,
+	.alloc = vduse_dev_alloc_coherent,
+	.free = vduse_dev_free_coherent,
+	.max_mapping_size = vduse_dev_max_mapping_size,
+};
+
+static unsigned int perm_to_file_flags(u8 perm)
+{
+	unsigned int flags = 0;
+
+	switch (perm) {
+	case VDUSE_ACCESS_WO:
+		flags |= O_WRONLY;
+		break;
+	case VDUSE_ACCESS_RO:
+		flags |= O_RDONLY;
+		break;
+	case VDUSE_ACCESS_RW:
+		flags |= O_RDWR;
+		break;
+	default:
+		WARN(1, "invalidate vhost IOTLB permission\n");
+		break;
+	}
+
+	return flags;
+}
+
+static int vduse_kickfd_setup(struct vduse_dev *dev,
+			struct vduse_vq_eventfd *eventfd)
+{
+	struct eventfd_ctx *ctx = NULL;
+	struct vduse_virtqueue *vq;
+	u32 index;
+
+	if (eventfd->index >= dev->vq_num)
+		return -EINVAL;
+
+	index = array_index_nospec(eventfd->index, dev->vq_num);
+	vq = &dev->vqs[index];
+	if (eventfd->fd >= 0) {
+		ctx = eventfd_ctx_fdget(eventfd->fd);
+		if (IS_ERR(ctx))
+			return PTR_ERR(ctx);
+	} else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
+		return 0;
+
+	spin_lock(&vq->kick_lock);
+	if (vq->kickfd)
+		eventfd_ctx_put(vq->kickfd);
+	vq->kickfd = ctx;
+	if (vq->ready && vq->kicked && vq->kickfd) {
+		eventfd_signal(vq->kickfd, 1);
+		vq->kicked = false;
+	}
+	spin_unlock(&vq->kick_lock);
+
+	return 0;
+}
+
+static bool vduse_dev_is_ready(struct vduse_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->vq_num; i++)
+		if (!dev->vqs[i].num_max)
+			return false;
+
+	return true;
+}
+
+static void vduse_dev_irq_inject(struct work_struct *work)
+{
+	struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
+
+	spin_lock_irq(&dev->irq_lock);
+	if (dev->config_cb.callback)
+		dev->config_cb.callback(dev->config_cb.private);
+	spin_unlock_irq(&dev->irq_lock);
+}
+
+static void vduse_vq_irq_inject(struct work_struct *work)
+{
+	struct vduse_virtqueue *vq = container_of(work,
+					struct vduse_virtqueue, inject);
+
+	spin_lock_irq(&vq->irq_lock);
+	if (vq->ready && vq->cb.callback)
+		vq->cb.callback(vq->cb.private);
+	spin_unlock_irq(&vq->irq_lock);
+}
+
+static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct vduse_dev *dev = file->private_data;
+	void __user *argp = (void __user *)arg;
+	int ret;
+
+	if (unlikely(dev->broken))
+		return -EPERM;
+
+	switch (cmd) {
+	case VDUSE_IOTLB_GET_FD: {
+		struct vduse_iotlb_entry entry;
+		struct vhost_iotlb_map *map;
+		struct vdpa_map_file *map_file;
+		struct vduse_iova_domain *domain = dev->domain;
+		struct file *f = NULL;
+
+		ret = -EFAULT;
+		if (copy_from_user(&entry, argp, sizeof(entry)))
+			break;
+
+		ret = -EINVAL;
+		if (entry.start > entry.last)
+			break;
+
+		spin_lock(&domain->iotlb_lock);
+		map = vhost_iotlb_itree_first(domain->iotlb,
+					      entry.start, entry.last);
+		if (map) {
+			map_file = (struct vdpa_map_file *)map->opaque;
+			f = get_file(map_file->file);
+			entry.offset = map_file->offset;
+			entry.start = map->start;
+			entry.last = map->last;
+			entry.perm = map->perm;
+		}
+		spin_unlock(&domain->iotlb_lock);
+		ret = -EINVAL;
+		if (!f)
+			break;
+
+		ret = -EFAULT;
+		if (copy_to_user(argp, &entry, sizeof(entry))) {
+			fput(f);
+			break;
+		}
+		ret = receive_fd(f, perm_to_file_flags(entry.perm));
+		fput(f);
+		break;
+	}
+	case VDUSE_DEV_GET_FEATURES:
+		/*
+		 * Just mirror what driver wrote here.
+		 * The driver is expected to check FEATURE_OK later.
+		 */
+		ret = put_user(dev->driver_features, (u64 __user *)argp);
+		break;
+	case VDUSE_DEV_SET_CONFIG: {
+		struct vduse_config_data config;
+		unsigned long size = offsetof(struct vduse_config_data,
+					      buffer);
+
+		ret = -EFAULT;
+		if (copy_from_user(&config, argp, size))
+			break;
+
+		ret = -EINVAL;
+		if (config.length == 0 ||
+		    config.length > dev->config_size - config.offset)
+			break;
+
+		ret = -EFAULT;
+		if (copy_from_user(dev->config + config.offset, argp + size,
+				   config.length))
+			break;
+
+		ret = 0;
+		break;
+	}
+	case VDUSE_DEV_INJECT_CONFIG_IRQ:
+		ret = 0;
+		queue_work(vduse_irq_wq, &dev->inject);
+		break;
+	case VDUSE_VQ_SETUP: {
+		struct vduse_vq_config config;
+		u32 index;
+
+		ret = -EFAULT;
+		if (copy_from_user(&config, argp, sizeof(config)))
+			break;
+
+		ret = -EINVAL;
+		if (config.index >= dev->vq_num)
+			break;
+
+		if (!is_mem_zero((const char *)config.reserved,
+				 sizeof(config.reserved)))
+			break;
+
+		index = array_index_nospec(config.index, dev->vq_num);
+		dev->vqs[index].num_max = config.max_size;
+		ret = 0;
+		break;
+	}
+	case VDUSE_VQ_GET_INFO: {
+		struct vduse_vq_info vq_info;
+		struct vduse_virtqueue *vq;
+		u32 index;
+
+		ret = -EFAULT;
+		if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
+			break;
+
+		ret = -EINVAL;
+		if (vq_info.index >= dev->vq_num)
+			break;
+
+		index = array_index_nospec(vq_info.index, dev->vq_num);
+		vq = &dev->vqs[index];
+		vq_info.desc_addr = vq->desc_addr;
+		vq_info.driver_addr = vq->driver_addr;
+		vq_info.device_addr = vq->device_addr;
+		vq_info.num = vq->num;
+
+		if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
+			vq_info.packed.last_avail_counter =
+				vq->state.packed.last_avail_counter;
+			vq_info.packed.last_avail_idx =
+				vq->state.packed.last_avail_idx;
+			vq_info.packed.last_used_counter =
+				vq->state.packed.last_used_counter;
+			vq_info.packed.last_used_idx =
+				vq->state.packed.last_used_idx;
+		} else
+			vq_info.split.avail_index =
+				vq->state.split.avail_index;
+
+		vq_info.ready = vq->ready;
+
+		ret = -EFAULT;
+		if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
+			break;
+
+		ret = 0;
+		break;
+	}
+	case VDUSE_VQ_SETUP_KICKFD: {
+		struct vduse_vq_eventfd eventfd;
+
+		ret = -EFAULT;
+		if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
+			break;
+
+		ret = vduse_kickfd_setup(dev, &eventfd);
+		break;
+	}
+	case VDUSE_VQ_INJECT_IRQ: {
+		u32 index;
+
+		ret = -EFAULT;
+		if (get_user(index, (u32 __user *)argp))
+			break;
+
+		ret = -EINVAL;
+		if (index >= dev->vq_num)
+			break;
+
+		ret = 0;
+		index = array_index_nospec(index, dev->vq_num);
+		queue_work(vduse_irq_wq, &dev->vqs[index].inject);
+		break;
+	}
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+
+	return ret;
+}
+
+static int vduse_dev_release(struct inode *inode, struct file *file)
+{
+	struct vduse_dev *dev = file->private_data;
+
+	spin_lock(&dev->msg_lock);
+	/* Make sure the inflight messages can processed after reconncection */
+	list_splice_init(&dev->recv_list, &dev->send_list);
+	spin_unlock(&dev->msg_lock);
+	dev->connected = false;
+
+	return 0;
+}
+
+static struct vduse_dev *vduse_dev_get_from_minor(int minor)
+{
+	struct vduse_dev *dev;
+
+	mutex_lock(&vduse_lock);
+	dev = idr_find(&vduse_idr, minor);
+	mutex_unlock(&vduse_lock);
+
+	return dev;
+}
+
+static int vduse_dev_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
+
+	if (!dev)
+		return -ENODEV;
+
+	ret = -EBUSY;
+	mutex_lock(&dev->lock);
+	if (dev->connected)
+		goto unlock;
+
+	ret = 0;
+	dev->connected = true;
+	file->private_data = dev;
+unlock:
+	mutex_unlock(&dev->lock);
+
+	return ret;
+}
+
+static const struct file_operations vduse_dev_fops = {
+	.owner		= THIS_MODULE,
+	.open		= vduse_dev_open,
+	.release	= vduse_dev_release,
+	.read_iter	= vduse_dev_read_iter,
+	.write_iter	= vduse_dev_write_iter,
+	.poll		= vduse_dev_poll,
+	.unlocked_ioctl	= vduse_dev_ioctl,
+	.compat_ioctl	= compat_ptr_ioctl,
+	.llseek		= noop_llseek,
+};
+
+static struct vduse_dev *vduse_dev_create(void)
+{
+	struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+
+	if (!dev)
+		return NULL;
+
+	mutex_init(&dev->lock);
+	spin_lock_init(&dev->msg_lock);
+	INIT_LIST_HEAD(&dev->send_list);
+	INIT_LIST_HEAD(&dev->recv_list);
+	spin_lock_init(&dev->irq_lock);
+
+	INIT_WORK(&dev->inject, vduse_dev_irq_inject);
+	init_waitqueue_head(&dev->waitq);
+
+	return dev;
+}
+
+static void vduse_dev_destroy(struct vduse_dev *dev)
+{
+	kfree(dev);
+}
+
+static struct vduse_dev *vduse_find_dev(const char *name)
+{
+	struct vduse_dev *dev;
+	int id;
+
+	idr_for_each_entry(&vduse_idr, dev, id)
+		if (!strcmp(dev->name, name))
+			return dev;
+
+	return NULL;
+}
+
+static int vduse_destroy_dev(char *name)
+{
+	struct vduse_dev *dev = vduse_find_dev(name);
+
+	if (!dev)
+		return -EINVAL;
+
+	mutex_lock(&dev->lock);
+	if (dev->vdev || dev->connected) {
+		mutex_unlock(&dev->lock);
+		return -EBUSY;
+	}
+	dev->connected = true;
+	mutex_unlock(&dev->lock);
+
+	vduse_dev_reset(dev);
+	device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
+	idr_remove(&vduse_idr, dev->minor);
+	kvfree(dev->config);
+	kfree(dev->vqs);
+	vduse_domain_destroy(dev->domain);
+	kfree(dev->name);
+	vduse_dev_destroy(dev);
+	module_put(THIS_MODULE);
+
+	return 0;
+}
+
+static bool device_is_allowed(u32 device_id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
+		if (allowed_device_id[i] == device_id)
+			return true;
+
+	return false;
+}
+
+static bool features_is_valid(u64 features)
+{
+	if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
+		return false;
+
+	/* Now we only support read-only configuration space */
+	if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE))
+		return false;
+
+	return true;
+}
+
+static bool vduse_validate_config(struct vduse_dev_config *config)
+{
+	if (!is_mem_zero((const char *)config->reserved,
+			 sizeof(config->reserved)))
+		return false;
+
+	if (config->vq_align > PAGE_SIZE)
+		return false;
+
+	if (config->config_size > PAGE_SIZE)
+		return false;
+
+	if (!device_is_allowed(config->device_id))
+		return false;
+
+	if (!features_is_valid(config->features))
+		return false;
+
+	return true;
+}
+
+static ssize_t msg_timeout_show(struct device *device,
+				struct device_attribute *attr, char *buf)
+{
+	struct vduse_dev *dev = dev_get_drvdata(device);
+
+	return sysfs_emit(buf, "%u\n", dev->msg_timeout);
+}
+
+static ssize_t msg_timeout_store(struct device *device,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct vduse_dev *dev = dev_get_drvdata(device);
+	int ret;
+
+	ret = kstrtouint(buf, 10, &dev->msg_timeout);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(msg_timeout);
+
+static struct attribute *vduse_dev_attrs[] = {
+	&dev_attr_msg_timeout.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(vduse_dev);
+
+static int vduse_create_dev(struct vduse_dev_config *config,
+			    void *config_buf, u64 api_version)
+{
+	int i, ret;
+	struct vduse_dev *dev;
+
+	ret = -EEXIST;
+	if (vduse_find_dev(config->name))
+		goto err;
+
+	ret = -ENOMEM;
+	dev = vduse_dev_create();
+	if (!dev)
+		goto err;
+
+	dev->api_version = api_version;
+	dev->device_features = config->features;
+	dev->device_id = config->device_id;
+	dev->vendor_id = config->vendor_id;
+	dev->name = kstrdup(config->name, GFP_KERNEL);
+	if (!dev->name)
+		goto err_str;
+
+	dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
+					  VDUSE_BOUNCE_SIZE);
+	if (!dev->domain)
+		goto err_domain;
+
+	dev->config = config_buf;
+	dev->config_size = config->config_size;
+	dev->vq_align = config->vq_align;
+	dev->vq_num = config->vq_num;
+	dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
+	if (!dev->vqs)
+		goto err_vqs;
+
+	for (i = 0; i < dev->vq_num; i++) {
+		dev->vqs[i].index = i;
+		INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject);
+		INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work);
+		spin_lock_init(&dev->vqs[i].kick_lock);
+		spin_lock_init(&dev->vqs[i].irq_lock);
+	}
+
+	ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
+	if (ret < 0)
+		goto err_idr;
+
+	dev->minor = ret;
+	dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
+	dev->dev = device_create(vduse_class, NULL,
+				 MKDEV(MAJOR(vduse_major), dev->minor),
+				 dev, "%s", config->name);
+	if (IS_ERR(dev->dev)) {
+		ret = PTR_ERR(dev->dev);
+		goto err_dev;
+	}
+	__module_get(THIS_MODULE);
+
+	return 0;
+err_dev:
+	idr_remove(&vduse_idr, dev->minor);
+err_idr:
+	kfree(dev->vqs);
+err_vqs:
+	vduse_domain_destroy(dev->domain);
+err_domain:
+	kfree(dev->name);
+err_str:
+	vduse_dev_destroy(dev);
+err:
+	kvfree(config_buf);
+	return ret;
+}
+
+static long vduse_ioctl(struct file *file, unsigned int cmd,
+			unsigned long arg)
+{
+	int ret;
+	void __user *argp = (void __user *)arg;
+	struct vduse_control *control = file->private_data;
+
+	mutex_lock(&vduse_lock);
+	switch (cmd) {
+	case VDUSE_GET_API_VERSION:
+		ret = put_user(control->api_version, (u64 __user *)argp);
+		break;
+	case VDUSE_SET_API_VERSION: {
+		u64 api_version;
+
+		ret = -EFAULT;
+		if (get_user(api_version, (u64 __user *)argp))
+			break;
+
+		ret = -EINVAL;
+		if (api_version > VDUSE_API_VERSION)
+			break;
+
+		ret = 0;
+		control->api_version = api_version;
+		break;
+	}
+	case VDUSE_CREATE_DEV: {
+		struct vduse_dev_config config;
+		unsigned long size = offsetof(struct vduse_dev_config, config);
+		void *buf;
+
+		ret = -EFAULT;
+		if (copy_from_user(&config, argp, size))
+			break;
+
+		ret = -EINVAL;
+		if (vduse_validate_config(&config) == false)
+			break;
+
+		buf = vmemdup_user(argp + size, config.config_size);
+		if (IS_ERR(buf)) {
+			ret = PTR_ERR(buf);
+			break;
+		}
+		config.name[VDUSE_NAME_MAX - 1] = '\0';
+		ret = vduse_create_dev(&config, buf, control->api_version);
+		break;
+	}
+	case VDUSE_DESTROY_DEV: {
+		char name[VDUSE_NAME_MAX];
+
+		ret = -EFAULT;
+		if (copy_from_user(name, argp, VDUSE_NAME_MAX))
+			break;
+
+		name[VDUSE_NAME_MAX - 1] = '\0';
+		ret = vduse_destroy_dev(name);
+		break;
+	}
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	mutex_unlock(&vduse_lock);
+
+	return ret;
+}
+
+static int vduse_release(struct inode *inode, struct file *file)
+{
+	struct vduse_control *control = file->private_data;
+
+	kfree(control);
+	return 0;
+}
+
+static int vduse_open(struct inode *inode, struct file *file)
+{
+	struct vduse_control *control;
+
+	control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
+	if (!control)
+		return -ENOMEM;
+
+	control->api_version = VDUSE_API_VERSION;
+	file->private_data = control;
+
+	return 0;
+}
+
+static const struct file_operations vduse_ctrl_fops = {
+	.owner		= THIS_MODULE,
+	.open		= vduse_open,
+	.release	= vduse_release,
+	.unlocked_ioctl	= vduse_ioctl,
+	.compat_ioctl	= compat_ptr_ioctl,
+	.llseek		= noop_llseek,
+};
+
+static char *vduse_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
+}
+
+static void vduse_mgmtdev_release(struct device *dev)
+{
+}
+
+static struct device vduse_mgmtdev = {
+	.init_name = "vduse",
+	.release = vduse_mgmtdev_release,
+};
+
+static struct vdpa_mgmt_dev mgmt_dev;
+
+static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
+{
+	struct vduse_vdpa *vdev;
+	int ret;
+
+	if (dev->vdev)
+		return -EEXIST;
+
+	vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
+				 &vduse_vdpa_config_ops, name, true);
+	if (IS_ERR(vdev))
+		return PTR_ERR(vdev);
+
+	dev->vdev = vdev;
+	vdev->dev = dev;
+	vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask;
+	ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64));
+	if (ret) {
+		put_device(&vdev->vdpa.dev);
+		return ret;
+	}
+	set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops);
+	vdev->vdpa.dma_dev = &vdev->vdpa.dev;
+	vdev->vdpa.mdev = &mgmt_dev;
+
+	return 0;
+}
+
+static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
+{
+	struct vduse_dev *dev;
+	int ret;
+
+	mutex_lock(&vduse_lock);
+	dev = vduse_find_dev(name);
+	if (!dev || !vduse_dev_is_ready(dev)) {
+		mutex_unlock(&vduse_lock);
+		return -EINVAL;
+	}
+	ret = vduse_dev_init_vdpa(dev, name);
+	mutex_unlock(&vduse_lock);
+	if (ret)
+		return ret;
+
+	ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
+	if (ret) {
+		put_device(&dev->vdev->vdpa.dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
+{
+	_vdpa_unregister_device(dev);
+}
+
+static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
+	.dev_add = vdpa_dev_add,
+	.dev_del = vdpa_dev_del,
+};
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static struct vdpa_mgmt_dev mgmt_dev = {
+	.device = &vduse_mgmtdev,
+	.id_table = id_table,
+	.ops = &vdpa_dev_mgmtdev_ops,
+};
+
+static int vduse_mgmtdev_init(void)
+{
+	int ret;
+
+	ret = device_register(&vduse_mgmtdev);
+	if (ret)
+		return ret;
+
+	ret = vdpa_mgmtdev_register(&mgmt_dev);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	device_unregister(&vduse_mgmtdev);
+	return ret;
+}
+
+static void vduse_mgmtdev_exit(void)
+{
+	vdpa_mgmtdev_unregister(&mgmt_dev);
+	device_unregister(&vduse_mgmtdev);
+}
+
+static int vduse_init(void)
+{
+	int ret;
+	struct device *dev;
+
+	vduse_class = class_create(THIS_MODULE, "vduse");
+	if (IS_ERR(vduse_class))
+		return PTR_ERR(vduse_class);
+
+	vduse_class->devnode = vduse_devnode;
+	vduse_class->dev_groups = vduse_dev_groups;
+
+	ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
+	if (ret)
+		goto err_chardev_region;
+
+	/* /dev/vduse/control */
+	cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
+	vduse_ctrl_cdev.owner = THIS_MODULE;
+	ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
+	if (ret)
+		goto err_ctrl_cdev;
+
+	dev = device_create(vduse_class, NULL, vduse_major, NULL, "control");
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto err_device;
+	}
+
+	/* /dev/vduse/$DEVICE */
+	cdev_init(&vduse_cdev, &vduse_dev_fops);
+	vduse_cdev.owner = THIS_MODULE;
+	ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
+		       VDUSE_DEV_MAX - 1);
+	if (ret)
+		goto err_cdev;
+
+	vduse_irq_wq = alloc_workqueue("vduse-irq",
+				WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
+	if (!vduse_irq_wq)
+		goto err_wq;
+
+	ret = vduse_domain_init();
+	if (ret)
+		goto err_domain;
+
+	ret = vduse_mgmtdev_init();
+	if (ret)
+		goto err_mgmtdev;
+
+	return 0;
+err_mgmtdev:
+	vduse_domain_exit();
+err_domain:
+	destroy_workqueue(vduse_irq_wq);
+err_wq:
+	cdev_del(&vduse_cdev);
+err_cdev:
+	device_destroy(vduse_class, vduse_major);
+err_device:
+	cdev_del(&vduse_ctrl_cdev);
+err_ctrl_cdev:
+	unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
+err_chardev_region:
+	class_destroy(vduse_class);
+	return ret;
+}
+module_init(vduse_init);
+
+static void vduse_exit(void)
+{
+	vduse_mgmtdev_exit();
+	vduse_domain_exit();
+	destroy_workqueue(vduse_irq_wq);
+	cdev_del(&vduse_cdev);
+	device_destroy(vduse_class, vduse_major);
+	cdev_del(&vduse_ctrl_cdev);
+	unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
+	class_destroy(vduse_class);
+}
+module_exit(vduse_exit);
+
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
new file mode 100644
index 000000000000..7cfe1c1280c0
--- /dev/null
+++ b/include/uapi/linux/vduse.h
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_VDUSE_H_
+#define _UAPI_VDUSE_H_
+
+#include <linux/types.h>
+
+#define VDUSE_BASE	0x81
+
+/* The ioctls for control device (/dev/vduse/control) */
+
+#define VDUSE_API_VERSION	0
+
+/*
+ * Get the version of VDUSE API that kernel supported (VDUSE_API_VERSION).
+ * This is used for future extension.
+ */
+#define VDUSE_GET_API_VERSION	_IOR(VDUSE_BASE, 0x00, __u64)
+
+/* Set the version of VDUSE API that userspace supported. */
+#define VDUSE_SET_API_VERSION	_IOW(VDUSE_BASE, 0x01, __u64)
+
+/**
+ * struct vduse_dev_config - basic configuration of a VDUSE device
+ * @name: VDUSE device name, needs to be NUL terminated
+ * @vendor_id: virtio vendor id
+ * @device_id: virtio device id
+ * @features: virtio features
+ * @vq_num: the number of virtqueues
+ * @vq_align: the allocation alignment of virtqueue's metadata
+ * @reserved: for future use, needs to be initialized to zero
+ * @config_size: the size of the configuration space
+ * @config: the buffer of the configuration space
+ *
+ * Structure used by VDUSE_CREATE_DEV ioctl to create VDUSE device.
+ */
+struct vduse_dev_config {
+#define VDUSE_NAME_MAX	256
+	char name[VDUSE_NAME_MAX];
+	__u32 vendor_id;
+	__u32 device_id;
+	__u64 features;
+	__u32 vq_num;
+	__u32 vq_align;
+	__u32 reserved[13];
+	__u32 config_size;
+	__u8 config[];
+};
+
+/* Create a VDUSE device which is represented by a char device (/dev/vduse/$NAME) */
+#define VDUSE_CREATE_DEV	_IOW(VDUSE_BASE, 0x02, struct vduse_dev_config)
+
+/*
+ * Destroy a VDUSE device. Make sure there are no more references
+ * to the char device (/dev/vduse/$NAME).
+ */
+#define VDUSE_DESTROY_DEV	_IOW(VDUSE_BASE, 0x03, char[VDUSE_NAME_MAX])
+
+/* The ioctls for VDUSE device (/dev/vduse/$NAME) */
+
+/**
+ * struct vduse_iotlb_entry - entry of IOTLB to describe one IOVA region [start, last]
+ * @offset: the mmap offset on returned file descriptor
+ * @start: start of the IOVA region
+ * @last: last of the IOVA region
+ * @perm: access permission of the IOVA region
+ *
+ * Structure used by VDUSE_IOTLB_GET_FD ioctl to find an overlapped IOVA region.
+ */
+struct vduse_iotlb_entry {
+	__u64 offset;
+	__u64 start;
+	__u64 last;
+#define VDUSE_ACCESS_RO 0x1
+#define VDUSE_ACCESS_WO 0x2
+#define VDUSE_ACCESS_RW 0x3
+	__u8 perm;
+};
+
+/*
+ * Find the first IOVA region that overlaps with the range [start, last]
+ * and return the corresponding file descriptor. Return -EINVAL means the
+ * IOVA region doesn't exist. Caller should set start and last fields.
+ */
+#define VDUSE_IOTLB_GET_FD	_IOWR(VDUSE_BASE, 0x10, struct vduse_iotlb_entry)
+
+/*
+ * Get the negotiated virtio features. It's a subset of the features in
+ * struct vduse_dev_config which can be accepted by virtio driver. It's
+ * only valid after FEATURES_OK status bit is set.
+ */
+#define VDUSE_DEV_GET_FEATURES	_IOR(VDUSE_BASE, 0x11, __u64)
+
+/**
+ * struct vduse_config_data - data used to update configuration space
+ * @offset: the offset from the beginning of configuration space
+ * @length: the length to write to configuration space
+ * @buffer: the buffer used to write from
+ *
+ * Structure used by VDUSE_DEV_SET_CONFIG ioctl to update device
+ * configuration space.
+ */
+struct vduse_config_data {
+	__u32 offset;
+	__u32 length;
+	__u8 buffer[];
+};
+
+/* Set device configuration space */
+#define VDUSE_DEV_SET_CONFIG	_IOW(VDUSE_BASE, 0x12, struct vduse_config_data)
+
+/*
+ * Inject a config interrupt. It's usually used to notify virtio driver
+ * that device configuration space has changed.
+ */
+#define VDUSE_DEV_INJECT_CONFIG_IRQ	_IO(VDUSE_BASE, 0x13)
+
+/**
+ * struct vduse_vq_config - basic configuration of a virtqueue
+ * @index: virtqueue index
+ * @max_size: the max size of virtqueue
+ * @reserved: for future use, needs to be initialized to zero
+ *
+ * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue.
+ */
+struct vduse_vq_config {
+	__u32 index;
+	__u16 max_size;
+	__u16 reserved[13];
+};
+
+/*
+ * Setup the specified virtqueue. Make sure all virtqueues have been
+ * configured before the device is attached to vDPA bus.
+ */
+#define VDUSE_VQ_SETUP		_IOW(VDUSE_BASE, 0x14, struct vduse_vq_config)
+
+/**
+ * struct vduse_vq_state_split - split virtqueue state
+ * @avail_index: available index
+ */
+struct vduse_vq_state_split {
+	__u16 avail_index;
+};
+
+/**
+ * struct vduse_vq_state_packed - packed virtqueue state
+ * @last_avail_counter: last driver ring wrap counter observed by device
+ * @last_avail_idx: device available index
+ * @last_used_counter: device ring wrap counter
+ * @last_used_idx: used index
+ */
+struct vduse_vq_state_packed {
+	__u16 last_avail_counter;
+	__u16 last_avail_idx;
+	__u16 last_used_counter;
+	__u16 last_used_idx;
+};
+
+/**
+ * struct vduse_vq_info - information of a virtqueue
+ * @index: virtqueue index
+ * @num: the size of virtqueue
+ * @desc_addr: address of desc area
+ * @driver_addr: address of driver area
+ * @device_addr: address of device area
+ * @split: split virtqueue state
+ * @packed: packed virtqueue state
+ * @ready: ready status of virtqueue
+ *
+ * Structure used by VDUSE_VQ_GET_INFO ioctl to get virtqueue's information.
+ */
+struct vduse_vq_info {
+	__u32 index;
+	__u32 num;
+	__u64 desc_addr;
+	__u64 driver_addr;
+	__u64 device_addr;
+	union {
+		struct vduse_vq_state_split split;
+		struct vduse_vq_state_packed packed;
+	};
+	__u8 ready;
+};
+
+/* Get the specified virtqueue's information. Caller should set index field. */
+#define VDUSE_VQ_GET_INFO	_IOWR(VDUSE_BASE, 0x15, struct vduse_vq_info)
+
+/**
+ * struct vduse_vq_eventfd - eventfd configuration for a virtqueue
+ * @index: virtqueue index
+ * @fd: eventfd, -1 means de-assigning the eventfd
+ *
+ * Structure used by VDUSE_VQ_SETUP_KICKFD ioctl to setup kick eventfd.
+ */
+struct vduse_vq_eventfd {
+	__u32 index;
+#define VDUSE_EVENTFD_DEASSIGN -1
+	int fd;
+};
+
+/*
+ * Setup kick eventfd for specified virtqueue. The kick eventfd is used
+ * by VDUSE kernel module to notify userspace to consume the avail vring.
+ */
+#define VDUSE_VQ_SETUP_KICKFD	_IOW(VDUSE_BASE, 0x16, struct vduse_vq_eventfd)
+
+/*
+ * Inject an interrupt for specific virtqueue. It's used to notify virtio driver
+ * to consume the used vring.
+ */
+#define VDUSE_VQ_INJECT_IRQ	_IOW(VDUSE_BASE, 0x17, __u32)
+
+/* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
+
+/**
+ * enum vduse_req_type - request type
+ * @VDUSE_GET_VQ_STATE: get the state for specified virtqueue from userspace
+ * @VDUSE_SET_STATUS: set the device status
+ * @VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for
+ *                      specified IOVA range via VDUSE_IOTLB_GET_FD ioctl
+ */
+enum vduse_req_type {
+	VDUSE_GET_VQ_STATE,
+	VDUSE_SET_STATUS,
+	VDUSE_UPDATE_IOTLB,
+};
+
+/**
+ * struct vduse_vq_state - virtqueue state
+ * @index: virtqueue index
+ * @split: split virtqueue state
+ * @packed: packed virtqueue state
+ */
+struct vduse_vq_state {
+	__u32 index;
+	union {
+		struct vduse_vq_state_split split;
+		struct vduse_vq_state_packed packed;
+	};
+};
+
+/**
+ * struct vduse_dev_status - device status
+ * @status: device status
+ */
+struct vduse_dev_status {
+	__u8 status;
+};
+
+/**
+ * struct vduse_iova_range - IOVA range [start, last]
+ * @start: start of the IOVA range
+ * @last: last of the IOVA range
+ */
+struct vduse_iova_range {
+	__u64 start;
+	__u64 last;
+};
+
+/**
+ * struct vduse_dev_request - control request
+ * @type: request type
+ * @request_id: request id
+ * @reserved: for future use
+ * @vq_state: virtqueue state, only index field is available
+ * @s: device status
+ * @iova: IOVA range for updating
+ * @padding: padding
+ *
+ * Structure used by read(2) on /dev/vduse/$NAME.
+ */
+struct vduse_dev_request {
+	__u32 type;
+	__u32 request_id;
+	__u32 reserved[4];
+	union {
+		struct vduse_vq_state vq_state;
+		struct vduse_dev_status s;
+		struct vduse_iova_range iova;
+		__u32 padding[32];
+	};
+};
+
+/**
+ * struct vduse_dev_response - response to control request
+ * @request_id: corresponding request id
+ * @result: the result of request
+ * @reserved: for future use, needs to be initialized to zero
+ * @vq_state: virtqueue state
+ * @padding: padding
+ *
+ * Structure used by write(2) on /dev/vduse/$NAME.
+ */
+struct vduse_dev_response {
+	__u32 request_id;
+#define VDUSE_REQ_RESULT_OK	0x00
+#define VDUSE_REQ_RESULT_FAILED	0x01
+	__u32 result;
+	__u32 reserved[4];
+	union {
+		struct vduse_vq_state vq_state;
+		__u32 padding[32];
+	};
+};
+
+#endif /* _UAPI_VDUSE_H_ */
-- 
cgit v1.2.3-71-gd317


From 9d1b3afd73047d4dd30e3636412c9f9b5def2b14 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Fri, 3 Sep 2021 19:20:56 -0700
Subject: cxl/uapi: Fix defined but not used warnings

Fix unused-const-variable warnings emitted by gcc when cxlmem.h is used
by pretty much all files except pci.c

Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/163072205652.2250120.16833548560832424468.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/uapi/linux/cxl_mem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index f6e8a005b113..8d206f27bb6d 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -50,7 +50,7 @@ enum { CXL_CMDS };
 #define ___C(a, b) { b }
 static const struct {
 	const char *name;
-} cxl_command_names[] = { CXL_CMDS };
+} cxl_command_names[] __attribute__((__unused__)) = { CXL_CMDS };
 
 /*
  * Here's how this actually breaks out:
-- 
cgit v1.2.3-71-gd317


From 59ab844eed9c6b01d32dcb27b57accc23771b324 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Sep 2021 15:18:25 -0700
Subject: compat: remove some compat entry points

These are all handled correctly when calling the native system call entry
point, so remove the special cases.

Link: https://lkml.kernel.org/r/20210727144859.4150043-6-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/unistd32.h         | 10 ++++-----
 arch/mips/kernel/syscalls/syscall_n32.tbl | 10 ++++-----
 arch/mips/kernel/syscalls/syscall_o32.tbl | 10 ++++-----
 arch/parisc/kernel/syscalls/syscall.tbl   |  8 +++----
 arch/powerpc/kernel/syscalls/syscall.tbl  | 10 ++++-----
 arch/s390/kernel/syscalls/syscall.tbl     | 10 ++++-----
 arch/sparc/kernel/syscalls/syscall.tbl    | 10 ++++-----
 arch/x86/entry/syscalls/syscall_32.tbl    |  4 ++--
 arch/x86/entry/syscalls/syscall_64.tbl    |  2 +-
 include/linux/compat.h                    | 20 -----------------
 include/uapi/asm-generic/unistd.h         | 10 ++++-----
 kernel/sys_ni.c                           |  5 -----
 mm/mempolicy.c                            | 37 -------------------------------
 mm/migrate.c                              | 13 -----------
 14 files changed, 42 insertions(+), 117 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 4e99e4b912ef..844f6ae58662 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -649,11 +649,11 @@ __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
 #define __NR_inotify_rm_watch 318
 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
 #define __NR_mbind 319
-__SYSCALL(__NR_mbind, compat_sys_mbind)
+__SYSCALL(__NR_mbind, sys_mbind)
 #define __NR_get_mempolicy 320
-__SYSCALL(__NR_get_mempolicy, compat_sys_get_mempolicy)
+__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
 #define __NR_set_mempolicy 321
-__SYSCALL(__NR_set_mempolicy, compat_sys_set_mempolicy)
+__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
 #define __NR_openat 322
 __SYSCALL(__NR_openat, compat_sys_openat)
 #define __NR_mkdirat 323
@@ -699,7 +699,7 @@ __SYSCALL(__NR_tee, sys_tee)
 #define __NR_vmsplice 343
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages 344
-__SYSCALL(__NR_move_pages, compat_sys_move_pages)
+__SYSCALL(__NR_move_pages, sys_move_pages)
 #define __NR_getcpu 345
 __SYSCALL(__NR_getcpu, sys_getcpu)
 #define __NR_epoll_pwait 346
@@ -811,7 +811,7 @@ __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_io_pgetevents 399
 __SYSCALL(__NR_io_pgetevents, compat_sys_io_pgetevents)
 #define __NR_migrate_pages 400
-__SYSCALL(__NR_migrate_pages, compat_sys_migrate_pages)
+__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
 #define __NR_kexec_file_load 401
 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load)
 /* 402 is unused */
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 56c8d3cf42ed..70e32de2bcaa 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -239,9 +239,9 @@
 228	n32	clock_nanosleep			sys_clock_nanosleep_time32
 229	n32	tgkill				sys_tgkill
 230	n32	utimes				sys_utimes_time32
-231	n32	mbind				compat_sys_mbind
-232	n32	get_mempolicy			compat_sys_get_mempolicy
-233	n32	set_mempolicy			compat_sys_set_mempolicy
+231	n32	mbind				sys_mbind
+232	n32	get_mempolicy			sys_get_mempolicy
+233	n32	set_mempolicy			sys_set_mempolicy
 234	n32	mq_open				compat_sys_mq_open
 235	n32	mq_unlink			sys_mq_unlink
 236	n32	mq_timedsend			sys_mq_timedsend_time32
@@ -258,7 +258,7 @@
 247	n32	inotify_init			sys_inotify_init
 248	n32	inotify_add_watch		sys_inotify_add_watch
 249	n32	inotify_rm_watch		sys_inotify_rm_watch
-250	n32	migrate_pages			compat_sys_migrate_pages
+250	n32	migrate_pages			sys_migrate_pages
 251	n32	openat				sys_openat
 252	n32	mkdirat				sys_mkdirat
 253	n32	mknodat				sys_mknodat
@@ -279,7 +279,7 @@
 268	n32	sync_file_range			sys_sync_file_range
 269	n32	tee				sys_tee
 270	n32	vmsplice			sys_vmsplice
-271	n32	move_pages			compat_sys_move_pages
+271	n32	move_pages			sys_move_pages
 272	n32	set_robust_list			compat_sys_set_robust_list
 273	n32	get_robust_list			compat_sys_get_robust_list
 274	n32	kexec_load			compat_sys_kexec_load
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 201237fd0f43..a61c35edaa74 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -279,9 +279,9 @@
 265	o32	clock_nanosleep			sys_clock_nanosleep_time32
 266	o32	tgkill				sys_tgkill
 267	o32	utimes				sys_utimes_time32
-268	o32	mbind				sys_mbind			compat_sys_mbind
-269	o32	get_mempolicy			sys_get_mempolicy		compat_sys_get_mempolicy
-270	o32	set_mempolicy			sys_set_mempolicy		compat_sys_set_mempolicy
+268	o32	mbind				sys_mbind
+269	o32	get_mempolicy			sys_get_mempolicy
+270	o32	set_mempolicy			sys_set_mempolicy
 271	o32	mq_open				sys_mq_open			compat_sys_mq_open
 272	o32	mq_unlink			sys_mq_unlink
 273	o32	mq_timedsend			sys_mq_timedsend_time32
@@ -298,7 +298,7 @@
 284	o32	inotify_init			sys_inotify_init
 285	o32	inotify_add_watch		sys_inotify_add_watch
 286	o32	inotify_rm_watch		sys_inotify_rm_watch
-287	o32	migrate_pages			sys_migrate_pages		compat_sys_migrate_pages
+287	o32	migrate_pages			sys_migrate_pages
 288	o32	openat				sys_openat			compat_sys_openat
 289	o32	mkdirat				sys_mkdirat
 290	o32	mknodat				sys_mknodat
@@ -319,7 +319,7 @@
 305	o32	sync_file_range			sys_sync_file_range		sys32_sync_file_range
 306	o32	tee				sys_tee
 307	o32	vmsplice			sys_vmsplice
-308	o32	move_pages			sys_move_pages			compat_sys_move_pages
+308	o32	move_pages			sys_move_pages
 309	o32	set_robust_list			sys_set_robust_list		compat_sys_set_robust_list
 310	o32	get_robust_list			sys_get_robust_list		compat_sys_get_robust_list
 311	o32	kexec_load			sys_kexec_load			compat_sys_kexec_load
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 0bf854b70612..bf751e0732b7 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -292,9 +292,9 @@
 258	32	clock_nanosleep		sys_clock_nanosleep_time32
 258	64	clock_nanosleep		sys_clock_nanosleep
 259	common	tgkill			sys_tgkill
-260	common	mbind			sys_mbind			compat_sys_mbind
-261	common	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
-262	common	set_mempolicy		sys_set_mempolicy		compat_sys_set_mempolicy
+260	common	mbind			sys_mbind
+261	common	get_mempolicy		sys_get_mempolicy
+262	common	set_mempolicy		sys_set_mempolicy
 # 263 was vserver
 264	common	add_key			sys_add_key
 265	common	request_key		sys_request_key
@@ -331,7 +331,7 @@
 292	64	sync_file_range		sys_sync_file_range
 293	common	tee			sys_tee
 294	common	vmsplice		sys_vmsplice
-295	common	move_pages		sys_move_pages			compat_sys_move_pages
+295	common	move_pages		sys_move_pages
 296	common	getcpu			sys_getcpu
 297	common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
 298	common	statfs64		sys_statfs64			compat_sys_statfs64
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 29b55e2e035c..7bef917cc84e 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -330,10 +330,10 @@
 256	64	sys_debug_setcontext		sys_ni_syscall
 256	spu	sys_debug_setcontext		sys_ni_syscall
 # 257 reserved for vserver
-258	nospu	migrate_pages			sys_migrate_pages		compat_sys_migrate_pages
-259	nospu	mbind				sys_mbind			compat_sys_mbind
-260	nospu	get_mempolicy			sys_get_mempolicy		compat_sys_get_mempolicy
-261	nospu	set_mempolicy			sys_set_mempolicy		compat_sys_set_mempolicy
+258	nospu	migrate_pages			sys_migrate_pages
+259	nospu	mbind				sys_mbind
+260	nospu	get_mempolicy			sys_get_mempolicy
+261	nospu	set_mempolicy			sys_set_mempolicy
 262	nospu	mq_open				sys_mq_open			compat_sys_mq_open
 263	nospu	mq_unlink			sys_mq_unlink
 264	32	mq_timedsend			sys_mq_timedsend_time32
@@ -381,7 +381,7 @@
 298	common	faccessat			sys_faccessat
 299	common	get_robust_list			sys_get_robust_list		compat_sys_get_robust_list
 300	common	set_robust_list			sys_set_robust_list		compat_sys_set_robust_list
-301	common	move_pages			sys_move_pages			compat_sys_move_pages
+301	common	move_pages			sys_move_pages
 302	common	getcpu				sys_getcpu
 303	nospu	epoll_pwait			sys_epoll_pwait			compat_sys_epoll_pwait
 304	32	utimensat			sys_utimensat_time32
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index aa9d68b8ee14..df5261e5cfe1 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -274,9 +274,9 @@
 265  common	statfs64		sys_statfs64			compat_sys_statfs64
 266  common	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
 267  common	remap_file_pages	sys_remap_file_pages		sys_remap_file_pages
-268  common	mbind			sys_mbind			compat_sys_mbind
-269  common	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
-270  common	set_mempolicy		sys_set_mempolicy		compat_sys_set_mempolicy
+268  common	mbind			sys_mbind			sys_mbind
+269  common	get_mempolicy		sys_get_mempolicy		sys_get_mempolicy
+270  common	set_mempolicy		sys_set_mempolicy		sys_set_mempolicy
 271  common	mq_open			sys_mq_open			compat_sys_mq_open
 272  common	mq_unlink		sys_mq_unlink			sys_mq_unlink
 273  common	mq_timedsend		sys_mq_timedsend		sys_mq_timedsend_time32
@@ -293,7 +293,7 @@
 284  common	inotify_init		sys_inotify_init		sys_inotify_init
 285  common	inotify_add_watch	sys_inotify_add_watch		sys_inotify_add_watch
 286  common	inotify_rm_watch	sys_inotify_rm_watch		sys_inotify_rm_watch
-287  common	migrate_pages		sys_migrate_pages		compat_sys_migrate_pages
+287  common	migrate_pages		sys_migrate_pages		sys_migrate_pages
 288  common	openat			sys_openat			compat_sys_openat
 289  common	mkdirat			sys_mkdirat			sys_mkdirat
 290  common	mknodat			sys_mknodat			sys_mknodat
@@ -317,7 +317,7 @@
 307  common	sync_file_range		sys_sync_file_range		compat_sys_s390_sync_file_range
 308  common	tee			sys_tee				sys_tee
 309  common	vmsplice		sys_vmsplice			sys_vmsplice
-310  common	move_pages		sys_move_pages			compat_sys_move_pages
+310  common	move_pages		sys_move_pages			sys_move_pages
 311  common	getcpu			sys_getcpu			sys_getcpu
 312  common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
 313  common	utimes			sys_utimes			sys_utimes_time32
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 7893104718c2..c37764dc764d 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -365,12 +365,12 @@
 299	common	unshare			sys_unshare
 300	common	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
 301	common	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
-302	common	migrate_pages		sys_migrate_pages		compat_sys_migrate_pages
-303	common	mbind			sys_mbind			compat_sys_mbind
-304	common	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
-305	common	set_mempolicy		sys_set_mempolicy		compat_sys_set_mempolicy
+302	common	migrate_pages		sys_migrate_pages
+303	common	mbind			sys_mbind
+304	common	get_mempolicy		sys_get_mempolicy
+305	common	set_mempolicy		sys_set_mempolicy
 306	common	kexec_load		sys_kexec_load			compat_sys_kexec_load
-307	common	move_pages		sys_move_pages			compat_sys_move_pages
+307	common	move_pages		sys_move_pages
 308	common	getcpu			sys_getcpu
 309	common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
 310	32	utimensat		sys_utimensat_time32
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 61f18b72552b..960a021d543e 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -286,7 +286,7 @@
 272	i386	fadvise64_64		sys_ia32_fadvise64_64
 273	i386	vserver
 274	i386	mbind			sys_mbind
-275	i386	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
+275	i386	get_mempolicy		sys_get_mempolicy
 276	i386	set_mempolicy		sys_set_mempolicy
 277	i386	mq_open			sys_mq_open			compat_sys_mq_open
 278	i386	mq_unlink		sys_mq_unlink
@@ -328,7 +328,7 @@
 314	i386	sync_file_range		sys_ia32_sync_file_range
 315	i386	tee			sys_tee
 316	i386	vmsplice		sys_vmsplice
-317	i386	move_pages		sys_move_pages			compat_sys_move_pages
+317	i386	move_pages		sys_move_pages
 318	i386	getcpu			sys_getcpu
 319	i386	epoll_pwait		sys_epoll_pwait
 320	i386	utimensat		sys_utimensat_time32
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 807b6a1de8e8..18b5500ea8bf 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -398,7 +398,7 @@
 530	x32	set_robust_list		compat_sys_set_robust_list
 531	x32	get_robust_list		compat_sys_get_robust_list
 532	x32	vmsplice		sys_vmsplice
-533	x32	move_pages		compat_sys_move_pages
+533	x32	move_pages		sys_move_pages
 534	x32	preadv			compat_sys_preadv64
 535	x32	pwritev			compat_sys_pwritev64
 536	x32	rt_tgsigqueueinfo	compat_sys_rt_tgsigqueueinfo
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 3a2ac5afee30..2d42cebd1fb8 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -799,26 +799,6 @@ asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr
 /* mm/fadvise.c: No generic prototype for fadvise64_64 */
 
 /* mm/, CONFIG_MMU only */
-asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
-				 compat_ulong_t mode,
-				 compat_ulong_t __user *nmask,
-				 compat_ulong_t maxnode, compat_ulong_t flags);
-asmlinkage long compat_sys_get_mempolicy(int __user *policy,
-					 compat_ulong_t __user *nmask,
-					 compat_ulong_t maxnode,
-					 compat_ulong_t addr,
-					 compat_ulong_t flags);
-asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
-					 compat_ulong_t maxnode);
-asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
-		compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
-		const compat_ulong_t __user *new_nodes);
-asmlinkage long compat_sys_move_pages(pid_t pid, compat_ulong_t nr_pages,
-				      __u32 __user *pages,
-				      const int __user *nodes,
-				      int __user *status,
-				      int flags);
-
 asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid,
 					compat_pid_t pid, int sig,
 					struct compat_siginfo __user *uinfo);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 14c8fe863c6d..1c5fb86d455a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -673,15 +673,15 @@ __SYSCALL(__NR_madvise, sys_madvise)
 #define __NR_remap_file_pages 234
 __SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
 #define __NR_mbind 235
-__SC_COMP(__NR_mbind, sys_mbind, compat_sys_mbind)
+__SYSCALL(__NR_mbind, sys_mbind)
 #define __NR_get_mempolicy 236
-__SC_COMP(__NR_get_mempolicy, sys_get_mempolicy, compat_sys_get_mempolicy)
+__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
 #define __NR_set_mempolicy 237
-__SC_COMP(__NR_set_mempolicy, sys_set_mempolicy, compat_sys_set_mempolicy)
+__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
 #define __NR_migrate_pages 238
-__SC_COMP(__NR_migrate_pages, sys_migrate_pages, compat_sys_migrate_pages)
+__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
 #define __NR_move_pages 239
-__SC_COMP(__NR_move_pages, sys_move_pages, compat_sys_move_pages)
+__SYSCALL(__NR_move_pages, sys_move_pages)
 #endif
 
 #define __NR_rt_tgsigqueueinfo 240
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 64578adfe115..f43d89d92860 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -292,15 +292,10 @@ COND_SYSCALL(process_madvise);
 COND_SYSCALL(process_mrelease);
 COND_SYSCALL(remap_file_pages);
 COND_SYSCALL(mbind);
-COND_SYSCALL_COMPAT(mbind);
 COND_SYSCALL(get_mempolicy);
-COND_SYSCALL_COMPAT(get_mempolicy);
 COND_SYSCALL(set_mempolicy);
-COND_SYSCALL_COMPAT(set_mempolicy);
 COND_SYSCALL(migrate_pages);
-COND_SYSCALL_COMPAT(migrate_pages);
 COND_SYSCALL(move_pages);
-COND_SYSCALL_COMPAT(move_pages);
 
 COND_SYSCALL(perf_event_open);
 COND_SYSCALL(accept4);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eb95578f5997..8d14240896a8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1649,43 +1649,6 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
 }
 
-#ifdef CONFIG_COMPAT
-
-COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
-		       compat_ulong_t __user *, nmask,
-		       compat_ulong_t, maxnode,
-		       compat_ulong_t, addr, compat_ulong_t, flags)
-{
-	return kernel_get_mempolicy(policy, (unsigned long __user *)nmask,
-				    maxnode, addr, flags);
-}
-
-COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
-		       compat_ulong_t, maxnode)
-{
-	return kernel_set_mempolicy(mode, (unsigned long __user *)nmask, maxnode);
-}
-
-COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
-		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
-		       compat_ulong_t, maxnode, compat_ulong_t, flags)
-{
-	return kernel_mbind(start, len, mode, (unsigned long __user *)nmask,
-			    maxnode, flags);
-}
-
-COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
-		       compat_ulong_t, maxnode,
-		       const compat_ulong_t __user *, old_nodes,
-		       const compat_ulong_t __user *, new_nodes)
-{
-	return kernel_migrate_pages(pid, maxnode,
-				    (const unsigned long __user *)old_nodes,
-				    (const unsigned long __user *)new_nodes);
-}
-
-#endif /* CONFIG_COMPAT */
-
 bool vma_migratable(struct vm_area_struct *vma)
 {
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
diff --git a/mm/migrate.c b/mm/migrate.c
index 2bc494875cea..a6a7743ee98f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2047,19 +2047,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
 	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
 }
 
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
-		       compat_uptr_t __user *, pages,
-		       const int __user *, nodes,
-		       int __user *, status,
-		       int, flags)
-{
-	return kernel_move_pages(pid, nr_pages,
-				 (const void __user *__user *)pages,
-				 nodes, status, flags);
-}
-#endif /* CONFIG_COMPAT */
-
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * Returns true if this is a safe migration target node for misplaced NUMA
-- 
cgit v1.2.3-71-gd317


From dd47c104533dedb90434a3f142e94a671ac623a6 Mon Sep 17 00:00:00 2001
From: Eugene Syromiatnikov <esyr@redhat.com>
Date: Mon, 13 Sep 2021 17:44:15 +0200
Subject: io-wq: provide IO_WQ_* constants for IORING_REGISTER_IOWQ_MAX_WORKERS
 arg items

The items passed in the array pointed by the arg parameter
of IORING_REGISTER_IOWQ_MAX_WORKERS io_uring_register operation
carry certain semantics: they refer to different io-wq worker categories;
provide IO_WQ_* constants in the UAPI, so these categories can be referenced
in the user space code.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Complements: 2e480058ddc21ec5 ("io-wq: provide a way to limit max number of workers")
Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
Link: https://lore.kernel.org/r/20210913154415.GA12890@asgard.redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c                    | 5 +++++
 include/uapi/linux/io_uring.h | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 3d4460df845c..c2e0e8e80949 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -14,6 +14,7 @@
 #include <linux/rculist_nulls.h>
 #include <linux/cpu.h>
 #include <linux/tracehook.h>
+#include <uapi/linux/io_uring.h>
 
 #include "io-wq.h"
 
@@ -1281,6 +1282,10 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
 {
 	int i, node, prev = 0;
 
+	BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND   != (int) IO_WQ_BOUND);
+	BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
+	BUILD_BUG_ON((int) IO_WQ_ACCT_NR      != 2);
+
 	for (i = 0; i < 2; i++) {
 		if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
 			new_count[i] = task_rlimit(current, RLIMIT_NPROC);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 59ef35154e3d..b270a07b285e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -317,13 +317,19 @@ enum {
 	IORING_REGISTER_IOWQ_AFF		= 17,
 	IORING_UNREGISTER_IOWQ_AFF		= 18,
 
-	/* set/get max number of workers */
+	/* set/get max number of io-wq workers */
 	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
 
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
 
+/* io-wq worker categories */
+enum {
+	IO_WQ_BOUND,
+	IO_WQ_UNBOUND,
+};
+
 /* deprecated, see struct io_uring_rsrc_update */
 struct io_uring_files_update {
 	__u32 offset;
-- 
cgit v1.2.3-71-gd317