From 882227626459ce7a24191f13c6d9749a00992059 Mon Sep 17 00:00:00 2001
From: Scott Branden <scott.branden@broadcom.com>
Date: Wed, 20 Jan 2021 09:58:15 -0800
Subject: bcm-vk: add bcm_vk UAPI

Add user space api for bcm-vk driver.

Provide ioctl api to load images and issue reset command to card.
FW status registers in PCI BAR space also defined as part
of API so that user space is able to interpret these memory locations
as needed via direct PCIe access.

Acked-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Scott Branden <scott.branden@broadcom.com>
Link: https://lore.kernel.org/r/20210120175827.14820-2-scott.branden@broadcom.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/misc/bcm_vk.h | 84 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 include/uapi/linux/misc/bcm_vk.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/misc/bcm_vk.h b/include/uapi/linux/misc/bcm_vk.h
new file mode 100644
index 000000000000..ec28e0bd46a9
--- /dev/null
+++ b/include/uapi/linux/misc/bcm_vk.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright 2018-2020 Broadcom.
+ */
+
+#ifndef __UAPI_LINUX_MISC_BCM_VK_H
+#define __UAPI_LINUX_MISC_BCM_VK_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define BCM_VK_MAX_FILENAME 64
+
+struct vk_image {
+	__u32 type; /* Type of image */
+#define VK_IMAGE_TYPE_BOOT1 1 /* 1st stage (load to SRAM) */
+#define VK_IMAGE_TYPE_BOOT2 2 /* 2nd stage (load to DDR) */
+	__u8 filename[BCM_VK_MAX_FILENAME]; /* Filename of image */
+};
+
+struct vk_reset {
+	__u32 arg1;
+	__u32 arg2;
+};
+
+#define VK_MAGIC		0x5e
+
+/* Load image to Valkyrie */
+#define VK_IOCTL_LOAD_IMAGE	_IOW(VK_MAGIC, 0x2, struct vk_image)
+
+/* Send Reset to Valkyrie */
+#define VK_IOCTL_RESET		_IOW(VK_MAGIC, 0x4, struct vk_reset)
+
+/*
+ * Firmware Status accessed directly via BAR space
+ */
+#define VK_BAR_FWSTS			0x41c
+#define VK_BAR_COP_FWSTS		0x428
+/* VK_FWSTS definitions */
+#define VK_FWSTS_RELOCATION_ENTRY	(1UL << 0)
+#define VK_FWSTS_RELOCATION_EXIT	(1UL << 1)
+#define VK_FWSTS_INIT_START		(1UL << 2)
+#define VK_FWSTS_ARCH_INIT_DONE		(1UL << 3)
+#define VK_FWSTS_PRE_KNL1_INIT_DONE	(1UL << 4)
+#define VK_FWSTS_PRE_KNL2_INIT_DONE	(1UL << 5)
+#define VK_FWSTS_POST_KNL_INIT_DONE	(1UL << 6)
+#define VK_FWSTS_INIT_DONE		(1UL << 7)
+#define VK_FWSTS_APP_INIT_START		(1UL << 8)
+#define VK_FWSTS_APP_INIT_DONE		(1UL << 9)
+#define VK_FWSTS_MASK			0xffffffff
+#define VK_FWSTS_READY			(VK_FWSTS_INIT_START | \
+					 VK_FWSTS_ARCH_INIT_DONE | \
+					 VK_FWSTS_PRE_KNL1_INIT_DONE | \
+					 VK_FWSTS_PRE_KNL2_INIT_DONE | \
+					 VK_FWSTS_POST_KNL_INIT_DONE | \
+					 VK_FWSTS_INIT_DONE | \
+					 VK_FWSTS_APP_INIT_START | \
+					 VK_FWSTS_APP_INIT_DONE)
+/* Deinit */
+#define VK_FWSTS_APP_DEINIT_START	(1UL << 23)
+#define VK_FWSTS_APP_DEINIT_DONE	(1UL << 24)
+#define VK_FWSTS_DRV_DEINIT_START	(1UL << 25)
+#define VK_FWSTS_DRV_DEINIT_DONE	(1UL << 26)
+#define VK_FWSTS_RESET_DONE		(1UL << 27)
+#define VK_FWSTS_DEINIT_TRIGGERED	(VK_FWSTS_APP_DEINIT_START | \
+					 VK_FWSTS_APP_DEINIT_DONE  | \
+					 VK_FWSTS_DRV_DEINIT_START | \
+					 VK_FWSTS_DRV_DEINIT_DONE)
+/* Last nibble for reboot reason */
+#define VK_FWSTS_RESET_REASON_SHIFT	28
+#define VK_FWSTS_RESET_REASON_MASK	(0xf << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_SYS_PWRUP	(0x0 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_MBOX_DB		(0x1 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_M7_WDOG		(0x2 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_TEMP		(0x3 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_PCI_FLR		(0x4 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_PCI_HOT		(0x5 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_PCI_WARM		(0x6 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_PCI_COLD		(0x7 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_L1		(0x8 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_L0		(0x9 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_UNKNOWN		(0xf << VK_FWSTS_RESET_REASON_SHIFT)
+
+#endif /* __UAPI_LINUX_MISC_BCM_VK_H */
-- 
cgit v1.2.3-71-gd317


From 8544717cdacc2f33f0f53a3b34c5125b37e13ce9 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Thu, 14 Jan 2021 19:07:48 +0200
Subject: bus: fsl-mc: move fsl_mc_command struct in a uapi header

Define "struct fsl_mc_command" as a structure that can cross the
user/kernel boundary.

Acked-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Link: https://lore.kernel.org/r/20210114170752.2927915-2-ciorneiioana@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                 |  1 +
 include/linux/fsl/mc.h      |  8 +-------
 include/uapi/linux/fsl_mc.h | 25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/fsl_mc.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 23273a59da6f..128455811980 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14652,6 +14652,7 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
 F:	Documentation/networking/device_drivers/ethernet/freescale/dpaa2/overview.rst
 F:	drivers/bus/fsl-mc/
+F:	include/uapi/linux/fsl_mc.h
 
 QT1010 MEDIA DRIVER
 M:	Antti Palosaari <crope@iki.fi>
diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index db244874e834..63b56aba925a 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -13,6 +13,7 @@
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/interrupt.h>
+#include <uapi/linux/fsl_mc.h>
 
 #define FSL_MC_VENDOR_FREESCALE	0x1957
 
@@ -209,8 +210,6 @@ struct fsl_mc_device {
 #define to_fsl_mc_device(_dev) \
 	container_of(_dev, struct fsl_mc_device, dev)
 
-#define MC_CMD_NUM_OF_PARAMS	7
-
 struct mc_cmd_header {
 	u8 src_id;
 	u8 flags_hw;
@@ -220,11 +219,6 @@ struct mc_cmd_header {
 	__le16 cmd_id;
 };
 
-struct fsl_mc_command {
-	__le64 header;
-	__le64 params[MC_CMD_NUM_OF_PARAMS];
-};
-
 enum mc_cmd_status {
 	MC_CMD_STATUS_OK = 0x0, /* Completed successfully */
 	MC_CMD_STATUS_READY = 0x1, /* Ready to be processed */
diff --git a/include/uapi/linux/fsl_mc.h b/include/uapi/linux/fsl_mc.h
new file mode 100644
index 000000000000..cf56d46f052e
--- /dev/null
+++ b/include/uapi/linux/fsl_mc.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Management Complex (MC) userspace public interface
+ *
+ * Copyright 2021 NXP
+ *
+ */
+#ifndef _UAPI_FSL_MC_H_
+#define _UAPI_FSL_MC_H_
+
+#include <linux/types.h>
+
+#define MC_CMD_NUM_OF_PARAMS	7
+
+/**
+ * struct fsl_mc_command - Management Complex (MC) command structure
+ * @header: MC command header
+ * @params: MC command parameters
+ */
+struct fsl_mc_command {
+	__le64 header;
+	__le64 params[MC_CMD_NUM_OF_PARAMS];
+};
+
+#endif /* _UAPI_FSL_MC_H_ */
-- 
cgit v1.2.3-71-gd317


From 2cf1e703f066cfa82eb5a358ae84c29fe15a3b3a Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Thu, 14 Jan 2021 19:07:50 +0200
Subject: bus: fsl-mc: add fsl-mc userspace support

Adding userspace support for the MC (Management Complex) means exporting
an ioctl capable device file representing the root resource container.

This new functionality in the fsl-mc bus driver intends to provide
userspace applications an interface to interact with the MC firmware.

Commands that are composed in userspace are sent to the MC firmware
through the FSL_MC_SEND_MC_COMMAND ioctl.  By default the implicit MC
I/O portal is used for this operation, but if the implicit one is busy,
a dynamic portal is allocated and then freed upon execution.

The command received through the ioctl interface is checked against a
known whitelist of accepted MC commands. Commands that attempt a change
in hardware configuration will need CAP_NET_ADMIN, while commands used
in debugging do not need it.

Acked-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Link: https://lore.kernel.org/r/20210114170752.2927915-4-ciorneiioana@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |   1 +
 drivers/bus/fsl-mc/Kconfig                         |   7 +
 drivers/bus/fsl-mc/Makefile                        |   3 +
 drivers/bus/fsl-mc/dprc-driver.c                   |  12 +
 drivers/bus/fsl-mc/fsl-mc-private.h                |  39 ++
 drivers/bus/fsl-mc/fsl-mc-uapi.c                   | 547 +++++++++++++++++++++
 include/uapi/linux/fsl_mc.h                        |   9 +
 7 files changed, 618 insertions(+)
 create mode 100644 drivers/bus/fsl-mc/fsl-mc-uapi.c

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index a4c75a28c839..19d730f9d136 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -180,6 +180,7 @@ Code  Seq#    Include File                                           Comments
 'R'   00-1F  linux/random.h                                          conflict!
 'R'   01     linux/rfkill.h                                          conflict!
 'R'   C0-DF  net/bluetooth/rfcomm.h
+'R'   E0     uapi/linux/fsl_mc.h
 'S'   all    linux/cdrom.h                                           conflict!
 'S'   80-81  scsi/scsi_ioctl.h                                       conflict!
 'S'   82-FF  scsi/scsi.h                                             conflict!
diff --git a/drivers/bus/fsl-mc/Kconfig b/drivers/bus/fsl-mc/Kconfig
index c23c77c9b705..b1fd55901c50 100644
--- a/drivers/bus/fsl-mc/Kconfig
+++ b/drivers/bus/fsl-mc/Kconfig
@@ -14,3 +14,10 @@ config FSL_MC_BUS
 	  architecture.  The fsl-mc bus driver handles discovery of
 	  DPAA2 objects (which are represented as Linux devices) and
 	  binding objects to drivers.
+
+config FSL_MC_UAPI_SUPPORT
+	bool "Management Complex (MC) userspace support"
+	depends on FSL_MC_BUS
+	help
+	  Provides userspace support for interrogating, creating, destroying or
+	  configuring DPAA2 objects exported by the Management Complex.
diff --git a/drivers/bus/fsl-mc/Makefile b/drivers/bus/fsl-mc/Makefile
index 3c518c7e8374..4ae292a30e53 100644
--- a/drivers/bus/fsl-mc/Makefile
+++ b/drivers/bus/fsl-mc/Makefile
@@ -16,3 +16,6 @@ mc-bus-driver-objs := fsl-mc-bus.o \
 		      fsl-mc-allocator.o \
 		      fsl-mc-msi.o \
 		      dpmcp.o
+
+# MC userspace support
+obj-$(CONFIG_FSL_MC_UAPI_SUPPORT) += fsl-mc-uapi.o
diff --git a/drivers/bus/fsl-mc/dprc-driver.c b/drivers/bus/fsl-mc/dprc-driver.c
index 68488a7ad0d6..ccec375095f2 100644
--- a/drivers/bus/fsl-mc/dprc-driver.c
+++ b/drivers/bus/fsl-mc/dprc-driver.c
@@ -603,6 +603,7 @@ int dprc_setup(struct fsl_mc_device *mc_dev)
 	struct irq_domain *mc_msi_domain;
 	bool mc_io_created = false;
 	bool msi_domain_set = false;
+	bool uapi_created = false;
 	u16 major_ver, minor_ver;
 	size_t region_size;
 	int error;
@@ -635,6 +636,11 @@ int dprc_setup(struct fsl_mc_device *mc_dev)
 			return error;
 
 		mc_io_created = true;
+	} else {
+		error = fsl_mc_uapi_create_device_file(mc_bus);
+		if (error < 0)
+			return -EPROBE_DEFER;
+		uapi_created = true;
 	}
 
 	mc_msi_domain = fsl_mc_find_msi_domain(&mc_dev->dev);
@@ -692,6 +698,9 @@ error_cleanup_msi_domain:
 		mc_dev->mc_io = NULL;
 	}
 
+	if (uapi_created)
+		fsl_mc_uapi_remove_device_file(mc_bus);
+
 	return error;
 }
 EXPORT_SYMBOL_GPL(dprc_setup);
@@ -763,6 +772,7 @@ static void dprc_teardown_irq(struct fsl_mc_device *mc_dev)
 
 int dprc_cleanup(struct fsl_mc_device *mc_dev)
 {
+	struct fsl_mc_bus *mc_bus = to_fsl_mc_bus(mc_dev);
 	int error;
 
 	/* this function should be called only for DPRCs, it
@@ -793,6 +803,8 @@ int dprc_cleanup(struct fsl_mc_device *mc_dev)
 	if (!fsl_mc_is_root_dprc(&mc_dev->dev)) {
 		fsl_destroy_mc_io(mc_dev->mc_io);
 		mc_dev->mc_io = NULL;
+	} else {
+		fsl_mc_uapi_remove_device_file(mc_bus);
 	}
 
 	return 0;
diff --git a/drivers/bus/fsl-mc/fsl-mc-private.h b/drivers/bus/fsl-mc/fsl-mc-private.h
index be686c363121..6293a24de456 100644
--- a/drivers/bus/fsl-mc/fsl-mc-private.h
+++ b/drivers/bus/fsl-mc/fsl-mc-private.h
@@ -10,6 +10,8 @@
 
 #include <linux/fsl/mc.h>
 #include <linux/mutex.h>
+#include <linux/ioctl.h>
+#include <linux/miscdevice.h>
 
 /*
  * Data Path Management Complex (DPMNG) General API
@@ -542,6 +544,22 @@ struct fsl_mc_resource_pool {
 	struct fsl_mc_bus *mc_bus;
 };
 
+/**
+ * struct fsl_mc_uapi - information associated with a device file
+ * @misc: struct miscdevice linked to the root dprc
+ * @device: newly created device in /dev
+ * @mutex: mutex lock to serialize the open/release operations
+ * @local_instance_in_use: local MC I/O instance in use or not
+ * @static_mc_io: pointer to the static MC I/O object
+ */
+struct fsl_mc_uapi {
+	struct miscdevice misc;
+	struct device *device;
+	struct mutex mutex; /* serialize open/release operations */
+	u32 local_instance_in_use;
+	struct fsl_mc_io *static_mc_io;
+};
+
 /**
  * struct fsl_mc_bus - logical bus that corresponds to a physical DPRC
  * @mc_dev: fsl-mc device for the bus device itself.
@@ -551,6 +569,7 @@ struct fsl_mc_resource_pool {
  * @irq_resources: Pointer to array of IRQ objects for the IRQ pool
  * @scan_mutex: Serializes bus scanning
  * @dprc_attr: DPRC attributes
+ * @uapi_misc: struct that abstracts the interaction with userspace
  */
 struct fsl_mc_bus {
 	struct fsl_mc_device mc_dev;
@@ -558,6 +577,7 @@ struct fsl_mc_bus {
 	struct fsl_mc_device_irq *irq_resources;
 	struct mutex scan_mutex;    /* serializes bus scanning */
 	struct dprc_attributes dprc_attr;
+	struct fsl_mc_uapi uapi_misc;
 };
 
 #define to_fsl_mc_bus(_mc_dev) \
@@ -614,4 +634,23 @@ struct fsl_mc_device *fsl_mc_device_lookup(struct fsl_mc_obj_desc *obj_desc,
 
 u16 mc_cmd_hdr_read_cmdid(struct fsl_mc_command *cmd);
 
+#ifdef CONFIG_FSL_MC_UAPI_SUPPORT
+
+int fsl_mc_uapi_create_device_file(struct fsl_mc_bus *mc_bus);
+
+void fsl_mc_uapi_remove_device_file(struct fsl_mc_bus *mc_bus);
+
+#else
+
+static inline int fsl_mc_uapi_create_device_file(struct fsl_mc_bus *mc_bus)
+{
+	return 0;
+}
+
+static inline void fsl_mc_uapi_remove_device_file(struct fsl_mc_bus *mc_bus)
+{
+}
+
+#endif
+
 #endif /* _FSL_MC_PRIVATE_H_ */
diff --git a/drivers/bus/fsl-mc/fsl-mc-uapi.c b/drivers/bus/fsl-mc/fsl-mc-uapi.c
new file mode 100644
index 000000000000..eeb988c9f4bb
--- /dev/null
+++ b/drivers/bus/fsl-mc/fsl-mc-uapi.c
@@ -0,0 +1,547 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Management Complex (MC) userspace support
+ *
+ * Copyright 2021 NXP
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+
+#include "fsl-mc-private.h"
+
+struct uapi_priv_data {
+	struct fsl_mc_uapi *uapi;
+	struct fsl_mc_io *mc_io;
+};
+
+struct fsl_mc_cmd_desc {
+	u16 cmdid_value;
+	u16 cmdid_mask;
+	int size;
+	bool token;
+	int flags;
+};
+
+#define FSL_MC_CHECK_MODULE_ID		BIT(0)
+#define FSL_MC_CAP_NET_ADMIN_NEEDED	BIT(1)
+
+enum fsl_mc_cmd_index {
+	DPDBG_DUMP = 0,
+	DPDBG_SET,
+	DPRC_GET_CONTAINER_ID,
+	DPRC_CREATE_CONT,
+	DPRC_DESTROY_CONT,
+	DPRC_ASSIGN,
+	DPRC_UNASSIGN,
+	DPRC_GET_OBJ_COUNT,
+	DPRC_GET_OBJ,
+	DPRC_GET_RES_COUNT,
+	DPRC_GET_RES_IDS,
+	DPRC_SET_OBJ_LABEL,
+	DPRC_SET_LOCKED,
+	DPRC_CONNECT,
+	DPRC_DISCONNECT,
+	DPRC_GET_POOL,
+	DPRC_GET_POOL_COUNT,
+	DPRC_GET_CONNECTION,
+	DPCI_GET_LINK_STATE,
+	DPCI_GET_PEER_ATTR,
+	DPAIOP_GET_SL_VERSION,
+	DPAIOP_GET_STATE,
+	DPMNG_GET_VERSION,
+	DPSECI_GET_TX_QUEUE,
+	DPMAC_GET_COUNTER,
+	DPMAC_GET_MAC_ADDR,
+	DPNI_SET_PRIM_MAC,
+	DPNI_GET_PRIM_MAC,
+	DPNI_GET_STATISTICS,
+	DPNI_GET_LINK_STATE,
+	GET_ATTR,
+	GET_IRQ_MASK,
+	GET_IRQ_STATUS,
+	CLOSE,
+	OPEN,
+	GET_API_VERSION,
+	DESTROY,
+	CREATE,
+};
+
+static struct fsl_mc_cmd_desc fsl_mc_accepted_cmds[] = {
+	[DPDBG_DUMP] = {
+		.cmdid_value = 0x1300,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 28,
+	},
+	[DPDBG_SET] = {
+		.cmdid_value = 0x1400,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 28,
+	},
+	[DPRC_GET_CONTAINER_ID] = {
+		.cmdid_value = 0x8300,
+		.cmdid_mask = 0xFFF0,
+		.token = false,
+		.size = 8,
+	},
+	[DPRC_CREATE_CONT] = {
+		.cmdid_value = 0x1510,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 40,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_DESTROY_CONT] = {
+		.cmdid_value = 0x1520,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 12,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_ASSIGN] = {
+		.cmdid_value = 0x1570,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 40,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_UNASSIGN] = {
+		.cmdid_value = 0x1580,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 40,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_GET_OBJ_COUNT] = {
+		.cmdid_value = 0x1590,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 16,
+	},
+	[DPRC_GET_OBJ] = {
+		.cmdid_value = 0x15A0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 12,
+	},
+	[DPRC_GET_RES_COUNT] = {
+		.cmdid_value = 0x15B0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 32,
+	},
+	[DPRC_GET_RES_IDS] = {
+		.cmdid_value = 0x15C0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 40,
+	},
+	[DPRC_SET_OBJ_LABEL] = {
+		.cmdid_value = 0x1610,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 48,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_SET_LOCKED] = {
+		.cmdid_value = 0x16B0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 16,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_CONNECT] = {
+		.cmdid_value = 0x1670,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 56,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_DISCONNECT] = {
+		.cmdid_value = 0x1680,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 32,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_GET_POOL] = {
+		.cmdid_value = 0x1690,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 12,
+	},
+	[DPRC_GET_POOL_COUNT] = {
+		.cmdid_value = 0x16A0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPRC_GET_CONNECTION] = {
+		.cmdid_value = 0x16C0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 32,
+	},
+
+	[DPCI_GET_LINK_STATE] = {
+		.cmdid_value = 0x0E10,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPCI_GET_PEER_ATTR] = {
+		.cmdid_value = 0x0E20,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPAIOP_GET_SL_VERSION] = {
+		.cmdid_value = 0x2820,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPAIOP_GET_STATE] = {
+		.cmdid_value = 0x2830,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPMNG_GET_VERSION] = {
+		.cmdid_value = 0x8310,
+		.cmdid_mask = 0xFFF0,
+		.token = false,
+		.size = 8,
+	},
+	[DPSECI_GET_TX_QUEUE] = {
+		.cmdid_value = 0x1970,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 14,
+	},
+	[DPMAC_GET_COUNTER] = {
+		.cmdid_value = 0x0c40,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 9,
+	},
+	[DPMAC_GET_MAC_ADDR] = {
+		.cmdid_value = 0x0c50,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPNI_SET_PRIM_MAC] = {
+		.cmdid_value = 0x2240,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 16,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPNI_GET_PRIM_MAC] = {
+		.cmdid_value = 0x2250,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPNI_GET_STATISTICS] = {
+		.cmdid_value = 0x25D0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 10,
+	},
+	[DPNI_GET_LINK_STATE] = {
+		.cmdid_value = 0x2150,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[GET_ATTR] = {
+		.cmdid_value = 0x0040,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[GET_IRQ_MASK] = {
+		.cmdid_value = 0x0150,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 13,
+	},
+	[GET_IRQ_STATUS] = {
+		.cmdid_value = 0x0160,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 13,
+	},
+	[CLOSE] = {
+		.cmdid_value = 0x8000,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+
+	/* Common commands amongst all types of objects. Must be checked last. */
+	[OPEN] = {
+		.cmdid_value = 0x8000,
+		.cmdid_mask = 0xFC00,
+		.token = false,
+		.size = 12,
+		.flags = FSL_MC_CHECK_MODULE_ID,
+	},
+	[GET_API_VERSION] = {
+		.cmdid_value = 0xA000,
+		.cmdid_mask = 0xFC00,
+		.token = false,
+		.size = 8,
+		.flags = FSL_MC_CHECK_MODULE_ID,
+	},
+	[DESTROY] = {
+		.cmdid_value = 0x9800,
+		.cmdid_mask = 0xFC00,
+		.token = true,
+		.size = 12,
+		.flags = FSL_MC_CHECK_MODULE_ID | FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[CREATE] = {
+		.cmdid_value = 0x9000,
+		.cmdid_mask = 0xFC00,
+		.token = true,
+		.size = 64,
+		.flags = FSL_MC_CHECK_MODULE_ID | FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+};
+
+#define FSL_MC_NUM_ACCEPTED_CMDS ARRAY_SIZE(fsl_mc_accepted_cmds)
+
+#define FSL_MC_MAX_MODULE_ID 0x10
+
+static int fsl_mc_command_check(struct fsl_mc_device *mc_dev,
+				struct fsl_mc_command *mc_cmd)
+{
+	struct fsl_mc_cmd_desc *desc = NULL;
+	int mc_cmd_max_size, i;
+	bool token_provided;
+	u16 cmdid, module_id;
+	char *mc_cmd_end;
+	char sum = 0;
+
+	/* Check if this is an accepted MC command */
+	cmdid = mc_cmd_hdr_read_cmdid(mc_cmd);
+	for (i = 0; i < FSL_MC_NUM_ACCEPTED_CMDS; i++) {
+		desc = &fsl_mc_accepted_cmds[i];
+		if ((cmdid & desc->cmdid_mask) == desc->cmdid_value)
+			break;
+	}
+	if (!desc) {
+		dev_err(&mc_dev->dev, "MC command 0x%04x: cmdid not accepted\n", cmdid);
+		return -EACCES;
+	}
+
+	/* Check if the size of the command is honored. Anything beyond the
+	 * last valid byte of the command should be zeroed.
+	 */
+	mc_cmd_max_size = sizeof(*mc_cmd);
+	mc_cmd_end = ((char *)mc_cmd) + desc->size;
+	for (i = desc->size; i < mc_cmd_max_size; i++)
+		sum |= *mc_cmd_end++;
+	if (sum) {
+		dev_err(&mc_dev->dev, "MC command 0x%04x: garbage beyond max size of %d bytes!\n",
+			cmdid, desc->size);
+		return -EACCES;
+	}
+
+	/* Some MC commands request a token to be passed so that object
+	 * identification is possible. Check if the token passed in the command
+	 * is as expected.
+	 */
+	token_provided = mc_cmd_hdr_read_token(mc_cmd) ? true : false;
+	if (token_provided != desc->token) {
+		dev_err(&mc_dev->dev, "MC command 0x%04x: token 0x%04x is invalid!\n",
+			cmdid, mc_cmd_hdr_read_token(mc_cmd));
+		return -EACCES;
+	}
+
+	/* If needed, check if the module ID passed is valid */
+	if (desc->flags & FSL_MC_CHECK_MODULE_ID) {
+		/* The module ID is represented by bits [4:9] from the cmdid */
+		module_id = (cmdid & GENMASK(9, 4)) >> 4;
+		if (module_id == 0 || module_id > FSL_MC_MAX_MODULE_ID) {
+			dev_err(&mc_dev->dev, "MC command 0x%04x: unknown module ID 0x%x\n",
+				cmdid, module_id);
+			return -EACCES;
+		}
+	}
+
+	/* Some commands alter how hardware resources are managed. For these
+	 * commands, check for CAP_NET_ADMIN.
+	 */
+	if (desc->flags & FSL_MC_CAP_NET_ADMIN_NEEDED) {
+		if (!capable(CAP_NET_ADMIN)) {
+			dev_err(&mc_dev->dev, "MC command 0x%04x: needs CAP_NET_ADMIN!\n",
+				cmdid);
+			return -EPERM;
+		}
+	}
+
+	return 0;
+}
+
+static int fsl_mc_uapi_send_command(struct fsl_mc_device *mc_dev, unsigned long arg,
+				    struct fsl_mc_io *mc_io)
+{
+	struct fsl_mc_command mc_cmd;
+	int error;
+
+	error = copy_from_user(&mc_cmd, (void __user *)arg, sizeof(mc_cmd));
+	if (error)
+		return -EFAULT;
+
+	error = fsl_mc_command_check(mc_dev, &mc_cmd);
+	if (error)
+		return error;
+
+	error = mc_send_command(mc_io, &mc_cmd);
+	if (error)
+		return error;
+
+	error = copy_to_user((void __user *)arg, &mc_cmd, sizeof(mc_cmd));
+	if (error)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int fsl_mc_uapi_dev_open(struct inode *inode, struct file *filep)
+{
+	struct fsl_mc_device *root_mc_device;
+	struct uapi_priv_data *priv_data;
+	struct fsl_mc_io *dynamic_mc_io;
+	struct fsl_mc_uapi *mc_uapi;
+	struct fsl_mc_bus *mc_bus;
+	int error;
+
+	priv_data = kzalloc(sizeof(*priv_data), GFP_KERNEL);
+	if (!priv_data)
+		return -ENOMEM;
+
+	mc_uapi = container_of(filep->private_data, struct fsl_mc_uapi, misc);
+	mc_bus = container_of(mc_uapi, struct fsl_mc_bus, uapi_misc);
+	root_mc_device = &mc_bus->mc_dev;
+
+	mutex_lock(&mc_uapi->mutex);
+
+	if (!mc_uapi->local_instance_in_use) {
+		priv_data->mc_io = mc_uapi->static_mc_io;
+		mc_uapi->local_instance_in_use = 1;
+	} else {
+		error = fsl_mc_portal_allocate(root_mc_device, 0,
+					       &dynamic_mc_io);
+		if (error) {
+			dev_dbg(&root_mc_device->dev,
+				"Could not allocate MC portal\n");
+			goto error_portal_allocate;
+		}
+
+		priv_data->mc_io = dynamic_mc_io;
+	}
+	priv_data->uapi = mc_uapi;
+	filep->private_data = priv_data;
+
+	mutex_unlock(&mc_uapi->mutex);
+
+	return 0;
+
+error_portal_allocate:
+	mutex_unlock(&mc_uapi->mutex);
+	kfree(priv_data);
+
+	return error;
+}
+
+static int fsl_mc_uapi_dev_release(struct inode *inode, struct file *filep)
+{
+	struct uapi_priv_data *priv_data;
+	struct fsl_mc_uapi *mc_uapi;
+	struct fsl_mc_io *mc_io;
+
+	priv_data = filep->private_data;
+	mc_uapi = priv_data->uapi;
+	mc_io = priv_data->mc_io;
+
+	mutex_lock(&mc_uapi->mutex);
+
+	if (mc_io == mc_uapi->static_mc_io)
+		mc_uapi->local_instance_in_use = 0;
+	else
+		fsl_mc_portal_free(mc_io);
+
+	kfree(filep->private_data);
+	filep->private_data =  NULL;
+
+	mutex_unlock(&mc_uapi->mutex);
+
+	return 0;
+}
+
+static long fsl_mc_uapi_dev_ioctl(struct file *file,
+				  unsigned int cmd,
+				  unsigned long arg)
+{
+	struct uapi_priv_data *priv_data = file->private_data;
+	struct fsl_mc_device *root_mc_device;
+	struct fsl_mc_bus *mc_bus;
+	int error;
+
+	mc_bus = container_of(priv_data->uapi, struct fsl_mc_bus, uapi_misc);
+	root_mc_device = &mc_bus->mc_dev;
+
+	switch (cmd) {
+	case FSL_MC_SEND_MC_COMMAND:
+		error = fsl_mc_uapi_send_command(root_mc_device, arg, priv_data->mc_io);
+		break;
+	default:
+		dev_dbg(&root_mc_device->dev, "unexpected ioctl call number\n");
+		error = -EINVAL;
+	}
+
+	return error;
+}
+
+static const struct file_operations fsl_mc_uapi_dev_fops = {
+	.owner = THIS_MODULE,
+	.open = fsl_mc_uapi_dev_open,
+	.release = fsl_mc_uapi_dev_release,
+	.unlocked_ioctl = fsl_mc_uapi_dev_ioctl,
+};
+
+int fsl_mc_uapi_create_device_file(struct fsl_mc_bus *mc_bus)
+{
+	struct fsl_mc_device *mc_dev = &mc_bus->mc_dev;
+	struct fsl_mc_uapi *mc_uapi = &mc_bus->uapi_misc;
+	int error;
+
+	mc_uapi->misc.minor = MISC_DYNAMIC_MINOR;
+	mc_uapi->misc.name = dev_name(&mc_dev->dev);
+	mc_uapi->misc.fops = &fsl_mc_uapi_dev_fops;
+
+	error = misc_register(&mc_uapi->misc);
+	if (error)
+		return error;
+
+	mc_uapi->static_mc_io = mc_bus->mc_dev.mc_io;
+
+	mutex_init(&mc_uapi->mutex);
+
+	return 0;
+}
+
+void fsl_mc_uapi_remove_device_file(struct fsl_mc_bus *mc_bus)
+{
+	misc_deregister(&mc_bus->uapi_misc.misc);
+}
diff --git a/include/uapi/linux/fsl_mc.h b/include/uapi/linux/fsl_mc.h
index cf56d46f052e..e57451570033 100644
--- a/include/uapi/linux/fsl_mc.h
+++ b/include/uapi/linux/fsl_mc.h
@@ -16,10 +16,19 @@
  * struct fsl_mc_command - Management Complex (MC) command structure
  * @header: MC command header
  * @params: MC command parameters
+ *
+ * Used by FSL_MC_SEND_MC_COMMAND
  */
 struct fsl_mc_command {
 	__le64 header;
 	__le64 params[MC_CMD_NUM_OF_PARAMS];
 };
 
+#define FSL_MC_SEND_CMD_IOCTL_TYPE	'R'
+#define FSL_MC_SEND_CMD_IOCTL_SEQ	0xE0
+
+#define FSL_MC_SEND_MC_COMMAND \
+	_IOWR(FSL_MC_SEND_CMD_IOCTL_TYPE, FSL_MC_SEND_CMD_IOCTL_SEQ, \
+	struct fsl_mc_command)
+
 #endif /* _UAPI_FSL_MC_H_ */
-- 
cgit v1.2.3-71-gd317


From c98fe7c2a2038af2bc24f039ccdb5a65c22ba11a Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:04 -0800
Subject: vfio: option to unmap all

For the UNMAP_DMA ioctl, delete all mappings if VFIO_DMA_UNMAP_FLAG_ALL
is set.  Define the corresponding VFIO_UNMAP_ALL extension.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index d1812777139f..78462599e5ed 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -46,6 +46,9 @@
  */
 #define VFIO_NOIOMMU_IOMMU		8
 
+/* Supports VFIO_DMA_UNMAP_FLAG_ALL */
+#define VFIO_UNMAP_ALL			9
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -1102,6 +1105,7 @@ struct vfio_bitmap {
  * field.  No guarantee is made to the user that arbitrary unmaps of iova
  * or size different from those used in the original mapping call will
  * succeed.
+ *
  * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
  * before unmapping IO virtual addresses. When this flag is set, the user must
  * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
@@ -1111,11 +1115,15 @@ struct vfio_bitmap {
  * indicates that the page at that offset from iova is dirty. A Bitmap of the
  * pages in the range of unmapped size is returned in the user-provided
  * vfio_bitmap.data.
+ *
+ * If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses.  iova and size
+ * must be 0.  This cannot be combined with the get-dirty-bitmap flag.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
+#define VFIO_DMA_UNMAP_FLAG_ALL		     (1 << 1)
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
 	__u8    data[];
-- 
cgit v1.2.3-71-gd317


From 441e8106a238920f28e2e79cff462044551f60e2 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:07 -0800
Subject: vfio: interfaces to update vaddr

Define interfaces that allow the underlying memory object of an iova
range to be mapped to a new host virtual address in the host process:

  - VFIO_DMA_UNMAP_FLAG_VADDR for VFIO_IOMMU_UNMAP_DMA
  - VFIO_DMA_MAP_FLAG_VADDR flag for VFIO_IOMMU_MAP_DMA
  - VFIO_UPDATE_VADDR extension for VFIO_CHECK_EXTENSION

Unmap vaddr invalidates the host virtual address in an iova range, and
blocks vfio translation of host virtual addresses.  DMA to already-mapped
pages continues.  Map vaddr updates the base VA and resumes translation.
See comments in uapi/linux/vfio.h for more details.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 78462599e5ed..8ce36c1d53ca 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -49,6 +49,9 @@
 /* Supports VFIO_DMA_UNMAP_FLAG_ALL */
 #define VFIO_UNMAP_ALL			9
 
+/* Supports the vaddr flag for DMA map and unmap */
+#define VFIO_UPDATE_VADDR		10
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -1077,12 +1080,22 @@ struct vfio_iommu_type1_info_dma_avail {
  *
  * Map process virtual addresses to IO virtual addresses using the
  * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
+ *
+ * If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova, and
+ * unblock translation of host virtual addresses in the iova range.  The vaddr
+ * must have previously been invalidated with VFIO_DMA_UNMAP_FLAG_VADDR.  To
+ * maintain memory consistency within the user application, the updated vaddr
+ * must address the same memory object as originally mapped.  Failure to do so
+ * will result in user memory corruption and/or device misbehavior.  iova and
+ * size must match those in the original MAP_DMA call.  Protection is not
+ * changed, and the READ & WRITE flags must be 0.
  */
 struct vfio_iommu_type1_dma_map {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
 #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
+#define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
 	__u64	vaddr;				/* Process virtual address */
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
@@ -1118,12 +1131,18 @@ struct vfio_bitmap {
  *
  * If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses.  iova and size
  * must be 0.  This cannot be combined with the get-dirty-bitmap flag.
+ *
+ * If flags & VFIO_DMA_UNMAP_FLAG_VADDR, do not unmap, but invalidate host
+ * virtual addresses in the iova range.  Tasks that attempt to translate an
+ * iova's vaddr will block.  DMA to already-mapped pages continues.  This
+ * cannot be combined with the get-dirty-bitmap flag.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
 #define VFIO_DMA_UNMAP_FLAG_ALL		     (1 << 1)
+#define VFIO_DMA_UNMAP_FLAG_VADDR	     (1 << 2)
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
 	__u8    data[];
-- 
cgit v1.2.3-71-gd317


From 9c5137aedd112f78a968bdd2325de2ea06df46c0 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:28 +0800
Subject: virt: acrn: Introduce VM management interfaces

The VM management interfaces expose several VM operations to ACRN
userspace via ioctls. For example, creating VM, starting VM, destroying
VM and so on.

The ACRN Hypervisor needs to exchange data with the ACRN userspace
during the VM operations. HSM provides VM operation ioctls to the ACRN
userspace and communicates with the ACRN Hypervisor for VM operations
via hypercalls.

HSM maintains a list of User VM. Each User VM will be bound to an
existing file descriptor of /dev/acrn_hsm. The User VM will be
destroyed when the file descriptor is closed.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-7-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |  1 +
 MAINTAINERS                                        |  1 +
 drivers/virt/acrn/Makefile                         |  2 +-
 drivers/virt/acrn/acrn_drv.h                       | 21 +++++-
 drivers/virt/acrn/hsm.c                            | 76 ++++++++++++++++++++-
 drivers/virt/acrn/hypercall.h                      | 78 ++++++++++++++++++++++
 drivers/virt/acrn/vm.c                             | 68 +++++++++++++++++++
 include/uapi/linux/acrn.h                          | 58 ++++++++++++++++
 8 files changed, 301 insertions(+), 4 deletions(-)
 create mode 100644 drivers/virt/acrn/hypercall.h
 create mode 100644 drivers/virt/acrn/vm.c
 create mode 100644 include/uapi/linux/acrn.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 19d730f9d136..76c27dbae96b 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -320,6 +320,7 @@ Code  Seq#    Include File                                           Comments
 0xA0  all    linux/sdp/sdp.h                                         Industrial Device Project
                                                                      <mailto:kenji@bitgate.com>
 0xA1  0      linux/vtpm_proxy.h                                      TPM Emulator Proxy Driver
+0xA2  all    uapi/linux/acrn.h                                       ACRN hypervisor
 0xA3  80-8F                                                          Port ACL  in development:
                                                                      <mailto:tlewis@mindspring.com>
 0xA3  90-9F  linux/dtlk.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 4ac781da514f..6b5990a28b44 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -443,6 +443,7 @@ S:	Supported
 W:	https://projectacrn.org
 F:	Documentation/virt/acrn/
 F:	drivers/virt/acrn/
+F:	include/uapi/linux/acrn.h
 
 AD1889 ALSA SOUND DRIVER
 L:	linux-parisc@vger.kernel.org
diff --git a/drivers/virt/acrn/Makefile b/drivers/virt/acrn/Makefile
index 6920ed798aaf..cf8b4ed5e74e 100644
--- a/drivers/virt/acrn/Makefile
+++ b/drivers/virt/acrn/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ACRN_HSM)	:= acrn.o
-acrn-y := hsm.o
+acrn-y := hsm.o vm.o
diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index 29eedd696327..e5aba86cad8c 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -3,16 +3,35 @@
 #ifndef __ACRN_HSM_DRV_H
 #define __ACRN_HSM_DRV_H
 
+#include <linux/acrn.h>
+#include <linux/dev_printk.h>
+#include <linux/miscdevice.h>
 #include <linux/types.h>
 
+#include "hypercall.h"
+
+extern struct miscdevice acrn_dev;
+
 #define ACRN_INVALID_VMID (0xffffU)
 
+#define ACRN_VM_FLAG_DESTROYED		0U
 /**
  * struct acrn_vm - Properties of ACRN User VM.
+ * @list:	Entry within global list of all VMs
  * @vmid:	User VM ID
+ * @vcpu_num:	Number of virtual CPUs in the VM
+ * @flags:	Flags (ACRN_VM_FLAG_*) of the VM. This is VM flag management
+ *		in HSM which is different from the &acrn_vm_creation.vm_flag.
  */
 struct acrn_vm {
-	u16	vmid;
+	struct list_head	list;
+	u16			vmid;
+	int			vcpu_num;
+	unsigned long		flags;
 };
 
+struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
+			       struct acrn_vm_creation *vm_param);
+int acrn_vm_destroy(struct acrn_vm *vm);
+
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index a8dcb250649d..5fd933471683 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -9,7 +9,6 @@
  *	Yakui Zhao <yakui.zhao@intel.com>
  */
 
-#include <linux/miscdevice.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -38,10 +37,82 @@ static int acrn_dev_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+/*
+ * HSM relies on hypercall layer of the ACRN hypervisor to do the
+ * sanity check against the input parameters.
+ */
+static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
+			   unsigned long ioctl_param)
+{
+	struct acrn_vm *vm = filp->private_data;
+	struct acrn_vm_creation *vm_param;
+	int ret = 0;
+
+	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
+		dev_dbg(acrn_dev.this_device,
+			"ioctl 0x%x: Invalid VM state!\n", cmd);
+		return -EINVAL;
+	}
+
+	switch (cmd) {
+	case ACRN_IOCTL_CREATE_VM:
+		vm_param = memdup_user((void __user *)ioctl_param,
+				       sizeof(struct acrn_vm_creation));
+		if (IS_ERR(vm_param))
+			return PTR_ERR(vm_param);
+
+		if ((vm_param->reserved0 | vm_param->reserved1) != 0)
+			return -EINVAL;
+
+		vm = acrn_vm_create(vm, vm_param);
+		if (!vm) {
+			ret = -EINVAL;
+			kfree(vm_param);
+			break;
+		}
+
+		if (copy_to_user((void __user *)ioctl_param, vm_param,
+				 sizeof(struct acrn_vm_creation))) {
+			acrn_vm_destroy(vm);
+			ret = -EFAULT;
+		}
+
+		kfree(vm_param);
+		break;
+	case ACRN_IOCTL_START_VM:
+		ret = hcall_start_vm(vm->vmid);
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to start VM %u!\n", vm->vmid);
+		break;
+	case ACRN_IOCTL_PAUSE_VM:
+		ret = hcall_pause_vm(vm->vmid);
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to pause VM %u!\n", vm->vmid);
+		break;
+	case ACRN_IOCTL_RESET_VM:
+		ret = hcall_reset_vm(vm->vmid);
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to restart VM %u!\n", vm->vmid);
+		break;
+	case ACRN_IOCTL_DESTROY_VM:
+		ret = acrn_vm_destroy(vm);
+		break;
+	default:
+		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
+		ret = -ENOTTY;
+	}
+
+	return ret;
+}
+
 static int acrn_dev_release(struct inode *inode, struct file *filp)
 {
 	struct acrn_vm *vm = filp->private_data;
 
+	acrn_vm_destroy(vm);
 	kfree(vm);
 	return 0;
 }
@@ -50,9 +121,10 @@ static const struct file_operations acrn_fops = {
 	.owner		= THIS_MODULE,
 	.open		= acrn_dev_open,
 	.release	= acrn_dev_release,
+	.unlocked_ioctl = acrn_dev_ioctl,
 };
 
-static struct miscdevice acrn_dev = {
+struct miscdevice acrn_dev = {
 	.minor	= MISC_DYNAMIC_MINOR,
 	.name	= "acrn_hsm",
 	.fops	= &acrn_fops,
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
new file mode 100644
index 000000000000..426b66cadb1f
--- /dev/null
+++ b/drivers/virt/acrn/hypercall.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ACRN HSM: hypercalls of ACRN Hypervisor
+ */
+#ifndef __ACRN_HSM_HYPERCALL_H
+#define __ACRN_HSM_HYPERCALL_H
+#include <asm/acrn.h>
+
+/*
+ * Hypercall IDs of the ACRN Hypervisor
+ */
+#define _HC_ID(x, y) (((x) << 24) | (y))
+
+#define HC_ID 0x80UL
+
+#define HC_ID_VM_BASE			0x10UL
+#define HC_CREATE_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x00)
+#define HC_DESTROY_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x01)
+#define HC_START_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x02)
+#define HC_PAUSE_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x03)
+#define HC_RESET_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x05)
+
+/**
+ * hcall_create_vm() - Create a User VM
+ * @vminfo:	Service VM GPA of info of User VM creation
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_create_vm(u64 vminfo)
+{
+	return acrn_hypercall1(HC_CREATE_VM, vminfo);
+}
+
+/**
+ * hcall_start_vm() - Start a User VM
+ * @vmid:	User VM ID
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_start_vm(u64 vmid)
+{
+	return acrn_hypercall1(HC_START_VM, vmid);
+}
+
+/**
+ * hcall_pause_vm() - Pause a User VM
+ * @vmid:	User VM ID
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_pause_vm(u64 vmid)
+{
+	return acrn_hypercall1(HC_PAUSE_VM, vmid);
+}
+
+/**
+ * hcall_destroy_vm() - Destroy a User VM
+ * @vmid:	User VM ID
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_destroy_vm(u64 vmid)
+{
+	return acrn_hypercall1(HC_DESTROY_VM, vmid);
+}
+
+/**
+ * hcall_reset_vm() - Reset a User VM
+ * @vmid:	User VM ID
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_reset_vm(u64 vmid)
+{
+	return acrn_hypercall1(HC_RESET_VM, vmid);
+}
+
+#endif /* __ACRN_HSM_HYPERCALL_H */
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
new file mode 100644
index 000000000000..3f667ac8ac1e
--- /dev/null
+++ b/drivers/virt/acrn/vm.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN_HSM: Virtual Machine management
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Authors:
+ *	Jason Chen CJ <jason.cj.chen@intel.com>
+ *	Yakui Zhao <yakui.zhao@intel.com>
+ */
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "acrn_drv.h"
+
+/* List of VMs */
+static LIST_HEAD(acrn_vm_list);
+/* To protect acrn_vm_list */
+static DEFINE_MUTEX(acrn_vm_list_lock);
+
+struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
+			       struct acrn_vm_creation *vm_param)
+{
+	int ret;
+
+	ret = hcall_create_vm(virt_to_phys(vm_param));
+	if (ret < 0 || vm_param->vmid == ACRN_INVALID_VMID) {
+		dev_err(acrn_dev.this_device,
+			"Failed to create VM! Error: %d\n", ret);
+		return NULL;
+	}
+
+	vm->vmid = vm_param->vmid;
+	vm->vcpu_num = vm_param->vcpu_num;
+
+	mutex_lock(&acrn_vm_list_lock);
+	list_add(&vm->list, &acrn_vm_list);
+	mutex_unlock(&acrn_vm_list_lock);
+
+	dev_dbg(acrn_dev.this_device, "VM %u created.\n", vm->vmid);
+	return vm;
+}
+
+int acrn_vm_destroy(struct acrn_vm *vm)
+{
+	int ret;
+
+	if (vm->vmid == ACRN_INVALID_VMID ||
+	    test_and_set_bit(ACRN_VM_FLAG_DESTROYED, &vm->flags))
+		return 0;
+
+	/* Remove from global VM list */
+	mutex_lock(&acrn_vm_list_lock);
+	list_del_init(&vm->list);
+	mutex_unlock(&acrn_vm_list_lock);
+
+	ret = hcall_destroy_vm(vm->vmid);
+	if (ret < 0) {
+		dev_err(acrn_dev.this_device,
+			"Failed to destroy VM %u\n", vm->vmid);
+		clear_bit(ACRN_VM_FLAG_DESTROYED, &vm->flags);
+		return ret;
+	}
+	dev_dbg(acrn_dev.this_device, "VM %u destroyed.\n", vm->vmid);
+	vm->vmid = ACRN_INVALID_VMID;
+	return 0;
+}
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
new file mode 100644
index 000000000000..32521168075f
--- /dev/null
+++ b/include/uapi/linux/acrn.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Userspace interface for /dev/acrn_hsm - ACRN Hypervisor Service Module
+ *
+ * This file can be used by applications that need to communicate with the HSM
+ * via the ioctl interface.
+ *
+ * Copyright (C) 2021 Intel Corporation. All rights reserved.
+ */
+
+#ifndef _UAPI_ACRN_H
+#define _UAPI_ACRN_H
+
+#include <linux/types.h>
+#include <linux/uuid.h>
+
+/**
+ * struct acrn_vm_creation - Info to create a User VM
+ * @vmid:		User VM ID returned from the hypervisor
+ * @reserved0:		Reserved and must be 0
+ * @vcpu_num:		Number of vCPU in the VM. Return from hypervisor.
+ * @reserved1:		Reserved and must be 0
+ * @uuid:		UUID of the VM. Pass to hypervisor directly.
+ * @vm_flag:		Flag of the VM creating. Pass to hypervisor directly.
+ * @ioreq_buf:		Service VM GPA of I/O request buffer. Pass to
+ *			hypervisor directly.
+ * @cpu_affinity:	CPU affinity of the VM. Pass to hypervisor directly.
+ * 			It's a bitmap which indicates CPUs used by the VM.
+ */
+struct acrn_vm_creation {
+	__u16	vmid;
+	__u16	reserved0;
+	__u16	vcpu_num;
+	__u16	reserved1;
+	guid_t	uuid;
+	__u64	vm_flag;
+	__u64	ioreq_buf;
+	__u64	cpu_affinity;
+};
+
+/* The ioctl type, documented in ioctl-number.rst */
+#define ACRN_IOCTL_TYPE			0xA2
+
+/*
+ * Common IOCTL IDs definition for ACRN userspace
+ */
+#define ACRN_IOCTL_CREATE_VM		\
+	_IOWR(ACRN_IOCTL_TYPE, 0x10, struct acrn_vm_creation)
+#define ACRN_IOCTL_DESTROY_VM		\
+	_IO(ACRN_IOCTL_TYPE, 0x11)
+#define ACRN_IOCTL_START_VM		\
+	_IO(ACRN_IOCTL_TYPE, 0x12)
+#define ACRN_IOCTL_PAUSE_VM		\
+	_IO(ACRN_IOCTL_TYPE, 0x13)
+#define ACRN_IOCTL_RESET_VM		\
+	_IO(ACRN_IOCTL_TYPE, 0x15)
+
+#endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3-71-gd317


From 2ad2aaee1bc9568d0c146463483d2c926ef20055 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:29 +0800
Subject: virt: acrn: Introduce an ioctl to set vCPU registers state

A virtual CPU of User VM has different context due to the different
registers state. ACRN userspace needs to set the virtual CPU
registers state (e.g. giving a initial registers state to a virtual
BSP of a User VM).

HSM provides an ioctl ACRN_IOCTL_SET_VCPU_REGS to do the virtual CPU
registers state setting. The ioctl passes the registers state from ACRN
userspace to the hypervisor directly.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-8-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/hsm.c       |  34 +++++++++++-
 drivers/virt/acrn/hypercall.h |  13 +++++
 include/uapi/linux/acrn.h     | 119 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 5fd933471683..ee5cc7413239 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -9,6 +9,7 @@
  *	Yakui Zhao <yakui.zhao@intel.com>
  */
 
+#include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -46,7 +47,8 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 {
 	struct acrn_vm *vm = filp->private_data;
 	struct acrn_vm_creation *vm_param;
-	int ret = 0;
+	struct acrn_vcpu_regs *cpu_regs;
+	int i, ret = 0;
 
 	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
 		dev_dbg(acrn_dev.this_device,
@@ -100,6 +102,36 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	case ACRN_IOCTL_DESTROY_VM:
 		ret = acrn_vm_destroy(vm);
 		break;
+	case ACRN_IOCTL_SET_VCPU_REGS:
+		cpu_regs = memdup_user((void __user *)ioctl_param,
+				       sizeof(struct acrn_vcpu_regs));
+		if (IS_ERR(cpu_regs))
+			return PTR_ERR(cpu_regs);
+
+		for (i = 0; i < ARRAY_SIZE(cpu_regs->reserved); i++)
+			if (cpu_regs->reserved[i])
+				return -EINVAL;
+
+		for (i = 0; i < ARRAY_SIZE(cpu_regs->vcpu_regs.reserved_32); i++)
+			if (cpu_regs->vcpu_regs.reserved_32[i])
+				return -EINVAL;
+
+		for (i = 0; i < ARRAY_SIZE(cpu_regs->vcpu_regs.reserved_64); i++)
+			if (cpu_regs->vcpu_regs.reserved_64[i])
+				return -EINVAL;
+
+		for (i = 0; i < ARRAY_SIZE(cpu_regs->vcpu_regs.gdt.reserved); i++)
+			if (cpu_regs->vcpu_regs.gdt.reserved[i] |
+			    cpu_regs->vcpu_regs.idt.reserved[i])
+				return -EINVAL;
+
+		ret = hcall_set_vcpu_regs(vm->vmid, virt_to_phys(cpu_regs));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to set regs state of VM%u!\n",
+				vm->vmid);
+		kfree(cpu_regs);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index 426b66cadb1f..f29cfae08862 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -19,6 +19,7 @@
 #define HC_START_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x02)
 #define HC_PAUSE_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x03)
 #define HC_RESET_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x05)
+#define HC_SET_VCPU_REGS		_HC_ID(HC_ID, HC_ID_VM_BASE + 0x06)
 
 /**
  * hcall_create_vm() - Create a User VM
@@ -75,4 +76,16 @@ static inline long hcall_reset_vm(u64 vmid)
 	return acrn_hypercall1(HC_RESET_VM, vmid);
 }
 
+/**
+ * hcall_set_vcpu_regs() - Set up registers of virtual CPU of a User VM
+ * @vmid:	User VM ID
+ * @regs_state:	Service VM GPA of registers state
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_set_vcpu_regs(u64 vmid, u64 regs_state)
+{
+	return acrn_hypercall2(HC_SET_VCPU_REGS, vmid, regs_state);
+}
+
 #endif /* __ACRN_HSM_HYPERCALL_H */
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index 32521168075f..775c58bad026 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -38,6 +38,123 @@ struct acrn_vm_creation {
 	__u64	cpu_affinity;
 };
 
+/**
+ * struct acrn_gp_regs - General registers of a User VM
+ * @rax:	Value of register RAX
+ * @rcx:	Value of register RCX
+ * @rdx:	Value of register RDX
+ * @rbx:	Value of register RBX
+ * @rsp:	Value of register RSP
+ * @rbp:	Value of register RBP
+ * @rsi:	Value of register RSI
+ * @rdi:	Value of register RDI
+ * @r8:		Value of register R8
+ * @r9:		Value of register R9
+ * @r10:	Value of register R10
+ * @r11:	Value of register R11
+ * @r12:	Value of register R12
+ * @r13:	Value of register R13
+ * @r14:	Value of register R14
+ * @r15:	Value of register R15
+ */
+struct acrn_gp_regs {
+	__le64	rax;
+	__le64	rcx;
+	__le64	rdx;
+	__le64	rbx;
+	__le64	rsp;
+	__le64	rbp;
+	__le64	rsi;
+	__le64	rdi;
+	__le64	r8;
+	__le64	r9;
+	__le64	r10;
+	__le64	r11;
+	__le64	r12;
+	__le64	r13;
+	__le64	r14;
+	__le64	r15;
+};
+
+/**
+ * struct acrn_descriptor_ptr - Segment descriptor table of a User VM.
+ * @limit:	Limit field.
+ * @base:	Base field.
+ * @reserved:	Reserved and must be 0.
+ */
+struct acrn_descriptor_ptr {
+	__le16	limit;
+	__le64	base;
+	__le16	reserved[3];
+} __attribute__ ((__packed__));
+
+/**
+ * struct acrn_regs - Registers structure of a User VM
+ * @gprs:		General registers
+ * @gdt:		Global Descriptor Table
+ * @idt:		Interrupt Descriptor Table
+ * @rip:		Value of register RIP
+ * @cs_base:		Base of code segment selector
+ * @cr0:		Value of register CR0
+ * @cr4:		Value of register CR4
+ * @cr3:		Value of register CR3
+ * @ia32_efer:		Value of IA32_EFER MSR
+ * @rflags:		Value of regsiter RFLAGS
+ * @reserved_64:	Reserved and must be 0
+ * @cs_ar:		Attribute field of code segment selector
+ * @cs_limit:		Limit field of code segment selector
+ * @reserved_32:	Reserved and must be 0
+ * @cs_sel:		Value of code segment selector
+ * @ss_sel:		Value of stack segment selector
+ * @ds_sel:		Value of data segment selector
+ * @es_sel:		Value of extra segment selector
+ * @fs_sel:		Value of FS selector
+ * @gs_sel:		Value of GS selector
+ * @ldt_sel:		Value of LDT descriptor selector
+ * @tr_sel:		Value of TSS descriptor selector
+ */
+struct acrn_regs {
+	struct acrn_gp_regs		gprs;
+	struct acrn_descriptor_ptr	gdt;
+	struct acrn_descriptor_ptr	idt;
+
+	__le64				rip;
+	__le64				cs_base;
+	__le64				cr0;
+	__le64				cr4;
+	__le64				cr3;
+	__le64				ia32_efer;
+	__le64				rflags;
+	__le64				reserved_64[4];
+
+	__le32				cs_ar;
+	__le32				cs_limit;
+	__le32				reserved_32[3];
+
+	__le16				cs_sel;
+	__le16				ss_sel;
+	__le16				ds_sel;
+	__le16				es_sel;
+	__le16				fs_sel;
+	__le16				gs_sel;
+	__le16				ldt_sel;
+	__le16				tr_sel;
+};
+
+/**
+ * struct acrn_vcpu_regs - Info of vCPU registers state
+ * @vcpu_id:	vCPU ID
+ * @reserved:	Reserved and must be 0
+ * @vcpu_regs:	vCPU registers state
+ *
+ * This structure will be passed to hypervisor directly.
+ */
+struct acrn_vcpu_regs {
+	__u16			vcpu_id;
+	__u16			reserved[3];
+	struct acrn_regs	vcpu_regs;
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -54,5 +171,7 @@ struct acrn_vm_creation {
 	_IO(ACRN_IOCTL_TYPE, 0x13)
 #define ACRN_IOCTL_RESET_VM		\
 	_IO(ACRN_IOCTL_TYPE, 0x15)
+#define ACRN_IOCTL_SET_VCPU_REGS	\
+	_IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs)
 
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3-71-gd317


From 88f537d5e8ddc89c2622f4a2bc1eb28455e8339c Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:30 +0800
Subject: virt: acrn: Introduce EPT mapping management

The HSM provides hypervisor services to the ACRN userspace. While
launching a User VM, ACRN userspace needs to allocate memory and request
the ACRN Hypervisor to set up the EPT mapping for the VM.

A mapping cache is introduced for accelerating the translation between
the Service VM kernel virtual address and User VM physical address.

>From the perspective of the hypervisor, the types of GPA of User VM can be
listed as following:
   1) RAM region, which is used by User VM as system ram.
   2) MMIO region, which is recognized by User VM as MMIO. MMIO region is
      used to be utilized for devices emulation.

Generally, User VM RAM regions mapping is set up before VM started and
is released in the User VM destruction. MMIO regions mapping may be set
and unset dynamically during User VM running.

To achieve this, ioctls ACRN_IOCTL_SET_MEMSEG and ACRN_IOCTL_UNSET_MEMSEG
are introduced in HSM.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-9-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/Makefile    |   2 +-
 drivers/virt/acrn/acrn_drv.h  |  96 +++++++++++--
 drivers/virt/acrn/hsm.c       |  15 +++
 drivers/virt/acrn/hypercall.h |  14 ++
 drivers/virt/acrn/mm.c        | 306 ++++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/vm.c        |   4 +
 include/uapi/linux/acrn.h     |  49 +++++++
 7 files changed, 476 insertions(+), 10 deletions(-)
 create mode 100644 drivers/virt/acrn/mm.c

(limited to 'include/uapi/linux')

diff --git a/drivers/virt/acrn/Makefile b/drivers/virt/acrn/Makefile
index cf8b4ed5e74e..38bc44b6edcd 100644
--- a/drivers/virt/acrn/Makefile
+++ b/drivers/virt/acrn/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ACRN_HSM)	:= acrn.o
-acrn-y := hsm.o vm.o
+acrn-y := hsm.o vm.o mm.o
diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index e5aba86cad8c..e47a45280eea 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -12,26 +12,104 @@
 
 extern struct miscdevice acrn_dev;
 
+#define ACRN_MEM_MAPPING_MAX	256
+
+#define ACRN_MEM_REGION_ADD	0
+#define ACRN_MEM_REGION_DEL	2
+/**
+ * struct vm_memory_region_op - Hypervisor memory operation
+ * @type:		Operation type (ACRN_MEM_REGION_*)
+ * @attr:		Memory attribute (ACRN_MEM_TYPE_* | ACRN_MEM_ACCESS_*)
+ * @user_vm_pa:		Physical address of User VM to be mapped.
+ * @service_vm_pa:	Physical address of Service VM to be mapped.
+ * @size:		Size of this region.
+ *
+ * Structure containing needed information that is provided to ACRN Hypervisor
+ * to manage the EPT mappings of a single memory region of the User VM. Several
+ * &struct vm_memory_region_op can be batched to ACRN Hypervisor, see &struct
+ * vm_memory_region_batch.
+ */
+struct vm_memory_region_op {
+	u32	type;
+	u32	attr;
+	u64	user_vm_pa;
+	u64	service_vm_pa;
+	u64	size;
+};
+
+/**
+ * struct vm_memory_region_batch - A batch of vm_memory_region_op.
+ * @vmid:		A User VM ID.
+ * @reserved:		Reserved.
+ * @regions_num:	The number of vm_memory_region_op.
+ * @regions_gpa:	Physical address of a vm_memory_region_op array.
+ *
+ * HC_VM_SET_MEMORY_REGIONS uses this structure to manage EPT mappings of
+ * multiple memory regions of a User VM. A &struct vm_memory_region_batch
+ * contains multiple &struct vm_memory_region_op for batch processing in the
+ * ACRN Hypervisor.
+ */
+struct vm_memory_region_batch {
+	u16	vmid;
+	u16	reserved[3];
+	u32	regions_num;
+	u64	regions_gpa;
+};
+
+/**
+ * struct vm_memory_mapping - Memory map between a User VM and the Service VM
+ * @pages:		Pages in Service VM kernel.
+ * @npages:		Number of pages.
+ * @service_vm_va:	Virtual address in Service VM kernel.
+ * @user_vm_pa:		Physical address in User VM.
+ * @size:		Size of this memory region.
+ *
+ * HSM maintains memory mappings between a User VM GPA and the Service VM
+ * kernel VA for accelerating the User VM GPA translation.
+ */
+struct vm_memory_mapping {
+	struct page	**pages;
+	int		npages;
+	void		*service_vm_va;
+	u64		user_vm_pa;
+	size_t		size;
+};
+
 #define ACRN_INVALID_VMID (0xffffU)
 
 #define ACRN_VM_FLAG_DESTROYED		0U
 /**
  * struct acrn_vm - Properties of ACRN User VM.
- * @list:	Entry within global list of all VMs
- * @vmid:	User VM ID
- * @vcpu_num:	Number of virtual CPUs in the VM
- * @flags:	Flags (ACRN_VM_FLAG_*) of the VM. This is VM flag management
- *		in HSM which is different from the &acrn_vm_creation.vm_flag.
+ * @list:			Entry within global list of all VMs.
+ * @vmid:			User VM ID.
+ * @vcpu_num:			Number of virtual CPUs in the VM.
+ * @flags:			Flags (ACRN_VM_FLAG_*) of the VM. This is VM
+ *				flag management in HSM which is different
+ *				from the &acrn_vm_creation.vm_flag.
+ * @regions_mapping_lock:	Lock to protect &acrn_vm.regions_mapping and
+ *				&acrn_vm.regions_mapping_count.
+ * @regions_mapping:		Memory mappings of this VM.
+ * @regions_mapping_count:	Number of memory mapping of this VM.
  */
 struct acrn_vm {
-	struct list_head	list;
-	u16			vmid;
-	int			vcpu_num;
-	unsigned long		flags;
+	struct list_head		list;
+	u16				vmid;
+	int				vcpu_num;
+	unsigned long			flags;
+	struct mutex			regions_mapping_lock;
+	struct vm_memory_mapping	regions_mapping[ACRN_MEM_MAPPING_MAX];
+	int				regions_mapping_count;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 			       struct acrn_vm_creation *vm_param);
 int acrn_vm_destroy(struct acrn_vm *vm);
+int acrn_mm_region_add(struct acrn_vm *vm, u64 user_gpa, u64 service_gpa,
+		       u64 size, u32 mem_type, u32 mem_access_right);
+int acrn_mm_region_del(struct acrn_vm *vm, u64 user_gpa, u64 size);
+int acrn_vm_memseg_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap);
+int acrn_vm_memseg_unmap(struct acrn_vm *vm, struct acrn_vm_memmap *memmap);
+int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap);
+void acrn_vm_all_ram_unmap(struct acrn_vm *vm);
 
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index ee5cc7413239..2c40d3dc5e94 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -48,6 +48,7 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_vm *vm = filp->private_data;
 	struct acrn_vm_creation *vm_param;
 	struct acrn_vcpu_regs *cpu_regs;
+	struct acrn_vm_memmap memmap;
 	int i, ret = 0;
 
 	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
@@ -132,6 +133,20 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 				vm->vmid);
 		kfree(cpu_regs);
 		break;
+	case ACRN_IOCTL_SET_MEMSEG:
+		if (copy_from_user(&memmap, (void __user *)ioctl_param,
+				   sizeof(memmap)))
+			return -EFAULT;
+
+		ret = acrn_vm_memseg_map(vm, &memmap);
+		break;
+	case ACRN_IOCTL_UNSET_MEMSEG:
+		if (copy_from_user(&memmap, (void __user *)ioctl_param,
+				   sizeof(memmap)))
+			return -EFAULT;
+
+		ret = acrn_vm_memseg_unmap(vm, &memmap);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index f29cfae08862..a1a70a071713 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -21,6 +21,9 @@
 #define HC_RESET_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x05)
 #define HC_SET_VCPU_REGS		_HC_ID(HC_ID, HC_ID_VM_BASE + 0x06)
 
+#define HC_ID_MEM_BASE			0x40UL
+#define HC_VM_SET_MEMORY_REGIONS	_HC_ID(HC_ID, HC_ID_MEM_BASE + 0x02)
+
 /**
  * hcall_create_vm() - Create a User VM
  * @vminfo:	Service VM GPA of info of User VM creation
@@ -88,4 +91,15 @@ static inline long hcall_set_vcpu_regs(u64 vmid, u64 regs_state)
 	return acrn_hypercall2(HC_SET_VCPU_REGS, vmid, regs_state);
 }
 
+/**
+ * hcall_set_memory_regions() - Inform the hypervisor to set up EPT mappings
+ * @regions_pa:	Service VM GPA of &struct vm_memory_region_batch
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_set_memory_regions(u64 regions_pa)
+{
+	return acrn_hypercall1(HC_VM_SET_MEMORY_REGIONS, regions_pa);
+}
+
 #endif /* __ACRN_HSM_HYPERCALL_H */
diff --git a/drivers/virt/acrn/mm.c b/drivers/virt/acrn/mm.c
new file mode 100644
index 000000000000..c4f2e15c8a2b
--- /dev/null
+++ b/drivers/virt/acrn/mm.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN: Memory mapping management
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Authors:
+ *	Fei Li <lei1.li@intel.com>
+ *	Shuo Liu <shuo.a.liu@intel.com>
+ */
+
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "acrn_drv.h"
+
+static int modify_region(struct acrn_vm *vm, struct vm_memory_region_op *region)
+{
+	struct vm_memory_region_batch *regions;
+	int ret;
+
+	regions = kzalloc(sizeof(*regions), GFP_KERNEL);
+	if (!regions)
+		return -ENOMEM;
+
+	regions->vmid = vm->vmid;
+	regions->regions_num = 1;
+	regions->regions_gpa = virt_to_phys(region);
+
+	ret = hcall_set_memory_regions(virt_to_phys(regions));
+	if (ret < 0)
+		dev_dbg(acrn_dev.this_device,
+			"Failed to set memory region for VM[%u]!\n", vm->vmid);
+
+	kfree(regions);
+	return ret;
+}
+
+/**
+ * acrn_mm_region_add() - Set up the EPT mapping of a memory region.
+ * @vm:			User VM.
+ * @user_gpa:		A GPA of User VM.
+ * @service_gpa:	A GPA of Service VM.
+ * @size:		Size of the region.
+ * @mem_type:		Combination of ACRN_MEM_TYPE_*.
+ * @mem_access_right:	Combination of ACRN_MEM_ACCESS_*.
+ *
+ * Return: 0 on success, <0 on error.
+ */
+int acrn_mm_region_add(struct acrn_vm *vm, u64 user_gpa, u64 service_gpa,
+		       u64 size, u32 mem_type, u32 mem_access_right)
+{
+	struct vm_memory_region_op *region;
+	int ret = 0;
+
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	region->type = ACRN_MEM_REGION_ADD;
+	region->user_vm_pa = user_gpa;
+	region->service_vm_pa = service_gpa;
+	region->size = size;
+	region->attr = ((mem_type & ACRN_MEM_TYPE_MASK) |
+			(mem_access_right & ACRN_MEM_ACCESS_RIGHT_MASK));
+	ret = modify_region(vm, region);
+
+	dev_dbg(acrn_dev.this_device,
+		"%s: user-GPA[%pK] service-GPA[%pK] size[0x%llx].\n",
+		__func__, (void *)user_gpa, (void *)service_gpa, size);
+	kfree(region);
+	return ret;
+}
+
+/**
+ * acrn_mm_region_del() - Del the EPT mapping of a memory region.
+ * @vm:		User VM.
+ * @user_gpa:	A GPA of the User VM.
+ * @size:	Size of the region.
+ *
+ * Return: 0 on success, <0 for error.
+ */
+int acrn_mm_region_del(struct acrn_vm *vm, u64 user_gpa, u64 size)
+{
+	struct vm_memory_region_op *region;
+	int ret = 0;
+
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	region->type = ACRN_MEM_REGION_DEL;
+	region->user_vm_pa = user_gpa;
+	region->service_vm_pa = 0UL;
+	region->size = size;
+	region->attr = 0U;
+
+	ret = modify_region(vm, region);
+
+	dev_dbg(acrn_dev.this_device, "%s: user-GPA[%pK] size[0x%llx].\n",
+		__func__, (void *)user_gpa, size);
+	kfree(region);
+	return ret;
+}
+
+int acrn_vm_memseg_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
+{
+	int ret;
+
+	if (memmap->type == ACRN_MEMMAP_RAM)
+		return acrn_vm_ram_map(vm, memmap);
+
+	if (memmap->type != ACRN_MEMMAP_MMIO) {
+		dev_dbg(acrn_dev.this_device,
+			"Invalid memmap type: %u\n", memmap->type);
+		return -EINVAL;
+	}
+
+	ret = acrn_mm_region_add(vm, memmap->user_vm_pa,
+				 memmap->service_vm_pa, memmap->len,
+				 ACRN_MEM_TYPE_UC, memmap->attr);
+	if (ret < 0)
+		dev_dbg(acrn_dev.this_device,
+			"Add memory region failed, VM[%u]!\n", vm->vmid);
+
+	return ret;
+}
+
+int acrn_vm_memseg_unmap(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
+{
+	int ret;
+
+	if (memmap->type != ACRN_MEMMAP_MMIO) {
+		dev_dbg(acrn_dev.this_device,
+			"Invalid memmap type: %u\n", memmap->type);
+		return -EINVAL;
+	}
+
+	ret = acrn_mm_region_del(vm, memmap->user_vm_pa, memmap->len);
+	if (ret < 0)
+		dev_dbg(acrn_dev.this_device,
+			"Del memory region failed, VM[%u]!\n", vm->vmid);
+
+	return ret;
+}
+
+/**
+ * acrn_vm_ram_map() - Create a RAM EPT mapping of User VM.
+ * @vm:		The User VM pointer
+ * @memmap:	Info of the EPT mapping
+ *
+ * Return: 0 on success, <0 for error.
+ */
+int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
+{
+	struct vm_memory_region_batch *regions_info;
+	int nr_pages, i = 0, order, nr_regions = 0;
+	struct vm_memory_mapping *region_mapping;
+	struct vm_memory_region_op *vm_region;
+	struct page **pages = NULL, *page;
+	void *remap_vaddr;
+	int ret, pinned;
+	u64 user_vm_pa;
+
+	if (!vm || !memmap)
+		return -EINVAL;
+
+	/* Get the page number of the map region */
+	nr_pages = memmap->len >> PAGE_SHIFT;
+	pages = vzalloc(nr_pages * sizeof(struct page *));
+	if (!pages)
+		return -ENOMEM;
+
+	/* Lock the pages of user memory map region */
+	pinned = pin_user_pages_fast(memmap->vma_base,
+				     nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+				     pages);
+	if (pinned < 0) {
+		ret = pinned;
+		goto free_pages;
+	} else if (pinned != nr_pages) {
+		ret = -EFAULT;
+		goto put_pages;
+	}
+
+	/* Create a kernel map for the map region */
+	remap_vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (!remap_vaddr) {
+		ret = -ENOMEM;
+		goto put_pages;
+	}
+
+	/* Record Service VM va <-> User VM pa mapping */
+	mutex_lock(&vm->regions_mapping_lock);
+	region_mapping = &vm->regions_mapping[vm->regions_mapping_count];
+	if (vm->regions_mapping_count < ACRN_MEM_MAPPING_MAX) {
+		region_mapping->pages = pages;
+		region_mapping->npages = nr_pages;
+		region_mapping->size = memmap->len;
+		region_mapping->service_vm_va = remap_vaddr;
+		region_mapping->user_vm_pa = memmap->user_vm_pa;
+		vm->regions_mapping_count++;
+	} else {
+		dev_warn(acrn_dev.this_device,
+			"Run out of memory mapping slots!\n");
+		ret = -ENOMEM;
+		mutex_unlock(&vm->regions_mapping_lock);
+		goto unmap_no_count;
+	}
+	mutex_unlock(&vm->regions_mapping_lock);
+
+	/* Calculate count of vm_memory_region_op */
+	while (i < nr_pages) {
+		page = pages[i];
+		VM_BUG_ON_PAGE(PageTail(page), page);
+		order = compound_order(page);
+		nr_regions++;
+		i += 1 << order;
+	}
+
+	/* Prepare the vm_memory_region_batch */
+	regions_info = kzalloc(sizeof(*regions_info) +
+			       sizeof(*vm_region) * nr_regions,
+			       GFP_KERNEL);
+	if (!regions_info) {
+		ret = -ENOMEM;
+		goto unmap_kernel_map;
+	}
+
+	/* Fill each vm_memory_region_op */
+	vm_region = (struct vm_memory_region_op *)(regions_info + 1);
+	regions_info->vmid = vm->vmid;
+	regions_info->regions_num = nr_regions;
+	regions_info->regions_gpa = virt_to_phys(vm_region);
+	user_vm_pa = memmap->user_vm_pa;
+	i = 0;
+	while (i < nr_pages) {
+		u32 region_size;
+
+		page = pages[i];
+		VM_BUG_ON_PAGE(PageTail(page), page);
+		order = compound_order(page);
+		region_size = PAGE_SIZE << order;
+		vm_region->type = ACRN_MEM_REGION_ADD;
+		vm_region->user_vm_pa = user_vm_pa;
+		vm_region->service_vm_pa = page_to_phys(page);
+		vm_region->size = region_size;
+		vm_region->attr = (ACRN_MEM_TYPE_WB & ACRN_MEM_TYPE_MASK) |
+				  (memmap->attr & ACRN_MEM_ACCESS_RIGHT_MASK);
+
+		vm_region++;
+		user_vm_pa += region_size;
+		i += 1 << order;
+	}
+
+	/* Inform the ACRN Hypervisor to set up EPT mappings */
+	ret = hcall_set_memory_regions(virt_to_phys(regions_info));
+	if (ret < 0) {
+		dev_dbg(acrn_dev.this_device,
+			"Failed to set regions, VM[%u]!\n", vm->vmid);
+		goto unset_region;
+	}
+	kfree(regions_info);
+
+	dev_dbg(acrn_dev.this_device,
+		"%s: VM[%u] service-GVA[%pK] user-GPA[%pK] size[0x%llx]\n",
+		__func__, vm->vmid,
+		remap_vaddr, (void *)memmap->user_vm_pa, memmap->len);
+	return ret;
+
+unset_region:
+	kfree(regions_info);
+unmap_kernel_map:
+	mutex_lock(&vm->regions_mapping_lock);
+	vm->regions_mapping_count--;
+	mutex_unlock(&vm->regions_mapping_lock);
+unmap_no_count:
+	vunmap(remap_vaddr);
+put_pages:
+	for (i = 0; i < pinned; i++)
+		unpin_user_page(pages[i]);
+free_pages:
+	vfree(pages);
+	return ret;
+}
+
+/**
+ * acrn_vm_all_ram_unmap() - Destroy a RAM EPT mapping of User VM.
+ * @vm:	The User VM
+ */
+void acrn_vm_all_ram_unmap(struct acrn_vm *vm)
+{
+	struct vm_memory_mapping *region_mapping;
+	int i, j;
+
+	mutex_lock(&vm->regions_mapping_lock);
+	for (i = 0; i < vm->regions_mapping_count; i++) {
+		region_mapping = &vm->regions_mapping[i];
+		vunmap(region_mapping->service_vm_va);
+		for (j = 0; j < region_mapping->npages; j++)
+			unpin_user_page(region_mapping->pages[j]);
+		vfree(region_mapping->pages);
+	}
+	mutex_unlock(&vm->regions_mapping_lock);
+}
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
index 3f667ac8ac1e..ff5df7acb551 100644
--- a/drivers/virt/acrn/vm.c
+++ b/drivers/virt/acrn/vm.c
@@ -31,6 +31,7 @@ struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 		return NULL;
 	}
 
+	mutex_init(&vm->regions_mapping_lock);
 	vm->vmid = vm_param->vmid;
 	vm->vcpu_num = vm_param->vcpu_num;
 
@@ -62,6 +63,9 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 		clear_bit(ACRN_VM_FLAG_DESTROYED, &vm->flags);
 		return ret;
 	}
+
+	acrn_vm_all_ram_unmap(vm);
+
 	dev_dbg(acrn_dev.this_device, "VM %u destroyed.\n", vm->vmid);
 	vm->vmid = ACRN_INVALID_VMID;
 	return 0;
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index 775c58bad026..ec4c61e7c170 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -155,6 +155,50 @@ struct acrn_vcpu_regs {
 	struct acrn_regs	vcpu_regs;
 };
 
+#define	ACRN_MEM_ACCESS_RIGHT_MASK	0x00000007U
+#define	ACRN_MEM_ACCESS_READ		0x00000001U
+#define	ACRN_MEM_ACCESS_WRITE		0x00000002U
+#define	ACRN_MEM_ACCESS_EXEC		0x00000004U
+#define	ACRN_MEM_ACCESS_RWX		(ACRN_MEM_ACCESS_READ  | \
+					 ACRN_MEM_ACCESS_WRITE | \
+					 ACRN_MEM_ACCESS_EXEC)
+
+#define	ACRN_MEM_TYPE_MASK		0x000007C0U
+#define	ACRN_MEM_TYPE_WB		0x00000040U
+#define	ACRN_MEM_TYPE_WT		0x00000080U
+#define	ACRN_MEM_TYPE_UC		0x00000100U
+#define	ACRN_MEM_TYPE_WC		0x00000200U
+#define	ACRN_MEM_TYPE_WP		0x00000400U
+
+/* Memory mapping types */
+#define	ACRN_MEMMAP_RAM			0
+#define	ACRN_MEMMAP_MMIO		1
+
+/**
+ * struct acrn_vm_memmap - A EPT memory mapping info for a User VM.
+ * @type:		Type of the memory mapping (ACRM_MEMMAP_*).
+ *			Pass to hypervisor directly.
+ * @attr:		Attribute of the memory mapping.
+ *			Pass to hypervisor directly.
+ * @user_vm_pa:		Physical address of User VM.
+ *			Pass to hypervisor directly.
+ * @service_vm_pa:	Physical address of Service VM.
+ *			Pass to hypervisor directly.
+ * @vma_base:		VMA address of Service VM. Pass to hypervisor directly.
+ * @len:		Length of the memory mapping.
+ *			Pass to hypervisor directly.
+ */
+struct acrn_vm_memmap {
+	__u32	type;
+	__u32	attr;
+	__u64	user_vm_pa;
+	union {
+		__u64	service_vm_pa;
+		__u64	vma_base;
+	};
+	__u64	len;
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -174,4 +218,9 @@ struct acrn_vcpu_regs {
 #define ACRN_IOCTL_SET_VCPU_REGS	\
 	_IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs)
 
+#define ACRN_IOCTL_SET_MEMSEG		\
+	_IOW(ACRN_IOCTL_TYPE, 0x41, struct acrn_vm_memmap)
+#define ACRN_IOCTL_UNSET_MEMSEG		\
+	_IOW(ACRN_IOCTL_TYPE, 0x42, struct acrn_vm_memmap)
+
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3-71-gd317


From 72f293de3ff40b57db573c1bf623f494f3446f74 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:31 +0800
Subject: virt: acrn: Introduce I/O request management

An I/O request of a User VM, which is constructed by the hypervisor, is
distributed by the ACRN Hypervisor Service Module to an I/O client
corresponding to the address range of the I/O request.

For each User VM, there is a shared 4-KByte memory region used for I/O
requests communication between the hypervisor and Service VM. An I/O
request is a 256-byte structure buffer, which is 'struct
acrn_io_request', that is filled by an I/O handler of the hypervisor
when a trapped I/O access happens in a User VM. ACRN userspace in the
Service VM first allocates a 4-KByte page and passes the GPA (Guest
Physical Address) of the buffer to the hypervisor. The buffer is used as
an array of 16 I/O request slots with each I/O request slot being 256
bytes. This array is indexed by vCPU ID.

An I/O client, which is 'struct acrn_ioreq_client', is responsible for
handling User VM I/O requests whose accessed GPA falls in a certain
range. Multiple I/O clients can be associated with each User VM. There
is a special client associated with each User VM, called the default
client, that handles all I/O requests that do not fit into the range of
any other I/O clients. The ACRN userspace acts as the default client for
each User VM.

The state transitions of a ACRN I/O request are as follows.

   FREE -> PENDING -> PROCESSING -> COMPLETE -> FREE -> ...

FREE: this I/O request slot is empty
PENDING: a valid I/O request is pending in this slot
PROCESSING: the I/O request is being processed
COMPLETE: the I/O request has been processed

An I/O request in COMPLETE or FREE state is owned by the hypervisor. HSM
and ACRN userspace are in charge of processing the others.

The processing flow of I/O requests are listed as following:

a) The I/O handler of the hypervisor will fill an I/O request with
   PENDING state when a trapped I/O access happens in a User VM.
b) The hypervisor makes an upcall, which is a notification interrupt, to
   the Service VM.
c) The upcall handler schedules a worker to dispatch I/O requests.
d) The worker looks for the PENDING I/O requests, assigns them to
   different registered clients based on the address of the I/O accesses,
   updates their state to PROCESSING, and notifies the corresponding
   client to handle.
e) The notified client handles the assigned I/O requests.
f) The HSM updates I/O requests states to COMPLETE and notifies the
   hypervisor of the completion via hypercalls.

Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-10-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/Makefile    |   2 +-
 drivers/virt/acrn/acrn_drv.h  |  82 +++++++
 drivers/virt/acrn/hsm.c       |  43 +++-
 drivers/virt/acrn/hypercall.h |  28 +++
 drivers/virt/acrn/ioreq.c     | 521 ++++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/vm.c        |  27 ++-
 include/uapi/linux/acrn.h     | 150 ++++++++++++
 7 files changed, 843 insertions(+), 10 deletions(-)
 create mode 100644 drivers/virt/acrn/ioreq.c

(limited to 'include/uapi/linux')

diff --git a/drivers/virt/acrn/Makefile b/drivers/virt/acrn/Makefile
index 38bc44b6edcd..21721cbf6a80 100644
--- a/drivers/virt/acrn/Makefile
+++ b/drivers/virt/acrn/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ACRN_HSM)	:= acrn.o
-acrn-y := hsm.o vm.o mm.o
+acrn-y := hsm.o vm.o mm.o ioreq.o
diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index e47a45280eea..68bd8db6c8be 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -12,10 +12,15 @@
 
 extern struct miscdevice acrn_dev;
 
+#define ACRN_NAME_LEN		16
 #define ACRN_MEM_MAPPING_MAX	256
 
 #define ACRN_MEM_REGION_ADD	0
 #define ACRN_MEM_REGION_DEL	2
+
+struct acrn_vm;
+struct acrn_ioreq_client;
+
 /**
  * struct vm_memory_region_op - Hypervisor memory operation
  * @type:		Operation type (ACRN_MEM_REGION_*)
@@ -75,9 +80,63 @@ struct vm_memory_mapping {
 	size_t		size;
 };
 
+/**
+ * struct acrn_ioreq_buffer - Data for setting the ioreq buffer of User VM
+ * @ioreq_buf:	The GPA of the IO request shared buffer of a VM
+ *
+ * The parameter for the HC_SET_IOREQ_BUFFER hypercall used to set up
+ * the shared I/O request buffer between Service VM and ACRN hypervisor.
+ */
+struct acrn_ioreq_buffer {
+	u64	ioreq_buf;
+};
+
+struct acrn_ioreq_range {
+	struct list_head	list;
+	u32			type;
+	u64			start;
+	u64			end;
+};
+
+#define ACRN_IOREQ_CLIENT_DESTROYING	0U
+typedef	int (*ioreq_handler_t)(struct acrn_ioreq_client *client,
+			       struct acrn_io_request *req);
+/**
+ * struct acrn_ioreq_client - Structure of I/O client.
+ * @name:	Client name
+ * @vm:		The VM that the client belongs to
+ * @list:	List node for this acrn_ioreq_client
+ * @is_default:	If this client is the default one
+ * @flags:	Flags (ACRN_IOREQ_CLIENT_*)
+ * @range_list:	I/O ranges
+ * @range_lock:	Lock to protect range_list
+ * @ioreqs_map:	The pending I/O requests bitmap.
+ * @handler:	I/O requests handler of this client
+ * @thread:	The thread which executes the handler
+ * @wq:		The wait queue for the handler thread parking
+ * @priv:	Data for the thread
+ */
+struct acrn_ioreq_client {
+	char			name[ACRN_NAME_LEN];
+	struct acrn_vm		*vm;
+	struct list_head	list;
+	bool			is_default;
+	unsigned long		flags;
+	struct list_head	range_list;
+	rwlock_t		range_lock;
+	DECLARE_BITMAP(ioreqs_map, ACRN_IO_REQUEST_MAX);
+	ioreq_handler_t		handler;
+	struct task_struct	*thread;
+	wait_queue_head_t	wq;
+	void			*priv;
+};
+
 #define ACRN_INVALID_VMID (0xffffU)
 
 #define ACRN_VM_FLAG_DESTROYED		0U
+#define ACRN_VM_FLAG_CLEARING_IOREQ	1U
+extern struct list_head acrn_vm_list;
+extern rwlock_t acrn_vm_list_lock;
 /**
  * struct acrn_vm - Properties of ACRN User VM.
  * @list:			Entry within global list of all VMs.
@@ -90,6 +149,11 @@ struct vm_memory_mapping {
  *				&acrn_vm.regions_mapping_count.
  * @regions_mapping:		Memory mappings of this VM.
  * @regions_mapping_count:	Number of memory mapping of this VM.
+ * @ioreq_clients_lock:		Lock to protect ioreq_clients and default_client
+ * @ioreq_clients:		The I/O request clients list of this VM
+ * @default_client:		The default I/O request client
+ * @ioreq_buf:			I/O request shared buffer
+ * @ioreq_page:			The page of the I/O request shared buffer
  */
 struct acrn_vm {
 	struct list_head		list;
@@ -99,6 +163,11 @@ struct acrn_vm {
 	struct mutex			regions_mapping_lock;
 	struct vm_memory_mapping	regions_mapping[ACRN_MEM_MAPPING_MAX];
 	int				regions_mapping_count;
+	spinlock_t			ioreq_clients_lock;
+	struct list_head		ioreq_clients;
+	struct acrn_ioreq_client	*default_client;
+	struct acrn_io_request_buffer	*ioreq_buf;
+	struct page			*ioreq_page;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
@@ -112,4 +181,17 @@ int acrn_vm_memseg_unmap(struct acrn_vm *vm, struct acrn_vm_memmap *memmap);
 int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap);
 void acrn_vm_all_ram_unmap(struct acrn_vm *vm);
 
+int acrn_ioreq_init(struct acrn_vm *vm, u64 buf_vma);
+void acrn_ioreq_deinit(struct acrn_vm *vm);
+int acrn_ioreq_intr_setup(void);
+void acrn_ioreq_intr_remove(void);
+void acrn_ioreq_request_clear(struct acrn_vm *vm);
+int acrn_ioreq_client_wait(struct acrn_ioreq_client *client);
+int acrn_ioreq_request_default_complete(struct acrn_vm *vm, u16 vcpu);
+struct acrn_ioreq_client *acrn_ioreq_client_create(struct acrn_vm *vm,
+						   ioreq_handler_t handler,
+						   void *data, bool is_default,
+						   const char *name);
+void acrn_ioreq_client_destroy(struct acrn_ioreq_client *client);
+
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 2c40d3dc5e94..1cc0c612dc09 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -48,6 +48,7 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_vm *vm = filp->private_data;
 	struct acrn_vm_creation *vm_param;
 	struct acrn_vcpu_regs *cpu_regs;
+	struct acrn_ioreq_notify notify;
 	struct acrn_vm_memmap memmap;
 	int i, ret = 0;
 
@@ -147,6 +148,35 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 
 		ret = acrn_vm_memseg_unmap(vm, &memmap);
 		break;
+	case ACRN_IOCTL_CREATE_IOREQ_CLIENT:
+		if (vm->default_client)
+			return -EEXIST;
+		if (!acrn_ioreq_client_create(vm, NULL, NULL, true, "acrndm"))
+			ret = -EINVAL;
+		break;
+	case ACRN_IOCTL_DESTROY_IOREQ_CLIENT:
+		if (vm->default_client)
+			acrn_ioreq_client_destroy(vm->default_client);
+		break;
+	case ACRN_IOCTL_ATTACH_IOREQ_CLIENT:
+		if (vm->default_client)
+			ret = acrn_ioreq_client_wait(vm->default_client);
+		else
+			ret = -ENODEV;
+		break;
+	case ACRN_IOCTL_NOTIFY_REQUEST_FINISH:
+		if (copy_from_user(&notify, (void __user *)ioctl_param,
+				   sizeof(struct acrn_ioreq_notify)))
+			return -EFAULT;
+
+		if (notify.reserved != 0)
+			return -EINVAL;
+
+		ret = acrn_ioreq_request_default_complete(vm, notify.vcpu);
+		break;
+	case ACRN_IOCTL_CLEAR_VM_IOREQ:
+		acrn_ioreq_request_clear(vm);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
@@ -188,14 +218,23 @@ static int __init hsm_init(void)
 		return -EPERM;
 
 	ret = misc_register(&acrn_dev);
-	if (ret)
+	if (ret) {
 		pr_err("Create misc dev failed!\n");
+		return ret;
+	}
 
-	return ret;
+	ret = acrn_ioreq_intr_setup();
+	if (ret) {
+		pr_err("Setup I/O request handler failed!\n");
+		misc_deregister(&acrn_dev);
+		return ret;
+	}
+	return 0;
 }
 
 static void __exit hsm_exit(void)
 {
+	acrn_ioreq_intr_remove();
 	misc_deregister(&acrn_dev);
 }
 module_init(hsm_init);
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index a1a70a071713..5eba29e3ed38 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -21,6 +21,10 @@
 #define HC_RESET_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x05)
 #define HC_SET_VCPU_REGS		_HC_ID(HC_ID, HC_ID_VM_BASE + 0x06)
 
+#define HC_ID_IOREQ_BASE		0x30UL
+#define HC_SET_IOREQ_BUFFER		_HC_ID(HC_ID, HC_ID_IOREQ_BASE + 0x00)
+#define HC_NOTIFY_REQUEST_FINISH	_HC_ID(HC_ID, HC_ID_IOREQ_BASE + 0x01)
+
 #define HC_ID_MEM_BASE			0x40UL
 #define HC_VM_SET_MEMORY_REGIONS	_HC_ID(HC_ID, HC_ID_MEM_BASE + 0x02)
 
@@ -91,6 +95,30 @@ static inline long hcall_set_vcpu_regs(u64 vmid, u64 regs_state)
 	return acrn_hypercall2(HC_SET_VCPU_REGS, vmid, regs_state);
 }
 
+/**
+ * hcall_set_ioreq_buffer() - Set up the shared buffer for I/O Requests.
+ * @vmid:	User VM ID
+ * @buffer:	Service VM GPA of the shared buffer
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_set_ioreq_buffer(u64 vmid, u64 buffer)
+{
+	return acrn_hypercall2(HC_SET_IOREQ_BUFFER, vmid, buffer);
+}
+
+/**
+ * hcall_notify_req_finish() - Notify ACRN Hypervisor of I/O request completion.
+ * @vmid:	User VM ID
+ * @vcpu:	The vCPU which initiated the I/O request
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_notify_req_finish(u64 vmid, u64 vcpu)
+{
+	return acrn_hypercall2(HC_NOTIFY_REQUEST_FINISH, vmid, vcpu);
+}
+
 /**
  * hcall_set_memory_regions() - Inform the hypervisor to set up EPT mappings
  * @regions_pa:	Service VM GPA of &struct vm_memory_region_batch
diff --git a/drivers/virt/acrn/ioreq.c b/drivers/virt/acrn/ioreq.c
new file mode 100644
index 000000000000..51cb41ef7c72
--- /dev/null
+++ b/drivers/virt/acrn/ioreq.c
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN_HSM: Handle I/O requests
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Authors:
+ *	Jason Chen CJ <jason.cj.chen@intel.com>
+ *	Fengwei Yin <fengwei.yin@intel.com>
+ */
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/acrn.h>
+
+#include "acrn_drv.h"
+
+static void ioreq_pause(void);
+static void ioreq_resume(void);
+
+static void ioreq_dispatcher(struct work_struct *work);
+static struct workqueue_struct *ioreq_wq;
+static DECLARE_WORK(ioreq_work, ioreq_dispatcher);
+
+static inline bool has_pending_request(struct acrn_ioreq_client *client)
+{
+	return !bitmap_empty(client->ioreqs_map, ACRN_IO_REQUEST_MAX);
+}
+
+static inline bool is_destroying(struct acrn_ioreq_client *client)
+{
+	return test_bit(ACRN_IOREQ_CLIENT_DESTROYING, &client->flags);
+}
+
+static int ioreq_complete_request(struct acrn_vm *vm, u16 vcpu,
+				  struct acrn_io_request *acrn_req)
+{
+	bool polling_mode;
+	int ret = 0;
+
+	polling_mode = acrn_req->completion_polling;
+	/* Add barrier() to make sure the writes are done before completion */
+	smp_store_release(&acrn_req->processed, ACRN_IOREQ_STATE_COMPLETE);
+
+	/*
+	 * To fulfill the requirement of real-time in several industry
+	 * scenarios, like automotive, ACRN can run under the partition mode,
+	 * in which User VMs and Service VM are bound to dedicated CPU cores.
+	 * Polling mode of handling the I/O request is introduced to achieve a
+	 * faster I/O request handling. In polling mode, the hypervisor polls
+	 * I/O request's completion. Once an I/O request is marked as
+	 * ACRN_IOREQ_STATE_COMPLETE, hypervisor resumes from the polling point
+	 * to continue the I/O request flow. Thus, the completion notification
+	 * from HSM of I/O request is not needed.  Please note,
+	 * completion_polling needs to be read before the I/O request being
+	 * marked as ACRN_IOREQ_STATE_COMPLETE to avoid racing with the
+	 * hypervisor.
+	 */
+	if (!polling_mode) {
+		ret = hcall_notify_req_finish(vm->vmid, vcpu);
+		if (ret < 0)
+			dev_err(acrn_dev.this_device,
+				"Notify I/O request finished failed!\n");
+	}
+
+	return ret;
+}
+
+static int acrn_ioreq_complete_request(struct acrn_ioreq_client *client,
+				       u16 vcpu,
+				       struct acrn_io_request *acrn_req)
+{
+	int ret;
+
+	if (vcpu >= client->vm->vcpu_num)
+		return -EINVAL;
+
+	clear_bit(vcpu, client->ioreqs_map);
+	if (!acrn_req) {
+		acrn_req = (struct acrn_io_request *)client->vm->ioreq_buf;
+		acrn_req += vcpu;
+	}
+
+	ret = ioreq_complete_request(client->vm, vcpu, acrn_req);
+
+	return ret;
+}
+
+int acrn_ioreq_request_default_complete(struct acrn_vm *vm, u16 vcpu)
+{
+	int ret = 0;
+
+	spin_lock_bh(&vm->ioreq_clients_lock);
+	if (vm->default_client)
+		ret = acrn_ioreq_complete_request(vm->default_client,
+						  vcpu, NULL);
+	spin_unlock_bh(&vm->ioreq_clients_lock);
+
+	return ret;
+}
+
+/*
+ * ioreq_task() is the execution entity of handler thread of an I/O client.
+ * The handler callback of the I/O client is called within the handler thread.
+ */
+static int ioreq_task(void *data)
+{
+	struct acrn_ioreq_client *client = data;
+	struct acrn_io_request *req;
+	unsigned long *ioreqs_map;
+	int vcpu, ret;
+
+	/*
+	 * Lockless access to ioreqs_map is safe, because
+	 * 1) set_bit() and clear_bit() are atomic operations.
+	 * 2) I/O requests arrives serialized. The access flow of ioreqs_map is:
+	 *	set_bit() - in ioreq_work handler
+	 *	Handler callback handles corresponding I/O request
+	 *	clear_bit() - in handler thread (include ACRN userspace)
+	 *	Mark corresponding I/O request completed
+	 *	Loop again if a new I/O request occurs
+	 */
+	ioreqs_map = client->ioreqs_map;
+	while (!kthread_should_stop()) {
+		acrn_ioreq_client_wait(client);
+		while (has_pending_request(client)) {
+			vcpu = find_first_bit(ioreqs_map, client->vm->vcpu_num);
+			req = client->vm->ioreq_buf->req_slot + vcpu;
+			ret = client->handler(client, req);
+			if (ret < 0) {
+				dev_err(acrn_dev.this_device,
+					"IO handle failure: %d\n", ret);
+				break;
+			}
+			acrn_ioreq_complete_request(client, vcpu, req);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * For the non-default I/O clients, give them chance to complete the current
+ * I/O requests if there are any. For the default I/O client, it is safe to
+ * clear all pending I/O requests because the clearing request is from ACRN
+ * userspace.
+ */
+void acrn_ioreq_request_clear(struct acrn_vm *vm)
+{
+	struct acrn_ioreq_client *client;
+	bool has_pending = false;
+	unsigned long vcpu;
+	int retry = 10;
+
+	/*
+	 * IO requests of this VM will be completed directly in
+	 * acrn_ioreq_dispatch if ACRN_VM_FLAG_CLEARING_IOREQ flag is set.
+	 */
+	set_bit(ACRN_VM_FLAG_CLEARING_IOREQ, &vm->flags);
+
+	/*
+	 * acrn_ioreq_request_clear is only called in VM reset case. Simply
+	 * wait 100ms in total for the IO requests' completion.
+	 */
+	do {
+		spin_lock_bh(&vm->ioreq_clients_lock);
+		list_for_each_entry(client, &vm->ioreq_clients, list) {
+			has_pending = has_pending_request(client);
+			if (has_pending)
+				break;
+		}
+		spin_unlock_bh(&vm->ioreq_clients_lock);
+
+		if (has_pending)
+			schedule_timeout_interruptible(HZ / 100);
+	} while (has_pending && --retry > 0);
+	if (retry == 0)
+		dev_warn(acrn_dev.this_device,
+			 "%s cannot flush pending request!\n", client->name);
+
+	/* Clear all ioreqs belonging to the default client */
+	spin_lock_bh(&vm->ioreq_clients_lock);
+	client = vm->default_client;
+	if (client) {
+		vcpu = find_next_bit(client->ioreqs_map,
+				     ACRN_IO_REQUEST_MAX, 0);
+		while (vcpu < ACRN_IO_REQUEST_MAX) {
+			acrn_ioreq_complete_request(client, vcpu, NULL);
+			vcpu = find_next_bit(client->ioreqs_map,
+					     ACRN_IO_REQUEST_MAX, vcpu + 1);
+		}
+	}
+	spin_unlock_bh(&vm->ioreq_clients_lock);
+
+	/* Clear ACRN_VM_FLAG_CLEARING_IOREQ flag after the clearing */
+	clear_bit(ACRN_VM_FLAG_CLEARING_IOREQ, &vm->flags);
+}
+
+int acrn_ioreq_client_wait(struct acrn_ioreq_client *client)
+{
+	if (client->is_default) {
+		/*
+		 * In the default client, a user space thread waits on the
+		 * waitqueue. The is_destroying() check is used to notify user
+		 * space the client is going to be destroyed.
+		 */
+		wait_event_interruptible(client->wq,
+					 has_pending_request(client) ||
+					 is_destroying(client));
+		if (is_destroying(client))
+			return -ENODEV;
+	} else {
+		wait_event_interruptible(client->wq,
+					 has_pending_request(client) ||
+					 kthread_should_stop());
+	}
+
+	return 0;
+}
+
+static bool in_range(struct acrn_ioreq_range *range,
+		     struct acrn_io_request *req)
+{
+	bool ret = false;
+
+	if (range->type == req->type) {
+		switch (req->type) {
+		case ACRN_IOREQ_TYPE_MMIO:
+			if (req->reqs.mmio_request.address >= range->start &&
+			    (req->reqs.mmio_request.address +
+			     req->reqs.mmio_request.size - 1) <= range->end)
+				ret = true;
+			break;
+		case ACRN_IOREQ_TYPE_PORTIO:
+			if (req->reqs.pio_request.address >= range->start &&
+			    (req->reqs.pio_request.address +
+			     req->reqs.pio_request.size - 1) <= range->end)
+				ret = true;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static struct acrn_ioreq_client *find_ioreq_client(struct acrn_vm *vm,
+						   struct acrn_io_request *req)
+{
+	struct acrn_ioreq_client *client, *found = NULL;
+	struct acrn_ioreq_range *range;
+
+	lockdep_assert_held(&vm->ioreq_clients_lock);
+
+	list_for_each_entry(client, &vm->ioreq_clients, list) {
+		read_lock_bh(&client->range_lock);
+		list_for_each_entry(range, &client->range_list, list) {
+			if (in_range(range, req)) {
+				found = client;
+				break;
+			}
+		}
+		read_unlock_bh(&client->range_lock);
+		if (found)
+			break;
+	}
+	return found ? found : vm->default_client;
+}
+
+/**
+ * acrn_ioreq_client_create() - Create an ioreq client
+ * @vm:		The VM that this client belongs to
+ * @handler:	The ioreq_handler of ioreq client acrn_hsm will create a kernel
+ *		thread and call the handler to handle I/O requests.
+ * @priv:	Private data for the handler
+ * @is_default:	If it is the default client
+ * @name:	The name of ioreq client
+ *
+ * Return: acrn_ioreq_client pointer on success, NULL on error
+ */
+struct acrn_ioreq_client *acrn_ioreq_client_create(struct acrn_vm *vm,
+						   ioreq_handler_t handler,
+						   void *priv, bool is_default,
+						   const char *name)
+{
+	struct acrn_ioreq_client *client;
+
+	if (!handler && !is_default) {
+		dev_dbg(acrn_dev.this_device,
+			"Cannot create non-default client w/o handler!\n");
+		return NULL;
+	}
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (!client)
+		return NULL;
+
+	client->handler = handler;
+	client->vm = vm;
+	client->priv = priv;
+	client->is_default = is_default;
+	if (name)
+		strncpy(client->name, name, sizeof(client->name) - 1);
+	rwlock_init(&client->range_lock);
+	INIT_LIST_HEAD(&client->range_list);
+	init_waitqueue_head(&client->wq);
+
+	if (client->handler) {
+		client->thread = kthread_run(ioreq_task, client, "VM%u-%s",
+					     client->vm->vmid, client->name);
+		if (IS_ERR(client->thread)) {
+			kfree(client);
+			return NULL;
+		}
+	}
+
+	spin_lock_bh(&vm->ioreq_clients_lock);
+	if (is_default)
+		vm->default_client = client;
+	else
+		list_add(&client->list, &vm->ioreq_clients);
+	spin_unlock_bh(&vm->ioreq_clients_lock);
+
+	dev_dbg(acrn_dev.this_device, "Created ioreq client %s.\n", name);
+	return client;
+}
+
+/**
+ * acrn_ioreq_client_destroy() - Destroy an ioreq client
+ * @client:	The ioreq client
+ */
+void acrn_ioreq_client_destroy(struct acrn_ioreq_client *client)
+{
+	struct acrn_ioreq_range *range, *next;
+	struct acrn_vm *vm = client->vm;
+
+	dev_dbg(acrn_dev.this_device,
+		"Destroy ioreq client %s.\n", client->name);
+	ioreq_pause();
+	set_bit(ACRN_IOREQ_CLIENT_DESTROYING, &client->flags);
+	if (client->is_default)
+		wake_up_interruptible(&client->wq);
+	else
+		kthread_stop(client->thread);
+
+	spin_lock_bh(&vm->ioreq_clients_lock);
+	if (client->is_default)
+		vm->default_client = NULL;
+	else
+		list_del(&client->list);
+	spin_unlock_bh(&vm->ioreq_clients_lock);
+
+	write_lock_bh(&client->range_lock);
+	list_for_each_entry_safe(range, next, &client->range_list, list) {
+		list_del(&range->list);
+		kfree(range);
+	}
+	write_unlock_bh(&client->range_lock);
+	kfree(client);
+
+	ioreq_resume();
+}
+
+static int acrn_ioreq_dispatch(struct acrn_vm *vm)
+{
+	struct acrn_ioreq_client *client;
+	struct acrn_io_request *req;
+	int i;
+
+	for (i = 0; i < vm->vcpu_num; i++) {
+		req = vm->ioreq_buf->req_slot + i;
+
+		/* barrier the read of processed of acrn_io_request */
+		if (smp_load_acquire(&req->processed) ==
+				     ACRN_IOREQ_STATE_PENDING) {
+			/* Complete the IO request directly in clearing stage */
+			if (test_bit(ACRN_VM_FLAG_CLEARING_IOREQ, &vm->flags)) {
+				ioreq_complete_request(vm, i, req);
+				continue;
+			}
+
+			spin_lock_bh(&vm->ioreq_clients_lock);
+			client = find_ioreq_client(vm, req);
+			if (!client) {
+				dev_err(acrn_dev.this_device,
+					"Failed to find ioreq client!\n");
+				spin_unlock_bh(&vm->ioreq_clients_lock);
+				return -EINVAL;
+			}
+			if (!client->is_default)
+				req->kernel_handled = 1;
+			else
+				req->kernel_handled = 0;
+			/*
+			 * Add barrier() to make sure the writes are done
+			 * before setting ACRN_IOREQ_STATE_PROCESSING
+			 */
+			smp_store_release(&req->processed,
+					  ACRN_IOREQ_STATE_PROCESSING);
+			set_bit(i, client->ioreqs_map);
+			wake_up_interruptible(&client->wq);
+			spin_unlock_bh(&vm->ioreq_clients_lock);
+		}
+	}
+
+	return 0;
+}
+
+static void ioreq_dispatcher(struct work_struct *work)
+{
+	struct acrn_vm *vm;
+
+	read_lock(&acrn_vm_list_lock);
+	list_for_each_entry(vm, &acrn_vm_list, list) {
+		if (!vm->ioreq_buf)
+			break;
+		acrn_ioreq_dispatch(vm);
+	}
+	read_unlock(&acrn_vm_list_lock);
+}
+
+static void ioreq_intr_handler(void)
+{
+	queue_work(ioreq_wq, &ioreq_work);
+}
+
+static void ioreq_pause(void)
+{
+	/* Flush and unarm the handler to ensure no I/O requests pending */
+	acrn_remove_intr_handler();
+	drain_workqueue(ioreq_wq);
+}
+
+static void ioreq_resume(void)
+{
+	/* Schedule after enabling in case other clients miss interrupt */
+	acrn_setup_intr_handler(ioreq_intr_handler);
+	queue_work(ioreq_wq, &ioreq_work);
+}
+
+int acrn_ioreq_intr_setup(void)
+{
+	acrn_setup_intr_handler(ioreq_intr_handler);
+	ioreq_wq = alloc_workqueue("ioreq_wq",
+				   WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!ioreq_wq) {
+		dev_err(acrn_dev.this_device, "Failed to alloc workqueue!\n");
+		acrn_remove_intr_handler();
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void acrn_ioreq_intr_remove(void)
+{
+	if (ioreq_wq)
+		destroy_workqueue(ioreq_wq);
+	acrn_remove_intr_handler();
+}
+
+int acrn_ioreq_init(struct acrn_vm *vm, u64 buf_vma)
+{
+	struct acrn_ioreq_buffer *set_buffer;
+	struct page *page;
+	int ret;
+
+	if (vm->ioreq_buf)
+		return -EEXIST;
+
+	set_buffer = kzalloc(sizeof(*set_buffer), GFP_KERNEL);
+	if (!set_buffer)
+		return -ENOMEM;
+
+	ret = pin_user_pages_fast(buf_vma, 1,
+				  FOLL_WRITE | FOLL_LONGTERM, &page);
+	if (unlikely(ret != 1) || !page) {
+		dev_err(acrn_dev.this_device, "Failed to pin ioreq page!\n");
+		ret = -EFAULT;
+		goto free_buf;
+	}
+
+	vm->ioreq_buf = page_address(page);
+	vm->ioreq_page = page;
+	set_buffer->ioreq_buf = page_to_phys(page);
+	ret = hcall_set_ioreq_buffer(vm->vmid, virt_to_phys(set_buffer));
+	if (ret < 0) {
+		dev_err(acrn_dev.this_device, "Failed to init ioreq buffer!\n");
+		unpin_user_page(page);
+		vm->ioreq_buf = NULL;
+		goto free_buf;
+	}
+
+	dev_dbg(acrn_dev.this_device,
+		"Init ioreq buffer %pK!\n", vm->ioreq_buf);
+	ret = 0;
+free_buf:
+	kfree(set_buffer);
+	return ret;
+}
+
+void acrn_ioreq_deinit(struct acrn_vm *vm)
+{
+	struct acrn_ioreq_client *client, *next;
+
+	dev_dbg(acrn_dev.this_device,
+		"Deinit ioreq buffer %pK!\n", vm->ioreq_buf);
+	/* Destroy all clients belonging to this VM */
+	list_for_each_entry_safe(client, next, &vm->ioreq_clients, list)
+		acrn_ioreq_client_destroy(client);
+	if (vm->default_client)
+		acrn_ioreq_client_destroy(vm->default_client);
+
+	if (vm->ioreq_buf && vm->ioreq_page) {
+		unpin_user_page(vm->ioreq_page);
+		vm->ioreq_buf = NULL;
+	}
+}
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
index ff5df7acb551..2bc649c7c1d3 100644
--- a/drivers/virt/acrn/vm.c
+++ b/drivers/virt/acrn/vm.c
@@ -15,9 +15,12 @@
 #include "acrn_drv.h"
 
 /* List of VMs */
-static LIST_HEAD(acrn_vm_list);
-/* To protect acrn_vm_list */
-static DEFINE_MUTEX(acrn_vm_list_lock);
+LIST_HEAD(acrn_vm_list);
+/*
+ * acrn_vm_list is read in a worker thread which dispatch I/O requests and
+ * is wrote in VM creation ioctl. Use the rwlock mechanism to protect it.
+ */
+DEFINE_RWLOCK(acrn_vm_list_lock);
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 			       struct acrn_vm_creation *vm_param)
@@ -32,12 +35,20 @@ struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 	}
 
 	mutex_init(&vm->regions_mapping_lock);
+	INIT_LIST_HEAD(&vm->ioreq_clients);
+	spin_lock_init(&vm->ioreq_clients_lock);
 	vm->vmid = vm_param->vmid;
 	vm->vcpu_num = vm_param->vcpu_num;
 
-	mutex_lock(&acrn_vm_list_lock);
+	if (acrn_ioreq_init(vm, vm_param->ioreq_buf) < 0) {
+		hcall_destroy_vm(vm_param->vmid);
+		vm->vmid = ACRN_INVALID_VMID;
+		return NULL;
+	}
+
+	write_lock_bh(&acrn_vm_list_lock);
 	list_add(&vm->list, &acrn_vm_list);
-	mutex_unlock(&acrn_vm_list_lock);
+	write_unlock_bh(&acrn_vm_list_lock);
 
 	dev_dbg(acrn_dev.this_device, "VM %u created.\n", vm->vmid);
 	return vm;
@@ -52,9 +63,11 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 		return 0;
 
 	/* Remove from global VM list */
-	mutex_lock(&acrn_vm_list_lock);
+	write_lock_bh(&acrn_vm_list_lock);
 	list_del_init(&vm->list);
-	mutex_unlock(&acrn_vm_list_lock);
+	write_unlock_bh(&acrn_vm_list_lock);
+
+	acrn_ioreq_deinit(vm);
 
 	ret = hcall_destroy_vm(vm->vmid);
 	if (ret < 0) {
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index ec4c61e7c170..b0f73ab5e4e8 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -14,6 +14,145 @@
 #include <linux/types.h>
 #include <linux/uuid.h>
 
+#define ACRN_IO_REQUEST_MAX		16
+
+#define ACRN_IOREQ_STATE_PENDING	0
+#define ACRN_IOREQ_STATE_COMPLETE	1
+#define ACRN_IOREQ_STATE_PROCESSING	2
+#define ACRN_IOREQ_STATE_FREE		3
+
+#define ACRN_IOREQ_TYPE_PORTIO		0
+#define ACRN_IOREQ_TYPE_MMIO		1
+
+#define ACRN_IOREQ_DIR_READ		0
+#define ACRN_IOREQ_DIR_WRITE		1
+
+/**
+ * struct acrn_mmio_request - Info of a MMIO I/O request
+ * @direction:	Access direction of this request (ACRN_IOREQ_DIR_*)
+ * @reserved:	Reserved for alignment and should be 0
+ * @address:	Access address of this MMIO I/O request
+ * @size:	Access size of this MMIO I/O request
+ * @value:	Read/write value of this MMIO I/O request
+ */
+struct acrn_mmio_request {
+	__u32	direction;
+	__u32	reserved;
+	__u64	address;
+	__u64	size;
+	__u64	value;
+};
+
+/**
+ * struct acrn_pio_request - Info of a PIO I/O request
+ * @direction:	Access direction of this request (ACRN_IOREQ_DIR_*)
+ * @reserved:	Reserved for alignment and should be 0
+ * @address:	Access address of this PIO I/O request
+ * @size:	Access size of this PIO I/O request
+ * @value:	Read/write value of this PIO I/O request
+ */
+struct acrn_pio_request {
+	__u32	direction;
+	__u32	reserved;
+	__u64	address;
+	__u64	size;
+	__u32	value;
+};
+
+/**
+ * struct acrn_io_request - 256-byte ACRN I/O request
+ * @type:		Type of this request (ACRN_IOREQ_TYPE_*).
+ * @completion_polling:	Polling flag. Hypervisor will poll completion of the
+ *			I/O request if this flag set.
+ * @reserved0:		Reserved fields.
+ * @reqs:		Union of different types of request. Byte offset: 64.
+ * @reqs.pio_request:	PIO request data of the I/O request.
+ * @reqs.mmio_request:	MMIO request data of the I/O request.
+ * @reqs.data:		Raw data of the I/O request.
+ * @reserved1:		Reserved fields.
+ * @kernel_handled:	Flag indicates this request need be handled in kernel.
+ * @processed:		The status of this request (ACRN_IOREQ_STATE_*).
+ *
+ * The state transitions of ACRN I/O request:
+ *
+ *    FREE -> PENDING -> PROCESSING -> COMPLETE -> FREE -> ...
+ *
+ * An I/O request in COMPLETE or FREE state is owned by the hypervisor. HSM and
+ * ACRN userspace are in charge of processing the others.
+ *
+ * On basis of the states illustrated above, a typical lifecycle of ACRN IO
+ * request would look like:
+ *
+ * Flow                 (assume the initial state is FREE)
+ * |
+ * |   Service VM vCPU 0     Service VM vCPU x      User vCPU y
+ * |
+ * |                                             hypervisor:
+ * |                                               fills in type, addr, etc.
+ * |                                               pauses the User VM vCPU y
+ * |                                               sets the state to PENDING (a)
+ * |                                               fires an upcall to Service VM
+ * |
+ * | HSM:
+ * |  scans for PENDING requests
+ * |  sets the states to PROCESSING (b)
+ * |  assigns the requests to clients (c)
+ * V
+ * |                     client:
+ * |                       scans for the assigned requests
+ * |                       handles the requests (d)
+ * |                     HSM:
+ * |                       sets states to COMPLETE
+ * |                       notifies the hypervisor
+ * |
+ * |                     hypervisor:
+ * |                       resumes User VM vCPU y (e)
+ * |
+ * |                                             hypervisor:
+ * |                                               post handling (f)
+ * V                                               sets states to FREE
+ *
+ * Note that the procedures (a) to (f) in the illustration above require to be
+ * strictly processed in the order.  One vCPU cannot trigger another request of
+ * I/O emulation before completing the previous one.
+ *
+ * Atomic and barriers are required when HSM and hypervisor accessing the state
+ * of &struct acrn_io_request.
+ *
+ */
+struct acrn_io_request {
+	__u32	type;
+	__u32	completion_polling;
+	__u32	reserved0[14];
+	union {
+		struct acrn_pio_request		pio_request;
+		struct acrn_mmio_request	mmio_request;
+		__u64				data[8];
+	} reqs;
+	__u32	reserved1;
+	__u32	kernel_handled;
+	__u32	processed;
+} __attribute__((aligned(256)));
+
+struct acrn_io_request_buffer {
+	union {
+		struct acrn_io_request	req_slot[ACRN_IO_REQUEST_MAX];
+		__u8			reserved[4096];
+	};
+};
+
+/**
+ * struct acrn_ioreq_notify - The structure of ioreq completion notification
+ * @vmid:	User VM ID
+ * @reserved:	Reserved and should be 0
+ * @vcpu:	vCPU ID
+ */
+struct acrn_ioreq_notify {
+	__u16	vmid;
+	__u16	reserved;
+	__u32	vcpu;
+};
+
 /**
  * struct acrn_vm_creation - Info to create a User VM
  * @vmid:		User VM ID returned from the hypervisor
@@ -218,6 +357,17 @@ struct acrn_vm_memmap {
 #define ACRN_IOCTL_SET_VCPU_REGS	\
 	_IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs)
 
+#define ACRN_IOCTL_NOTIFY_REQUEST_FINISH \
+	_IOW(ACRN_IOCTL_TYPE, 0x31, struct acrn_ioreq_notify)
+#define ACRN_IOCTL_CREATE_IOREQ_CLIENT	\
+	_IO(ACRN_IOCTL_TYPE, 0x32)
+#define ACRN_IOCTL_ATTACH_IOREQ_CLIENT	\
+	_IO(ACRN_IOCTL_TYPE, 0x33)
+#define ACRN_IOCTL_DESTROY_IOREQ_CLIENT	\
+	_IO(ACRN_IOCTL_TYPE, 0x34)
+#define ACRN_IOCTL_CLEAR_VM_IOREQ	\
+	_IO(ACRN_IOCTL_TYPE, 0x35)
+
 #define ACRN_IOCTL_SET_MEMSEG		\
 	_IOW(ACRN_IOCTL_TYPE, 0x41, struct acrn_vm_memmap)
 #define ACRN_IOCTL_UNSET_MEMSEG		\
-- 
cgit v1.2.3-71-gd317


From 3c4c331667d4d9f1b5f3fdff9c4db36776da30ae Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:32 +0800
Subject: virt: acrn: Introduce PCI configuration space PIO accesses combiner

A User VM can access its virtual PCI configuration spaces via port IO
approach, which has two following steps:
 1) writes address into port 0xCF8
 2) put/get data in/from port 0xCFC

To distribute a complete PCI configuration space access one time, HSM
need to combine such two accesses together.

Combine two paired PIO I/O requests into one PCI I/O request and
continue the I/O request distribution.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-11-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/acrn_drv.h |  2 ++
 drivers/virt/acrn/ioreq.c    | 76 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/acrn.h    | 27 ++++++++++++++++
 3 files changed, 105 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index 68bd8db6c8be..6c92a505fa20 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -154,6 +154,7 @@ extern rwlock_t acrn_vm_list_lock;
  * @default_client:		The default I/O request client
  * @ioreq_buf:			I/O request shared buffer
  * @ioreq_page:			The page of the I/O request shared buffer
+ * @pci_conf_addr:		Address of a PCI configuration access emulation
  */
 struct acrn_vm {
 	struct list_head		list;
@@ -168,6 +169,7 @@ struct acrn_vm {
 	struct acrn_ioreq_client	*default_client;
 	struct acrn_io_request_buffer	*ioreq_buf;
 	struct page			*ioreq_page;
+	u32				pci_conf_addr;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
diff --git a/drivers/virt/acrn/ioreq.c b/drivers/virt/acrn/ioreq.c
index 51cb41ef7c72..d19c05582d38 100644
--- a/drivers/virt/acrn/ioreq.c
+++ b/drivers/virt/acrn/ioreq.c
@@ -222,6 +222,80 @@ int acrn_ioreq_client_wait(struct acrn_ioreq_client *client)
 	return 0;
 }
 
+static bool is_cfg_addr(struct acrn_io_request *req)
+{
+	return ((req->type == ACRN_IOREQ_TYPE_PORTIO) &&
+		(req->reqs.pio_request.address == 0xcf8));
+}
+
+static bool is_cfg_data(struct acrn_io_request *req)
+{
+	return ((req->type == ACRN_IOREQ_TYPE_PORTIO) &&
+		((req->reqs.pio_request.address >= 0xcfc) &&
+		 (req->reqs.pio_request.address < (0xcfc + 4))));
+}
+
+/* The low 8-bit of supported pci_reg addr.*/
+#define PCI_LOWREG_MASK  0xFC
+/* The high 4-bit of supported pci_reg addr */
+#define PCI_HIGHREG_MASK 0xF00
+/* Max number of supported functions */
+#define PCI_FUNCMAX	7
+/* Max number of supported slots */
+#define PCI_SLOTMAX	31
+/* Max number of supported buses */
+#define PCI_BUSMAX	255
+#define CONF1_ENABLE	0x80000000UL
+/*
+ * A PCI configuration space access via PIO 0xCF8 and 0xCFC normally has two
+ * following steps:
+ *   1) writes address into 0xCF8 port
+ *   2) accesses data in/from 0xCFC
+ * This function combines such paired PCI configuration space I/O requests into
+ * one ACRN_IOREQ_TYPE_PCICFG type I/O request and continues the processing.
+ */
+static bool handle_cf8cfc(struct acrn_vm *vm,
+			  struct acrn_io_request *req, u16 vcpu)
+{
+	int offset, pci_cfg_addr, pci_reg;
+	bool is_handled = false;
+
+	if (is_cfg_addr(req)) {
+		WARN_ON(req->reqs.pio_request.size != 4);
+		if (req->reqs.pio_request.direction == ACRN_IOREQ_DIR_WRITE)
+			vm->pci_conf_addr = req->reqs.pio_request.value;
+		else
+			req->reqs.pio_request.value = vm->pci_conf_addr;
+		is_handled = true;
+	} else if (is_cfg_data(req)) {
+		if (!(vm->pci_conf_addr & CONF1_ENABLE)) {
+			if (req->reqs.pio_request.direction ==
+					ACRN_IOREQ_DIR_READ)
+				req->reqs.pio_request.value = 0xffffffff;
+			is_handled = true;
+		} else {
+			offset = req->reqs.pio_request.address - 0xcfc;
+
+			req->type = ACRN_IOREQ_TYPE_PCICFG;
+			pci_cfg_addr = vm->pci_conf_addr;
+			req->reqs.pci_request.bus =
+					(pci_cfg_addr >> 16) & PCI_BUSMAX;
+			req->reqs.pci_request.dev =
+					(pci_cfg_addr >> 11) & PCI_SLOTMAX;
+			req->reqs.pci_request.func =
+					(pci_cfg_addr >> 8) & PCI_FUNCMAX;
+			pci_reg = (pci_cfg_addr & PCI_LOWREG_MASK) +
+				   ((pci_cfg_addr >> 16) & PCI_HIGHREG_MASK);
+			req->reqs.pci_request.reg = pci_reg + offset;
+		}
+	}
+
+	if (is_handled)
+		ioreq_complete_request(vm, vcpu, req);
+
+	return is_handled;
+}
+
 static bool in_range(struct acrn_ioreq_range *range,
 		     struct acrn_io_request *req)
 {
@@ -382,6 +456,8 @@ static int acrn_ioreq_dispatch(struct acrn_vm *vm)
 				ioreq_complete_request(vm, i, req);
 				continue;
 			}
+			if (handle_cf8cfc(vm, req, i))
+				continue;
 
 			spin_lock_bh(&vm->ioreq_clients_lock);
 			client = find_ioreq_client(vm, req);
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index b0f73ab5e4e8..da40f7ad13d9 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -23,6 +23,7 @@
 
 #define ACRN_IOREQ_TYPE_PORTIO		0
 #define ACRN_IOREQ_TYPE_MMIO		1
+#define ACRN_IOREQ_TYPE_PCICFG		2
 
 #define ACRN_IOREQ_DIR_READ		0
 #define ACRN_IOREQ_DIR_WRITE		1
@@ -59,6 +60,30 @@ struct acrn_pio_request {
 	__u32	value;
 };
 
+/**
+ * struct acrn_pci_request - Info of a PCI I/O request
+ * @direction:	Access direction of this request (ACRN_IOREQ_DIR_*)
+ * @reserved:	Reserved for alignment and should be 0
+ * @size:	Access size of this PCI I/O request
+ * @value:	Read/write value of this PIO I/O request
+ * @bus:	PCI bus value of this PCI I/O request
+ * @dev:	PCI device value of this PCI I/O request
+ * @func:	PCI function value of this PCI I/O request
+ * @reg:	PCI config space offset of this PCI I/O request
+ *
+ * Need keep same header layout with &struct acrn_pio_request.
+ */
+struct acrn_pci_request {
+	__u32	direction;
+	__u32	reserved[3];
+	__u64	size;
+	__u32	value;
+	__u32	bus;
+	__u32	dev;
+	__u32	func;
+	__u32	reg;
+};
+
 /**
  * struct acrn_io_request - 256-byte ACRN I/O request
  * @type:		Type of this request (ACRN_IOREQ_TYPE_*).
@@ -67,6 +92,7 @@ struct acrn_pio_request {
  * @reserved0:		Reserved fields.
  * @reqs:		Union of different types of request. Byte offset: 64.
  * @reqs.pio_request:	PIO request data of the I/O request.
+ * @reqs.pci_request:	PCI configuration space request data of the I/O request.
  * @reqs.mmio_request:	MMIO request data of the I/O request.
  * @reqs.data:		Raw data of the I/O request.
  * @reserved1:		Reserved fields.
@@ -126,6 +152,7 @@ struct acrn_io_request {
 	__u32	reserved0[14];
 	union {
 		struct acrn_pio_request		pio_request;
+		struct acrn_pci_request		pci_request;
 		struct acrn_mmio_request	mmio_request;
 		__u64				data[8];
 	} reqs;
-- 
cgit v1.2.3-71-gd317


From ce011e1363a1fe43de0ca05abc394022ee4fefeb Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:33 +0800
Subject: virt: acrn: Introduce interfaces for PCI device passthrough

PCI device passthrough enables an OS in a virtual machine to directly
access a PCI device in the host. It promises almost the native
performance, which is required in performance-critical scenarios of
ACRN.

HSM provides the following ioctls:
 - Assign - ACRN_IOCTL_ASSIGN_PCIDEV
   Pass data struct acrn_pcidev from userspace to the hypervisor, and
   inform the hypervisor to assign a PCI device to a User VM.

 - De-assign - ACRN_IOCTL_DEASSIGN_PCIDEV
   Pass data struct acrn_pcidev from userspace to the hypervisor, and
   inform the hypervisor to de-assign a PCI device from a User VM.

 - Set a interrupt of a passthrough device - ACRN_IOCTL_SET_PTDEV_INTR
   Pass data struct acrn_ptdev_irq from userspace to the hypervisor,
   and inform the hypervisor to map a INTx interrupt of passthrough
   device of User VM.

 - Reset passthrough device interrupt - ACRN_IOCTL_RESET_PTDEV_INTR
   Pass data struct acrn_ptdev_irq from userspace to the hypervisor,
   and inform the hypervisor to unmap a INTx interrupt of passthrough
   device of User VM.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-12-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/hsm.c       | 50 +++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/hypercall.h | 54 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/acrn.h     | 61 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 1cc0c612dc09..94d70b1c1e5c 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -49,7 +49,9 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_vm_creation *vm_param;
 	struct acrn_vcpu_regs *cpu_regs;
 	struct acrn_ioreq_notify notify;
+	struct acrn_ptdev_irq *irq_info;
 	struct acrn_vm_memmap memmap;
+	struct acrn_pcidev *pcidev;
 	int i, ret = 0;
 
 	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
@@ -148,6 +150,54 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 
 		ret = acrn_vm_memseg_unmap(vm, &memmap);
 		break;
+	case ACRN_IOCTL_ASSIGN_PCIDEV:
+		pcidev = memdup_user((void __user *)ioctl_param,
+				     sizeof(struct acrn_pcidev));
+		if (IS_ERR(pcidev))
+			return PTR_ERR(pcidev);
+
+		ret = hcall_assign_pcidev(vm->vmid, virt_to_phys(pcidev));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to assign pci device!\n");
+		kfree(pcidev);
+		break;
+	case ACRN_IOCTL_DEASSIGN_PCIDEV:
+		pcidev = memdup_user((void __user *)ioctl_param,
+				     sizeof(struct acrn_pcidev));
+		if (IS_ERR(pcidev))
+			return PTR_ERR(pcidev);
+
+		ret = hcall_deassign_pcidev(vm->vmid, virt_to_phys(pcidev));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to deassign pci device!\n");
+		kfree(pcidev);
+		break;
+	case ACRN_IOCTL_SET_PTDEV_INTR:
+		irq_info = memdup_user((void __user *)ioctl_param,
+				       sizeof(struct acrn_ptdev_irq));
+		if (IS_ERR(irq_info))
+			return PTR_ERR(irq_info);
+
+		ret = hcall_set_ptdev_intr(vm->vmid, virt_to_phys(irq_info));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to configure intr for ptdev!\n");
+		kfree(irq_info);
+		break;
+	case ACRN_IOCTL_RESET_PTDEV_INTR:
+		irq_info = memdup_user((void __user *)ioctl_param,
+				       sizeof(struct acrn_ptdev_irq));
+		if (IS_ERR(irq_info))
+			return PTR_ERR(irq_info);
+
+		ret = hcall_reset_ptdev_intr(vm->vmid, virt_to_phys(irq_info));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to reset intr for ptdev!\n");
+		kfree(irq_info);
+		break;
 	case ACRN_IOCTL_CREATE_IOREQ_CLIENT:
 		if (vm->default_client)
 			return -EEXIST;
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index 5eba29e3ed38..f448301832cf 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -28,6 +28,12 @@
 #define HC_ID_MEM_BASE			0x40UL
 #define HC_VM_SET_MEMORY_REGIONS	_HC_ID(HC_ID, HC_ID_MEM_BASE + 0x02)
 
+#define HC_ID_PCI_BASE			0x50UL
+#define HC_SET_PTDEV_INTR		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x03)
+#define HC_RESET_PTDEV_INTR		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x04)
+#define HC_ASSIGN_PCIDEV		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x05)
+#define HC_DEASSIGN_PCIDEV		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x06)
+
 /**
  * hcall_create_vm() - Create a User VM
  * @vminfo:	Service VM GPA of info of User VM creation
@@ -130,4 +136,52 @@ static inline long hcall_set_memory_regions(u64 regions_pa)
 	return acrn_hypercall1(HC_VM_SET_MEMORY_REGIONS, regions_pa);
 }
 
+/**
+ * hcall_assign_pcidev() - Assign a PCI device to a User VM
+ * @vmid:	User VM ID
+ * @addr:	Service VM GPA of the &struct acrn_pcidev
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_assign_pcidev(u64 vmid, u64 addr)
+{
+	return acrn_hypercall2(HC_ASSIGN_PCIDEV, vmid, addr);
+}
+
+/**
+ * hcall_deassign_pcidev() - De-assign a PCI device from a User VM
+ * @vmid:	User VM ID
+ * @addr:	Service VM GPA of the &struct acrn_pcidev
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_deassign_pcidev(u64 vmid, u64 addr)
+{
+	return acrn_hypercall2(HC_DEASSIGN_PCIDEV, vmid, addr);
+}
+
+/**
+ * hcall_set_ptdev_intr() - Configure an interrupt for an assigned PCI device.
+ * @vmid:	User VM ID
+ * @irq:	Service VM GPA of the &struct acrn_ptdev_irq
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_set_ptdev_intr(u64 vmid, u64 irq)
+{
+	return acrn_hypercall2(HC_SET_PTDEV_INTR, vmid, irq);
+}
+
+/**
+ * hcall_reset_ptdev_intr() - Reset an interrupt for an assigned PCI device.
+ * @vmid:	User VM ID
+ * @irq:	Service VM GPA of the &struct acrn_ptdev_irq
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_reset_ptdev_intr(u64 vmid, u64 irq)
+{
+	return acrn_hypercall2(HC_RESET_PTDEV_INTR, vmid, irq);
+}
+
 #endif /* __ACRN_HSM_HYPERCALL_H */
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index da40f7ad13d9..b25ca8c33b92 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -365,6 +365,58 @@ struct acrn_vm_memmap {
 	__u64	len;
 };
 
+/* Type of interrupt of a passthrough device */
+#define ACRN_PTDEV_IRQ_INTX	0
+#define ACRN_PTDEV_IRQ_MSI	1
+#define ACRN_PTDEV_IRQ_MSIX	2
+/**
+ * struct acrn_ptdev_irq - Interrupt data of a passthrough device.
+ * @type:		Type (ACRN_PTDEV_IRQ_*)
+ * @virt_bdf:		Virtual Bus/Device/Function
+ * @phys_bdf:		Physical Bus/Device/Function
+ * @intx:		Info of interrupt
+ * @intx.virt_pin:	Virtual IOAPIC pin
+ * @intx.phys_pin:	Physical IOAPIC pin
+ * @intx.is_pic_pin:	Is PIC pin or not
+ *
+ * This structure will be passed to hypervisor directly.
+ */
+struct acrn_ptdev_irq {
+	__u32	type;
+	__u16	virt_bdf;
+	__u16	phys_bdf;
+
+	struct {
+		__u32	virt_pin;
+		__u32	phys_pin;
+		__u32	is_pic_pin;
+	} intx;
+};
+
+/* Type of PCI device assignment */
+#define ACRN_PTDEV_QUIRK_ASSIGN	(1U << 0)
+
+#define ACRN_PCI_NUM_BARS	6
+/**
+ * struct acrn_pcidev - Info for assigning or de-assigning a PCI device
+ * @type:	Type of the assignment
+ * @virt_bdf:	Virtual Bus/Device/Function
+ * @phys_bdf:	Physical Bus/Device/Function
+ * @intr_line:	PCI interrupt line
+ * @intr_pin:	PCI interrupt pin
+ * @bar:	PCI BARs.
+ *
+ * This structure will be passed to hypervisor directly.
+ */
+struct acrn_pcidev {
+	__u32	type;
+	__u16	virt_bdf;
+	__u16	phys_bdf;
+	__u8	intr_line;
+	__u8	intr_pin;
+	__u32	bar[ACRN_PCI_NUM_BARS];
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -400,4 +452,13 @@ struct acrn_vm_memmap {
 #define ACRN_IOCTL_UNSET_MEMSEG		\
 	_IOW(ACRN_IOCTL_TYPE, 0x42, struct acrn_vm_memmap)
 
+#define ACRN_IOCTL_SET_PTDEV_INTR	\
+	_IOW(ACRN_IOCTL_TYPE, 0x53, struct acrn_ptdev_irq)
+#define ACRN_IOCTL_RESET_PTDEV_INTR	\
+	_IOW(ACRN_IOCTL_TYPE, 0x54, struct acrn_ptdev_irq)
+#define ACRN_IOCTL_ASSIGN_PCIDEV	\
+	_IOW(ACRN_IOCTL_TYPE, 0x55, struct acrn_pcidev)
+#define ACRN_IOCTL_DEASSIGN_PCIDEV	\
+	_IOW(ACRN_IOCTL_TYPE, 0x56, struct acrn_pcidev)
+
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3-71-gd317


From c7cf8d27244f2ccdde30c79eb6314c943bbeac28 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:34 +0800
Subject: virt: acrn: Introduce interrupt injection interfaces

ACRN userspace need to inject virtual interrupts into a User VM in
devices emulation.

HSM needs provide interfaces to do so.

Introduce following interrupt injection interfaces:

ioctl ACRN_IOCTL_SET_IRQLINE:
  Pass data from userspace to the hypervisor, and inform the hypervisor
  to inject a virtual IOAPIC GSI interrupt to a User VM.

ioctl ACRN_IOCTL_INJECT_MSI:
  Pass data struct acrn_msi_entry from userspace to the hypervisor, and
  inform the hypervisor to inject a virtual MSI to a User VM.

ioctl ACRN_IOCTL_VM_INTR_MONITOR:
  Set a 4-Kbyte aligned shared page for statistics information of
  interrupts of a User VM.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-13-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/acrn_drv.h  |  4 ++++
 drivers/virt/acrn/hsm.c       | 40 ++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/hypercall.h | 41 +++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/vm.c        | 36 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/acrn.h     | 17 +++++++++++++++++
 5 files changed, 138 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index 6c92a505fa20..068f0be769f6 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -155,6 +155,7 @@ extern rwlock_t acrn_vm_list_lock;
  * @ioreq_buf:			I/O request shared buffer
  * @ioreq_page:			The page of the I/O request shared buffer
  * @pci_conf_addr:		Address of a PCI configuration access emulation
+ * @monitor_page:		Page of interrupt statistics of User VM
  */
 struct acrn_vm {
 	struct list_head		list;
@@ -170,6 +171,7 @@ struct acrn_vm {
 	struct acrn_io_request_buffer	*ioreq_buf;
 	struct page			*ioreq_page;
 	u32				pci_conf_addr;
+	struct page			*monitor_page;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
@@ -196,4 +198,6 @@ struct acrn_ioreq_client *acrn_ioreq_client_create(struct acrn_vm *vm,
 						   const char *name);
 void acrn_ioreq_client_destroy(struct acrn_ioreq_client *client);
 
+int acrn_msi_inject(struct acrn_vm *vm, u64 msi_addr, u64 msi_data);
+
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 94d70b1c1e5c..419271f32be8 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -51,7 +51,9 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_ioreq_notify notify;
 	struct acrn_ptdev_irq *irq_info;
 	struct acrn_vm_memmap memmap;
+	struct acrn_msi_entry *msi;
 	struct acrn_pcidev *pcidev;
+	struct page *page;
 	int i, ret = 0;
 
 	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
@@ -198,6 +200,44 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 				"Failed to reset intr for ptdev!\n");
 		kfree(irq_info);
 		break;
+	case ACRN_IOCTL_SET_IRQLINE:
+		ret = hcall_set_irqline(vm->vmid, ioctl_param);
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to set interrupt line!\n");
+		break;
+	case ACRN_IOCTL_INJECT_MSI:
+		msi = memdup_user((void __user *)ioctl_param,
+				  sizeof(struct acrn_msi_entry));
+		if (IS_ERR(msi))
+			return PTR_ERR(msi);
+
+		ret = hcall_inject_msi(vm->vmid, virt_to_phys(msi));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to inject MSI!\n");
+		kfree(msi);
+		break;
+	case ACRN_IOCTL_VM_INTR_MONITOR:
+		ret = pin_user_pages_fast(ioctl_param, 1,
+					  FOLL_WRITE | FOLL_LONGTERM, &page);
+		if (unlikely(ret != 1)) {
+			dev_dbg(acrn_dev.this_device,
+				"Failed to pin intr hdr buffer!\n");
+			return -EFAULT;
+		}
+
+		ret = hcall_vm_intr_monitor(vm->vmid, page_to_phys(page));
+		if (ret < 0) {
+			unpin_user_page(page);
+			dev_dbg(acrn_dev.this_device,
+				"Failed to monitor intr data!\n");
+			return ret;
+		}
+		if (vm->monitor_page)
+			unpin_user_page(vm->monitor_page);
+		vm->monitor_page = page;
+		break;
 	case ACRN_IOCTL_CREATE_IOREQ_CLIENT:
 		if (vm->default_client)
 			return -EEXIST;
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index f448301832cf..a8813397a3fe 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -21,6 +21,11 @@
 #define HC_RESET_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x05)
 #define HC_SET_VCPU_REGS		_HC_ID(HC_ID, HC_ID_VM_BASE + 0x06)
 
+#define HC_ID_IRQ_BASE			0x20UL
+#define HC_INJECT_MSI			_HC_ID(HC_ID, HC_ID_IRQ_BASE + 0x03)
+#define HC_VM_INTR_MONITOR		_HC_ID(HC_ID, HC_ID_IRQ_BASE + 0x04)
+#define HC_SET_IRQLINE			_HC_ID(HC_ID, HC_ID_IRQ_BASE + 0x05)
+
 #define HC_ID_IOREQ_BASE		0x30UL
 #define HC_SET_IOREQ_BUFFER		_HC_ID(HC_ID, HC_ID_IOREQ_BASE + 0x00)
 #define HC_NOTIFY_REQUEST_FINISH	_HC_ID(HC_ID, HC_ID_IOREQ_BASE + 0x01)
@@ -101,6 +106,42 @@ static inline long hcall_set_vcpu_regs(u64 vmid, u64 regs_state)
 	return acrn_hypercall2(HC_SET_VCPU_REGS, vmid, regs_state);
 }
 
+/**
+ * hcall_inject_msi() - Deliver a MSI interrupt to a User VM
+ * @vmid:	User VM ID
+ * @msi:	Service VM GPA of MSI message
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_inject_msi(u64 vmid, u64 msi)
+{
+	return acrn_hypercall2(HC_INJECT_MSI, vmid, msi);
+}
+
+/**
+ * hcall_vm_intr_monitor() - Set a shared page for User VM interrupt statistics
+ * @vmid:	User VM ID
+ * @addr:	Service VM GPA of the shared page
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_vm_intr_monitor(u64 vmid, u64 addr)
+{
+	return acrn_hypercall2(HC_VM_INTR_MONITOR, vmid, addr);
+}
+
+/**
+ * hcall_set_irqline() - Set or clear an interrupt line
+ * @vmid:	User VM ID
+ * @op:		Service VM GPA of interrupt line operations
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_set_irqline(u64 vmid, u64 op)
+{
+	return acrn_hypercall2(HC_SET_IRQLINE, vmid, op);
+}
+
 /**
  * hcall_set_ioreq_buffer() - Set up the shared buffer for I/O Requests.
  * @vmid:	User VM ID
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
index 2bc649c7c1d3..db597f27690a 100644
--- a/drivers/virt/acrn/vm.c
+++ b/drivers/virt/acrn/vm.c
@@ -68,6 +68,10 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 	write_unlock_bh(&acrn_vm_list_lock);
 
 	acrn_ioreq_deinit(vm);
+	if (vm->monitor_page) {
+		put_page(vm->monitor_page);
+		vm->monitor_page = NULL;
+	}
 
 	ret = hcall_destroy_vm(vm->vmid);
 	if (ret < 0) {
@@ -83,3 +87,35 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 	vm->vmid = ACRN_INVALID_VMID;
 	return 0;
 }
+
+/**
+ * acrn_inject_msi() - Inject a MSI interrupt into a User VM
+ * @vm:		User VM
+ * @msi_addr:	The MSI address
+ * @msi_data:	The MSI data
+ *
+ * Return: 0 on success, <0 on error
+ */
+int acrn_msi_inject(struct acrn_vm *vm, u64 msi_addr, u64 msi_data)
+{
+	struct acrn_msi_entry *msi;
+	int ret;
+
+	/* might be used in interrupt context, so use GFP_ATOMIC */
+	msi = kzalloc(sizeof(*msi), GFP_ATOMIC);
+	if (!msi)
+		return -ENOMEM;
+
+	/*
+	 * msi_addr: addr[19:12] with dest vcpu id
+	 * msi_data: data[7:0] with vector
+	 */
+	msi->msi_addr = msi_addr;
+	msi->msi_data = msi_data;
+	ret = hcall_inject_msi(vm->vmid, virt_to_phys(msi));
+	if (ret < 0)
+		dev_err(acrn_dev.this_device,
+			"Failed to inject MSI to VM %u!\n", vm->vmid);
+	kfree(msi);
+	return ret;
+}
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index b25ca8c33b92..b1c06b28ebdc 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -417,6 +417,16 @@ struct acrn_pcidev {
 	__u32	bar[ACRN_PCI_NUM_BARS];
 };
 
+/**
+ * struct acrn_msi_entry - Info for injecting a MSI interrupt to a VM
+ * @msi_addr:	MSI addr[19:12] with dest vCPU ID
+ * @msi_data:	MSI data[7:0] with vector
+ */
+struct acrn_msi_entry {
+	__u64	msi_addr;
+	__u64	msi_data;
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -436,6 +446,13 @@ struct acrn_pcidev {
 #define ACRN_IOCTL_SET_VCPU_REGS	\
 	_IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs)
 
+#define ACRN_IOCTL_INJECT_MSI		\
+	_IOW(ACRN_IOCTL_TYPE, 0x23, struct acrn_msi_entry)
+#define ACRN_IOCTL_VM_INTR_MONITOR	\
+	_IOW(ACRN_IOCTL_TYPE, 0x24, unsigned long)
+#define ACRN_IOCTL_SET_IRQLINE		\
+	_IOW(ACRN_IOCTL_TYPE, 0x25, __u64)
+
 #define ACRN_IOCTL_NOTIFY_REQUEST_FINISH \
 	_IOW(ACRN_IOCTL_TYPE, 0x31, struct acrn_ioreq_notify)
 #define ACRN_IOCTL_CREATE_IOREQ_CLIENT	\
-- 
cgit v1.2.3-71-gd317


From 3d679d5aec648f50e645702929890b9611998a0b Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:35 +0800
Subject: virt: acrn: Introduce interfaces to query C-states and P-states
 allowed by hypervisor

The C-states and P-states data are used to support CPU power management.
The hypervisor controls C-states and P-states for a User VM.

ACRN userspace need to query the data from the hypervisor to build ACPI
tables for a User VM.

HSM provides ioctls for ACRN userspace to query C-states and P-states
data obtained from the hypervisor.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-14-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/hsm.c       | 69 +++++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/hypercall.h | 12 ++++++++
 include/uapi/linux/acrn.h     | 55 ++++++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 419271f32be8..b7f2deddc3e7 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -38,6 +38,67 @@ static int acrn_dev_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int pmcmd_ioctl(u64 cmd, void __user *uptr)
+{
+	struct acrn_pstate_data *px_data;
+	struct acrn_cstate_data *cx_data;
+	u64 *pm_info;
+	int ret = 0;
+
+	switch (cmd & PMCMD_TYPE_MASK) {
+	case ACRN_PMCMD_GET_PX_CNT:
+	case ACRN_PMCMD_GET_CX_CNT:
+		pm_info = kmalloc(sizeof(u64), GFP_KERNEL);
+		if (!pm_info)
+			return -ENOMEM;
+
+		ret = hcall_get_cpu_state(cmd, virt_to_phys(pm_info));
+		if (ret < 0) {
+			kfree(pm_info);
+			break;
+		}
+
+		if (copy_to_user(uptr, pm_info, sizeof(u64)))
+			ret = -EFAULT;
+		kfree(pm_info);
+		break;
+	case ACRN_PMCMD_GET_PX_DATA:
+		px_data = kmalloc(sizeof(*px_data), GFP_KERNEL);
+		if (!px_data)
+			return -ENOMEM;
+
+		ret = hcall_get_cpu_state(cmd, virt_to_phys(px_data));
+		if (ret < 0) {
+			kfree(px_data);
+			break;
+		}
+
+		if (copy_to_user(uptr, px_data, sizeof(*px_data)))
+			ret = -EFAULT;
+		kfree(px_data);
+		break;
+	case ACRN_PMCMD_GET_CX_DATA:
+		cx_data = kmalloc(sizeof(*cx_data), GFP_KERNEL);
+		if (!cx_data)
+			return -ENOMEM;
+
+		ret = hcall_get_cpu_state(cmd, virt_to_phys(cx_data));
+		if (ret < 0) {
+			kfree(cx_data);
+			break;
+		}
+
+		if (copy_to_user(uptr, cx_data, sizeof(*cx_data)))
+			ret = -EFAULT;
+		kfree(cx_data);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
 /*
  * HSM relies on hypercall layer of the ACRN hypervisor to do the
  * sanity check against the input parameters.
@@ -54,6 +115,7 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_msi_entry *msi;
 	struct acrn_pcidev *pcidev;
 	struct page *page;
+	u64 cstate_cmd;
 	int i, ret = 0;
 
 	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
@@ -267,6 +329,13 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	case ACRN_IOCTL_CLEAR_VM_IOREQ:
 		acrn_ioreq_request_clear(vm);
 		break;
+	case ACRN_IOCTL_PM_GET_CPU_STATE:
+		if (copy_from_user(&cstate_cmd, (void *)ioctl_param,
+				   sizeof(cstate_cmd)))
+			return -EFAULT;
+
+		ret = pmcmd_ioctl(cstate_cmd, (void __user *)ioctl_param);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index a8813397a3fe..e640632366f0 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -39,6 +39,9 @@
 #define HC_ASSIGN_PCIDEV		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x05)
 #define HC_DEASSIGN_PCIDEV		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x06)
 
+#define HC_ID_PM_BASE			0x80UL
+#define HC_PM_GET_CPU_STATE		_HC_ID(HC_ID, HC_ID_PM_BASE + 0x00)
+
 /**
  * hcall_create_vm() - Create a User VM
  * @vminfo:	Service VM GPA of info of User VM creation
@@ -225,4 +228,13 @@ static inline long hcall_reset_ptdev_intr(u64 vmid, u64 irq)
 	return acrn_hypercall2(HC_RESET_PTDEV_INTR, vmid, irq);
 }
 
+/*
+ * hcall_get_cpu_state() - Get P-states and C-states info from the hypervisor
+ * @state:	Service VM GPA of buffer of P-states and C-states
+ */
+static inline long hcall_get_cpu_state(u64 cmd, u64 state)
+{
+	return acrn_hypercall2(HC_PM_GET_CPU_STATE, cmd, state);
+}
+
 #endif /* __ACRN_HSM_HYPERCALL_H */
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index b1c06b28ebdc..e997d80a0cc7 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -427,6 +427,58 @@ struct acrn_msi_entry {
 	__u64	msi_data;
 };
 
+struct acrn_acpi_generic_address {
+	__u8	space_id;
+	__u8	bit_width;
+	__u8	bit_offset;
+	__u8	access_size;
+	__u64	address;
+} __attribute__ ((__packed__));
+
+/**
+ * struct acrn_cstate_data - A C state package defined in ACPI
+ * @cx_reg:	Register of the C state object
+ * @type:	Type of the C state object
+ * @latency:	The worst-case latency to enter and exit this C state
+ * @power:	The average power consumption when in this C state
+ */
+struct acrn_cstate_data {
+	struct acrn_acpi_generic_address	cx_reg;
+	__u8					type;
+	__u32					latency;
+	__u64					power;
+};
+
+/**
+ * struct acrn_pstate_data - A P state package defined in ACPI
+ * @core_frequency:	CPU frequency (in MHz).
+ * @power:		Power dissipation (in milliwatts).
+ * @transition_latency:	The worst-case latency in microseconds that CPU is
+ * 			unavailable during a transition from any P state to
+ * 			this P state.
+ * @bus_master_latency:	The worst-case latency in microseconds that Bus Masters
+ * 			are prevented from accessing memory during a transition
+ * 			from any P state to this P state.
+ * @control:		The value to be written to Performance Control Register
+ * @status:		Transition status.
+ */
+struct acrn_pstate_data {
+	__u64	core_frequency;
+	__u64	power;
+	__u64	transition_latency;
+	__u64	bus_master_latency;
+	__u64	control;
+	__u64	status;
+};
+
+#define PMCMD_TYPE_MASK		0x000000ff
+enum acrn_pm_cmd_type {
+	ACRN_PMCMD_GET_PX_CNT,
+	ACRN_PMCMD_GET_PX_DATA,
+	ACRN_PMCMD_GET_CX_CNT,
+	ACRN_PMCMD_GET_CX_DATA,
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -478,4 +530,7 @@ struct acrn_msi_entry {
 #define ACRN_IOCTL_DEASSIGN_PCIDEV	\
 	_IOW(ACRN_IOCTL_TYPE, 0x56, struct acrn_pcidev)
 
+#define ACRN_IOCTL_PM_GET_CPU_STATE	\
+	_IOWR(ACRN_IOCTL_TYPE, 0x60, __u64)
+
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3-71-gd317


From d8ad515156b66e7e79a6e4c814f997ee54eb47c7 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:37 +0800
Subject: virt: acrn: Introduce ioeventfd

ioeventfd is a mechanism to register PIO/MMIO regions to trigger an
eventfd signal when written to by a User VM. ACRN userspace can register
any arbitrary I/O address with a corresponding eventfd and then pass the
eventfd to a specific end-point of interest for handling.

Vhost is a kernel-level virtio server which uses eventfd for signalling.
To support vhost on ACRN, ioeventfd is introduced in HSM.

A new I/O client dedicated to ioeventfd is associated with a User VM
during VM creation. HSM provides ioctls to associate an I/O region with
a eventfd. The I/O client signals a eventfd once its corresponding I/O
region is matched with an I/O request.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-16-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/Kconfig     |   1 +
 drivers/virt/acrn/Makefile    |   2 +-
 drivers/virt/acrn/acrn_drv.h  |  10 ++
 drivers/virt/acrn/hsm.c       |  11 ++
 drivers/virt/acrn/ioeventfd.c | 273 ++++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/vm.c        |   2 +
 include/uapi/linux/acrn.h     |  29 +++++
 7 files changed, 327 insertions(+), 1 deletion(-)
 create mode 100644 drivers/virt/acrn/ioeventfd.c

(limited to 'include/uapi/linux')

diff --git a/drivers/virt/acrn/Kconfig b/drivers/virt/acrn/Kconfig
index 36c80378c30c..3e1a61c9d8d8 100644
--- a/drivers/virt/acrn/Kconfig
+++ b/drivers/virt/acrn/Kconfig
@@ -2,6 +2,7 @@
 config ACRN_HSM
 	tristate "ACRN Hypervisor Service Module"
 	depends on ACRN_GUEST
+	select EVENTFD
 	help
 	  ACRN Hypervisor Service Module (HSM) is a kernel module which
 	  communicates with ACRN userspace through ioctls and talks to
diff --git a/drivers/virt/acrn/Makefile b/drivers/virt/acrn/Makefile
index 21721cbf6a80..755b583b32ca 100644
--- a/drivers/virt/acrn/Makefile
+++ b/drivers/virt/acrn/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ACRN_HSM)	:= acrn.o
-acrn-y := hsm.o vm.o mm.o ioreq.o
+acrn-y := hsm.o vm.o mm.o ioreq.o ioeventfd.o
diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index 8a7d7721f505..e3f8190bd972 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -156,6 +156,9 @@ extern rwlock_t acrn_vm_list_lock;
  * @ioreq_page:			The page of the I/O request shared buffer
  * @pci_conf_addr:		Address of a PCI configuration access emulation
  * @monitor_page:		Page of interrupt statistics of User VM
+ * @ioeventfds_lock:		Lock to protect ioeventfds list
+ * @ioeventfds:			List to link all hsm_ioeventfd
+ * @ioeventfd_client:		I/O client for ioeventfds of the VM
  */
 struct acrn_vm {
 	struct list_head		list;
@@ -172,6 +175,9 @@ struct acrn_vm {
 	struct page			*ioreq_page;
 	u32				pci_conf_addr;
 	struct page			*monitor_page;
+	struct mutex			ioeventfds_lock;
+	struct list_head		ioeventfds;
+	struct acrn_ioreq_client	*ioeventfd_client;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
@@ -204,4 +210,8 @@ void acrn_ioreq_range_del(struct acrn_ioreq_client *client,
 
 int acrn_msi_inject(struct acrn_vm *vm, u64 msi_addr, u64 msi_data);
 
+int acrn_ioeventfd_init(struct acrn_vm *vm);
+int acrn_ioeventfd_config(struct acrn_vm *vm, struct acrn_ioeventfd *args);
+void acrn_ioeventfd_deinit(struct acrn_vm *vm);
+
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index b7f2deddc3e7..0b4e88b0b6bc 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -111,6 +111,7 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_vcpu_regs *cpu_regs;
 	struct acrn_ioreq_notify notify;
 	struct acrn_ptdev_irq *irq_info;
+	struct acrn_ioeventfd ioeventfd;
 	struct acrn_vm_memmap memmap;
 	struct acrn_msi_entry *msi;
 	struct acrn_pcidev *pcidev;
@@ -336,6 +337,16 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 
 		ret = pmcmd_ioctl(cstate_cmd, (void __user *)ioctl_param);
 		break;
+	case ACRN_IOCTL_IOEVENTFD:
+		if (copy_from_user(&ioeventfd, (void __user *)ioctl_param,
+				   sizeof(ioeventfd)))
+			return -EFAULT;
+
+		if (ioeventfd.reserved != 0)
+			return -EINVAL;
+
+		ret = acrn_ioeventfd_config(vm, &ioeventfd);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
diff --git a/drivers/virt/acrn/ioeventfd.c b/drivers/virt/acrn/ioeventfd.c
new file mode 100644
index 000000000000..ac4037e9f947
--- /dev/null
+++ b/drivers/virt/acrn/ioeventfd.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN HSM eventfd - use eventfd objects to signal expected I/O requests
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Authors:
+ *	Shuo Liu <shuo.a.liu@intel.com>
+ *	Yakui Zhao <yakui.zhao@intel.com>
+ */
+
+#include <linux/eventfd.h>
+#include <linux/slab.h>
+
+#include "acrn_drv.h"
+
+/**
+ * struct hsm_ioeventfd - Properties of HSM ioeventfd
+ * @list:	Entry within &acrn_vm.ioeventfds of ioeventfds of a VM
+ * @eventfd:	Eventfd of the HSM ioeventfd
+ * @addr:	Address of I/O range
+ * @data:	Data for matching
+ * @length:	Length of I/O range
+ * @type:	Type of I/O range (ACRN_IOREQ_TYPE_MMIO/ACRN_IOREQ_TYPE_PORTIO)
+ * @wildcard:	Data matching or not
+ */
+struct hsm_ioeventfd {
+	struct list_head	list;
+	struct eventfd_ctx	*eventfd;
+	u64			addr;
+	u64			data;
+	int			length;
+	int			type;
+	bool			wildcard;
+};
+
+static inline int ioreq_type_from_flags(int flags)
+{
+	return flags & ACRN_IOEVENTFD_FLAG_PIO ?
+		       ACRN_IOREQ_TYPE_PORTIO : ACRN_IOREQ_TYPE_MMIO;
+}
+
+static void acrn_ioeventfd_shutdown(struct acrn_vm *vm, struct hsm_ioeventfd *p)
+{
+	lockdep_assert_held(&vm->ioeventfds_lock);
+
+	eventfd_ctx_put(p->eventfd);
+	list_del(&p->list);
+	kfree(p);
+}
+
+static bool hsm_ioeventfd_is_conflict(struct acrn_vm *vm,
+				      struct hsm_ioeventfd *ioeventfd)
+{
+	struct hsm_ioeventfd *p;
+
+	lockdep_assert_held(&vm->ioeventfds_lock);
+
+	/* Either one is wildcard, the data matching will be skipped. */
+	list_for_each_entry(p, &vm->ioeventfds, list)
+		if (p->eventfd == ioeventfd->eventfd &&
+		    p->addr == ioeventfd->addr &&
+		    p->type == ioeventfd->type &&
+		    (p->wildcard || ioeventfd->wildcard ||
+			p->data == ioeventfd->data))
+			return true;
+
+	return false;
+}
+
+/*
+ * Assign an eventfd to a VM and create a HSM ioeventfd associated with the
+ * eventfd. The properties of the HSM ioeventfd are built from a &struct
+ * acrn_ioeventfd.
+ */
+static int acrn_ioeventfd_assign(struct acrn_vm *vm,
+				 struct acrn_ioeventfd *args)
+{
+	struct eventfd_ctx *eventfd;
+	struct hsm_ioeventfd *p;
+	int ret;
+
+	/* Check for range overflow */
+	if (args->addr + args->len < args->addr)
+		return -EINVAL;
+
+	/*
+	 * Currently, acrn_ioeventfd is used to support vhost. 1,2,4,8 width
+	 * accesses can cover vhost's requirements.
+	 */
+	if (!(args->len == 1 || args->len == 2 ||
+	      args->len == 4 || args->len == 8))
+		return -EINVAL;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	INIT_LIST_HEAD(&p->list);
+	p->addr = args->addr;
+	p->length = args->len;
+	p->eventfd = eventfd;
+	p->type = ioreq_type_from_flags(args->flags);
+
+	/*
+	 * ACRN_IOEVENTFD_FLAG_DATAMATCH flag is set in virtio 1.0 support, the
+	 * writing of notification register of each virtqueue may trigger the
+	 * notification. There is no data matching requirement.
+	 */
+	if (args->flags & ACRN_IOEVENTFD_FLAG_DATAMATCH)
+		p->data = args->data;
+	else
+		p->wildcard = true;
+
+	mutex_lock(&vm->ioeventfds_lock);
+
+	if (hsm_ioeventfd_is_conflict(vm, p)) {
+		ret = -EEXIST;
+		goto unlock_fail;
+	}
+
+	/* register the I/O range into ioreq client */
+	ret = acrn_ioreq_range_add(vm->ioeventfd_client, p->type,
+				   p->addr, p->addr + p->length - 1);
+	if (ret < 0)
+		goto unlock_fail;
+
+	list_add_tail(&p->list, &vm->ioeventfds);
+	mutex_unlock(&vm->ioeventfds_lock);
+
+	return 0;
+
+unlock_fail:
+	mutex_unlock(&vm->ioeventfds_lock);
+	kfree(p);
+fail:
+	eventfd_ctx_put(eventfd);
+	return ret;
+}
+
+static int acrn_ioeventfd_deassign(struct acrn_vm *vm,
+				   struct acrn_ioeventfd *args)
+{
+	struct hsm_ioeventfd *p;
+	struct eventfd_ctx *eventfd;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	mutex_lock(&vm->ioeventfds_lock);
+	list_for_each_entry(p, &vm->ioeventfds, list) {
+		if (p->eventfd != eventfd)
+			continue;
+
+		acrn_ioreq_range_del(vm->ioeventfd_client, p->type,
+				     p->addr, p->addr + p->length - 1);
+		acrn_ioeventfd_shutdown(vm, p);
+		break;
+	}
+	mutex_unlock(&vm->ioeventfds_lock);
+
+	eventfd_ctx_put(eventfd);
+	return 0;
+}
+
+static struct hsm_ioeventfd *hsm_ioeventfd_match(struct acrn_vm *vm, u64 addr,
+						 u64 data, int len, int type)
+{
+	struct hsm_ioeventfd *p = NULL;
+
+	lockdep_assert_held(&vm->ioeventfds_lock);
+
+	list_for_each_entry(p, &vm->ioeventfds, list) {
+		if (p->type == type && p->addr == addr && p->length >= len &&
+		    (p->wildcard || p->data == data))
+			return p;
+	}
+
+	return NULL;
+}
+
+static int acrn_ioeventfd_handler(struct acrn_ioreq_client *client,
+				  struct acrn_io_request *req)
+{
+	struct hsm_ioeventfd *p;
+	u64 addr, val;
+	int size;
+
+	if (req->type == ACRN_IOREQ_TYPE_MMIO) {
+		/*
+		 * I/O requests are dispatched by range check only, so a
+		 * acrn_ioreq_client need process both READ and WRITE accesses
+		 * of same range. READ accesses are safe to be ignored here
+		 * because virtio PCI devices write the notify registers for
+		 * notification.
+		 */
+		if (req->reqs.mmio_request.direction == ACRN_IOREQ_DIR_READ) {
+			/* reading does nothing and return 0 */
+			req->reqs.mmio_request.value = 0;
+			return 0;
+		}
+		addr = req->reqs.mmio_request.address;
+		size = req->reqs.mmio_request.size;
+		val = req->reqs.mmio_request.value;
+	} else {
+		if (req->reqs.pio_request.direction == ACRN_IOREQ_DIR_READ) {
+			/* reading does nothing and return 0 */
+			req->reqs.pio_request.value = 0;
+			return 0;
+		}
+		addr = req->reqs.pio_request.address;
+		size = req->reqs.pio_request.size;
+		val = req->reqs.pio_request.value;
+	}
+
+	mutex_lock(&client->vm->ioeventfds_lock);
+	p = hsm_ioeventfd_match(client->vm, addr, val, size, req->type);
+	if (p)
+		eventfd_signal(p->eventfd, 1);
+	mutex_unlock(&client->vm->ioeventfds_lock);
+
+	return 0;
+}
+
+int acrn_ioeventfd_config(struct acrn_vm *vm, struct acrn_ioeventfd *args)
+{
+	int ret;
+
+	if (args->flags & ACRN_IOEVENTFD_FLAG_DEASSIGN)
+		ret = acrn_ioeventfd_deassign(vm, args);
+	else
+		ret = acrn_ioeventfd_assign(vm, args);
+
+	return ret;
+}
+
+int acrn_ioeventfd_init(struct acrn_vm *vm)
+{
+	char name[ACRN_NAME_LEN];
+
+	mutex_init(&vm->ioeventfds_lock);
+	INIT_LIST_HEAD(&vm->ioeventfds);
+	snprintf(name, sizeof(name), "ioeventfd-%u", vm->vmid);
+	vm->ioeventfd_client = acrn_ioreq_client_create(vm,
+							acrn_ioeventfd_handler,
+							NULL, false, name);
+	if (!vm->ioeventfd_client) {
+		dev_err(acrn_dev.this_device, "Failed to create ioeventfd ioreq client!\n");
+		return -EINVAL;
+	}
+
+	dev_dbg(acrn_dev.this_device, "VM %u ioeventfd init.\n", vm->vmid);
+	return 0;
+}
+
+void acrn_ioeventfd_deinit(struct acrn_vm *vm)
+{
+	struct hsm_ioeventfd *p, *next;
+
+	dev_dbg(acrn_dev.this_device, "VM %u ioeventfd deinit.\n", vm->vmid);
+	acrn_ioreq_client_destroy(vm->ioeventfd_client);
+	mutex_lock(&vm->ioeventfds_lock);
+	list_for_each_entry_safe(p, next, &vm->ioeventfds, list)
+		acrn_ioeventfd_shutdown(vm, p);
+	mutex_unlock(&vm->ioeventfds_lock);
+}
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
index db597f27690a..2b322d956a8c 100644
--- a/drivers/virt/acrn/vm.c
+++ b/drivers/virt/acrn/vm.c
@@ -50,6 +50,7 @@ struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 	list_add(&vm->list, &acrn_vm_list);
 	write_unlock_bh(&acrn_vm_list_lock);
 
+	acrn_ioeventfd_init(vm);
 	dev_dbg(acrn_dev.this_device, "VM %u created.\n", vm->vmid);
 	return vm;
 }
@@ -67,6 +68,7 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 	list_del_init(&vm->list);
 	write_unlock_bh(&acrn_vm_list_lock);
 
+	acrn_ioeventfd_deinit(vm);
 	acrn_ioreq_deinit(vm);
 	if (vm->monitor_page) {
 		put_page(vm->monitor_page);
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index e997d80a0cc7..132c5ff95967 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -479,6 +479,32 @@ enum acrn_pm_cmd_type {
 	ACRN_PMCMD_GET_CX_DATA,
 };
 
+#define ACRN_IOEVENTFD_FLAG_PIO		0x01
+#define ACRN_IOEVENTFD_FLAG_DATAMATCH	0x02
+#define ACRN_IOEVENTFD_FLAG_DEASSIGN	0x04
+/**
+ * struct acrn_ioeventfd - Data to operate a &struct hsm_ioeventfd
+ * @fd:		The fd of eventfd associated with a hsm_ioeventfd
+ * @flags:	Logical-OR of ACRN_IOEVENTFD_FLAG_*
+ * @addr:	The start address of IO range of ioeventfd
+ * @len:	The length of IO range of ioeventfd
+ * @reserved:	Reserved and should be 0
+ * @data:	Data for data matching
+ *
+ * Without flag ACRN_IOEVENTFD_FLAG_DEASSIGN, ioctl ACRN_IOCTL_IOEVENTFD
+ * creates a &struct hsm_ioeventfd with properties originated from &struct
+ * acrn_ioeventfd. With flag ACRN_IOEVENTFD_FLAG_DEASSIGN, ioctl
+ * ACRN_IOCTL_IOEVENTFD destroys the &struct hsm_ioeventfd matching the fd.
+ */
+struct acrn_ioeventfd {
+	__u32	fd;
+	__u32	flags;
+	__u64	addr;
+	__u32	len;
+	__u32	reserved;
+	__u64	data;
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -533,4 +559,7 @@ enum acrn_pm_cmd_type {
 #define ACRN_IOCTL_PM_GET_CPU_STATE	\
 	_IOWR(ACRN_IOCTL_TYPE, 0x60, __u64)
 
+#define ACRN_IOCTL_IOEVENTFD		\
+	_IOW(ACRN_IOCTL_TYPE, 0x70, struct acrn_ioeventfd)
+
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3-71-gd317


From aa3b483ff1d71c50b33db154048dff9a8f08ac71 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:38 +0800
Subject: virt: acrn: Introduce irqfd

irqfd is a mechanism to inject a specific interrupt to a User VM using a
decoupled eventfd mechanism.

Vhost is a kernel-level virtio server which uses eventfd for interrupt
injection. To support vhost on ACRN, irqfd is introduced in HSM.

HSM provides ioctls to associate a virtual Message Signaled Interrupt
(MSI) with an eventfd. The corresponding virtual MSI will be injected
into a User VM once the eventfd got signal.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-17-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/Makefile   |   2 +-
 drivers/virt/acrn/acrn_drv.h |  10 ++
 drivers/virt/acrn/hsm.c      |   7 ++
 drivers/virt/acrn/irqfd.c    | 235 +++++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/vm.c       |   3 +
 include/uapi/linux/acrn.h    |  15 +++
 6 files changed, 271 insertions(+), 1 deletion(-)
 create mode 100644 drivers/virt/acrn/irqfd.c

(limited to 'include/uapi/linux')

diff --git a/drivers/virt/acrn/Makefile b/drivers/virt/acrn/Makefile
index 755b583b32ca..08ce641dcfa1 100644
--- a/drivers/virt/acrn/Makefile
+++ b/drivers/virt/acrn/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ACRN_HSM)	:= acrn.o
-acrn-y := hsm.o vm.o mm.o ioreq.o ioeventfd.o
+acrn-y := hsm.o vm.o mm.o ioreq.o ioeventfd.o irqfd.o
diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index e3f8190bd972..1be54efa666c 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -159,6 +159,9 @@ extern rwlock_t acrn_vm_list_lock;
  * @ioeventfds_lock:		Lock to protect ioeventfds list
  * @ioeventfds:			List to link all hsm_ioeventfd
  * @ioeventfd_client:		I/O client for ioeventfds of the VM
+ * @irqfds_lock:		Lock to protect irqfds list
+ * @irqfds:			List to link all hsm_irqfd
+ * @irqfd_wq:			Workqueue for irqfd async shutdown
  */
 struct acrn_vm {
 	struct list_head		list;
@@ -178,6 +181,9 @@ struct acrn_vm {
 	struct mutex			ioeventfds_lock;
 	struct list_head		ioeventfds;
 	struct acrn_ioreq_client	*ioeventfd_client;
+	struct mutex			irqfds_lock;
+	struct list_head		irqfds;
+	struct workqueue_struct		*irqfd_wq;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
@@ -214,4 +220,8 @@ int acrn_ioeventfd_init(struct acrn_vm *vm);
 int acrn_ioeventfd_config(struct acrn_vm *vm, struct acrn_ioeventfd *args);
 void acrn_ioeventfd_deinit(struct acrn_vm *vm);
 
+int acrn_irqfd_init(struct acrn_vm *vm);
+int acrn_irqfd_config(struct acrn_vm *vm, struct acrn_irqfd *args);
+void acrn_irqfd_deinit(struct acrn_vm *vm);
+
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 0b4e88b0b6bc..93f6abe10fa6 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -115,6 +115,7 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_vm_memmap memmap;
 	struct acrn_msi_entry *msi;
 	struct acrn_pcidev *pcidev;
+	struct acrn_irqfd irqfd;
 	struct page *page;
 	u64 cstate_cmd;
 	int i, ret = 0;
@@ -347,6 +348,12 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 
 		ret = acrn_ioeventfd_config(vm, &ioeventfd);
 		break;
+	case ACRN_IOCTL_IRQFD:
+		if (copy_from_user(&irqfd, (void __user *)ioctl_param,
+				   sizeof(irqfd)))
+			return -EFAULT;
+		ret = acrn_irqfd_config(vm, &irqfd);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c
new file mode 100644
index 000000000000..a8766d528e29
--- /dev/null
+++ b/drivers/virt/acrn/irqfd.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN HSM irqfd: use eventfd objects to inject virtual interrupts
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Authors:
+ *	Shuo Liu <shuo.a.liu@intel.com>
+ *	Yakui Zhao <yakui.zhao@intel.com>
+ */
+
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+
+#include "acrn_drv.h"
+
+static LIST_HEAD(acrn_irqfd_clients);
+static DEFINE_MUTEX(acrn_irqfds_mutex);
+
+/**
+ * struct hsm_irqfd - Properties of HSM irqfd
+ * @vm:		Associated VM pointer
+ * @wait:	Entry of wait-queue
+ * @shutdown:	Async shutdown work
+ * @eventfd:	Associated eventfd
+ * @list:	Entry within &acrn_vm.irqfds of irqfds of a VM
+ * @pt:		Structure for select/poll on the associated eventfd
+ * @msi:	MSI data
+ */
+struct hsm_irqfd {
+	struct acrn_vm		*vm;
+	wait_queue_entry_t	wait;
+	struct work_struct	shutdown;
+	struct eventfd_ctx	*eventfd;
+	struct list_head	list;
+	poll_table		pt;
+	struct acrn_msi_entry	msi;
+};
+
+static void acrn_irqfd_inject(struct hsm_irqfd *irqfd)
+{
+	struct acrn_vm *vm = irqfd->vm;
+
+	acrn_msi_inject(vm, irqfd->msi.msi_addr,
+			irqfd->msi.msi_data);
+}
+
+static void hsm_irqfd_shutdown(struct hsm_irqfd *irqfd)
+{
+	u64 cnt;
+
+	lockdep_assert_held(&irqfd->vm->irqfds_lock);
+
+	/* remove from wait queue */
+	list_del_init(&irqfd->list);
+	eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
+	eventfd_ctx_put(irqfd->eventfd);
+	kfree(irqfd);
+}
+
+static void hsm_irqfd_shutdown_work(struct work_struct *work)
+{
+	struct hsm_irqfd *irqfd;
+	struct acrn_vm *vm;
+
+	irqfd = container_of(work, struct hsm_irqfd, shutdown);
+	vm = irqfd->vm;
+	mutex_lock(&vm->irqfds_lock);
+	if (!list_empty(&irqfd->list))
+		hsm_irqfd_shutdown(irqfd);
+	mutex_unlock(&vm->irqfds_lock);
+}
+
+/* Called with wqh->lock held and interrupts disabled */
+static int hsm_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
+			    int sync, void *key)
+{
+	unsigned long poll_bits = (unsigned long)key;
+	struct hsm_irqfd *irqfd;
+	struct acrn_vm *vm;
+
+	irqfd = container_of(wait, struct hsm_irqfd, wait);
+	vm = irqfd->vm;
+	if (poll_bits & POLLIN)
+		/* An event has been signaled, inject an interrupt */
+		acrn_irqfd_inject(irqfd);
+
+	if (poll_bits & POLLHUP)
+		/* Do shutdown work in thread to hold wqh->lock */
+		queue_work(vm->irqfd_wq, &irqfd->shutdown);
+
+	return 0;
+}
+
+static void hsm_irqfd_poll_func(struct file *file, wait_queue_head_t *wqh,
+				poll_table *pt)
+{
+	struct hsm_irqfd *irqfd;
+
+	irqfd = container_of(pt, struct hsm_irqfd, pt);
+	add_wait_queue(wqh, &irqfd->wait);
+}
+
+/*
+ * Assign an eventfd to a VM and create a HSM irqfd associated with the
+ * eventfd. The properties of the HSM irqfd are built from a &struct
+ * acrn_irqfd.
+ */
+static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
+{
+	struct eventfd_ctx *eventfd = NULL;
+	struct hsm_irqfd *irqfd, *tmp;
+	unsigned int events;
+	struct fd f;
+	int ret = 0;
+
+	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+	if (!irqfd)
+		return -ENOMEM;
+
+	irqfd->vm = vm;
+	memcpy(&irqfd->msi, &args->msi, sizeof(args->msi));
+	INIT_LIST_HEAD(&irqfd->list);
+	INIT_WORK(&irqfd->shutdown, hsm_irqfd_shutdown_work);
+
+	f = fdget(args->fd);
+	if (!f.file) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	eventfd = eventfd_ctx_fileget(f.file);
+	if (IS_ERR(eventfd)) {
+		ret = PTR_ERR(eventfd);
+		goto fail;
+	}
+
+	irqfd->eventfd = eventfd;
+
+	/*
+	 * Install custom wake-up handling to be notified whenever underlying
+	 * eventfd is signaled.
+	 */
+	init_waitqueue_func_entry(&irqfd->wait, hsm_irqfd_wakeup);
+	init_poll_funcptr(&irqfd->pt, hsm_irqfd_poll_func);
+
+	mutex_lock(&vm->irqfds_lock);
+	list_for_each_entry(tmp, &vm->irqfds, list) {
+		if (irqfd->eventfd != tmp->eventfd)
+			continue;
+		ret = -EBUSY;
+		mutex_unlock(&vm->irqfds_lock);
+		goto fail;
+	}
+	list_add_tail(&irqfd->list, &vm->irqfds);
+	mutex_unlock(&vm->irqfds_lock);
+
+	/* Check the pending event in this stage */
+	events = f.file->f_op->poll(f.file, &irqfd->pt);
+
+	if (events & POLLIN)
+		acrn_irqfd_inject(irqfd);
+
+	fdput(f);
+	return 0;
+fail:
+	if (eventfd && !IS_ERR(eventfd))
+		eventfd_ctx_put(eventfd);
+
+	fdput(f);
+out:
+	kfree(irqfd);
+	return ret;
+}
+
+static int acrn_irqfd_deassign(struct acrn_vm *vm,
+			       struct acrn_irqfd *args)
+{
+	struct hsm_irqfd *irqfd, *tmp;
+	struct eventfd_ctx *eventfd;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	mutex_lock(&vm->irqfds_lock);
+	list_for_each_entry_safe(irqfd, tmp, &vm->irqfds, list) {
+		if (irqfd->eventfd == eventfd) {
+			hsm_irqfd_shutdown(irqfd);
+			break;
+		}
+	}
+	mutex_unlock(&vm->irqfds_lock);
+	eventfd_ctx_put(eventfd);
+
+	return 0;
+}
+
+int acrn_irqfd_config(struct acrn_vm *vm, struct acrn_irqfd *args)
+{
+	int ret;
+
+	if (args->flags & ACRN_IRQFD_FLAG_DEASSIGN)
+		ret = acrn_irqfd_deassign(vm, args);
+	else
+		ret = acrn_irqfd_assign(vm, args);
+
+	return ret;
+}
+
+int acrn_irqfd_init(struct acrn_vm *vm)
+{
+	INIT_LIST_HEAD(&vm->irqfds);
+	mutex_init(&vm->irqfds_lock);
+	vm->irqfd_wq = alloc_workqueue("acrn_irqfd-%u", 0, 0, vm->vmid);
+	if (!vm->irqfd_wq)
+		return -ENOMEM;
+
+	dev_dbg(acrn_dev.this_device, "VM %u irqfd init.\n", vm->vmid);
+	return 0;
+}
+
+void acrn_irqfd_deinit(struct acrn_vm *vm)
+{
+	struct hsm_irqfd *irqfd, *next;
+
+	dev_dbg(acrn_dev.this_device, "VM %u irqfd deinit.\n", vm->vmid);
+	destroy_workqueue(vm->irqfd_wq);
+	mutex_lock(&vm->irqfds_lock);
+	list_for_each_entry_safe(irqfd, next, &vm->irqfds, list)
+		hsm_irqfd_shutdown(irqfd);
+	mutex_unlock(&vm->irqfds_lock);
+}
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
index 2b322d956a8c..7804a2492ad7 100644
--- a/drivers/virt/acrn/vm.c
+++ b/drivers/virt/acrn/vm.c
@@ -51,6 +51,7 @@ struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 	write_unlock_bh(&acrn_vm_list_lock);
 
 	acrn_ioeventfd_init(vm);
+	acrn_irqfd_init(vm);
 	dev_dbg(acrn_dev.this_device, "VM %u created.\n", vm->vmid);
 	return vm;
 }
@@ -69,7 +70,9 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 	write_unlock_bh(&acrn_vm_list_lock);
 
 	acrn_ioeventfd_deinit(vm);
+	acrn_irqfd_deinit(vm);
 	acrn_ioreq_deinit(vm);
+
 	if (vm->monitor_page) {
 		put_page(vm->monitor_page);
 		vm->monitor_page = NULL;
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index 132c5ff95967..353b2a2e4536 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -505,6 +505,19 @@ struct acrn_ioeventfd {
 	__u64	data;
 };
 
+#define ACRN_IRQFD_FLAG_DEASSIGN	0x01
+/**
+ * struct acrn_irqfd - Data to operate a &struct hsm_irqfd
+ * @fd:		The fd of eventfd associated with a hsm_irqfd
+ * @flags:	Logical-OR of ACRN_IRQFD_FLAG_*
+ * @msi:	Info of MSI associated with the irqfd
+ */
+struct acrn_irqfd {
+	__s32			fd;
+	__u32			flags;
+	struct acrn_msi_entry	msi;
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -561,5 +574,7 @@ struct acrn_ioeventfd {
 
 #define ACRN_IOCTL_IOEVENTFD		\
 	_IOW(ACRN_IOCTL_TYPE, 0x70, struct acrn_ioeventfd)
+#define ACRN_IOCTL_IRQFD		\
+	_IOW(ACRN_IOCTL_TYPE, 0x71, struct acrn_irqfd)
 
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3-71-gd317


From 0566752c3e8681ec47fee37374cb38081d801e95 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 7 Feb 2021 14:05:43 +0100
Subject: uapi: map_to_7segment: Update example in documentation

The device_attribute .show() and .store() methods gained an extra
parameter in v2.6.13, but the example in the documentation for the
7-segment header file was never updated.  Add the missing parameters.

While at it, get rid of the (misspelled) deprecated symbolic
permissions, and switch to DEVICE_ATTR_RW(), which was introduced in
v3.11

Fixes: 54b6f35c99974e99 ("[PATCH] Driver core: change device_attribute callbacks")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20210207130543.2128980-1-geert@linux-m68k.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/map_to_7segment.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/map_to_7segment.h b/include/uapi/linux/map_to_7segment.h
index 13a06e5e966e..8b02088f96e3 100644
--- a/include/uapi/linux/map_to_7segment.h
+++ b/include/uapi/linux/map_to_7segment.h
@@ -45,17 +45,22 @@
  * In device drivers it is recommended, if required, to make the char map
  * accessible via the sysfs interface using the following scheme:
  *
- * static ssize_t show_map(struct device *dev, char *buf) {
+ * static ssize_t map_seg7_show(struct device *dev,
+ *				struct device_attribute *attr, char *buf)
+ * {
  *	memcpy(buf, &map_seg7, sizeof(map_seg7));
  *	return sizeof(map_seg7);
  * }
- * static ssize_t store_map(struct device *dev, const char *buf, size_t cnt) {
+ * static ssize_t map_seg7_store(struct device *dev,
+ *				 struct device_attribute *attr, const char *buf,
+ *				 size_t cnt)
+ * {
  *	if(cnt != sizeof(map_seg7))
  *		return -EINVAL;
  *	memcpy(&map_seg7, buf, cnt);
  *	return cnt;
  * }
- * static DEVICE_ATTR(map_seg7, PERMS_RW, show_map, store_map);
+ * static DEVICE_ATTR_RW(map_seg7);
  *
  * History:
  * 2005-05-31	RFC linux-kernel@vger.kernel.org
-- 
cgit v1.2.3-71-gd317


From 583fa5e71caeb79e04e477e9837e2f7fa53b71e4 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Tue, 16 Feb 2021 20:09:53 -0800
Subject: cxl/mem: Add basic IOCTL interface

Add a straightforward IOCTL that provides a mechanism for userspace to
query the supported memory device commands. CXL commands as they appear
to userspace are described as part of the UAPI kerneldoc. The command
list returned via this IOCTL will contain the full set of commands that
the driver supports, however, some of those commands may not be
available for use by userspace.

Memory device commands first appear in the CXL 2.0 specification. They
are submitted through a mailbox mechanism specified in the CXL 2.0
specification.

The send command allows userspace to issue mailbox commands directly to
the hardware. The list of available commands to send are the output of
the query command. The driver verifies basic properties of the command
and possibly inspect the input (or output) payload to determine whether
or not the command is allowed (or might taint the kernel).

Reported-by: kernel test robot <lkp@intel.com> # bug in earlier revision
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com> (v2)
Cc: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20210217040958.1354670-5-ben.widawsky@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 .clang-format                                      |   1 +
 Documentation/driver-api/cxl/memory-devices.rst    |  12 +
 Documentation/userspace-api/ioctl/ioctl-number.rst |   1 +
 drivers/cxl/mem.c                                  | 283 ++++++++++++++++++++-
 include/uapi/linux/cxl_mem.h                       | 156 ++++++++++++
 5 files changed, 452 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/cxl_mem.h

(limited to 'include/uapi/linux')

diff --git a/.clang-format b/.clang-format
index 10dc5a9a61b3..3f11c8901b43 100644
--- a/.clang-format
+++ b/.clang-format
@@ -109,6 +109,7 @@ ForEachMacros:
   - 'css_for_each_child'
   - 'css_for_each_descendant_post'
   - 'css_for_each_descendant_pre'
+  - 'cxl_for_each_cmd'
   - 'device_for_each_child_node'
   - 'dma_fence_chain_for_each'
   - 'do_for_each_ftrace_op'
diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst
index 1fef2c0a167d..1bad466f9167 100644
--- a/Documentation/driver-api/cxl/memory-devices.rst
+++ b/Documentation/driver-api/cxl/memory-devices.rst
@@ -32,3 +32,15 @@ CXL Bus
 -------
 .. kernel-doc:: drivers/cxl/bus.c
    :doc: cxl bus
+
+External Interfaces
+===================
+
+CXL IOCTL Interface
+-------------------
+
+.. kernel-doc:: include/uapi/linux/cxl_mem.h
+   :doc: UAPI
+
+.. kernel-doc:: include/uapi/linux/cxl_mem.h
+   :internal:
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index a4c75a28c839..6eb8e634664d 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -352,6 +352,7 @@ Code  Seq#    Include File                                           Comments
                                                                      <mailto:michael.klein@puffin.lb.shuttle.de>
 0xCC  00-0F  drivers/misc/ibmvmc.h                                   pseries VMC driver
 0xCD  01     linux/reiserfs_fs.h
+0xCE  01-02  uapi/linux/cxl_mem.h                                    Compute Express Link Memory Devices
 0xCF  02     fs/cifs/ioctl.c
 0xDB  00-0F  drivers/char/mwave/mwavepub.h
 0xDD  00-3F                                                          ZFCP device driver see drivers/s390/scsi/
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index 1c0195b07063..aa8f843fcca1 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <uapi/linux/cxl_mem.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/cdev.h>
@@ -40,6 +41,7 @@
 #define CXL_MAILBOX_TIMEOUT_MS (2 * HZ)
 
 enum opcode {
+	CXL_MBOX_OP_INVALID		= 0x0000,
 	CXL_MBOX_OP_IDENTIFY		= 0x4000,
 	CXL_MBOX_OP_MAX			= 0x10000
 };
@@ -95,6 +97,49 @@ struct cxl_memdev {
 static int cxl_mem_major;
 static DEFINE_IDA(cxl_memdev_ida);
 
+/**
+ * struct cxl_mem_command - Driver representation of a memory device command
+ * @info: Command information as it exists for the UAPI
+ * @opcode: The actual bits used for the mailbox protocol
+ *
+ * The cxl_mem_command is the driver's internal representation of commands that
+ * are supported by the driver. Some of these commands may not be supported by
+ * the hardware. The driver will use @info to validate the fields passed in by
+ * the user then submit the @opcode to the hardware.
+ *
+ * See struct cxl_command_info.
+ */
+struct cxl_mem_command {
+	struct cxl_command_info info;
+	enum opcode opcode;
+};
+
+#define CXL_CMD(_id, sin, sout)                                                \
+	[CXL_MEM_COMMAND_ID_##_id] = {                                         \
+	.info =	{                                                              \
+			.id = CXL_MEM_COMMAND_ID_##_id,                        \
+			.size_in = sin,                                        \
+			.size_out = sout,                                      \
+		},                                                             \
+	.opcode = CXL_MBOX_OP_##_id,                                           \
+	}
+
+/*
+ * This table defines the supported mailbox commands for the driver. This table
+ * is made up of a UAPI structure. Non-negative values as parameters in the
+ * table will be validated against the user's input. For example, if size_in is
+ * 0, and the user passed in 1, it is an error.
+ */
+static struct cxl_mem_command mem_commands[] = {
+	CXL_CMD(IDENTIFY, 0, 0x43),
+};
+
+#define cxl_for_each_cmd(cmd)                                                  \
+	for ((cmd) = &mem_commands[0];                                         \
+	     ((cmd) - mem_commands) < ARRAY_SIZE(mem_commands); (cmd)++)
+
+#define cxl_cmd_count ARRAY_SIZE(mem_commands)
+
 static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
 {
 	const unsigned long start = jiffies;
@@ -324,6 +369,242 @@ static void cxl_mem_mbox_put(struct cxl_mem *cxlm)
 	mutex_unlock(&cxlm->mbox_mutex);
 }
 
+/**
+ * handle_mailbox_cmd_from_user() - Dispatch a mailbox command for userspace.
+ * @cxlm: The CXL memory device to communicate with.
+ * @cmd: The validated command.
+ * @in_payload: Pointer to userspace's input payload.
+ * @out_payload: Pointer to userspace's output payload.
+ * @size_out: (Input) Max payload size to copy out.
+ *            (Output) Payload size hardware generated.
+ * @retval: Hardware generated return code from the operation.
+ *
+ * Return:
+ *  * %0	- Mailbox transaction succeeded. This implies the mailbox
+ *		  protocol completed successfully not that the operation itself
+ *		  was successful.
+ *  * %-ENOMEM  - Couldn't allocate a bounce buffer.
+ *  * %-EFAULT	- Something happened with copy_to/from_user.
+ *  * %-EINTR	- Mailbox acquisition interrupted.
+ *  * %-EXXX	- Transaction level failures.
+ *
+ * Creates the appropriate mailbox command and dispatches it on behalf of a
+ * userspace request. The input and output payloads are copied between
+ * userspace.
+ *
+ * See cxl_send_cmd().
+ */
+static int handle_mailbox_cmd_from_user(struct cxl_mem *cxlm,
+					const struct cxl_mem_command *cmd,
+					u64 in_payload, u64 out_payload,
+					s32 *size_out, u32 *retval)
+{
+	struct device *dev = &cxlm->pdev->dev;
+	struct mbox_cmd mbox_cmd = {
+		.opcode = cmd->opcode,
+		.size_in = cmd->info.size_in,
+		.size_out = cmd->info.size_out,
+	};
+	int rc;
+
+	if (cmd->info.size_out) {
+		mbox_cmd.payload_out = kvzalloc(cmd->info.size_out, GFP_KERNEL);
+		if (!mbox_cmd.payload_out)
+			return -ENOMEM;
+	}
+
+	if (cmd->info.size_in) {
+		mbox_cmd.payload_in = vmemdup_user(u64_to_user_ptr(in_payload),
+						   cmd->info.size_in);
+		if (IS_ERR(mbox_cmd.payload_in))
+			return PTR_ERR(mbox_cmd.payload_in);
+	}
+
+	rc = cxl_mem_mbox_get(cxlm);
+	if (rc)
+		goto out;
+
+	dev_dbg(dev,
+		"Submitting %s command for user\n"
+		"\topcode: %x\n"
+		"\tsize: %ub\n",
+		cxl_command_names[cmd->info.id].name, mbox_cmd.opcode,
+		cmd->info.size_in);
+
+	rc = __cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
+	cxl_mem_mbox_put(cxlm);
+	if (rc)
+		goto out;
+
+	/*
+	 * @size_out contains the max size that's allowed to be written back out
+	 * to userspace. While the payload may have written more output than
+	 * this it will have to be ignored.
+	 */
+	if (mbox_cmd.size_out) {
+		dev_WARN_ONCE(dev, mbox_cmd.size_out > *size_out,
+			      "Invalid return size\n");
+		if (copy_to_user(u64_to_user_ptr(out_payload),
+				 mbox_cmd.payload_out, mbox_cmd.size_out)) {
+			rc = -EFAULT;
+			goto out;
+		}
+	}
+
+	*size_out = mbox_cmd.size_out;
+	*retval = mbox_cmd.return_code;
+
+out:
+	kvfree(mbox_cmd.payload_in);
+	kvfree(mbox_cmd.payload_out);
+	return rc;
+}
+
+/**
+ * cxl_validate_cmd_from_user() - Check fields for CXL_MEM_SEND_COMMAND.
+ * @cxlm: &struct cxl_mem device whose mailbox will be used.
+ * @send_cmd: &struct cxl_send_command copied in from userspace.
+ * @out_cmd: Sanitized and populated &struct cxl_mem_command.
+ *
+ * Return:
+ *  * %0	- @out_cmd is ready to send.
+ *  * %-ENOTTY	- Invalid command specified.
+ *  * %-EINVAL	- Reserved fields or invalid values were used.
+ *  * %-ENOMEM	- Input or output buffer wasn't sized properly.
+ *
+ * The result of this command is a fully validated command in @out_cmd that is
+ * safe to send to the hardware.
+ *
+ * See handle_mailbox_cmd_from_user()
+ */
+static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
+				      const struct cxl_send_command *send_cmd,
+				      struct cxl_mem_command *out_cmd)
+{
+	const struct cxl_command_info *info;
+	struct cxl_mem_command *c;
+
+	if (send_cmd->id == 0 || send_cmd->id >= CXL_MEM_COMMAND_ID_MAX)
+		return -ENOTTY;
+
+	/*
+	 * The user can never specify an input payload larger than what hardware
+	 * supports, but output can be arbitrarily large (simply write out as
+	 * much data as the hardware provides).
+	 */
+	if (send_cmd->in.size > cxlm->payload_size)
+		return -EINVAL;
+
+	if (send_cmd->flags & ~CXL_MEM_COMMAND_FLAG_MASK)
+		return -EINVAL;
+
+	if (send_cmd->rsvd)
+		return -EINVAL;
+
+	if (send_cmd->in.rsvd || send_cmd->out.rsvd)
+		return -EINVAL;
+
+	/* Convert user's command into the internal representation */
+	c = &mem_commands[send_cmd->id];
+	info = &c->info;
+
+	/* Check the input buffer is the expected size */
+	if (info->size_in >= 0 && info->size_in != send_cmd->in.size)
+		return -ENOMEM;
+
+	/* Check the output buffer is at least large enough */
+	if (info->size_out >= 0 && send_cmd->out.size < info->size_out)
+		return -ENOMEM;
+
+	memcpy(out_cmd, c, sizeof(*c));
+	out_cmd->info.size_in = send_cmd->in.size;
+	/*
+	 * XXX: out_cmd->info.size_out will be controlled by the driver, and the
+	 * specified number of bytes @send_cmd->out.size will be copied back out
+	 * to userspace.
+	 */
+
+	return 0;
+}
+
+static int cxl_query_cmd(struct cxl_memdev *cxlmd,
+			 struct cxl_mem_query_commands __user *q)
+{
+	struct device *dev = &cxlmd->dev;
+	struct cxl_mem_command *cmd;
+	u32 n_commands;
+	int j = 0;
+
+	dev_dbg(dev, "Query IOCTL\n");
+
+	if (get_user(n_commands, &q->n_commands))
+		return -EFAULT;
+
+	/* returns the total number if 0 elements are requested. */
+	if (n_commands == 0)
+		return put_user(cxl_cmd_count, &q->n_commands);
+
+	/*
+	 * otherwise, return max(n_commands, total commands) cxl_command_info
+	 * structures.
+	 */
+	cxl_for_each_cmd(cmd) {
+		const struct cxl_command_info *info = &cmd->info;
+
+		if (copy_to_user(&q->commands[j++], info, sizeof(*info)))
+			return -EFAULT;
+
+		if (j == n_commands)
+			break;
+	}
+
+	return 0;
+}
+
+static int cxl_send_cmd(struct cxl_memdev *cxlmd,
+			struct cxl_send_command __user *s)
+{
+	struct cxl_mem *cxlm = cxlmd->cxlm;
+	struct device *dev = &cxlmd->dev;
+	struct cxl_send_command send;
+	struct cxl_mem_command c;
+	int rc;
+
+	dev_dbg(dev, "Send IOCTL\n");
+
+	if (copy_from_user(&send, s, sizeof(send)))
+		return -EFAULT;
+
+	rc = cxl_validate_cmd_from_user(cxlmd->cxlm, &send, &c);
+	if (rc)
+		return rc;
+
+	/* Prepare to handle a full payload for variable sized output */
+	if (c.info.size_out < 0)
+		c.info.size_out = cxlm->payload_size;
+
+	rc = handle_mailbox_cmd_from_user(cxlm, &c, send.in.payload,
+					  send.out.payload, &send.out.size,
+					  &send.retval);
+	if (rc)
+		return rc;
+
+	return copy_to_user(s, &send, sizeof(send));
+}
+
+static long __cxl_memdev_ioctl(struct cxl_memdev *cxlmd, unsigned int cmd,
+			       unsigned long arg)
+{
+	switch (cmd) {
+	case CXL_MEM_QUERY_COMMANDS:
+		return cxl_query_cmd(cxlmd, (void __user *)arg);
+	case CXL_MEM_SEND_COMMAND:
+		return cxl_send_cmd(cxlmd, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
 static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
 			     unsigned long arg)
 {
@@ -337,7 +618,7 @@ static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
 	if (!percpu_ref_tryget_live(&cxlmd->ops_active))
 		return -ENXIO;
 
-	/* TODO: ioctl body */
+	rc = __cxl_memdev_ioctl(cxlmd, cmd, arg);
 
 	percpu_ref_put(&cxlmd->ops_active);
 
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
new file mode 100644
index 000000000000..887781dc3b6c
--- /dev/null
+++ b/include/uapi/linux/cxl_mem.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * CXL IOCTLs for Memory Devices
+ */
+
+#ifndef _UAPI_CXL_MEM_H_
+#define _UAPI_CXL_MEM_H_
+
+#include <linux/types.h>
+
+/**
+ * DOC: UAPI
+ *
+ * Not all of all commands that the driver supports are always available for use
+ * by userspace. Userspace must check the results from the QUERY command in
+ * order to determine the live set of commands.
+ */
+
+#define CXL_MEM_QUERY_COMMANDS _IOR(0xCE, 1, struct cxl_mem_query_commands)
+#define CXL_MEM_SEND_COMMAND _IOWR(0xCE, 2, struct cxl_send_command)
+
+#define CXL_CMDS                                                          \
+	___C(INVALID, "Invalid Command"),                                 \
+	___C(IDENTIFY, "Identify Command"),                               \
+	___C(MAX, "invalid / last command")
+
+#define ___C(a, b) CXL_MEM_COMMAND_ID_##a
+enum { CXL_CMDS };
+
+#undef ___C
+#define ___C(a, b) { b }
+static const struct {
+	const char *name;
+} cxl_command_names[] = { CXL_CMDS };
+
+/*
+ * Here's how this actually breaks out:
+ * cxl_command_names[] = {
+ *	[CXL_MEM_COMMAND_ID_INVALID] = { "Invalid Command" },
+ *	[CXL_MEM_COMMAND_ID_IDENTIFY] = { "Identify Command" },
+ *	...
+ *	[CXL_MEM_COMMAND_ID_MAX] = { "invalid / last command" },
+ * };
+ */
+
+#undef ___C
+
+/**
+ * struct cxl_command_info - Command information returned from a query.
+ * @id: ID number for the command.
+ * @flags: Flags that specify command behavior.
+ * @size_in: Expected input size, or -1 if variable length.
+ * @size_out: Expected output size, or -1 if variable length.
+ *
+ * Represents a single command that is supported by both the driver and the
+ * hardware. This is returned as part of an array from the query ioctl. The
+ * following would be a command that takes a variable length input and returns 0
+ * bytes of output.
+ *
+ *  - @id = 10
+ *  - @flags = 0
+ *  - @size_in = -1
+ *  - @size_out = 0
+ *
+ * See struct cxl_mem_query_commands.
+ */
+struct cxl_command_info {
+	__u32 id;
+
+	__u32 flags;
+#define CXL_MEM_COMMAND_FLAG_MASK GENMASK(0, 0)
+
+	__s32 size_in;
+	__s32 size_out;
+};
+
+/**
+ * struct cxl_mem_query_commands - Query supported commands.
+ * @n_commands: In/out parameter. When @n_commands is > 0, the driver will
+ *		return min(num_support_commands, n_commands). When @n_commands
+ *		is 0, driver will return the number of total supported commands.
+ * @rsvd: Reserved for future use.
+ * @commands: Output array of supported commands. This array must be allocated
+ *            by userspace to be at least min(num_support_commands, @n_commands)
+ *
+ * Allow userspace to query the available commands supported by both the driver,
+ * and the hardware. Commands that aren't supported by either the driver, or the
+ * hardware are not returned in the query.
+ *
+ * Examples:
+ *
+ *  - { .n_commands = 0 } // Get number of supported commands
+ *  - { .n_commands = 15, .commands = buf } // Return first 15 (or less)
+ *    supported commands
+ *
+ *  See struct cxl_command_info.
+ */
+struct cxl_mem_query_commands {
+	/*
+	 * Input: Number of commands to return (space allocated by user)
+	 * Output: Number of commands supported by the driver/hardware
+	 *
+	 * If n_commands is 0, kernel will only return number of commands and
+	 * not try to populate commands[], thus allowing userspace to know how
+	 * much space to allocate
+	 */
+	__u32 n_commands;
+	__u32 rsvd;
+
+	struct cxl_command_info __user commands[]; /* out: supported commands */
+};
+
+/**
+ * struct cxl_send_command - Send a command to a memory device.
+ * @id: The command to send to the memory device. This must be one of the
+ *	commands returned by the query command.
+ * @flags: Flags for the command (input).
+ * @rsvd: Must be zero.
+ * @retval: Return value from the memory device (output).
+ * @in: Parameters associated with input payload.
+ * @in.size: Size of the payload to provide to the device (input).
+ * @in.rsvd: Must be zero.
+ * @in.payload: Pointer to memory for payload input, payload is little endian.
+ * @out: Parameters associated with output payload.
+ * @out.size: Size of the payload received from the device (input/output). This
+ *	      field is filled in by userspace to let the driver know how much
+ *	      space was allocated for output. It is populated by the driver to
+ *	      let userspace know how large the output payload actually was.
+ * @out.rsvd: Must be zero.
+ * @out.payload: Pointer to memory for payload output, payload is little endian.
+ *
+ * Mechanism for userspace to send a command to the hardware for processing. The
+ * driver will do basic validation on the command sizes. In some cases even the
+ * payload may be introspected. Userspace is required to allocate large enough
+ * buffers for size_out which can be variable length in certain situations.
+ */
+struct cxl_send_command {
+	__u32 id;
+	__u32 flags;
+	__u32 rsvd;
+	__u32 retval;
+
+	struct {
+		__s32 size;
+		__u32 rsvd;
+		__u64 payload;
+	} in;
+
+	struct {
+		__s32 size;
+		__u32 rsvd;
+		__u64 payload;
+	} out;
+};
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 13237183c735f5cba4ae26bc782c613ae0d4e4d3 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Tue, 16 Feb 2021 20:09:54 -0800
Subject: cxl/mem: Add a "RAW" send command

The CXL memory device send interface will have a number of supported
commands. The raw command is not such a command. Raw commands allow
userspace to send a specified opcode to the underlying hardware and
bypass all driver checks on the command. The primary use for this
command is to [begrudgingly] allow undocumented vendor specific hardware
commands.

While not the main motivation, it also allows prototyping new hardware
commands without a driver patch and rebuild.

While this all sounds very powerful it comes with a couple of caveats:
1. Bug reports using raw commands will not get the same level of
   attention as bug reports using supported commands (via taint).
2. Supported commands will be rejected by the RAW command.

With this comes new debugfs knob to allow full access to your toes with
your weapon of choice.

Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com> (v2)
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Ariel Sibley <Ariel.Sibley@microchip.com>
Link: https://lore.kernel.org/r/20210217040958.1354670-6-ben.widawsky@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/Kconfig          |  18 ++++++
 drivers/cxl/mem.c            | 132 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/cxl_mem.h |  12 +++-
 3 files changed, 161 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 9e80b311e928..97dc4d751651 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -32,4 +32,22 @@ config CXL_MEM
 	  Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification.
 
 	  If unsure say 'm'.
+
+config CXL_MEM_RAW_COMMANDS
+	bool "RAW Command Interface for Memory Devices"
+	depends on CXL_MEM
+	help
+	  Enable CXL RAW command interface.
+
+	  The CXL driver ioctl interface may assign a kernel ioctl command
+	  number for each specification defined opcode. At any given point in
+	  time the number of opcodes that the specification defines and a device
+	  may implement may exceed the kernel's set of associated ioctl function
+	  numbers. The mismatch is either by omission, specification is too new,
+	  or by design. When prototyping new hardware, or developing / debugging
+	  the driver it is useful to be able to submit any possible command to
+	  the hardware, even commands that may crash the kernel due to their
+	  potential impact to memory currently in use by the kernel.
+
+	  If developing CXL hardware or the driver say Y, otherwise say N.
 endif
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index aa8f843fcca1..5319412e245c 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
 #include <uapi/linux/cxl_mem.h>
+#include <linux/security.h>
+#include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/cdev.h>
@@ -42,7 +44,14 @@
 
 enum opcode {
 	CXL_MBOX_OP_INVALID		= 0x0000,
+	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
+	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
 	CXL_MBOX_OP_IDENTIFY		= 0x4000,
+	CXL_MBOX_OP_SET_PARTITION_INFO	= 0x4101,
+	CXL_MBOX_OP_SET_LSA		= 0x4103,
+	CXL_MBOX_OP_SET_SHUTDOWN_STATE	= 0x4204,
+	CXL_MBOX_OP_SCAN_MEDIA		= 0x4304,
+	CXL_MBOX_OP_GET_SCAN_MEDIA	= 0x4305,
 	CXL_MBOX_OP_MAX			= 0x10000
 };
 
@@ -96,6 +105,8 @@ struct cxl_memdev {
 
 static int cxl_mem_major;
 static DEFINE_IDA(cxl_memdev_ida);
+static struct dentry *cxl_debugfs;
+static bool cxl_raw_allow_all;
 
 /**
  * struct cxl_mem_command - Driver representation of a memory device command
@@ -132,6 +143,49 @@ struct cxl_mem_command {
  */
 static struct cxl_mem_command mem_commands[] = {
 	CXL_CMD(IDENTIFY, 0, 0x43),
+#ifdef CONFIG_CXL_MEM_RAW_COMMANDS
+	CXL_CMD(RAW, ~0, ~0),
+#endif
+};
+
+/*
+ * Commands that RAW doesn't permit. The rationale for each:
+ *
+ * CXL_MBOX_OP_ACTIVATE_FW: Firmware activation requires adjustment /
+ * coordination of transaction timeout values at the root bridge level.
+ *
+ * CXL_MBOX_OP_SET_PARTITION_INFO: The device memory map may change live
+ * and needs to be coordinated with HDM updates.
+ *
+ * CXL_MBOX_OP_SET_LSA: The label storage area may be cached by the
+ * driver and any writes from userspace invalidates those contents.
+ *
+ * CXL_MBOX_OP_SET_SHUTDOWN_STATE: Set shutdown state assumes no writes
+ * to the device after it is marked clean, userspace can not make that
+ * assertion.
+ *
+ * CXL_MBOX_OP_[GET_]SCAN_MEDIA: The kernel provides a native error list that
+ * is kept up to date with patrol notifications and error management.
+ */
+static u16 cxl_disabled_raw_commands[] = {
+	CXL_MBOX_OP_ACTIVATE_FW,
+	CXL_MBOX_OP_SET_PARTITION_INFO,
+	CXL_MBOX_OP_SET_LSA,
+	CXL_MBOX_OP_SET_SHUTDOWN_STATE,
+	CXL_MBOX_OP_SCAN_MEDIA,
+	CXL_MBOX_OP_GET_SCAN_MEDIA,
+};
+
+/*
+ * Command sets that RAW doesn't permit. All opcodes in this set are
+ * disabled because they pass plain text security payloads over the
+ * user/kernel boundary. This functionality is intended to be wrapped
+ * behind the keys ABI which allows for encrypted payloads in the UAPI
+ */
+static u8 security_command_sets[] = {
+	0x44, /* Sanitize */
+	0x45, /* Persistent Memory Data-at-rest Security */
+	0x46, /* Security Passthrough */
 };
 
 #define cxl_for_each_cmd(cmd)                                                  \
@@ -162,6 +216,16 @@ static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
 	return 0;
 }
 
+static bool cxl_is_security_command(u16 opcode)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(security_command_sets); i++)
+		if (security_command_sets[i] == (opcode >> 8))
+			return true;
+	return false;
+}
+
 static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
 				 struct mbox_cmd *mbox_cmd)
 {
@@ -431,6 +495,9 @@ static int handle_mailbox_cmd_from_user(struct cxl_mem *cxlm,
 		cxl_command_names[cmd->info.id].name, mbox_cmd.opcode,
 		cmd->info.size_in);
 
+	dev_WARN_ONCE(dev, cmd->info.id == CXL_MEM_COMMAND_ID_RAW,
+		      "raw command path used\n");
+
 	rc = __cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
 	cxl_mem_mbox_put(cxlm);
 	if (rc)
@@ -460,6 +527,29 @@ out:
 	return rc;
 }
 
+static bool cxl_mem_raw_command_allowed(u16 opcode)
+{
+	int i;
+
+	if (!IS_ENABLED(CONFIG_CXL_MEM_RAW_COMMANDS))
+		return false;
+
+	if (security_locked_down(LOCKDOWN_NONE))
+		return false;
+
+	if (cxl_raw_allow_all)
+		return true;
+
+	if (cxl_is_security_command(opcode))
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(cxl_disabled_raw_commands); i++)
+		if (cxl_disabled_raw_commands[i] == opcode)
+			return false;
+
+	return true;
+}
+
 /**
  * cxl_validate_cmd_from_user() - Check fields for CXL_MEM_SEND_COMMAND.
  * @cxlm: &struct cxl_mem device whose mailbox will be used.
@@ -471,6 +561,7 @@ out:
  *  * %-ENOTTY	- Invalid command specified.
  *  * %-EINVAL	- Reserved fields or invalid values were used.
  *  * %-ENOMEM	- Input or output buffer wasn't sized properly.
+ *  * %-EPERM	- Attempted to use a protected command.
  *
  * The result of this command is a fully validated command in @out_cmd that is
  * safe to send to the hardware.
@@ -495,6 +586,40 @@ static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
 	if (send_cmd->in.size > cxlm->payload_size)
 		return -EINVAL;
 
+	/*
+	 * Checks are bypassed for raw commands but a WARN/taint will occur
+	 * later in the callchain
+	 */
+	if (send_cmd->id == CXL_MEM_COMMAND_ID_RAW) {
+		const struct cxl_mem_command temp = {
+			.info = {
+				.id = CXL_MEM_COMMAND_ID_RAW,
+				.flags = 0,
+				.size_in = send_cmd->in.size,
+				.size_out = send_cmd->out.size,
+			},
+			.opcode = send_cmd->raw.opcode
+		};
+
+		if (send_cmd->raw.rsvd)
+			return -EINVAL;
+
+		/*
+		 * Unlike supported commands, the output size of RAW commands
+		 * gets passed along without further checking, so it must be
+		 * validated here.
+		 */
+		if (send_cmd->out.size > cxlm->payload_size)
+			return -EINVAL;
+
+		if (!cxl_mem_raw_command_allowed(send_cmd->raw.opcode))
+			return -EPERM;
+
+		memcpy(out_cmd, &temp, sizeof(temp));
+
+		return 0;
+	}
+
 	if (send_cmd->flags & ~CXL_MEM_COMMAND_FLAG_MASK)
 		return -EINVAL;
 
@@ -1166,6 +1291,7 @@ static struct pci_driver cxl_mem_driver = {
 
 static __init int cxl_mem_init(void)
 {
+	struct dentry *mbox_debugfs;
 	dev_t devt;
 	int rc;
 
@@ -1182,11 +1308,17 @@ static __init int cxl_mem_init(void)
 		return rc;
 	}
 
+	cxl_debugfs = debugfs_create_dir("cxl", NULL);
+	mbox_debugfs = debugfs_create_dir("mbox", cxl_debugfs);
+	debugfs_create_bool("raw_allow_all", 0600, mbox_debugfs,
+			    &cxl_raw_allow_all);
+
 	return 0;
 }
 
 static __exit void cxl_mem_exit(void)
 {
+	debugfs_remove_recursive(cxl_debugfs);
 	pci_unregister_driver(&cxl_mem_driver);
 	unregister_chrdev_region(MKDEV(cxl_mem_major, 0), CXL_MEM_MAX_DEVS);
 }
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index 887781dc3b6c..c316028730e7 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -22,6 +22,7 @@
 #define CXL_CMDS                                                          \
 	___C(INVALID, "Invalid Command"),                                 \
 	___C(IDENTIFY, "Identify Command"),                               \
+	___C(RAW, "Raw device command"),                                  \
 	___C(MAX, "invalid / last command")
 
 #define ___C(a, b) CXL_MEM_COMMAND_ID_##a
@@ -115,6 +116,9 @@ struct cxl_mem_query_commands {
  * @id: The command to send to the memory device. This must be one of the
  *	commands returned by the query command.
  * @flags: Flags for the command (input).
+ * @raw: Special fields for raw commands
+ * @raw.opcode: Opcode passed to hardware when using the RAW command.
+ * @raw.rsvd: Must be zero.
  * @rsvd: Must be zero.
  * @retval: Return value from the memory device (output).
  * @in: Parameters associated with input payload.
@@ -137,7 +141,13 @@ struct cxl_mem_query_commands {
 struct cxl_send_command {
 	__u32 id;
 	__u32 flags;
-	__u32 rsvd;
+	union {
+		struct {
+			__u16 opcode;
+			__u16 rsvd;
+		} raw;
+		__u32 rsvd;
+	};
 	__u32 retval;
 
 	struct {
-- 
cgit v1.2.3-71-gd317


From 472b1ce6e9d6396ab3f11fc5101c6b63b934a018 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Tue, 16 Feb 2021 20:09:55 -0800
Subject: cxl/mem: Enable commands via CEL

CXL devices identified by the memory-device class code must implement
the Device Command Interface (described in 8.2.9 of the CXL 2.0 spec).
While the driver already maintains a list of commands it supports, there
is still a need to be able to distinguish between commands that the
driver knows about from commands that are optionally supported by the
hardware.

The Command Effects Log (CEL) is specified in the CXL 2.0 specification.
The CEL is one of two types of logs, the other being vendor specific.
They are distinguished in hardware/spec via UUID. The CEL is useful for
2 things:
1. Determine which optional commands are supported by the CXL device.
2. Enumerate any vendor specific commands

The CEL is used by the driver to determine which commands are available
in the hardware and therefore which commands userspace is allowed to
execute. The set of enabled commands might be a subset of commands which
are advertised in UAPI via CXL_MEM_SEND_COMMAND IOCTL.

With the CEL enabling comes a internal flag to indicate a base set of
commands that are enabled regardless of CEL. Such commands are required
for basic interaction with the hardware and thus can be useful in debug
cases, for example if the CEL is corrupted.

The implementation leaves the statically defined table of commands and
supplements it with a bitmap to determine commands that are enabled.
This organization was chosen for the following reasons:
- Smaller memory footprint. Doesn't need a table per device.
- Reduce memory allocation complexity.
- Fixed command IDs to opcode mapping for all devices makes development
  and debugging easier.
- Certain helpers are easily achievable, like cxl_for_each_cmd().

Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com> (v2)
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> (v3)
Link: https://lore.kernel.org/r/20210217040958.1354670-7-ben.widawsky@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/cxl.h            |   2 +
 drivers/cxl/mem.c            | 223 +++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/cxl_mem.h |   1 +
 3 files changed, 219 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 8fd4a177fe25..6f14838c2d25 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -69,6 +69,7 @@ struct cxl_memdev;
  *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
  * @mbox_mutex: Mutex to synchronize mailbox access.
  * @firmware_version: Firmware version for the memory device.
+ * @enabled_commands: Hardware commands found enabled in CEL.
  * @pmem_range: Persistent memory capacity information.
  * @ram_range: Volatile memory capacity information.
  */
@@ -84,6 +85,7 @@ struct cxl_mem {
 	size_t payload_size;
 	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
 	char firmware_version[0x10];
+	unsigned long *enabled_cmds;
 
 	struct range pmem_range;
 	struct range ram_range;
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index 5319412e245c..e31b3045e231 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -46,6 +46,8 @@ enum opcode {
 	CXL_MBOX_OP_INVALID		= 0x0000,
 	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
 	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
+	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
+	CXL_MBOX_OP_GET_LOG		= 0x0401,
 	CXL_MBOX_OP_IDENTIFY		= 0x4000,
 	CXL_MBOX_OP_SET_PARTITION_INFO	= 0x4101,
 	CXL_MBOX_OP_SET_LSA		= 0x4103,
@@ -108,10 +110,28 @@ static DEFINE_IDA(cxl_memdev_ida);
 static struct dentry *cxl_debugfs;
 static bool cxl_raw_allow_all;
 
+enum {
+	CEL_UUID,
+	VENDOR_DEBUG_UUID,
+};
+
+/* See CXL 2.0 Table 170. Get Log Input Payload */
+static const uuid_t log_uuid[] = {
+	[CEL_UUID] = UUID_INIT(0xda9c0b5, 0xbf41, 0x4b78, 0x8f, 0x79, 0x96,
+			       0xb1, 0x62, 0x3b, 0x3f, 0x17),
+	[VENDOR_DEBUG_UUID] = UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f,
+					0xd6, 0x07, 0x19, 0x40, 0x3d, 0x86),
+};
+
 /**
  * struct cxl_mem_command - Driver representation of a memory device command
  * @info: Command information as it exists for the UAPI
  * @opcode: The actual bits used for the mailbox protocol
+ * @flags: Set of flags effecting driver behavior.
+ *
+ *  * %CXL_CMD_FLAG_FORCE_ENABLE: In cases of error, commands with this flag
+ *    will be enabled by the driver regardless of what hardware may have
+ *    advertised.
  *
  * The cxl_mem_command is the driver's internal representation of commands that
  * are supported by the driver. Some of these commands may not be supported by
@@ -123,9 +143,12 @@ static bool cxl_raw_allow_all;
 struct cxl_mem_command {
 	struct cxl_command_info info;
 	enum opcode opcode;
+	u32 flags;
+#define CXL_CMD_FLAG_NONE 0
+#define CXL_CMD_FLAG_FORCE_ENABLE BIT(0)
 };
 
-#define CXL_CMD(_id, sin, sout)                                                \
+#define CXL_CMD(_id, sin, sout, _flags)                                        \
 	[CXL_MEM_COMMAND_ID_##_id] = {                                         \
 	.info =	{                                                              \
 			.id = CXL_MEM_COMMAND_ID_##_id,                        \
@@ -133,6 +156,7 @@ struct cxl_mem_command {
 			.size_out = sout,                                      \
 		},                                                             \
 	.opcode = CXL_MBOX_OP_##_id,                                           \
+	.flags = _flags,                                                       \
 	}
 
 /*
@@ -142,10 +166,11 @@ struct cxl_mem_command {
  * 0, and the user passed in 1, it is an error.
  */
 static struct cxl_mem_command mem_commands[] = {
-	CXL_CMD(IDENTIFY, 0, 0x43),
+	CXL_CMD(IDENTIFY, 0, 0x43, CXL_CMD_FLAG_FORCE_ENABLE),
 #ifdef CONFIG_CXL_MEM_RAW_COMMANDS
-	CXL_CMD(RAW, ~0, ~0),
+	CXL_CMD(RAW, ~0, ~0, 0),
 #endif
+	CXL_CMD(GET_SUPPORTED_LOGS, 0, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
 };
 
 /*
@@ -633,6 +658,10 @@ static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
 	c = &mem_commands[send_cmd->id];
 	info = &c->info;
 
+	/* Check that the command is enabled for hardware */
+	if (!test_bit(info->id, cxlm->enabled_cmds))
+		return -ENOTTY;
+
 	/* Check the input buffer is the expected size */
 	if (info->size_in >= 0 && info->size_in != send_cmd->in.size)
 		return -ENOMEM;
@@ -757,6 +786,17 @@ static const struct file_operations cxl_memdev_fops = {
 	.llseek = noop_llseek,
 };
 
+static inline struct cxl_mem_command *cxl_mem_find_command(u16 opcode)
+{
+	struct cxl_mem_command *c;
+
+	cxl_for_each_cmd(c)
+		if (c->opcode == opcode)
+			return c;
+
+	return NULL;
+}
+
 /**
  * cxl_mem_mbox_send_cmd() - Send a mailbox command to a memory device.
  * @cxlm: The CXL memory device to communicate with.
@@ -777,9 +817,7 @@ static const struct file_operations cxl_memdev_fops = {
  *
  * Mailbox commands may execute successfully yet the device itself reported an
  * error. While this distinction can be useful for commands from userspace, the
- * kernel will only be able to use results when both are successful. It's
- * expected that all callers of this function know exactly the size of the data
- * they will consume from the hardware.
+ * kernel will only be able to use results when both are successful.
  *
  * See __cxl_mem_mbox_send_cmd()
  */
@@ -787,6 +825,7 @@ static int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm, u16 opcode,
 				 void *in, size_t in_size,
 				 void *out, size_t out_size)
 {
+	const struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
 	struct mbox_cmd mbox_cmd = {
 		.opcode = opcode,
 		.payload_in = in,
@@ -812,7 +851,11 @@ static int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm, u16 opcode,
 	if (mbox_cmd.return_code != CXL_MBOX_SUCCESS)
 		return -ENXIO;
 
-	if (mbox_cmd.size_out != out_size)
+	/*
+	 * Variable sized commands can't be validated and so it's up to the
+	 * caller to do that if they wish.
+	 */
+	if (cmd->info.size_out >= 0 && mbox_cmd.size_out != out_size)
 		return -EIO;
 
 	return 0;
@@ -947,6 +990,14 @@ static struct cxl_mem *cxl_mem_create(struct pci_dev *pdev, u32 reg_lo,
 	mutex_init(&cxlm->mbox_mutex);
 	cxlm->pdev = pdev;
 	cxlm->regs = regs + offset;
+	cxlm->enabled_cmds =
+		devm_kmalloc_array(dev, BITS_TO_LONGS(cxl_cmd_count),
+				   sizeof(unsigned long),
+				   GFP_KERNEL | __GFP_ZERO);
+	if (!cxlm->enabled_cmds) {
+		dev_err(dev, "No memory available for bitmap\n");
+		return NULL;
+	}
 
 	dev_dbg(dev, "Mapped CXL Memory Device resource\n");
 	return cxlm;
@@ -1166,6 +1217,160 @@ err_ref:
 	return rc;
 }
 
+static int cxl_xfer_log(struct cxl_mem *cxlm, uuid_t *uuid, u32 size, u8 *out)
+{
+	u32 remaining = size;
+	u32 offset = 0;
+
+	while (remaining) {
+		u32 xfer_size = min_t(u32, remaining, cxlm->payload_size);
+		struct cxl_mbox_get_log {
+			uuid_t uuid;
+			__le32 offset;
+			__le32 length;
+		} __packed log = {
+			.uuid = *uuid,
+			.offset = cpu_to_le32(offset),
+			.length = cpu_to_le32(xfer_size)
+		};
+		int rc;
+
+		rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_LOG, &log,
+					   sizeof(log), out, xfer_size);
+		if (rc < 0)
+			return rc;
+
+		out += xfer_size;
+		remaining -= xfer_size;
+		offset += xfer_size;
+	}
+
+	return 0;
+}
+
+/**
+ * cxl_walk_cel() - Walk through the Command Effects Log.
+ * @cxlm: Device.
+ * @size: Length of the Command Effects Log.
+ * @cel: CEL
+ *
+ * Iterate over each entry in the CEL and determine if the driver supports the
+ * command. If so, the command is enabled for the device and can be used later.
+ */
+static void cxl_walk_cel(struct cxl_mem *cxlm, size_t size, u8 *cel)
+{
+	struct cel_entry {
+		__le16 opcode;
+		__le16 effect;
+	} __packed * cel_entry;
+	const int cel_entries = size / sizeof(*cel_entry);
+	int i;
+
+	cel_entry = (struct cel_entry *)cel;
+
+	for (i = 0; i < cel_entries; i++) {
+		u16 opcode = le16_to_cpu(cel_entry[i].opcode);
+		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
+
+		if (!cmd) {
+			dev_dbg(&cxlm->pdev->dev,
+				"Opcode 0x%04x unsupported by driver", opcode);
+			continue;
+		}
+
+		set_bit(cmd->info.id, cxlm->enabled_cmds);
+	}
+}
+
+struct cxl_mbox_get_supported_logs {
+	__le16 entries;
+	u8 rsvd[6];
+	struct gsl_entry {
+		uuid_t uuid;
+		__le32 size;
+	} __packed entry[];
+} __packed;
+
+static struct cxl_mbox_get_supported_logs *cxl_get_gsl(struct cxl_mem *cxlm)
+{
+	struct cxl_mbox_get_supported_logs *ret;
+	int rc;
+
+	ret = kvmalloc(cxlm->payload_size, GFP_KERNEL);
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
+
+	rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_SUPPORTED_LOGS, NULL,
+				   0, ret, cxlm->payload_size);
+	if (rc < 0) {
+		kvfree(ret);
+		return ERR_PTR(rc);
+	}
+
+	return ret;
+}
+
+/**
+ * cxl_mem_enumerate_cmds() - Enumerate commands for a device.
+ * @cxlm: The device.
+ *
+ * Returns 0 if enumerate completed successfully.
+ *
+ * CXL devices have optional support for certain commands. This function will
+ * determine the set of supported commands for the hardware and update the
+ * enabled_cmds bitmap in the @cxlm.
+ */
+static int cxl_mem_enumerate_cmds(struct cxl_mem *cxlm)
+{
+	struct cxl_mbox_get_supported_logs *gsl;
+	struct device *dev = &cxlm->pdev->dev;
+	struct cxl_mem_command *cmd;
+	int i, rc;
+
+	gsl = cxl_get_gsl(cxlm);
+	if (IS_ERR(gsl))
+		return PTR_ERR(gsl);
+
+	rc = -ENOENT;
+	for (i = 0; i < le16_to_cpu(gsl->entries); i++) {
+		u32 size = le32_to_cpu(gsl->entry[i].size);
+		uuid_t uuid = gsl->entry[i].uuid;
+		u8 *log;
+
+		dev_dbg(dev, "Found LOG type %pU of size %d", &uuid, size);
+
+		if (!uuid_equal(&uuid, &log_uuid[CEL_UUID]))
+			continue;
+
+		log = kvmalloc(size, GFP_KERNEL);
+		if (!log) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		rc = cxl_xfer_log(cxlm, &uuid, size, log);
+		if (rc) {
+			kvfree(log);
+			goto out;
+		}
+
+		cxl_walk_cel(cxlm, size, log);
+		kvfree(log);
+
+		/* In case CEL was bogus, enable some default commands. */
+		cxl_for_each_cmd(cmd)
+			if (cmd->flags & CXL_CMD_FLAG_FORCE_ENABLE)
+				set_bit(cmd->info.id, cxlm->enabled_cmds);
+
+		/* Found the required CEL */
+		rc = 0;
+	}
+
+out:
+	kvfree(gsl);
+	return rc;
+}
+
 /**
  * cxl_mem_identify() - Send the IDENTIFY command to the device.
  * @cxlm: The device to identify.
@@ -1266,6 +1471,10 @@ static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
+	rc = cxl_mem_enumerate_cmds(cxlm);
+	if (rc)
+		return rc;
+
 	rc = cxl_mem_identify(cxlm);
 	if (rc)
 		return rc;
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index c316028730e7..59227f82a4c1 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -23,6 +23,7 @@
 	___C(INVALID, "Invalid Command"),                                 \
 	___C(IDENTIFY, "Identify Command"),                               \
 	___C(RAW, "Raw device command"),                                  \
+	___C(GET_SUPPORTED_LOGS, "Get Supported Logs"),                   \
 	___C(MAX, "invalid / last command")
 
 #define ___C(a, b) CXL_MEM_COMMAND_ID_##a
-- 
cgit v1.2.3-71-gd317


From 57ee605b976c30a86613648935d255bbe704aeab Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Tue, 16 Feb 2021 20:09:56 -0800
Subject: cxl/mem: Add set of informational commands

Add initial set of formal commands beyond basic identify and command
enumeration.

Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> (v2)
Link: https://lore.kernel.org/r/20210217040958.1354670-8-ben.widawsky@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/mem.c            | 9 +++++++++
 include/uapi/linux/cxl_mem.h | 5 +++++
 2 files changed, 14 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index e31b3045e231..6d7d3870b5da 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -45,12 +45,16 @@
 enum opcode {
 	CXL_MBOX_OP_INVALID		= 0x0000,
 	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
+	CXL_MBOX_OP_GET_FW_INFO		= 0x0200,
 	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
 	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
 	CXL_MBOX_OP_GET_LOG		= 0x0401,
 	CXL_MBOX_OP_IDENTIFY		= 0x4000,
+	CXL_MBOX_OP_GET_PARTITION_INFO	= 0x4100,
 	CXL_MBOX_OP_SET_PARTITION_INFO	= 0x4101,
+	CXL_MBOX_OP_GET_LSA		= 0x4102,
 	CXL_MBOX_OP_SET_LSA		= 0x4103,
+	CXL_MBOX_OP_GET_HEALTH_INFO	= 0x4200,
 	CXL_MBOX_OP_SET_SHUTDOWN_STATE	= 0x4204,
 	CXL_MBOX_OP_SCAN_MEDIA		= 0x4304,
 	CXL_MBOX_OP_GET_SCAN_MEDIA	= 0x4305,
@@ -171,6 +175,11 @@ static struct cxl_mem_command mem_commands[] = {
 	CXL_CMD(RAW, ~0, ~0, 0),
 #endif
 	CXL_CMD(GET_SUPPORTED_LOGS, 0, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
+	CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
+	CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
+	CXL_CMD(GET_LSA, 0x8, ~0, 0),
+	CXL_CMD(GET_HEALTH_INFO, 0, 0x12, 0),
+	CXL_CMD(GET_LOG, 0x18, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
 };
 
 /*
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index 59227f82a4c1..3155382dfc9b 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -24,6 +24,11 @@
 	___C(IDENTIFY, "Identify Command"),                               \
 	___C(RAW, "Raw device command"),                                  \
 	___C(GET_SUPPORTED_LOGS, "Get Supported Logs"),                   \
+	___C(GET_FW_INFO, "Get FW Info"),                                 \
+	___C(GET_PARTITION_INFO, "Get Partition Information"),            \
+	___C(GET_LSA, "Get Label Storage Area"),                          \
+	___C(GET_HEALTH_INFO, "Get Health Info"),                         \
+	___C(GET_LOG, "Get Log"),                                         \
 	___C(MAX, "invalid / last command")
 
 #define ___C(a, b) CXL_MEM_COMMAND_ID_##a
-- 
cgit v1.2.3-71-gd317


From 33b347503f014ebf76257327cbc7001c6b721956 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 5 Jan 2021 12:32:00 +0200
Subject: vdpa: Define vdpa mgmt device, ops and a netlink interface

To add one or more VDPA devices, define a management device which
allows adding or removing vdpa device. A management device defines
set of callbacks to manage vdpa devices.

To begin with, it defines add and remove callbacks through which a user
defined vdpa device can be added or removed.

A unique management device is identified by its unique handle identified
by management device name and optionally the bus name.

Hence, introduce routine through which driver can register a
management device and its callback operations for adding and remove
a vdpa device.

Introduce vdpa netlink socket family so that user can query management
device and its attributes.

Example of show vdpa management device which allows creating vdpa device of
networking class (device id = 0x1) of virtio specification 1.1
section 5.1.1.

$ vdpa mgmtdev show
vdpasim_net:
  supported_classes:
    net

Example of showing vdpa management device in JSON format.

$ vdpa mgmtdev show -jp
{
    "show": {
        "vdpasim_net": {
            "supported_classes": [ "net" ]
        }
    }
}

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210105103203.82508-4-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

Including a bugfix:

vpda: correctly size vdpa_nl_policy

We need to ensure last entry of vdpa_nl_policy[]
is zero, otherwise out-of-bounds access is hurting us.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Eli Cohen <elic@nvidia.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20210210134911.4119555-1-eric.dumazet@gmail.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/Kconfig      |   1 +
 drivers/vdpa/vdpa.c       | 213 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/vdpa.h      |  31 +++++++
 include/uapi/linux/vdpa.h |  31 +++++++
 4 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/vdpa.h

(limited to 'include/uapi/linux')

diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index 92a6396f8a73..ffd1e098bfd2 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 menuconfig VDPA
 	tristate "vDPA drivers"
+	depends on NET
 	help
 	  Enable this module to support vDPA device that uses a
 	  datapath which complies with virtio specifications with
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 7414bbd9057c..586e66b7f671 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -11,11 +11,17 @@
 #include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/vdpa.h>
+#include <uapi/linux/vdpa.h>
+#include <net/genetlink.h>
+#include <linux/mod_devicetable.h>
 
+static LIST_HEAD(mdev_head);
 /* A global mutex that protects vdpa management device and device level operations. */
 static DEFINE_MUTEX(vdpa_dev_mutex);
 static DEFINE_IDA(vdpa_index_ida);
 
+static struct genl_family vdpa_nl_family;
+
 static int vdpa_dev_probe(struct device *d)
 {
 	struct vdpa_device *vdev = dev_to_vdpa(d);
@@ -195,13 +201,218 @@ void vdpa_unregister_driver(struct vdpa_driver *drv)
 }
 EXPORT_SYMBOL_GPL(vdpa_unregister_driver);
 
+/**
+ * vdpa_mgmtdev_register - register a vdpa management device
+ *
+ * @mdev: Pointer to vdpa management device
+ * vdpa_mgmtdev_register() register a vdpa management device which supports
+ * vdpa device management.
+ */
+int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev)
+{
+	if (!mdev->device || !mdev->ops || !mdev->ops->dev_add || !mdev->ops->dev_del)
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&mdev->list);
+	mutex_lock(&vdpa_dev_mutex);
+	list_add_tail(&mdev->list, &mdev_head);
+	mutex_unlock(&vdpa_dev_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register);
+
+void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev)
+{
+	mutex_lock(&vdpa_dev_mutex);
+	list_del(&mdev->list);
+	mutex_unlock(&vdpa_dev_mutex);
+}
+EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister);
+
+static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev,
+				 const char *busname, const char *devname)
+{
+	/* Bus name is optional for simulated management device, so ignore the
+	 * device with bus if bus attribute is provided.
+	 */
+	if ((busname && !mdev->device->bus) || (!busname && mdev->device->bus))
+		return false;
+
+	if (!busname && strcmp(dev_name(mdev->device), devname) == 0)
+		return true;
+
+	if (busname && (strcmp(mdev->device->bus->name, busname) == 0) &&
+	    (strcmp(dev_name(mdev->device), devname) == 0))
+		return true;
+
+	return false;
+}
+
+static struct vdpa_mgmt_dev *vdpa_mgmtdev_get_from_attr(struct nlattr **attrs)
+{
+	struct vdpa_mgmt_dev *mdev;
+	const char *busname = NULL;
+	const char *devname;
+
+	if (!attrs[VDPA_ATTR_MGMTDEV_DEV_NAME])
+		return ERR_PTR(-EINVAL);
+	devname = nla_data(attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]);
+	if (attrs[VDPA_ATTR_MGMTDEV_BUS_NAME])
+		busname = nla_data(attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]);
+
+	list_for_each_entry(mdev, &mdev_head, list) {
+		if (mgmtdev_handle_match(mdev, busname, devname))
+			return mdev;
+	}
+	return ERR_PTR(-ENODEV);
+}
+
+static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mgmt_dev *mdev)
+{
+	if (mdev->device->bus &&
+	    nla_put_string(msg, VDPA_ATTR_MGMTDEV_BUS_NAME, mdev->device->bus->name))
+		return -EMSGSIZE;
+	if (nla_put_string(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, dev_name(mdev->device)))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg,
+			     u32 portid, u32 seq, int flags)
+{
+	u64 supported_classes = 0;
+	void *hdr;
+	int i = 0;
+	int err;
+
+	hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW);
+	if (!hdr)
+		return -EMSGSIZE;
+	err = vdpa_nl_mgmtdev_handle_fill(msg, mdev);
+	if (err)
+		goto msg_err;
+
+	while (mdev->id_table[i].device) {
+		supported_classes |= BIT(mdev->id_table[i].device);
+		i++;
+	}
+
+	if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,
+			      supported_classes, VDPA_ATTR_UNSPEC)) {
+		err = -EMSGSIZE;
+		goto msg_err;
+	}
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+msg_err:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
+static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_mgmt_dev *mdev;
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	mutex_lock(&vdpa_dev_mutex);
+	mdev = vdpa_mgmtdev_get_from_attr(info->attrs);
+	if (IS_ERR(mdev)) {
+		mutex_unlock(&vdpa_dev_mutex);
+		NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device");
+		err = PTR_ERR(mdev);
+		goto out;
+	}
+
+	err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0);
+	mutex_unlock(&vdpa_dev_mutex);
+	if (err)
+		goto out;
+	err = genlmsg_reply(msg, info);
+	return err;
+
+out:
+	nlmsg_free(msg);
+	return err;
+}
+
+static int
+vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct vdpa_mgmt_dev *mdev;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&vdpa_dev_mutex);
+	list_for_each_entry(mdev, &mdev_head, list) {
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+		err = vdpa_mgmtdev_fill(mdev, msg, NETLINK_CB(cb->skb).portid,
+					cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err)
+			goto out;
+		idx++;
+	}
+out:
+	mutex_unlock(&vdpa_dev_mutex);
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
+	[VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
+	[VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
+};
+
+static const struct genl_ops vdpa_nl_ops[] = {
+	{
+		.cmd = VDPA_CMD_MGMTDEV_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_mgmtdev_get_doit,
+		.dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit,
+	},
+};
+
+static struct genl_family vdpa_nl_family __ro_after_init = {
+	.name = VDPA_GENL_NAME,
+	.version = VDPA_GENL_VERSION,
+	.maxattr = VDPA_ATTR_MAX,
+	.policy = vdpa_nl_policy,
+	.netnsok = false,
+	.module = THIS_MODULE,
+	.ops = vdpa_nl_ops,
+	.n_ops = ARRAY_SIZE(vdpa_nl_ops),
+};
+
 static int vdpa_init(void)
 {
-	return bus_register(&vdpa_bus);
+	int err;
+
+	err = bus_register(&vdpa_bus);
+	if (err)
+		return err;
+	err = genl_register_family(&vdpa_nl_family);
+	if (err)
+		goto err;
+	return 0;
+
+err:
+	bus_unregister(&vdpa_bus);
+	return err;
 }
 
 static void __exit vdpa_exit(void)
 {
+	genl_unregister_family(&vdpa_nl_family);
 	bus_unregister(&vdpa_bus);
 	ida_destroy(&vdpa_index_ida);
 }
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 5700baa22356..6b8b4222bca6 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -35,6 +35,8 @@ struct vdpa_vq_state {
 	u16	avail_index;
 };
 
+struct vdpa_mgmt_dev;
+
 /**
  * vDPA device - representation of a vDPA device
  * @dev: underlying device
@@ -335,4 +337,33 @@ static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset,
 	ops->get_config(vdev, offset, buf, len);
 }
 
+/**
+ * vdpa_mgmtdev_ops - vdpa device ops
+ * @dev_add:	Add a vdpa device using alloc and register
+ *		@mdev: parent device to use for device addition
+ *		@name: name of the new vdpa device
+ *		Driver need to add a new device using _vdpa_register_device()
+ *		after fully initializing the vdpa device. Driver must return 0
+ *		on success or appropriate error code.
+ * @dev_del:	Remove a vdpa device using unregister
+ *		@mdev: parent device to use for device removal
+ *		@dev: vdpa device to remove
+ *		Driver need to remove the specified device by calling
+ *		_vdpa_unregister_device().
+ */
+struct vdpa_mgmtdev_ops {
+	int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name);
+	void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev);
+};
+
+struct vdpa_mgmt_dev {
+	struct device *device;
+	const struct vdpa_mgmtdev_ops *ops;
+	const struct virtio_device_id *id_table; /* supported ids */
+	struct list_head list;
+};
+
+int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev);
+void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev);
+
 #endif /* _LINUX_VDPA_H */
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
new file mode 100644
index 000000000000..d44d82e567b1
--- /dev/null
+++ b/include/uapi/linux/vdpa.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * vdpa device management interface
+ * Copyright (c) 2020 Mellanox Technologies Ltd. All rights reserved.
+ */
+
+#ifndef _UAPI_LINUX_VDPA_H_
+#define _UAPI_LINUX_VDPA_H_
+
+#define VDPA_GENL_NAME "vdpa"
+#define VDPA_GENL_VERSION 0x1
+
+enum vdpa_command {
+	VDPA_CMD_UNSPEC,
+	VDPA_CMD_MGMTDEV_NEW,
+	VDPA_CMD_MGMTDEV_GET,		/* can dump */
+};
+
+enum vdpa_attr {
+	VDPA_ATTR_UNSPEC,
+
+	/* bus name (optional) + dev name together make the parent device handle */
+	VDPA_ATTR_MGMTDEV_BUS_NAME,		/* string */
+	VDPA_ATTR_MGMTDEV_DEV_NAME,		/* string */
+	VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,	/* u64 */
+
+	/* new attributes must be added above here */
+	VDPA_ATTR_MAX,
+};
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 903f7bcaedb84ca47998e609015a34ddde93742e Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 5 Jan 2021 12:32:01 +0200
Subject: vdpa: Enable a user to add and delete a vdpa device

Add the ability to add and delete a vdpa device.

Examples:
Create a vdpa device of type network named "foo2" from
the management device vdpasim:

$ vdpa dev add mgmtdev vdpasim_net name foo2

Delete the vdpa device after its use:
$ vdpa dev del foo2

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210105103203.82508-5-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c       | 143 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/vdpa.h      |   6 ++
 include/uapi/linux/vdpa.h |   4 ++
 3 files changed, 143 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 586e66b7f671..f1228189814f 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -136,6 +136,37 @@ static int vdpa_name_match(struct device *dev, const void *data)
 	return (strcmp(dev_name(&vdev->dev), data) == 0);
 }
 
+static int __vdpa_register_device(struct vdpa_device *vdev)
+{
+	struct device *dev;
+
+	lockdep_assert_held(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match);
+	if (dev) {
+		put_device(dev);
+		return -EEXIST;
+	}
+	return device_add(&vdev->dev);
+}
+
+/**
+ * _vdpa_register_device - register a vDPA device with vdpa lock held
+ * Caller must have a succeed call of vdpa_alloc_device() before.
+ * Caller must invoke this routine in the management device dev_add()
+ * callback after setting up valid mgmtdev for this vdpa device.
+ * @vdev: the vdpa device to be registered to vDPA bus
+ *
+ * Returns an error when fail to add device to vDPA bus
+ */
+int _vdpa_register_device(struct vdpa_device *vdev)
+{
+	if (!vdev->mdev)
+		return -EINVAL;
+
+	return __vdpa_register_device(vdev);
+}
+EXPORT_SYMBOL_GPL(_vdpa_register_device);
+
 /**
  * vdpa_register_device - register a vDPA device
  * Callers must have a succeed call of vdpa_alloc_device() before.
@@ -145,24 +176,29 @@ static int vdpa_name_match(struct device *dev, const void *data)
  */
 int vdpa_register_device(struct vdpa_device *vdev)
 {
-	struct device *dev;
 	int err;
 
 	mutex_lock(&vdpa_dev_mutex);
-	dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match);
-	if (dev) {
-		put_device(dev);
-		err = -EEXIST;
-		goto name_err;
-	}
-
-	err = device_add(&vdev->dev);
-name_err:
+	err = __vdpa_register_device(vdev);
 	mutex_unlock(&vdpa_dev_mutex);
 	return err;
 }
 EXPORT_SYMBOL_GPL(vdpa_register_device);
 
+/**
+ * _vdpa_unregister_device - unregister a vDPA device
+ * Caller must invoke this routine as part of management device dev_del()
+ * callback.
+ * @vdev: the vdpa device to be unregisted from vDPA bus
+ */
+void _vdpa_unregister_device(struct vdpa_device *vdev)
+{
+	lockdep_assert_held(&vdpa_dev_mutex);
+	WARN_ON(!vdev->mdev);
+	device_unregister(&vdev->dev);
+}
+EXPORT_SYMBOL_GPL(_vdpa_unregister_device);
+
 /**
  * vdpa_unregister_device - unregister a vDPA device
  * @vdev: the vdpa device to be unregisted from vDPA bus
@@ -221,10 +257,25 @@ int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev)
 }
 EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register);
 
+static int vdpa_match_remove(struct device *dev, void *data)
+{
+	struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+	struct vdpa_mgmt_dev *mdev = vdev->mdev;
+
+	if (mdev == data)
+		mdev->ops->dev_del(mdev, vdev);
+	return 0;
+}
+
 void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev)
 {
 	mutex_lock(&vdpa_dev_mutex);
+
 	list_del(&mdev->list);
+
+	/* Filter out all the entries belong to this management device and delete it. */
+	bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove);
+
 	mutex_unlock(&vdpa_dev_mutex);
 }
 EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister);
@@ -368,9 +419,69 @@ out:
 	return msg->len;
 }
 
+static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_mgmt_dev *mdev;
+	const char *name;
+	int err = 0;
+
+	if (!info->attrs[VDPA_ATTR_DEV_NAME])
+		return -EINVAL;
+
+	name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+
+	mutex_lock(&vdpa_dev_mutex);
+	mdev = vdpa_mgmtdev_get_from_attr(info->attrs);
+	if (IS_ERR(mdev)) {
+		NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device");
+		err = PTR_ERR(mdev);
+		goto err;
+	}
+
+	err = mdev->ops->dev_add(mdev, name);
+err:
+	mutex_unlock(&vdpa_dev_mutex);
+	return err;
+}
+
+static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_mgmt_dev *mdev;
+	struct vdpa_device *vdev;
+	struct device *dev;
+	const char *name;
+	int err = 0;
+
+	if (!info->attrs[VDPA_ATTR_DEV_NAME])
+		return -EINVAL;
+	name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+
+	mutex_lock(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match);
+	if (!dev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "device not found");
+		err = -ENODEV;
+		goto dev_err;
+	}
+	vdev = container_of(dev, struct vdpa_device, dev);
+	if (!vdev->mdev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "Only user created device can be deleted by user");
+		err = -EINVAL;
+		goto mdev_err;
+	}
+	mdev = vdev->mdev;
+	mdev->ops->dev_del(mdev, vdev);
+mdev_err:
+	put_device(dev);
+dev_err:
+	mutex_unlock(&vdpa_dev_mutex);
+	return err;
+}
+
 static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
 	[VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
+	[VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING },
 };
 
 static const struct genl_ops vdpa_nl_ops[] = {
@@ -380,6 +491,18 @@ static const struct genl_ops vdpa_nl_ops[] = {
 		.doit = vdpa_nl_cmd_mgmtdev_get_doit,
 		.dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit,
 	},
+	{
+		.cmd = VDPA_CMD_DEV_NEW,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_dev_add_set_doit,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = VDPA_CMD_DEV_DEL,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_dev_del_set_doit,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family vdpa_nl_family __ro_after_init = {
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 6b8b4222bca6..4ab5494503a8 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -45,6 +45,8 @@ struct vdpa_mgmt_dev;
  * @index: device index
  * @features_valid: were features initialized? for legacy guests
  * @nvqs: maximum number of supported virtqueues
+ * @mdev: management device pointer; caller must setup when registering device as part
+ *	  of dev_add() mgmtdev ops callback before invoking _vdpa_register_device().
  */
 struct vdpa_device {
 	struct device dev;
@@ -53,6 +55,7 @@ struct vdpa_device {
 	unsigned int index;
 	bool features_valid;
 	int nvqs;
+	struct vdpa_mgmt_dev *mdev;
 };
 
 /**
@@ -260,6 +263,9 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 int vdpa_register_device(struct vdpa_device *vdev);
 void vdpa_unregister_device(struct vdpa_device *vdev);
 
+int _vdpa_register_device(struct vdpa_device *vdev);
+void _vdpa_unregister_device(struct vdpa_device *vdev);
+
 /**
  * vdpa_driver - operations for a vDPA driver
  * @driver: underlying device driver
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index d44d82e567b1..bb4a1f00eb1c 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -14,6 +14,8 @@ enum vdpa_command {
 	VDPA_CMD_UNSPEC,
 	VDPA_CMD_MGMTDEV_NEW,
 	VDPA_CMD_MGMTDEV_GET,		/* can dump */
+	VDPA_CMD_DEV_NEW,
+	VDPA_CMD_DEV_DEL,
 };
 
 enum vdpa_attr {
@@ -24,6 +26,8 @@ enum vdpa_attr {
 	VDPA_ATTR_MGMTDEV_DEV_NAME,		/* string */
 	VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,	/* u64 */
 
+	VDPA_ATTR_DEV_NAME,			/* string */
+
 	/* new attributes must be added above here */
 	VDPA_ATTR_MAX,
 };
-- 
cgit v1.2.3-71-gd317


From bc0d90ee021f1baecd6aaa010d787eb373aa74dd Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 5 Jan 2021 12:32:02 +0200
Subject: vdpa: Enable user to query vdpa device info

Enable user to query vdpa device information.

$ vdpa dev add mgmtdev vdpasim_net name foo2

Show the newly created vdpa device by its name:
$ vdpa dev show foo2
foo2: type network mgmtdev vdpasim_net vendor_id 0 max_vqs 2 max_vq_size 256

$ vdpa dev show foo2 -jp
{
    "dev": {
        "foo2": {
            "type": "network",
            "mgmtdev": "vdpasim_net",
            "vendor_id": 0,
            "max_vqs": 2,
            "max_vq_size": 256
        }
    }
}

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210105103203.82508-6-parav@nvidia.com

Including a memory leak fix:

Link: https://lore.kernel.org/r/20210217060614.59561-1-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c       | 131 ++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vdpa.h |   5 ++
 2 files changed, 136 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index f1228189814f..da67f07e24fd 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -478,6 +478,131 @@ dev_err:
 	return err;
 }
 
+static int
+vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq,
+	      int flags, struct netlink_ext_ack *extack)
+{
+	u16 max_vq_size;
+	u32 device_id;
+	u32 vendor_id;
+	void *hdr;
+	int err;
+
+	hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_NEW);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	err = vdpa_nl_mgmtdev_handle_fill(msg, vdev->mdev);
+	if (err)
+		goto msg_err;
+
+	device_id = vdev->config->get_device_id(vdev);
+	vendor_id = vdev->config->get_vendor_id(vdev);
+	max_vq_size = vdev->config->get_vq_num_max(vdev);
+
+	err = -EMSGSIZE;
+	if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev)))
+		goto msg_err;
+	if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id))
+		goto msg_err;
+	if (nla_put_u32(msg, VDPA_ATTR_DEV_VENDOR_ID, vendor_id))
+		goto msg_err;
+	if (nla_put_u32(msg, VDPA_ATTR_DEV_MAX_VQS, vdev->nvqs))
+		goto msg_err;
+	if (nla_put_u16(msg, VDPA_ATTR_DEV_MAX_VQ_SIZE, max_vq_size))
+		goto msg_err;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+msg_err:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
+static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_device *vdev;
+	struct sk_buff *msg;
+	const char *devname;
+	struct device *dev;
+	int err;
+
+	if (!info->attrs[VDPA_ATTR_DEV_NAME])
+		return -EINVAL;
+	devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	mutex_lock(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match);
+	if (!dev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "device not found");
+		err = -ENODEV;
+		goto err;
+	}
+	vdev = container_of(dev, struct vdpa_device, dev);
+	if (!vdev->mdev) {
+		err = -EINVAL;
+		goto mdev_err;
+	}
+	err = vdpa_dev_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack);
+	if (!err)
+		err = genlmsg_reply(msg, info);
+mdev_err:
+	put_device(dev);
+err:
+	mutex_unlock(&vdpa_dev_mutex);
+	if (err)
+		nlmsg_free(msg);
+	return err;
+}
+
+struct vdpa_dev_dump_info {
+	struct sk_buff *msg;
+	struct netlink_callback *cb;
+	int start_idx;
+	int idx;
+};
+
+static int vdpa_dev_dump(struct device *dev, void *data)
+{
+	struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+	struct vdpa_dev_dump_info *info = data;
+	int err;
+
+	if (!vdev->mdev)
+		return 0;
+	if (info->idx < info->start_idx) {
+		info->idx++;
+		return 0;
+	}
+	err = vdpa_dev_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid,
+			    info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack);
+	if (err)
+		return err;
+
+	info->idx++;
+	return 0;
+}
+
+static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct vdpa_dev_dump_info info;
+
+	info.msg = msg;
+	info.cb = cb;
+	info.start_idx = cb->args[0];
+	info.idx = 0;
+
+	mutex_lock(&vdpa_dev_mutex);
+	bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_dump);
+	mutex_unlock(&vdpa_dev_mutex);
+	cb->args[0] = info.idx;
+	return msg->len;
+}
+
 static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
 	[VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
@@ -503,6 +628,12 @@ static const struct genl_ops vdpa_nl_ops[] = {
 		.doit = vdpa_nl_cmd_dev_del_set_doit,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = VDPA_CMD_DEV_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_dev_get_doit,
+		.dumpit = vdpa_nl_cmd_dev_get_dumpit,
+	},
 };
 
 static struct genl_family vdpa_nl_family __ro_after_init = {
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index bb4a1f00eb1c..66a41e4ec163 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -16,6 +16,7 @@ enum vdpa_command {
 	VDPA_CMD_MGMTDEV_GET,		/* can dump */
 	VDPA_CMD_DEV_NEW,
 	VDPA_CMD_DEV_DEL,
+	VDPA_CMD_DEV_GET,		/* can dump */
 };
 
 enum vdpa_attr {
@@ -27,6 +28,10 @@ enum vdpa_attr {
 	VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,	/* u64 */
 
 	VDPA_ATTR_DEV_NAME,			/* string */
+	VDPA_ATTR_DEV_ID,			/* u32 */
+	VDPA_ATTR_DEV_VENDOR_ID,		/* u32 */
+	VDPA_ATTR_DEV_MAX_VQS,			/* u32 */
+	VDPA_ATTR_DEV_MAX_VQ_SIZE,		/* u16 */
 
 	/* new attributes must be added above here */
 	VDPA_ATTR_MAX,
-- 
cgit v1.2.3-71-gd317


From 1c0aa1fae1acb77c5f9917adb0e4cb4500b9f3a6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 20 Feb 2021 11:55:28 -0700
Subject: io_uring: flag new native workers with IORING_FEAT_NATIVE_WORKERS

A few reasons to do this:

- The naming of the manager and worker have changed. That's a user visible
  change, so makes sense to flag it.

- Opening certain files that use ->signal (like /proc/self or /dev/tty)
  now works, and the flag tells the application upfront that this is the
  case.

- Related to the above, using signalfd will now work as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 2 +-
 include/uapi/linux/io_uring.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 378cf79e66c9..cf9a5fa1ad03 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9467,7 +9467,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
 			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
-			IORING_FEAT_EXT_ARG;
+			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ac4e1738a9af..2514eb6b1cf2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -262,6 +262,7 @@ struct io_uring_params {
 #define IORING_FEAT_POLL_32BITS 	(1U << 6)
 #define IORING_FEAT_SQPOLL_NONFIXED	(1U << 7)
 #define IORING_FEAT_EXT_ARG		(1U << 8)
+#define IORING_FEAT_NATIVE_WORKERS	(1U << 9)
 
 /*
  * io_uring_register(2) opcodes and arguments
-- 
cgit v1.2.3-71-gd317


From bda420b985054a3badafef23807c4b4fa38a3dff Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 24 Feb 2021 12:09:43 -0800
Subject: numa balancing: migrate on fault among multiple bound nodes

Now, NUMA balancing can only optimize the page placement among the NUMA
nodes if the default memory policy is used.  Because the memory policy
specified explicitly should take precedence.  But this seems too strict in
some situations.  For example, on a system with 4 NUMA nodes, if the
memory of an application is bound to the node 0 and 1, NUMA balancing can
potentially migrate the pages between the node 0 and 1 to reduce
cross-node accessing without breaking the explicit memory binding policy.

So in this patch, we add MPOL_F_NUMA_BALANCING mode flag to
set_mempolicy() when mode is MPOL_BIND.  With the flag specified, NUMA
balancing will be enabled within the thread to optimize the page placement
within the constrains of the specified memory binding policy.  With the
newly added flag, the NUMA balancing control mechanism becomes,

 - sysctl knob numa_balancing can enable/disable the NUMA balancing
   globally.

 - even if sysctl numa_balancing is enabled, the NUMA balancing will be
   disabled for the memory areas or applications with the explicit
   memory policy by default.

 - MPOL_F_NUMA_BALANCING can be used to enable the NUMA balancing for
   the applications when specifying the explicit memory policy
   (MPOL_BIND).

Various page placement optimization based on the NUMA balancing can be
done with these flags.  As the first step, in this patch, if the memory of
the application is bound to multiple nodes (MPOL_BIND), and in the hint
page fault handler the accessing node are in the policy nodemask, the page
will be tried to be migrated to the accessing node to reduce the
cross-node accessing.

If the newly added MPOL_F_NUMA_BALANCING flag is specified by an
application on an old kernel version without its support, set_mempolicy()
will return -1 and errno will be set to EINVAL.  The application can use
this behavior to run on both old and new kernel versions.

And if the MPOL_F_NUMA_BALANCING flag is specified for the mode other than
MPOL_BIND, set_mempolicy() will return -1 and errno will be set to EINVAL
as before.  Because we don't support optimization based on the NUMA
balancing for these modes.

In the previous version of the patch, we tried to reuse MPOL_MF_LAZY for
mbind().  But that flag is tied to MPOL_MF_MOVE.*, so it seems not a good
API/ABI for the purpose of the patch.

And because it's not clear whether it's necessary to enable NUMA balancing
for a specific memory area inside an application, so we only add the flag
at the thread level (set_mempolicy()) instead of the memory area level
(mbind()).  We can do that when it become necessary.

To test the patch, we run a test case as follows on a 4-node machine with
192 GB memory (48 GB per node).

1. Change pmbench memory accessing benchmark to call set_mempolicy()
   to bind its memory to node 1 and 3 and enable NUMA balancing.  Some
   related code snippets are as follows,

     #include <numaif.h>
     #include <numa.h>

	struct bitmask *bmp;
	int ret;

	bmp = numa_parse_nodestring("1,3");
	ret = set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING,
			    bmp->maskp, bmp->size + 1);
	/* If MPOL_F_NUMA_BALANCING isn't supported, fall back to MPOL_BIND */
	if (ret < 0 && errno == EINVAL)
		ret = set_mempolicy(MPOL_BIND, bmp->maskp, bmp->size + 1);
	if (ret < 0) {
		perror("Failed to call set_mempolicy");
		exit(-1);
	}

2. Run a memory eater on node 3 to use 40 GB memory before running pmbench.

3. Run pmbench with 64 processes, the working-set size of each process
   is 640 MB, so the total working-set size is 64 * 640 MB = 40 GB.  The
   CPU and the memory (as in step 1.) of all pmbench processes is bound
   to node 1 and 3. So, after CPU usage is balanced, some pmbench
   processes run on the CPUs of the node 3 will access the memory of
   the node 1.

4. After the pmbench processes run for 100 seconds, kill the memory
   eater.  Now it's possible for some pmbench processes to migrate
   their pages from node 1 to node 3 to reduce cross-node accessing.

Test results show that, with the patch, the pages can be migrated from
node 1 to node 3 after killing the memory eater, and the pmbench score
can increase about 17.5%.

Link: https://lkml.kernel.org/r/20210120061235.148637-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/mempolicy.h |  4 +++-
 mm/mempolicy.c                 | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 3354774af61e..8948467b3992 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -28,12 +28,14 @@ enum {
 /* Flags for set_mempolicy */
 #define MPOL_F_STATIC_NODES	(1 << 15)
 #define MPOL_F_RELATIVE_NODES	(1 << 14)
+#define MPOL_F_NUMA_BALANCING	(1 << 13) /* Optimize with NUMA balancing if possible */
 
 /*
  * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
  * either set_mempolicy() or mbind().
  */
-#define MPOL_MODE_FLAGS	(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
+#define MPOL_MODE_FLAGS							\
+	(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES | MPOL_F_NUMA_BALANCING)
 
 /* Flags for get_mempolicy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2c3a86502053..6961238c7ef5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -875,6 +875,16 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 		goto out;
 	}
 
+	if (flags & MPOL_F_NUMA_BALANCING) {
+		if (new && new->mode == MPOL_BIND) {
+			new->flags |= (MPOL_F_MOF | MPOL_F_MORON);
+		} else {
+			ret = -EINVAL;
+			mpol_put(new);
+			goto out;
+		}
+	}
+
 	ret = mpol_set_nodemask(new, nodes, scratch);
 	if (ret) {
 		mpol_put(new);
@@ -2486,6 +2496,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		break;
 
 	case MPOL_BIND:
+		/* Optimize placement among multiple nodes via NUMA balancing */
+		if (pol->flags & MPOL_F_MORON) {
+			if (node_isset(thisnid, pol->v.nodes))
+				break;
+			goto out;
+		}
 
 		/*
 		 * allows binding to multiple nodes.
-- 
cgit v1.2.3-71-gd317


From df54714f579a77662054132161612ce3da876b0d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 25 Feb 2021 17:20:56 -0800
Subject: include/linux: remove repeated words

Drop the doubled word "for" in a comment. {firewire-cdev.h}
Drop the doubled word "in" in a comment. {input.h}
Drop the doubled word "a" in a comment. {mdev.h}
Drop the doubled word "the" in a comment. {ptrace.h}

Link: https://lkml.kernel.org/r/20210126232444.22861-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Kirti Wankhede <kwankhede@nvidia.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mdev.h               | 2 +-
 include/linux/ptrace.h             | 2 +-
 include/uapi/linux/firewire-cdev.h | 2 +-
 include/uapi/linux/input.h         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 9004375c462e..27eb383cb95d 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -42,7 +42,7 @@ struct device *mdev_get_iommu_device(struct device *dev);
  *			@mdev: mdev_device structure on of mediated device
  *			      that is being created
  *			Returns integer: success (0) or error (< 0)
- * @remove:		Called to free resources in parent device's driver for a
+ * @remove:		Called to free resources in parent device's driver for
  *			a mediated device. It is mandatory to provide 'remove'
  *			ops.
  *			@mdev: mdev_device device structure which is being
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 2a9df80ea887..b5ebf6c01292 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -171,7 +171,7 @@ static inline void ptrace_event(int event, unsigned long message)
  *
  * Check whether @event is enabled and, if so, report @event and @pid
  * to the ptrace parent.  @pid is reported as the pid_t seen from the
- * the ptrace parent's pid namespace.
+ * ptrace parent's pid namespace.
  *
  * Called without locks.
  */
diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h
index 7e5b5c10a49c..5effa9832802 100644
--- a/include/uapi/linux/firewire-cdev.h
+++ b/include/uapi/linux/firewire-cdev.h
@@ -844,7 +844,7 @@ struct fw_cdev_queue_iso {
  * struct fw_cdev_start_iso - Start an isochronous transmission or reception
  * @cycle:	Cycle in which to start I/O.  If @cycle is greater than or
  *		equal to 0, the I/O will start on that cycle.
- * @sync:	Determines the value to wait for for receive packets that have
+ * @sync:	Determines the value to wait for receive packets that have
  *		the %FW_CDEV_ISO_SYNC bit set
  * @tags:	Tag filter bit mask.  Only valid for isochronous reception.
  *		Determines the tag values for which packets will be accepted.
diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index 9a61c28ed3ae..ee3127461ee0 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -84,7 +84,7 @@ struct input_id {
  * in units per radian.
  * When INPUT_PROP_ACCELEROMETER is set the resolution changes.
  * The main axes (ABS_X, ABS_Y, ABS_Z) are then reported in
- * in units per g (units/g) and in units per degree per second
+ * units per g (units/g) and in units per degree per second
  * (units/deg/s) for rotational axes (ABS_RX, ABS_RY, ABS_RZ).
  */
 struct input_absinfo {
-- 
cgit v1.2.3-71-gd317


From 30b5c851af7991ad08abe90c1e7c31615fa98a1a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 1 Mar 2021 12:53:09 +0000
Subject: KVM: x86/xen: Add support for vCPU runstate information

This is how Xen guests do steal time accounting. The hypervisor records
the amount of time spent in each of running/runnable/blocked/offline
states.

In the Xen accounting, a vCPU is still in state RUNSTATE_running while
in Xen for a hypercall or I/O trap, etc. Only if Xen explicitly schedules
does the state become RUNSTATE_blocked. In KVM this means that even when
the vCPU exits the kvm_run loop, the state remains RUNSTATE_running.

The VMM can explicitly set the vCPU to RUNSTATE_blocked by using the
KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT attribute, and can also use
KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST to retrospectively add a given
amount of time to the blocked state and subtract it from the running
state.

The state_entry_time corresponds to get_kvmclock_ns() at the time the
vCPU entered the current state, and the total times of all four states
should always add up to state_entry_time.

Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20210301125309.874953-2-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst                     |  41 +++
 arch/x86/include/asm/kvm_host.h                    |   6 +
 arch/x86/kvm/x86.c                                 |  13 +-
 arch/x86/kvm/xen.c                                 | 286 +++++++++++++++++++++
 arch/x86/kvm/xen.h                                 |  40 ++-
 include/uapi/linux/kvm.h                           |  13 +
 .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 159 +++++++++++-
 7 files changed, 553 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 359435d4e417..1a2b5210cdbf 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4878,6 +4878,14 @@ see KVM_XEN_HVM_SET_ATTR above.
 	union {
 		__u64 gpa;
 		__u64 pad[4];
+		struct {
+			__u64 state;
+			__u64 state_entry_time;
+			__u64 time_running;
+			__u64 time_runnable;
+			__u64 time_blocked;
+			__u64 time_offline;
+		} runstate;
 	} u;
   };
 
@@ -4890,6 +4898,31 @@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
   Sets the guest physical address of an additional pvclock structure
   for a given vCPU. This is typically used for guest vsyscall support.
 
+KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
+  Sets the guest physical address of the vcpu_runstate_info for a given
+  vCPU. This is how a Xen guest tracks CPU state such as steal time.
+
+KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT
+  Sets the runstate (RUNSTATE_running/_runnable/_blocked/_offline) of
+  the given vCPU from the .u.runstate.state member of the structure.
+  KVM automatically accounts running and runnable time but blocked
+  and offline states are only entered explicitly.
+
+KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA
+  Sets all fields of the vCPU runstate data from the .u.runstate member
+  of the structure, including the current runstate. The state_entry_time
+  must equal the sum of the other four times.
+
+KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
+  This *adds* the contents of the .u.runstate members of the structure
+  to the corresponding members of the given vCPU's runstate data, thus
+  permitting atomic adjustments to the runstate times. The adjustment
+  to the state_entry_time must equal the sum of the adjustments to the
+  other four times. The state field must be set to -1, or to a valid
+  runstate value (RUNSTATE_running, RUNSTATE_runnable, RUNSTATE_blocked
+  or RUNSTATE_offline) to set the current accounted state as of the
+  adjusted state_entry_time.
+
 4.130 KVM_XEN_VCPU_GET_ATTR
 ---------------------------
 
@@ -4902,6 +4935,9 @@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
 Allows Xen vCPU attributes to be read. For the structure and types,
 see KVM_XEN_VCPU_SET_ATTR above.
 
+The KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST type may not be used
+with the KVM_XEN_VCPU_GET_ATTR ioctl.
+
 5. The kvm_run structure
 ========================
 
@@ -6666,6 +6702,7 @@ PVHVM guests. Valid flags are::
   #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR	(1 << 0)
   #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
   #define KVM_XEN_HVM_CONFIG_SHARED_INFO	(1 << 2)
+  #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 2)
 
 The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
 ioctl is available, for the guest to set its hypercall page.
@@ -6680,3 +6717,7 @@ KVM_XEN_HVM_SET_ATTR, KVM_XEN_HVM_GET_ATTR, KVM_XEN_VCPU_SET_ATTR and
 KVM_XEN_VCPU_GET_ATTR ioctls, as well as the delivery of exception vectors
 for event channel upcalls when the evtchn_upcall_pending field of a vcpu's
 vcpu_info is set.
+
+The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related
+features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are
+supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 85ccbd4b7c52..877a4025d8da 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -535,10 +535,16 @@ struct kvm_vcpu_hv {
 /* Xen HVM per vcpu emulation context */
 struct kvm_vcpu_xen {
 	u64 hypercall_rip;
+	u32 current_runstate;
 	bool vcpu_info_set;
 	bool vcpu_time_info_set;
+	bool runstate_set;
 	struct gfn_to_hva_cache vcpu_info_cache;
 	struct gfn_to_hva_cache vcpu_time_info_cache;
+	struct gfn_to_hva_cache runstate_cache;
+	u64 last_steal;
+	u64 runstate_entry_time;
+	u64 runstate_times[4];
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4a5ce57b0bb2..868213ca4f98 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2956,6 +2956,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	struct kvm_host_map map;
 	struct kvm_steal_time *st;
 
+	if (kvm_xen_msr_enabled(vcpu->kvm)) {
+		kvm_xen_runstate_set_running(vcpu);
+		return;
+	}
+
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
@@ -3760,6 +3765,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
 		    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
 		    KVM_XEN_HVM_CONFIG_SHARED_INFO;
+		if (sched_info_on())
+			r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
 		break;
 #endif
 	case KVM_CAP_SYNC_REGS:
@@ -4039,7 +4046,11 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	if (vcpu->preempted && !vcpu->arch.guest_state_protected)
 		vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
 
-	kvm_steal_time_set_preempted(vcpu);
+	if (kvm_xen_msr_enabled(vcpu->kvm))
+		kvm_xen_runstate_set_preempted(vcpu);
+	else
+		kvm_steal_time_set_preempted(vcpu);
+
 	static_call(kvm_x86_vcpu_put)(vcpu);
 	vcpu->arch.last_host_tsc = rdtsc();
 	/*
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 77b20ff09078..ae17250e1efe 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -11,9 +11,11 @@
 #include "hyperv.h"
 
 #include <linux/kvm_host.h>
+#include <linux/sched/stat.h>
 
 #include <trace/events/kvm.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
 
 #include "trace.h"
 
@@ -61,6 +63,132 @@ out:
 	return ret;
 }
 
+static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
+{
+	struct kvm_vcpu_xen *vx = &v->arch.xen;
+	u64 now = get_kvmclock_ns(v->kvm);
+	u64 delta_ns = now - vx->runstate_entry_time;
+	u64 run_delay = current->sched_info.run_delay;
+
+	if (unlikely(!vx->runstate_entry_time))
+		vx->current_runstate = RUNSTATE_offline;
+
+	/*
+	 * Time waiting for the scheduler isn't "stolen" if the
+	 * vCPU wasn't running anyway.
+	 */
+	if (vx->current_runstate == RUNSTATE_running) {
+		u64 steal_ns = run_delay - vx->last_steal;
+
+		delta_ns -= steal_ns;
+
+		vx->runstate_times[RUNSTATE_runnable] += steal_ns;
+	}
+	vx->last_steal = run_delay;
+
+	vx->runstate_times[vx->current_runstate] += delta_ns;
+	vx->current_runstate = state;
+	vx->runstate_entry_time = now;
+}
+
+void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
+{
+	struct kvm_vcpu_xen *vx = &v->arch.xen;
+	uint64_t state_entry_time;
+	unsigned int offset;
+
+	kvm_xen_update_runstate(v, state);
+
+	if (!vx->runstate_set)
+		return;
+
+	BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+
+	offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
+#ifdef CONFIG_X86_64
+	/*
+	 * The only difference is alignment of uint64_t in 32-bit.
+	 * So the first field 'state' is accessed directly using
+	 * offsetof() (where its offset happens to be zero), while the
+	 * remaining fields which are all uint64_t, start at 'offset'
+	 * which we tweak here by adding 4.
+	 */
+	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
+		     offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
+	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
+		     offsetof(struct compat_vcpu_runstate_info, time) + 4);
+
+	if (v->kvm->arch.xen.long_mode)
+		offset = offsetof(struct vcpu_runstate_info, state_entry_time);
+#endif
+	/*
+	 * First write the updated state_entry_time at the appropriate
+	 * location determined by 'offset'.
+	 */
+	state_entry_time = vx->runstate_entry_time;
+	state_entry_time |= XEN_RUNSTATE_UPDATE;
+
+	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state_entry_time) !=
+		     sizeof(state_entry_time));
+	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) !=
+		     sizeof(state_entry_time));
+
+	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+					  &state_entry_time, offset,
+					  sizeof(state_entry_time)))
+		return;
+	smp_wmb();
+
+	/*
+	 * Next, write the new runstate. This is in the *same* place
+	 * for 32-bit and 64-bit guests, asserted here for paranoia.
+	 */
+	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
+		     offsetof(struct compat_vcpu_runstate_info, state));
+	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) !=
+		     sizeof(vx->current_runstate));
+	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) !=
+		     sizeof(vx->current_runstate));
+
+	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+					  &vx->current_runstate,
+					  offsetof(struct vcpu_runstate_info, state),
+					  sizeof(vx->current_runstate)))
+		return;
+
+	/*
+	 * Write the actual runstate times immediately after the
+	 * runstate_entry_time.
+	 */
+	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
+		     offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
+	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
+		     offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
+	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
+		     sizeof(((struct compat_vcpu_runstate_info *)0)->time));
+	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
+		     sizeof(vx->runstate_times));
+
+	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+					  &vx->runstate_times[0],
+					  offset + sizeof(u64),
+					  sizeof(vx->runstate_times)))
+		return;
+
+	smp_wmb();
+
+	/*
+	 * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
+	 * runstate_entry_time field.
+	 */
+
+	state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+					  &state_entry_time, offset,
+					  sizeof(state_entry_time)))
+		return;
+}
+
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 {
 	u8 rc = 0;
@@ -223,6 +351,121 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		}
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		if (data->u.gpa == GPA_INVALID) {
+			vcpu->arch.xen.runstate_set = false;
+			r = 0;
+			break;
+		}
+
+		r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+					      &vcpu->arch.xen.runstate_cache,
+					      data->u.gpa,
+					      sizeof(struct vcpu_runstate_info));
+		if (!r) {
+			vcpu->arch.xen.runstate_set = true;
+		}
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		if (data->u.runstate.state > RUNSTATE_offline) {
+			r = -EINVAL;
+			break;
+		}
+
+		kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+		r = 0;
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		if (data->u.runstate.state > RUNSTATE_offline) {
+			r = -EINVAL;
+			break;
+		}
+		if (data->u.runstate.state_entry_time !=
+		    (data->u.runstate.time_running +
+		     data->u.runstate.time_runnable +
+		     data->u.runstate.time_blocked +
+		     data->u.runstate.time_offline)) {
+			r = -EINVAL;
+			break;
+		}
+		if (get_kvmclock_ns(vcpu->kvm) <
+		    data->u.runstate.state_entry_time) {
+			r = -EINVAL;
+			break;
+		}
+
+		vcpu->arch.xen.current_runstate = data->u.runstate.state;
+		vcpu->arch.xen.runstate_entry_time =
+			data->u.runstate.state_entry_time;
+		vcpu->arch.xen.runstate_times[RUNSTATE_running] =
+			data->u.runstate.time_running;
+		vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
+			data->u.runstate.time_runnable;
+		vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
+			data->u.runstate.time_blocked;
+		vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
+			data->u.runstate.time_offline;
+		vcpu->arch.xen.last_steal = current->sched_info.run_delay;
+		r = 0;
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		if (data->u.runstate.state > RUNSTATE_offline &&
+		    data->u.runstate.state != (u64)-1) {
+			r = -EINVAL;
+			break;
+		}
+		/* The adjustment must add up */
+		if (data->u.runstate.state_entry_time !=
+		    (data->u.runstate.time_running +
+		     data->u.runstate.time_runnable +
+		     data->u.runstate.time_blocked +
+		     data->u.runstate.time_offline)) {
+			r = -EINVAL;
+			break;
+		}
+
+		if (get_kvmclock_ns(vcpu->kvm) <
+		    (vcpu->arch.xen.runstate_entry_time +
+		     data->u.runstate.state_entry_time)) {
+			r = -EINVAL;
+			break;
+		}
+
+		vcpu->arch.xen.runstate_entry_time +=
+			data->u.runstate.state_entry_time;
+		vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
+			data->u.runstate.time_running;
+		vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
+			data->u.runstate.time_runnable;
+		vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
+			data->u.runstate.time_blocked;
+		vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
+			data->u.runstate.time_offline;
+
+		if (data->u.runstate.state <= RUNSTATE_offline)
+			kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+		r = 0;
+		break;
+
 	default:
 		break;
 	}
@@ -255,6 +498,49 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		r = 0;
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		if (vcpu->arch.xen.runstate_set) {
+			data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
+			r = 0;
+		}
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		data->u.runstate.state = vcpu->arch.xen.current_runstate;
+		r = 0;
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		data->u.runstate.state = vcpu->arch.xen.current_runstate;
+		data->u.runstate.state_entry_time =
+			vcpu->arch.xen.runstate_entry_time;
+		data->u.runstate.time_running =
+			vcpu->arch.xen.runstate_times[RUNSTATE_running];
+		data->u.runstate.time_runnable =
+			vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
+		data->u.runstate.time_blocked =
+			vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
+		data->u.runstate.time_offline =
+			vcpu->arch.xen.runstate_times[RUNSTATE_offline];
+		r = 0;
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
+		r = -EINVAL;
+		break;
+
 	default:
 		break;
 	}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 87eaf2be9549..463a7844a8ca 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -23,6 +23,12 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
 void kvm_xen_destroy_vm(struct kvm *kvm);
 
+static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
+{
+	return static_branch_unlikely(&kvm_xen_enabled.key) &&
+		kvm->arch.xen_hvm_config.msr;
+}
+
 static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
 {
 	return static_branch_unlikely(&kvm_xen_enabled.key) &&
@@ -48,6 +54,11 @@ static inline void kvm_xen_destroy_vm(struct kvm *kvm)
 {
 }
 
+static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
+{
+	return false;
+}
+
 static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
 {
 	return false;
@@ -61,10 +72,31 @@ static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
 
 int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
 
-/* 32-bit compatibility definitions, also used natively in 32-bit build */
 #include <asm/pvclock-abi.h>
 #include <asm/xen/interface.h>
+#include <xen/interface/vcpu.h>
 
+void kvm_xen_update_runstate_guest(struct kvm_vcpu *vcpu, int state);
+
+static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu)
+{
+	kvm_xen_update_runstate_guest(vcpu, RUNSTATE_running);
+}
+
+static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * If the vCPU wasn't preempted but took a normal exit for
+	 * some reason (hypercalls, I/O, etc.), that is accounted as
+	 * still RUNSTATE_running, as the VMM is still operating on
+	 * behalf of the vCPU. Only if the VMM does actually block
+	 * does it need to enter RUNSTATE_blocked.
+	 */
+	if (vcpu->preempted)
+		kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+}
+
+/* 32-bit compatibility definitions, also used natively in 32-bit build */
 struct compat_arch_vcpu_info {
 	unsigned int cr2;
 	unsigned int pad[5];
@@ -97,4 +129,10 @@ struct compat_shared_info {
 	struct compat_arch_shared_info arch;
 };
 
+struct compat_vcpu_runstate_info {
+    int state;
+    uint64_t state_entry_time;
+    uint64_t time[4];
+} __attribute__((packed));
+
 #endif /* __ARCH_X86_KVM_XEN_H__ */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 8b281f722e5b..f6afee209620 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1154,6 +1154,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR	(1 << 0)
 #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO		(1 << 2)
+#define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
 
 struct kvm_xen_hvm_config {
 	__u32 flags;
@@ -1621,12 +1622,24 @@ struct kvm_xen_vcpu_attr {
 	union {
 		__u64 gpa;
 		__u64 pad[8];
+		struct {
+			__u64 state;
+			__u64 state_entry_time;
+			__u64 time_running;
+			__u64 time_runnable;
+			__u64 time_blocked;
+			__u64 time_offline;
+		} runstate;
 	} u;
 };
 
 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO	0x0
 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO	0x1
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR	0x2
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT	0x3
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA	0x4
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST	0x5
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index 9246ea310587..804ff5ff022d 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -13,19 +13,27 @@
 
 #include <stdint.h>
 #include <time.h>
+#include <sched.h>
+#include <sys/syscall.h>
 
 #define VCPU_ID		5
 
+#define SHINFO_REGION_GVA	0xc0000000ULL
 #define SHINFO_REGION_GPA	0xc0000000ULL
 #define SHINFO_REGION_SLOT	10
 #define PAGE_SIZE		4096
 
 #define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
+#define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + 0x20)
+
+#define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + 0x20)
 
 static struct kvm_vm *vm;
 
 #define XEN_HYPERCALL_MSR	0x40000000
 
+#define MIN_STEAL_TIME		50000
+
 struct pvclock_vcpu_time_info {
         u32   version;
         u32   pad0;
@@ -43,11 +51,67 @@ struct pvclock_wall_clock {
         u32   nsec;
 } __attribute__((__packed__));
 
+struct vcpu_runstate_info {
+    uint32_t state;
+    uint64_t state_entry_time;
+    uint64_t time[4];
+};
+
+#define RUNSTATE_running  0
+#define RUNSTATE_runnable 1
+#define RUNSTATE_blocked  2
+#define RUNSTATE_offline  3
+
 static void guest_code(void)
 {
+	struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
+
+	/* Test having the host set runstates manually */
+	GUEST_SYNC(RUNSTATE_runnable);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	GUEST_SYNC(RUNSTATE_blocked);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	GUEST_SYNC(RUNSTATE_offline);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	/* Test runstate time adjust */
+	GUEST_SYNC(4);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x5a);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x6b6b);
+
+	/* Test runstate time set */
+	GUEST_SYNC(5);
+	GUEST_ASSERT(rs->state_entry_time >= 0x8000);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] == 0);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x6b6b);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x5a);
+
+	/* sched_yield() should result in some 'runnable' time */
+	GUEST_SYNC(6);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME);
+
 	GUEST_DONE();
 }
 
+static long get_run_delay(void)
+{
+        char path[64];
+        long val[2];
+        FILE *fp;
+
+        sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid));
+        fp = fopen(path, "r");
+        fscanf(fp, "%ld %ld ", &val[0], &val[1]);
+        fclose(fp);
+
+        return val[1];
+}
+
 static int cmp_timespec(struct timespec *a, struct timespec *b)
 {
 	if (a->tv_sec > b->tv_sec)
@@ -66,12 +130,14 @@ int main(int argc, char *argv[])
 {
 	struct timespec min_ts, max_ts, vm_ts;
 
-	if (!(kvm_check_cap(KVM_CAP_XEN_HVM) &
-	      KVM_XEN_HVM_CONFIG_SHARED_INFO) ) {
+	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
+	if (!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO) ) {
 		print_skip("KVM_XEN_HVM_CONFIG_SHARED_INFO not available");
 		exit(KSFT_SKIP);
 	}
 
+	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
+
 	clock_gettime(CLOCK_REALTIME, &min_ts);
 
 	vm = vm_create_default(VCPU_ID, 0, (void *) guest_code);
@@ -80,6 +146,7 @@ int main(int argc, char *argv[])
 	/* Map a region for the shared_info page */
 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
 				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0);
+	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2, 0);
 
 	struct kvm_xen_hvm_config hvmc = {
 		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
@@ -111,6 +178,17 @@ int main(int argc, char *argv[])
 	};
 	vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &pvclock);
 
+	if (do_runstate_tests) {
+		struct kvm_xen_vcpu_attr st = {
+			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
+			.u.gpa = RUNSTATE_ADDR,
+		};
+		vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st);
+	}
+
+	struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);;
+	rs->state = 0x5a;
+
 	for (;;) {
 		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
 		struct ucall uc;
@@ -126,8 +204,56 @@ int main(int argc, char *argv[])
 		case UCALL_ABORT:
 			TEST_FAIL("%s", (const char *)uc.args[0]);
 			/* NOT REACHED */
-		case UCALL_SYNC:
+		case UCALL_SYNC: {
+			struct kvm_xen_vcpu_attr rst;
+			long rundelay;
+
+			/* If no runstate support, bail out early */
+			if (!do_runstate_tests)
+				goto done;
+
+			TEST_ASSERT(rs->state_entry_time == rs->time[0] +
+				    rs->time[1] + rs->time[2] + rs->time[3],
+				    "runstate times don't add up");
+
+			switch (uc.args[1]) {
+			case RUNSTATE_running...RUNSTATE_offline:
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT;
+				rst.u.runstate.state = uc.args[1];
+				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+			case 4:
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST;
+				memset(&rst.u, 0, sizeof(rst.u));
+				rst.u.runstate.state = (uint64_t)-1;
+				rst.u.runstate.time_blocked =
+					0x5a - rs->time[RUNSTATE_blocked];
+				rst.u.runstate.time_offline =
+					0x6b6b - rs->time[RUNSTATE_offline];
+				rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked -
+					rst.u.runstate.time_offline;
+				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+
+			case 5:
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA;
+				memset(&rst.u, 0, sizeof(rst.u));
+				rst.u.runstate.state = RUNSTATE_running;
+				rst.u.runstate.state_entry_time = 0x6b6b + 0x5a;
+				rst.u.runstate.time_blocked = 0x6b6b;
+				rst.u.runstate.time_offline = 0x5a;
+				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+			case 6:
+				/* Yield until scheduler delay exceeds target */
+				rundelay = get_run_delay() + MIN_STEAL_TIME;
+				do {
+					sched_yield();
+				} while (get_run_delay() < rundelay);
+				break;
+			}
 			break;
+		}
 		case UCALL_DONE:
 			goto done;
 		default:
@@ -162,6 +288,33 @@ int main(int argc, char *argv[])
 	TEST_ASSERT(ti2->version && !(ti2->version & 1),
 		    "Bad time_info version %x", ti->version);
 
+	if (do_runstate_tests) {
+		/*
+		 * Fetch runstate and check sanity. Strictly speaking in the
+		 * general case we might not expect the numbers to be identical
+		 * but in this case we know we aren't running the vCPU any more.
+		 */
+		struct kvm_xen_vcpu_attr rst = {
+			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA,
+		};
+		vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &rst);
+
+		TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch");
+		TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time,
+			    "State entry time mismatch");
+		TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running,
+			    "Running time mismatch");
+		TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
+			    "Runnable time mismatch");
+		TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
+			    "Blocked time mismatch");
+		TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
+			    "Offline time mismatch");
+
+		TEST_ASSERT(rs->state_entry_time == rs->time[0] +
+			    rs->time[1] + rs->time[2] + rs->time[3],
+			    "runstate times don't add up");
+	}
 	kvm_vm_free(vm);
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317