cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

vfio.c (57129B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * VFIO core
      4 *
      5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
      6 *     Author: Alex Williamson <alex.williamson@redhat.com>
      7 *
      8 * Derived from original vfio:
      9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
     10 * Author: Tom Lyon, pugs@cisco.com
     11 */
     12
     13#include <linux/cdev.h>
     14#include <linux/compat.h>
     15#include <linux/device.h>
     16#include <linux/file.h>
     17#include <linux/anon_inodes.h>
     18#include <linux/fs.h>
     19#include <linux/idr.h>
     20#include <linux/iommu.h>
     21#include <linux/list.h>
     22#include <linux/miscdevice.h>
     23#include <linux/module.h>
     24#include <linux/mutex.h>
     25#include <linux/pci.h>
     26#include <linux/rwsem.h>
     27#include <linux/sched.h>
     28#include <linux/slab.h>
     29#include <linux/stat.h>
     30#include <linux/string.h>
     31#include <linux/uaccess.h>
     32#include <linux/vfio.h>
     33#include <linux/wait.h>
     34#include <linux/sched/signal.h>
     35#include "vfio.h"
     36
     37#define DRIVER_VERSION	"0.3"
     38#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
     39#define DRIVER_DESC	"VFIO - User Level meta-driver"
     40
     41static struct vfio {
     42	struct class			*class;
     43	struct list_head		iommu_drivers_list;
     44	struct mutex			iommu_drivers_lock;
     45	struct list_head		group_list;
     46	struct mutex			group_lock; /* locks group_list */
     47	struct ida			group_ida;
     48	dev_t				group_devt;
     49} vfio;
     50
     51struct vfio_iommu_driver {
     52	const struct vfio_iommu_driver_ops	*ops;
     53	struct list_head			vfio_next;
     54};
     55
     56struct vfio_container {
     57	struct kref			kref;
     58	struct list_head		group_list;
     59	struct rw_semaphore		group_lock;
     60	struct vfio_iommu_driver	*iommu_driver;
     61	void				*iommu_data;
     62	bool				noiommu;
     63};
     64
     65struct vfio_group {
     66	struct device 			dev;
     67	struct cdev			cdev;
     68	refcount_t			users;
     69	unsigned int			container_users;
     70	struct iommu_group		*iommu_group;
     71	struct vfio_container		*container;
     72	struct list_head		device_list;
     73	struct mutex			device_lock;
     74	struct list_head		vfio_next;
     75	struct list_head		container_next;
     76	enum vfio_group_type		type;
     77	unsigned int			dev_counter;
     78	struct rw_semaphore		group_rwsem;
     79	struct kvm			*kvm;
     80	struct file			*opened_file;
     81	struct blocking_notifier_head	notifier;
     82};
     83
     84#ifdef CONFIG_VFIO_NOIOMMU
     85static bool noiommu __read_mostly;
     86module_param_named(enable_unsafe_noiommu_mode,
     87		   noiommu, bool, S_IRUGO | S_IWUSR);
     88MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
     89#endif
     90
     91static DEFINE_XARRAY(vfio_device_set_xa);
     92static const struct file_operations vfio_group_fops;
     93
     94int vfio_assign_device_set(struct vfio_device *device, void *set_id)
     95{
     96	unsigned long idx = (unsigned long)set_id;
     97	struct vfio_device_set *new_dev_set;
     98	struct vfio_device_set *dev_set;
     99
    100	if (WARN_ON(!set_id))
    101		return -EINVAL;
    102
    103	/*
    104	 * Atomically acquire a singleton object in the xarray for this set_id
    105	 */
    106	xa_lock(&vfio_device_set_xa);
    107	dev_set = xa_load(&vfio_device_set_xa, idx);
    108	if (dev_set)
    109		goto found_get_ref;
    110	xa_unlock(&vfio_device_set_xa);
    111
    112	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
    113	if (!new_dev_set)
    114		return -ENOMEM;
    115	mutex_init(&new_dev_set->lock);
    116	INIT_LIST_HEAD(&new_dev_set->device_list);
    117	new_dev_set->set_id = set_id;
    118
    119	xa_lock(&vfio_device_set_xa);
    120	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
    121			       GFP_KERNEL);
    122	if (!dev_set) {
    123		dev_set = new_dev_set;
    124		goto found_get_ref;
    125	}
    126
    127	kfree(new_dev_set);
    128	if (xa_is_err(dev_set)) {
    129		xa_unlock(&vfio_device_set_xa);
    130		return xa_err(dev_set);
    131	}
    132
    133found_get_ref:
    134	dev_set->device_count++;
    135	xa_unlock(&vfio_device_set_xa);
    136	mutex_lock(&dev_set->lock);
    137	device->dev_set = dev_set;
    138	list_add_tail(&device->dev_set_list, &dev_set->device_list);
    139	mutex_unlock(&dev_set->lock);
    140	return 0;
    141}
    142EXPORT_SYMBOL_GPL(vfio_assign_device_set);
    143
    144static void vfio_release_device_set(struct vfio_device *device)
    145{
    146	struct vfio_device_set *dev_set = device->dev_set;
    147
    148	if (!dev_set)
    149		return;
    150
    151	mutex_lock(&dev_set->lock);
    152	list_del(&device->dev_set_list);
    153	mutex_unlock(&dev_set->lock);
    154
    155	xa_lock(&vfio_device_set_xa);
    156	if (!--dev_set->device_count) {
    157		__xa_erase(&vfio_device_set_xa,
    158			   (unsigned long)dev_set->set_id);
    159		mutex_destroy(&dev_set->lock);
    160		kfree(dev_set);
    161	}
    162	xa_unlock(&vfio_device_set_xa);
    163}
    164
    165#ifdef CONFIG_VFIO_NOIOMMU
    166static void *vfio_noiommu_open(unsigned long arg)
    167{
    168	if (arg != VFIO_NOIOMMU_IOMMU)
    169		return ERR_PTR(-EINVAL);
    170	if (!capable(CAP_SYS_RAWIO))
    171		return ERR_PTR(-EPERM);
    172
    173	return NULL;
    174}
    175
    176static void vfio_noiommu_release(void *iommu_data)
    177{
    178}
    179
    180static long vfio_noiommu_ioctl(void *iommu_data,
    181			       unsigned int cmd, unsigned long arg)
    182{
    183	if (cmd == VFIO_CHECK_EXTENSION)
    184		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
    185
    186	return -ENOTTY;
    187}
    188
    189static int vfio_noiommu_attach_group(void *iommu_data,
    190		struct iommu_group *iommu_group, enum vfio_group_type type)
    191{
    192	return 0;
    193}
    194
    195static void vfio_noiommu_detach_group(void *iommu_data,
    196				      struct iommu_group *iommu_group)
    197{
    198}
    199
    200static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
    201	.name = "vfio-noiommu",
    202	.owner = THIS_MODULE,
    203	.open = vfio_noiommu_open,
    204	.release = vfio_noiommu_release,
    205	.ioctl = vfio_noiommu_ioctl,
    206	.attach_group = vfio_noiommu_attach_group,
    207	.detach_group = vfio_noiommu_detach_group,
    208};
    209
    210/*
    211 * Only noiommu containers can use vfio-noiommu and noiommu containers can only
    212 * use vfio-noiommu.
    213 */
    214static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
    215		const struct vfio_iommu_driver *driver)
    216{
    217	return container->noiommu == (driver->ops == &vfio_noiommu_ops);
    218}
    219#else
    220static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
    221		const struct vfio_iommu_driver *driver)
    222{
    223	return true;
    224}
    225#endif /* CONFIG_VFIO_NOIOMMU */
    226
    227/*
    228 * IOMMU driver registration
    229 */
    230int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
    231{
    232	struct vfio_iommu_driver *driver, *tmp;
    233
    234	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
    235	if (!driver)
    236		return -ENOMEM;
    237
    238	driver->ops = ops;
    239
    240	mutex_lock(&vfio.iommu_drivers_lock);
    241
    242	/* Check for duplicates */
    243	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
    244		if (tmp->ops == ops) {
    245			mutex_unlock(&vfio.iommu_drivers_lock);
    246			kfree(driver);
    247			return -EINVAL;
    248		}
    249	}
    250
    251	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
    252
    253	mutex_unlock(&vfio.iommu_drivers_lock);
    254
    255	return 0;
    256}
    257EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
    258
    259void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
    260{
    261	struct vfio_iommu_driver *driver;
    262
    263	mutex_lock(&vfio.iommu_drivers_lock);
    264	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
    265		if (driver->ops == ops) {
    266			list_del(&driver->vfio_next);
    267			mutex_unlock(&vfio.iommu_drivers_lock);
    268			kfree(driver);
    269			return;
    270		}
    271	}
    272	mutex_unlock(&vfio.iommu_drivers_lock);
    273}
    274EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
    275
    276static void vfio_group_get(struct vfio_group *group);
    277
    278/*
    279 * Container objects - containers are created when /dev/vfio/vfio is
    280 * opened, but their lifecycle extends until the last user is done, so
    281 * it's freed via kref.  Must support container/group/device being
    282 * closed in any order.
    283 */
    284static void vfio_container_get(struct vfio_container *container)
    285{
    286	kref_get(&container->kref);
    287}
    288
    289static void vfio_container_release(struct kref *kref)
    290{
    291	struct vfio_container *container;
    292	container = container_of(kref, struct vfio_container, kref);
    293
    294	kfree(container);
    295}
    296
    297static void vfio_container_put(struct vfio_container *container)
    298{
    299	kref_put(&container->kref, vfio_container_release);
    300}
    301
    302/*
    303 * Group objects - create, release, get, put, search
    304 */
    305static struct vfio_group *
    306__vfio_group_get_from_iommu(struct iommu_group *iommu_group)
    307{
    308	struct vfio_group *group;
    309
    310	list_for_each_entry(group, &vfio.group_list, vfio_next) {
    311		if (group->iommu_group == iommu_group) {
    312			vfio_group_get(group);
    313			return group;
    314		}
    315	}
    316	return NULL;
    317}
    318
    319static struct vfio_group *
    320vfio_group_get_from_iommu(struct iommu_group *iommu_group)
    321{
    322	struct vfio_group *group;
    323
    324	mutex_lock(&vfio.group_lock);
    325	group = __vfio_group_get_from_iommu(iommu_group);
    326	mutex_unlock(&vfio.group_lock);
    327	return group;
    328}
    329
    330static void vfio_group_release(struct device *dev)
    331{
    332	struct vfio_group *group = container_of(dev, struct vfio_group, dev);
    333
    334	mutex_destroy(&group->device_lock);
    335	iommu_group_put(group->iommu_group);
    336	ida_free(&vfio.group_ida, MINOR(group->dev.devt));
    337	kfree(group);
    338}
    339
    340static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
    341					   enum vfio_group_type type)
    342{
    343	struct vfio_group *group;
    344	int minor;
    345
    346	group = kzalloc(sizeof(*group), GFP_KERNEL);
    347	if (!group)
    348		return ERR_PTR(-ENOMEM);
    349
    350	minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
    351	if (minor < 0) {
    352		kfree(group);
    353		return ERR_PTR(minor);
    354	}
    355
    356	device_initialize(&group->dev);
    357	group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
    358	group->dev.class = vfio.class;
    359	group->dev.release = vfio_group_release;
    360	cdev_init(&group->cdev, &vfio_group_fops);
    361	group->cdev.owner = THIS_MODULE;
    362
    363	refcount_set(&group->users, 1);
    364	init_rwsem(&group->group_rwsem);
    365	INIT_LIST_HEAD(&group->device_list);
    366	mutex_init(&group->device_lock);
    367	group->iommu_group = iommu_group;
    368	/* put in vfio_group_release() */
    369	iommu_group_ref_get(iommu_group);
    370	group->type = type;
    371	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
    372
    373	return group;
    374}
    375
    376static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
    377		enum vfio_group_type type)
    378{
    379	struct vfio_group *group;
    380	struct vfio_group *ret;
    381	int err;
    382
    383	group = vfio_group_alloc(iommu_group, type);
    384	if (IS_ERR(group))
    385		return group;
    386
    387	err = dev_set_name(&group->dev, "%s%d",
    388			   group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
    389			   iommu_group_id(iommu_group));
    390	if (err) {
    391		ret = ERR_PTR(err);
    392		goto err_put;
    393	}
    394
    395	mutex_lock(&vfio.group_lock);
    396
    397	/* Did we race creating this group? */
    398	ret = __vfio_group_get_from_iommu(iommu_group);
    399	if (ret)
    400		goto err_unlock;
    401
    402	err = cdev_device_add(&group->cdev, &group->dev);
    403	if (err) {
    404		ret = ERR_PTR(err);
    405		goto err_unlock;
    406	}
    407
    408	list_add(&group->vfio_next, &vfio.group_list);
    409
    410	mutex_unlock(&vfio.group_lock);
    411	return group;
    412
    413err_unlock:
    414	mutex_unlock(&vfio.group_lock);
    415err_put:
    416	put_device(&group->dev);
    417	return ret;
    418}
    419
    420static void vfio_group_put(struct vfio_group *group)
    421{
    422	if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
    423		return;
    424
    425	/*
    426	 * These data structures all have paired operations that can only be
    427	 * undone when the caller holds a live reference on the group. Since all
    428	 * pairs must be undone these WARN_ON's indicate some caller did not
    429	 * properly hold the group reference.
    430	 */
    431	WARN_ON(!list_empty(&group->device_list));
    432	WARN_ON(group->container || group->container_users);
    433	WARN_ON(group->notifier.head);
    434
    435	list_del(&group->vfio_next);
    436	cdev_device_del(&group->cdev, &group->dev);
    437	mutex_unlock(&vfio.group_lock);
    438
    439	put_device(&group->dev);
    440}
    441
    442static void vfio_group_get(struct vfio_group *group)
    443{
    444	refcount_inc(&group->users);
    445}
    446
    447/*
    448 * Device objects - create, release, get, put, search
    449 */
    450/* Device reference always implies a group reference */
    451static void vfio_device_put(struct vfio_device *device)
    452{
    453	if (refcount_dec_and_test(&device->refcount))
    454		complete(&device->comp);
    455}
    456
    457static bool vfio_device_try_get(struct vfio_device *device)
    458{
    459	return refcount_inc_not_zero(&device->refcount);
    460}
    461
    462static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
    463						 struct device *dev)
    464{
    465	struct vfio_device *device;
    466
    467	mutex_lock(&group->device_lock);
    468	list_for_each_entry(device, &group->device_list, group_next) {
    469		if (device->dev == dev && vfio_device_try_get(device)) {
    470			mutex_unlock(&group->device_lock);
    471			return device;
    472		}
    473	}
    474	mutex_unlock(&group->device_lock);
    475	return NULL;
    476}
    477
    478/*
    479 * VFIO driver API
    480 */
    481void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
    482			 const struct vfio_device_ops *ops)
    483{
    484	init_completion(&device->comp);
    485	device->dev = dev;
    486	device->ops = ops;
    487}
    488EXPORT_SYMBOL_GPL(vfio_init_group_dev);
    489
    490void vfio_uninit_group_dev(struct vfio_device *device)
    491{
    492	vfio_release_device_set(device);
    493}
    494EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
    495
    496static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
    497		enum vfio_group_type type)
    498{
    499	struct iommu_group *iommu_group;
    500	struct vfio_group *group;
    501	int ret;
    502
    503	iommu_group = iommu_group_alloc();
    504	if (IS_ERR(iommu_group))
    505		return ERR_CAST(iommu_group);
    506
    507	iommu_group_set_name(iommu_group, "vfio-noiommu");
    508	ret = iommu_group_add_device(iommu_group, dev);
    509	if (ret)
    510		goto out_put_group;
    511
    512	group = vfio_create_group(iommu_group, type);
    513	if (IS_ERR(group)) {
    514		ret = PTR_ERR(group);
    515		goto out_remove_device;
    516	}
    517	iommu_group_put(iommu_group);
    518	return group;
    519
    520out_remove_device:
    521	iommu_group_remove_device(dev);
    522out_put_group:
    523	iommu_group_put(iommu_group);
    524	return ERR_PTR(ret);
    525}
    526
    527static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
    528{
    529	struct iommu_group *iommu_group;
    530	struct vfio_group *group;
    531
    532	iommu_group = iommu_group_get(dev);
    533#ifdef CONFIG_VFIO_NOIOMMU
    534	if (!iommu_group && noiommu) {
    535		/*
    536		 * With noiommu enabled, create an IOMMU group for devices that
    537		 * don't already have one, implying no IOMMU hardware/driver
    538		 * exists.  Taint the kernel because we're about to give a DMA
    539		 * capable device to a user without IOMMU protection.
    540		 */
    541		group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
    542		if (!IS_ERR(group)) {
    543			add_taint(TAINT_USER, LOCKDEP_STILL_OK);
    544			dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
    545		}
    546		return group;
    547	}
    548#endif
    549	if (!iommu_group)
    550		return ERR_PTR(-EINVAL);
    551
    552	group = vfio_group_get_from_iommu(iommu_group);
    553	if (!group)
    554		group = vfio_create_group(iommu_group, VFIO_IOMMU);
    555
    556	/* The vfio_group holds a reference to the iommu_group */
    557	iommu_group_put(iommu_group);
    558	return group;
    559}
    560
    561static int __vfio_register_dev(struct vfio_device *device,
    562		struct vfio_group *group)
    563{
    564	struct vfio_device *existing_device;
    565
    566	if (IS_ERR(group))
    567		return PTR_ERR(group);
    568
    569	/*
    570	 * If the driver doesn't specify a set then the device is added to a
    571	 * singleton set just for itself.
    572	 */
    573	if (!device->dev_set)
    574		vfio_assign_device_set(device, device);
    575
    576	existing_device = vfio_group_get_device(group, device->dev);
    577	if (existing_device) {
    578		dev_WARN(device->dev, "Device already exists on group %d\n",
    579			 iommu_group_id(group->iommu_group));
    580		vfio_device_put(existing_device);
    581		if (group->type == VFIO_NO_IOMMU ||
    582		    group->type == VFIO_EMULATED_IOMMU)
    583			iommu_group_remove_device(device->dev);
    584		vfio_group_put(group);
    585		return -EBUSY;
    586	}
    587
    588	/* Our reference on group is moved to the device */
    589	device->group = group;
    590
    591	/* Refcounting can't start until the driver calls register */
    592	refcount_set(&device->refcount, 1);
    593
    594	mutex_lock(&group->device_lock);
    595	list_add(&device->group_next, &group->device_list);
    596	group->dev_counter++;
    597	mutex_unlock(&group->device_lock);
    598
    599	return 0;
    600}
    601
    602int vfio_register_group_dev(struct vfio_device *device)
    603{
    604	/*
    605	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
    606	 * restore cache coherency.
    607	 */
    608	if (!iommu_capable(device->dev->bus, IOMMU_CAP_CACHE_COHERENCY))
    609		return -EINVAL;
    610
    611	return __vfio_register_dev(device,
    612		vfio_group_find_or_alloc(device->dev));
    613}
    614EXPORT_SYMBOL_GPL(vfio_register_group_dev);
    615
    616/*
    617 * Register a virtual device without IOMMU backing.  The user of this
    618 * device must not be able to directly trigger unmediated DMA.
    619 */
    620int vfio_register_emulated_iommu_dev(struct vfio_device *device)
    621{
    622	return __vfio_register_dev(device,
    623		vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
    624}
    625EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
    626
    627static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
    628						     char *buf)
    629{
    630	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
    631
    632	mutex_lock(&group->device_lock);
    633	list_for_each_entry(it, &group->device_list, group_next) {
    634		int ret;
    635
    636		if (it->ops->match) {
    637			ret = it->ops->match(it, buf);
    638			if (ret < 0) {
    639				device = ERR_PTR(ret);
    640				break;
    641			}
    642		} else {
    643			ret = !strcmp(dev_name(it->dev), buf);
    644		}
    645
    646		if (ret && vfio_device_try_get(it)) {
    647			device = it;
    648			break;
    649		}
    650	}
    651	mutex_unlock(&group->device_lock);
    652
    653	return device;
    654}
    655
    656/*
    657 * Decrement the device reference count and wait for the device to be
    658 * removed.  Open file descriptors for the device... */
    659void vfio_unregister_group_dev(struct vfio_device *device)
    660{
    661	struct vfio_group *group = device->group;
    662	unsigned int i = 0;
    663	bool interrupted = false;
    664	long rc;
    665
    666	vfio_device_put(device);
    667	rc = try_wait_for_completion(&device->comp);
    668	while (rc <= 0) {
    669		if (device->ops->request)
    670			device->ops->request(device, i++);
    671
    672		if (interrupted) {
    673			rc = wait_for_completion_timeout(&device->comp,
    674							 HZ * 10);
    675		} else {
    676			rc = wait_for_completion_interruptible_timeout(
    677				&device->comp, HZ * 10);
    678			if (rc < 0) {
    679				interrupted = true;
    680				dev_warn(device->dev,
    681					 "Device is currently in use, task"
    682					 " \"%s\" (%d) "
    683					 "blocked until device is released",
    684					 current->comm, task_pid_nr(current));
    685			}
    686		}
    687	}
    688
    689	mutex_lock(&group->device_lock);
    690	list_del(&device->group_next);
    691	group->dev_counter--;
    692	mutex_unlock(&group->device_lock);
    693
    694	if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
    695		iommu_group_remove_device(device->dev);
    696
    697	/* Matches the get in vfio_register_group_dev() */
    698	vfio_group_put(group);
    699}
    700EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
    701
    702/*
    703 * VFIO base fd, /dev/vfio/vfio
    704 */
    705static long vfio_ioctl_check_extension(struct vfio_container *container,
    706				       unsigned long arg)
    707{
    708	struct vfio_iommu_driver *driver;
    709	long ret = 0;
    710
    711	down_read(&container->group_lock);
    712
    713	driver = container->iommu_driver;
    714
    715	switch (arg) {
    716		/* No base extensions yet */
    717	default:
    718		/*
    719		 * If no driver is set, poll all registered drivers for
    720		 * extensions and return the first positive result.  If
    721		 * a driver is already set, further queries will be passed
    722		 * only to that driver.
    723		 */
    724		if (!driver) {
    725			mutex_lock(&vfio.iommu_drivers_lock);
    726			list_for_each_entry(driver, &vfio.iommu_drivers_list,
    727					    vfio_next) {
    728
    729				if (!list_empty(&container->group_list) &&
    730				    !vfio_iommu_driver_allowed(container,
    731							       driver))
    732					continue;
    733				if (!try_module_get(driver->ops->owner))
    734					continue;
    735
    736				ret = driver->ops->ioctl(NULL,
    737							 VFIO_CHECK_EXTENSION,
    738							 arg);
    739				module_put(driver->ops->owner);
    740				if (ret > 0)
    741					break;
    742			}
    743			mutex_unlock(&vfio.iommu_drivers_lock);
    744		} else
    745			ret = driver->ops->ioctl(container->iommu_data,
    746						 VFIO_CHECK_EXTENSION, arg);
    747	}
    748
    749	up_read(&container->group_lock);
    750
    751	return ret;
    752}
    753
    754/* hold write lock on container->group_lock */
    755static int __vfio_container_attach_groups(struct vfio_container *container,
    756					  struct vfio_iommu_driver *driver,
    757					  void *data)
    758{
    759	struct vfio_group *group;
    760	int ret = -ENODEV;
    761
    762	list_for_each_entry(group, &container->group_list, container_next) {
    763		ret = driver->ops->attach_group(data, group->iommu_group,
    764						group->type);
    765		if (ret)
    766			goto unwind;
    767	}
    768
    769	return ret;
    770
    771unwind:
    772	list_for_each_entry_continue_reverse(group, &container->group_list,
    773					     container_next) {
    774		driver->ops->detach_group(data, group->iommu_group);
    775	}
    776
    777	return ret;
    778}
    779
    780static long vfio_ioctl_set_iommu(struct vfio_container *container,
    781				 unsigned long arg)
    782{
    783	struct vfio_iommu_driver *driver;
    784	long ret = -ENODEV;
    785
    786	down_write(&container->group_lock);
    787
    788	/*
    789	 * The container is designed to be an unprivileged interface while
    790	 * the group can be assigned to specific users.  Therefore, only by
    791	 * adding a group to a container does the user get the privilege of
    792	 * enabling the iommu, which may allocate finite resources.  There
    793	 * is no unset_iommu, but by removing all the groups from a container,
    794	 * the container is deprivileged and returns to an unset state.
    795	 */
    796	if (list_empty(&container->group_list) || container->iommu_driver) {
    797		up_write(&container->group_lock);
    798		return -EINVAL;
    799	}
    800
    801	mutex_lock(&vfio.iommu_drivers_lock);
    802	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
    803		void *data;
    804
    805		if (!vfio_iommu_driver_allowed(container, driver))
    806			continue;
    807		if (!try_module_get(driver->ops->owner))
    808			continue;
    809
    810		/*
    811		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
    812		 * so test which iommu driver reported support for this
    813		 * extension and call open on them.  We also pass them the
    814		 * magic, allowing a single driver to support multiple
    815		 * interfaces if they'd like.
    816		 */
    817		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
    818			module_put(driver->ops->owner);
    819			continue;
    820		}
    821
    822		data = driver->ops->open(arg);
    823		if (IS_ERR(data)) {
    824			ret = PTR_ERR(data);
    825			module_put(driver->ops->owner);
    826			continue;
    827		}
    828
    829		ret = __vfio_container_attach_groups(container, driver, data);
    830		if (ret) {
    831			driver->ops->release(data);
    832			module_put(driver->ops->owner);
    833			continue;
    834		}
    835
    836		container->iommu_driver = driver;
    837		container->iommu_data = data;
    838		break;
    839	}
    840
    841	mutex_unlock(&vfio.iommu_drivers_lock);
    842	up_write(&container->group_lock);
    843
    844	return ret;
    845}
    846
    847static long vfio_fops_unl_ioctl(struct file *filep,
    848				unsigned int cmd, unsigned long arg)
    849{
    850	struct vfio_container *container = filep->private_data;
    851	struct vfio_iommu_driver *driver;
    852	void *data;
    853	long ret = -EINVAL;
    854
    855	if (!container)
    856		return ret;
    857
    858	switch (cmd) {
    859	case VFIO_GET_API_VERSION:
    860		ret = VFIO_API_VERSION;
    861		break;
    862	case VFIO_CHECK_EXTENSION:
    863		ret = vfio_ioctl_check_extension(container, arg);
    864		break;
    865	case VFIO_SET_IOMMU:
    866		ret = vfio_ioctl_set_iommu(container, arg);
    867		break;
    868	default:
    869		driver = container->iommu_driver;
    870		data = container->iommu_data;
    871
    872		if (driver) /* passthrough all unrecognized ioctls */
    873			ret = driver->ops->ioctl(data, cmd, arg);
    874	}
    875
    876	return ret;
    877}
    878
    879static int vfio_fops_open(struct inode *inode, struct file *filep)
    880{
    881	struct vfio_container *container;
    882
    883	container = kzalloc(sizeof(*container), GFP_KERNEL);
    884	if (!container)
    885		return -ENOMEM;
    886
    887	INIT_LIST_HEAD(&container->group_list);
    888	init_rwsem(&container->group_lock);
    889	kref_init(&container->kref);
    890
    891	filep->private_data = container;
    892
    893	return 0;
    894}
    895
    896static int vfio_fops_release(struct inode *inode, struct file *filep)
    897{
    898	struct vfio_container *container = filep->private_data;
    899	struct vfio_iommu_driver *driver = container->iommu_driver;
    900
    901	if (driver && driver->ops->notify)
    902		driver->ops->notify(container->iommu_data,
    903				    VFIO_IOMMU_CONTAINER_CLOSE);
    904
    905	filep->private_data = NULL;
    906
    907	vfio_container_put(container);
    908
    909	return 0;
    910}
    911
    912static const struct file_operations vfio_fops = {
    913	.owner		= THIS_MODULE,
    914	.open		= vfio_fops_open,
    915	.release	= vfio_fops_release,
    916	.unlocked_ioctl	= vfio_fops_unl_ioctl,
    917	.compat_ioctl	= compat_ptr_ioctl,
    918};
    919
    920/*
    921 * VFIO Group fd, /dev/vfio/$GROUP
    922 */
    923static void __vfio_group_unset_container(struct vfio_group *group)
    924{
    925	struct vfio_container *container = group->container;
    926	struct vfio_iommu_driver *driver;
    927
    928	lockdep_assert_held_write(&group->group_rwsem);
    929
    930	down_write(&container->group_lock);
    931
    932	driver = container->iommu_driver;
    933	if (driver)
    934		driver->ops->detach_group(container->iommu_data,
    935					  group->iommu_group);
    936
    937	if (group->type == VFIO_IOMMU)
    938		iommu_group_release_dma_owner(group->iommu_group);
    939
    940	group->container = NULL;
    941	group->container_users = 0;
    942	list_del(&group->container_next);
    943
    944	/* Detaching the last group deprivileges a container, remove iommu */
    945	if (driver && list_empty(&container->group_list)) {
    946		driver->ops->release(container->iommu_data);
    947		module_put(driver->ops->owner);
    948		container->iommu_driver = NULL;
    949		container->iommu_data = NULL;
    950	}
    951
    952	up_write(&container->group_lock);
    953
    954	vfio_container_put(container);
    955}
    956
    957/*
    958 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
    959 * if there was no container to unset.  Since the ioctl is called on
    960 * the group, we know that still exists, therefore the only valid
    961 * transition here is 1->0.
    962 */
    963static int vfio_group_unset_container(struct vfio_group *group)
    964{
    965	lockdep_assert_held_write(&group->group_rwsem);
    966
    967	if (!group->container)
    968		return -EINVAL;
    969	if (group->container_users != 1)
    970		return -EBUSY;
    971	__vfio_group_unset_container(group);
    972	return 0;
    973}
    974
    975static int vfio_group_set_container(struct vfio_group *group, int container_fd)
    976{
    977	struct fd f;
    978	struct vfio_container *container;
    979	struct vfio_iommu_driver *driver;
    980	int ret = 0;
    981
    982	lockdep_assert_held_write(&group->group_rwsem);
    983
    984	if (group->container || WARN_ON(group->container_users))
    985		return -EINVAL;
    986
    987	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
    988		return -EPERM;
    989
    990	f = fdget(container_fd);
    991	if (!f.file)
    992		return -EBADF;
    993
    994	/* Sanity check, is this really our fd? */
    995	if (f.file->f_op != &vfio_fops) {
    996		fdput(f);
    997		return -EINVAL;
    998	}
    999
   1000	container = f.file->private_data;
   1001	WARN_ON(!container); /* fget ensures we don't race vfio_release */
   1002
   1003	down_write(&container->group_lock);
   1004
   1005	/* Real groups and fake groups cannot mix */
   1006	if (!list_empty(&container->group_list) &&
   1007	    container->noiommu != (group->type == VFIO_NO_IOMMU)) {
   1008		ret = -EPERM;
   1009		goto unlock_out;
   1010	}
   1011
   1012	if (group->type == VFIO_IOMMU) {
   1013		ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
   1014		if (ret)
   1015			goto unlock_out;
   1016	}
   1017
   1018	driver = container->iommu_driver;
   1019	if (driver) {
   1020		ret = driver->ops->attach_group(container->iommu_data,
   1021						group->iommu_group,
   1022						group->type);
   1023		if (ret) {
   1024			if (group->type == VFIO_IOMMU)
   1025				iommu_group_release_dma_owner(
   1026					group->iommu_group);
   1027			goto unlock_out;
   1028		}
   1029	}
   1030
   1031	group->container = container;
   1032	group->container_users = 1;
   1033	container->noiommu = (group->type == VFIO_NO_IOMMU);
   1034	list_add(&group->container_next, &container->group_list);
   1035
   1036	/* Get a reference on the container and mark a user within the group */
   1037	vfio_container_get(container);
   1038
   1039unlock_out:
   1040	up_write(&container->group_lock);
   1041	fdput(f);
   1042	return ret;
   1043}
   1044
   1045static const struct file_operations vfio_device_fops;
   1046
   1047/* true if the vfio_device has open_device() called but not close_device() */
   1048static bool vfio_assert_device_open(struct vfio_device *device)
   1049{
   1050	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
   1051}
   1052
   1053static int vfio_device_assign_container(struct vfio_device *device)
   1054{
   1055	struct vfio_group *group = device->group;
   1056
   1057	lockdep_assert_held_write(&group->group_rwsem);
   1058
   1059	if (!group->container || !group->container->iommu_driver ||
   1060	    WARN_ON(!group->container_users))
   1061		return -EINVAL;
   1062
   1063	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
   1064		return -EPERM;
   1065
   1066	get_file(group->opened_file);
   1067	group->container_users++;
   1068	return 0;
   1069}
   1070
   1071static void vfio_device_unassign_container(struct vfio_device *device)
   1072{
   1073	down_write(&device->group->group_rwsem);
   1074	WARN_ON(device->group->container_users <= 1);
   1075	device->group->container_users--;
   1076	fput(device->group->opened_file);
   1077	up_write(&device->group->group_rwsem);
   1078}
   1079
   1080static struct file *vfio_device_open(struct vfio_device *device)
   1081{
   1082	struct file *filep;
   1083	int ret;
   1084
   1085	down_write(&device->group->group_rwsem);
   1086	ret = vfio_device_assign_container(device);
   1087	up_write(&device->group->group_rwsem);
   1088	if (ret)
   1089		return ERR_PTR(ret);
   1090
   1091	if (!try_module_get(device->dev->driver->owner)) {
   1092		ret = -ENODEV;
   1093		goto err_unassign_container;
   1094	}
   1095
   1096	mutex_lock(&device->dev_set->lock);
   1097	device->open_count++;
   1098	if (device->open_count == 1) {
   1099		/*
   1100		 * Here we pass the KVM pointer with the group under the read
   1101		 * lock.  If the device driver will use it, it must obtain a
   1102		 * reference and release it during close_device.
   1103		 */
   1104		down_read(&device->group->group_rwsem);
   1105		device->kvm = device->group->kvm;
   1106
   1107		if (device->ops->open_device) {
   1108			ret = device->ops->open_device(device);
   1109			if (ret)
   1110				goto err_undo_count;
   1111		}
   1112		up_read(&device->group->group_rwsem);
   1113	}
   1114	mutex_unlock(&device->dev_set->lock);
   1115
   1116	/*
   1117	 * We can't use anon_inode_getfd() because we need to modify
   1118	 * the f_mode flags directly to allow more than just ioctls
   1119	 */
   1120	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
   1121				   device, O_RDWR);
   1122	if (IS_ERR(filep)) {
   1123		ret = PTR_ERR(filep);
   1124		goto err_close_device;
   1125	}
   1126
   1127	/*
   1128	 * TODO: add an anon_inode interface to do this.
   1129	 * Appears to be missing by lack of need rather than
   1130	 * explicitly prevented.  Now there's need.
   1131	 */
   1132	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
   1133
   1134	if (device->group->type == VFIO_NO_IOMMU)
   1135		dev_warn(device->dev, "vfio-noiommu device opened by user "
   1136			 "(%s:%d)\n", current->comm, task_pid_nr(current));
   1137	/*
   1138	 * On success the ref of device is moved to the file and
   1139	 * put in vfio_device_fops_release()
   1140	 */
   1141	return filep;
   1142
   1143err_close_device:
   1144	mutex_lock(&device->dev_set->lock);
   1145	down_read(&device->group->group_rwsem);
   1146	if (device->open_count == 1 && device->ops->close_device)
   1147		device->ops->close_device(device);
   1148err_undo_count:
   1149	device->open_count--;
   1150	if (device->open_count == 0 && device->kvm)
   1151		device->kvm = NULL;
   1152	up_read(&device->group->group_rwsem);
   1153	mutex_unlock(&device->dev_set->lock);
   1154	module_put(device->dev->driver->owner);
   1155err_unassign_container:
   1156	vfio_device_unassign_container(device);
   1157	return ERR_PTR(ret);
   1158}
   1159
   1160static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
   1161{
   1162	struct vfio_device *device;
   1163	struct file *filep;
   1164	int fdno;
   1165	int ret;
   1166
   1167	device = vfio_device_get_from_name(group, buf);
   1168	if (IS_ERR(device))
   1169		return PTR_ERR(device);
   1170
   1171	fdno = get_unused_fd_flags(O_CLOEXEC);
   1172	if (fdno < 0) {
   1173		ret = fdno;
   1174		goto err_put_device;
   1175	}
   1176
   1177	filep = vfio_device_open(device);
   1178	if (IS_ERR(filep)) {
   1179		ret = PTR_ERR(filep);
   1180		goto err_put_fdno;
   1181	}
   1182
   1183	fd_install(fdno, filep);
   1184	return fdno;
   1185
   1186err_put_fdno:
   1187	put_unused_fd(fdno);
   1188err_put_device:
   1189	vfio_device_put(device);
   1190	return ret;
   1191}
   1192
   1193static long vfio_group_fops_unl_ioctl(struct file *filep,
   1194				      unsigned int cmd, unsigned long arg)
   1195{
   1196	struct vfio_group *group = filep->private_data;
   1197	long ret = -ENOTTY;
   1198
   1199	switch (cmd) {
   1200	case VFIO_GROUP_GET_STATUS:
   1201	{
   1202		struct vfio_group_status status;
   1203		unsigned long minsz;
   1204
   1205		minsz = offsetofend(struct vfio_group_status, flags);
   1206
   1207		if (copy_from_user(&status, (void __user *)arg, minsz))
   1208			return -EFAULT;
   1209
   1210		if (status.argsz < minsz)
   1211			return -EINVAL;
   1212
   1213		status.flags = 0;
   1214
   1215		down_read(&group->group_rwsem);
   1216		if (group->container)
   1217			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
   1218					VFIO_GROUP_FLAGS_VIABLE;
   1219		else if (!iommu_group_dma_owner_claimed(group->iommu_group))
   1220			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
   1221		up_read(&group->group_rwsem);
   1222
   1223		if (copy_to_user((void __user *)arg, &status, minsz))
   1224			return -EFAULT;
   1225
   1226		ret = 0;
   1227		break;
   1228	}
   1229	case VFIO_GROUP_SET_CONTAINER:
   1230	{
   1231		int fd;
   1232
   1233		if (get_user(fd, (int __user *)arg))
   1234			return -EFAULT;
   1235
   1236		if (fd < 0)
   1237			return -EINVAL;
   1238
   1239		down_write(&group->group_rwsem);
   1240		ret = vfio_group_set_container(group, fd);
   1241		up_write(&group->group_rwsem);
   1242		break;
   1243	}
   1244	case VFIO_GROUP_UNSET_CONTAINER:
   1245		down_write(&group->group_rwsem);
   1246		ret = vfio_group_unset_container(group);
   1247		up_write(&group->group_rwsem);
   1248		break;
   1249	case VFIO_GROUP_GET_DEVICE_FD:
   1250	{
   1251		char *buf;
   1252
   1253		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
   1254		if (IS_ERR(buf))
   1255			return PTR_ERR(buf);
   1256
   1257		ret = vfio_group_get_device_fd(group, buf);
   1258		kfree(buf);
   1259		break;
   1260	}
   1261	}
   1262
   1263	return ret;
   1264}
   1265
   1266static int vfio_group_fops_open(struct inode *inode, struct file *filep)
   1267{
   1268	struct vfio_group *group =
   1269		container_of(inode->i_cdev, struct vfio_group, cdev);
   1270	int ret;
   1271
   1272	down_write(&group->group_rwsem);
   1273
   1274	/* users can be zero if this races with vfio_group_put() */
   1275	if (!refcount_inc_not_zero(&group->users)) {
   1276		ret = -ENODEV;
   1277		goto err_unlock;
   1278	}
   1279
   1280	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
   1281		ret = -EPERM;
   1282		goto err_put;
   1283	}
   1284
   1285	/*
   1286	 * Do we need multiple instances of the group open?  Seems not.
   1287	 */
   1288	if (group->opened_file) {
   1289		ret = -EBUSY;
   1290		goto err_put;
   1291	}
   1292	group->opened_file = filep;
   1293	filep->private_data = group;
   1294
   1295	up_write(&group->group_rwsem);
   1296	return 0;
   1297err_put:
   1298	vfio_group_put(group);
   1299err_unlock:
   1300	up_write(&group->group_rwsem);
   1301	return ret;
   1302}
   1303
   1304static int vfio_group_fops_release(struct inode *inode, struct file *filep)
   1305{
   1306	struct vfio_group *group = filep->private_data;
   1307
   1308	filep->private_data = NULL;
   1309
   1310	down_write(&group->group_rwsem);
   1311	/*
   1312	 * Device FDs hold a group file reference, therefore the group release
   1313	 * is only called when there are no open devices.
   1314	 */
   1315	WARN_ON(group->notifier.head);
   1316	if (group->container) {
   1317		WARN_ON(group->container_users != 1);
   1318		__vfio_group_unset_container(group);
   1319	}
   1320	group->opened_file = NULL;
   1321	up_write(&group->group_rwsem);
   1322
   1323	vfio_group_put(group);
   1324
   1325	return 0;
   1326}
   1327
   1328static const struct file_operations vfio_group_fops = {
   1329	.owner		= THIS_MODULE,
   1330	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
   1331	.compat_ioctl	= compat_ptr_ioctl,
   1332	.open		= vfio_group_fops_open,
   1333	.release	= vfio_group_fops_release,
   1334};
   1335
   1336/*
   1337 * VFIO Device fd
   1338 */
   1339static int vfio_device_fops_release(struct inode *inode, struct file *filep)
   1340{
   1341	struct vfio_device *device = filep->private_data;
   1342
   1343	mutex_lock(&device->dev_set->lock);
   1344	vfio_assert_device_open(device);
   1345	down_read(&device->group->group_rwsem);
   1346	if (device->open_count == 1 && device->ops->close_device)
   1347		device->ops->close_device(device);
   1348	up_read(&device->group->group_rwsem);
   1349	device->open_count--;
   1350	if (device->open_count == 0)
   1351		device->kvm = NULL;
   1352	mutex_unlock(&device->dev_set->lock);
   1353
   1354	module_put(device->dev->driver->owner);
   1355
   1356	vfio_device_unassign_container(device);
   1357
   1358	vfio_device_put(device);
   1359
   1360	return 0;
   1361}
   1362
   1363/*
   1364 * vfio_mig_get_next_state - Compute the next step in the FSM
   1365 * @cur_fsm - The current state the device is in
   1366 * @new_fsm - The target state to reach
   1367 * @next_fsm - Pointer to the next step to get to new_fsm
   1368 *
   1369 * Return 0 upon success, otherwise -errno
   1370 * Upon success the next step in the state progression between cur_fsm and
   1371 * new_fsm will be set in next_fsm.
   1372 *
   1373 * This breaks down requests for combination transitions into smaller steps and
   1374 * returns the next step to get to new_fsm. The function may need to be called
   1375 * multiple times before reaching new_fsm.
   1376 *
   1377 */
   1378int vfio_mig_get_next_state(struct vfio_device *device,
   1379			    enum vfio_device_mig_state cur_fsm,
   1380			    enum vfio_device_mig_state new_fsm,
   1381			    enum vfio_device_mig_state *next_fsm)
   1382{
   1383	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
   1384	/*
   1385	 * The coding in this table requires the driver to implement the
   1386	 * following FSM arcs:
   1387	 *         RESUMING -> STOP
   1388	 *         STOP -> RESUMING
   1389	 *         STOP -> STOP_COPY
   1390	 *         STOP_COPY -> STOP
   1391	 *
   1392	 * If P2P is supported then the driver must also implement these FSM
   1393	 * arcs:
   1394	 *         RUNNING -> RUNNING_P2P
   1395	 *         RUNNING_P2P -> RUNNING
   1396	 *         RUNNING_P2P -> STOP
   1397	 *         STOP -> RUNNING_P2P
   1398	 * Without P2P the driver must implement:
   1399	 *         RUNNING -> STOP
   1400	 *         STOP -> RUNNING
   1401	 *
   1402	 * The coding will step through multiple states for some combination
   1403	 * transitions; if all optional features are supported, this means the
   1404	 * following ones:
   1405	 *         RESUMING -> STOP -> RUNNING_P2P
   1406	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
   1407	 *         RESUMING -> STOP -> STOP_COPY
   1408	 *         RUNNING -> RUNNING_P2P -> STOP
   1409	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
   1410	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
   1411	 *         RUNNING_P2P -> STOP -> RESUMING
   1412	 *         RUNNING_P2P -> STOP -> STOP_COPY
   1413	 *         STOP -> RUNNING_P2P -> RUNNING
   1414	 *         STOP_COPY -> STOP -> RESUMING
   1415	 *         STOP_COPY -> STOP -> RUNNING_P2P
   1416	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
   1417	 */
   1418	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
   1419		[VFIO_DEVICE_STATE_STOP] = {
   1420			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
   1421			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
   1422			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
   1423			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
   1424			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
   1425			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
   1426		},
   1427		[VFIO_DEVICE_STATE_RUNNING] = {
   1428			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
   1429			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
   1430			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
   1431			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
   1432			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
   1433			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
   1434		},
   1435		[VFIO_DEVICE_STATE_STOP_COPY] = {
   1436			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
   1437			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
   1438			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
   1439			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
   1440			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
   1441			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
   1442		},
   1443		[VFIO_DEVICE_STATE_RESUMING] = {
   1444			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
   1445			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
   1446			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
   1447			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
   1448			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
   1449			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
   1450		},
   1451		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
   1452			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
   1453			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
   1454			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
   1455			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
   1456			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
   1457			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
   1458		},
   1459		[VFIO_DEVICE_STATE_ERROR] = {
   1460			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
   1461			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
   1462			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
   1463			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
   1464			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
   1465			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
   1466		},
   1467	};
   1468
   1469	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
   1470		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
   1471		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
   1472		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
   1473		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
   1474		[VFIO_DEVICE_STATE_RUNNING_P2P] =
   1475			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
   1476		[VFIO_DEVICE_STATE_ERROR] = ~0U,
   1477	};
   1478
   1479	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
   1480		    (state_flags_table[cur_fsm] & device->migration_flags) !=
   1481			state_flags_table[cur_fsm]))
   1482		return -EINVAL;
   1483
   1484	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
   1485	   (state_flags_table[new_fsm] & device->migration_flags) !=
   1486			state_flags_table[new_fsm])
   1487		return -EINVAL;
   1488
   1489	/*
   1490	 * Arcs touching optional and unsupported states are skipped over. The
   1491	 * driver will instead see an arc from the original state to the next
   1492	 * logical state, as per the above comment.
   1493	 */
   1494	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
   1495	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
   1496			state_flags_table[*next_fsm])
   1497		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
   1498
   1499	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
   1500}
   1501EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
   1502
   1503/*
   1504 * Convert the drivers's struct file into a FD number and return it to userspace
   1505 */
   1506static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
   1507				   struct vfio_device_feature_mig_state *mig)
   1508{
   1509	int ret;
   1510	int fd;
   1511
   1512	fd = get_unused_fd_flags(O_CLOEXEC);
   1513	if (fd < 0) {
   1514		ret = fd;
   1515		goto out_fput;
   1516	}
   1517
   1518	mig->data_fd = fd;
   1519	if (copy_to_user(arg, mig, sizeof(*mig))) {
   1520		ret = -EFAULT;
   1521		goto out_put_unused;
   1522	}
   1523	fd_install(fd, filp);
   1524	return 0;
   1525
   1526out_put_unused:
   1527	put_unused_fd(fd);
   1528out_fput:
   1529	fput(filp);
   1530	return ret;
   1531}
   1532
   1533static int
   1534vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
   1535					   u32 flags, void __user *arg,
   1536					   size_t argsz)
   1537{
   1538	size_t minsz =
   1539		offsetofend(struct vfio_device_feature_mig_state, data_fd);
   1540	struct vfio_device_feature_mig_state mig;
   1541	struct file *filp = NULL;
   1542	int ret;
   1543
   1544	if (!device->ops->migration_set_state ||
   1545	    !device->ops->migration_get_state)
   1546		return -ENOTTY;
   1547
   1548	ret = vfio_check_feature(flags, argsz,
   1549				 VFIO_DEVICE_FEATURE_SET |
   1550				 VFIO_DEVICE_FEATURE_GET,
   1551				 sizeof(mig));
   1552	if (ret != 1)
   1553		return ret;
   1554
   1555	if (copy_from_user(&mig, arg, minsz))
   1556		return -EFAULT;
   1557
   1558	if (flags & VFIO_DEVICE_FEATURE_GET) {
   1559		enum vfio_device_mig_state curr_state;
   1560
   1561		ret = device->ops->migration_get_state(device, &curr_state);
   1562		if (ret)
   1563			return ret;
   1564		mig.device_state = curr_state;
   1565		goto out_copy;
   1566	}
   1567
   1568	/* Handle the VFIO_DEVICE_FEATURE_SET */
   1569	filp = device->ops->migration_set_state(device, mig.device_state);
   1570	if (IS_ERR(filp) || !filp)
   1571		goto out_copy;
   1572
   1573	return vfio_ioct_mig_return_fd(filp, arg, &mig);
   1574out_copy:
   1575	mig.data_fd = -1;
   1576	if (copy_to_user(arg, &mig, sizeof(mig)))
   1577		return -EFAULT;
   1578	if (IS_ERR(filp))
   1579		return PTR_ERR(filp);
   1580	return 0;
   1581}
   1582
   1583static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
   1584					       u32 flags, void __user *arg,
   1585					       size_t argsz)
   1586{
   1587	struct vfio_device_feature_migration mig = {
   1588		.flags = device->migration_flags,
   1589	};
   1590	int ret;
   1591
   1592	if (!device->ops->migration_set_state ||
   1593	    !device->ops->migration_get_state)
   1594		return -ENOTTY;
   1595
   1596	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
   1597				 sizeof(mig));
   1598	if (ret != 1)
   1599		return ret;
   1600	if (copy_to_user(arg, &mig, sizeof(mig)))
   1601		return -EFAULT;
   1602	return 0;
   1603}
   1604
   1605static int vfio_ioctl_device_feature(struct vfio_device *device,
   1606				     struct vfio_device_feature __user *arg)
   1607{
   1608	size_t minsz = offsetofend(struct vfio_device_feature, flags);
   1609	struct vfio_device_feature feature;
   1610
   1611	if (copy_from_user(&feature, arg, minsz))
   1612		return -EFAULT;
   1613
   1614	if (feature.argsz < minsz)
   1615		return -EINVAL;
   1616
   1617	/* Check unknown flags */
   1618	if (feature.flags &
   1619	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
   1620	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
   1621		return -EINVAL;
   1622
   1623	/* GET & SET are mutually exclusive except with PROBE */
   1624	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
   1625	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
   1626	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
   1627		return -EINVAL;
   1628
   1629	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
   1630	case VFIO_DEVICE_FEATURE_MIGRATION:
   1631		return vfio_ioctl_device_feature_migration(
   1632			device, feature.flags, arg->data,
   1633			feature.argsz - minsz);
   1634	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
   1635		return vfio_ioctl_device_feature_mig_device_state(
   1636			device, feature.flags, arg->data,
   1637			feature.argsz - minsz);
   1638	default:
   1639		if (unlikely(!device->ops->device_feature))
   1640			return -EINVAL;
   1641		return device->ops->device_feature(device, feature.flags,
   1642						   arg->data,
   1643						   feature.argsz - minsz);
   1644	}
   1645}
   1646
   1647static long vfio_device_fops_unl_ioctl(struct file *filep,
   1648				       unsigned int cmd, unsigned long arg)
   1649{
   1650	struct vfio_device *device = filep->private_data;
   1651
   1652	switch (cmd) {
   1653	case VFIO_DEVICE_FEATURE:
   1654		return vfio_ioctl_device_feature(device, (void __user *)arg);
   1655	default:
   1656		if (unlikely(!device->ops->ioctl))
   1657			return -EINVAL;
   1658		return device->ops->ioctl(device, cmd, arg);
   1659	}
   1660}
   1661
   1662static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
   1663				     size_t count, loff_t *ppos)
   1664{
   1665	struct vfio_device *device = filep->private_data;
   1666
   1667	if (unlikely(!device->ops->read))
   1668		return -EINVAL;
   1669
   1670	return device->ops->read(device, buf, count, ppos);
   1671}
   1672
   1673static ssize_t vfio_device_fops_write(struct file *filep,
   1674				      const char __user *buf,
   1675				      size_t count, loff_t *ppos)
   1676{
   1677	struct vfio_device *device = filep->private_data;
   1678
   1679	if (unlikely(!device->ops->write))
   1680		return -EINVAL;
   1681
   1682	return device->ops->write(device, buf, count, ppos);
   1683}
   1684
   1685static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
   1686{
   1687	struct vfio_device *device = filep->private_data;
   1688
   1689	if (unlikely(!device->ops->mmap))
   1690		return -EINVAL;
   1691
   1692	return device->ops->mmap(device, vma);
   1693}
   1694
   1695static const struct file_operations vfio_device_fops = {
   1696	.owner		= THIS_MODULE,
   1697	.release	= vfio_device_fops_release,
   1698	.read		= vfio_device_fops_read,
   1699	.write		= vfio_device_fops_write,
   1700	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
   1701	.compat_ioctl	= compat_ptr_ioctl,
   1702	.mmap		= vfio_device_fops_mmap,
   1703};
   1704
   1705/**
   1706 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
   1707 * @file: VFIO group file
   1708 *
   1709 * The returned iommu_group is valid as long as a ref is held on the file.
   1710 */
   1711struct iommu_group *vfio_file_iommu_group(struct file *file)
   1712{
   1713	struct vfio_group *group = file->private_data;
   1714
   1715	if (file->f_op != &vfio_group_fops)
   1716		return NULL;
   1717	return group->iommu_group;
   1718}
   1719EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
   1720
   1721/**
   1722 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
   1723 *        is always CPU cache coherent
   1724 * @file: VFIO group file
   1725 *
   1726 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
   1727 * bit in DMA transactions. A return of false indicates that the user has
   1728 * rights to access additional instructions such as wbinvd on x86.
   1729 */
   1730bool vfio_file_enforced_coherent(struct file *file)
   1731{
   1732	struct vfio_group *group = file->private_data;
   1733	bool ret;
   1734
   1735	if (file->f_op != &vfio_group_fops)
   1736		return true;
   1737
   1738	down_read(&group->group_rwsem);
   1739	if (group->container) {
   1740		ret = vfio_ioctl_check_extension(group->container,
   1741						 VFIO_DMA_CC_IOMMU);
   1742	} else {
   1743		/*
   1744		 * Since the coherency state is determined only once a container
   1745		 * is attached the user must do so before they can prove they
   1746		 * have permission.
   1747		 */
   1748		ret = true;
   1749	}
   1750	up_read(&group->group_rwsem);
   1751	return ret;
   1752}
   1753EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
   1754
   1755/**
   1756 * vfio_file_set_kvm - Link a kvm with VFIO drivers
   1757 * @file: VFIO group file
   1758 * @kvm: KVM to link
   1759 *
   1760 * When a VFIO device is first opened the KVM will be available in
   1761 * device->kvm if one was associated with the group.
   1762 */
   1763void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
   1764{
   1765	struct vfio_group *group = file->private_data;
   1766
   1767	if (file->f_op != &vfio_group_fops)
   1768		return;
   1769
   1770	down_write(&group->group_rwsem);
   1771	group->kvm = kvm;
   1772	up_write(&group->group_rwsem);
   1773}
   1774EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
   1775
   1776/**
   1777 * vfio_file_has_dev - True if the VFIO file is a handle for device
   1778 * @file: VFIO file to check
   1779 * @device: Device that must be part of the file
   1780 *
   1781 * Returns true if given file has permission to manipulate the given device.
   1782 */
   1783bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
   1784{
   1785	struct vfio_group *group = file->private_data;
   1786
   1787	if (file->f_op != &vfio_group_fops)
   1788		return false;
   1789
   1790	return group == device->group;
   1791}
   1792EXPORT_SYMBOL_GPL(vfio_file_has_dev);
   1793
   1794/*
   1795 * Sub-module support
   1796 */
   1797/*
   1798 * Helper for managing a buffer of info chain capabilities, allocate or
   1799 * reallocate a buffer with additional @size, filling in @id and @version
   1800 * of the capability.  A pointer to the new capability is returned.
   1801 *
   1802 * NB. The chain is based at the head of the buffer, so new entries are
   1803 * added to the tail, vfio_info_cap_shift() should be called to fixup the
   1804 * next offsets prior to copying to the user buffer.
   1805 */
   1806struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
   1807					       size_t size, u16 id, u16 version)
   1808{
   1809	void *buf;
   1810	struct vfio_info_cap_header *header, *tmp;
   1811
   1812	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
   1813	if (!buf) {
   1814		kfree(caps->buf);
   1815		caps->size = 0;
   1816		return ERR_PTR(-ENOMEM);
   1817	}
   1818
   1819	caps->buf = buf;
   1820	header = buf + caps->size;
   1821
   1822	/* Eventually copied to user buffer, zero */
   1823	memset(header, 0, size);
   1824
   1825	header->id = id;
   1826	header->version = version;
   1827
   1828	/* Add to the end of the capability chain */
   1829	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
   1830		; /* nothing */
   1831
   1832	tmp->next = caps->size;
   1833	caps->size += size;
   1834
   1835	return header;
   1836}
   1837EXPORT_SYMBOL_GPL(vfio_info_cap_add);
   1838
   1839void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
   1840{
   1841	struct vfio_info_cap_header *tmp;
   1842	void *buf = (void *)caps->buf;
   1843
   1844	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
   1845		tmp->next += offset;
   1846}
   1847EXPORT_SYMBOL(vfio_info_cap_shift);
   1848
   1849int vfio_info_add_capability(struct vfio_info_cap *caps,
   1850			     struct vfio_info_cap_header *cap, size_t size)
   1851{
   1852	struct vfio_info_cap_header *header;
   1853
   1854	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
   1855	if (IS_ERR(header))
   1856		return PTR_ERR(header);
   1857
   1858	memcpy(header + 1, cap + 1, size - sizeof(*header));
   1859
   1860	return 0;
   1861}
   1862EXPORT_SYMBOL(vfio_info_add_capability);
   1863
   1864int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
   1865				       int max_irq_type, size_t *data_size)
   1866{
   1867	unsigned long minsz;
   1868	size_t size;
   1869
   1870	minsz = offsetofend(struct vfio_irq_set, count);
   1871
   1872	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
   1873	    (hdr->count >= (U32_MAX - hdr->start)) ||
   1874	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
   1875				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
   1876		return -EINVAL;
   1877
   1878	if (data_size)
   1879		*data_size = 0;
   1880
   1881	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
   1882		return -EINVAL;
   1883
   1884	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
   1885	case VFIO_IRQ_SET_DATA_NONE:
   1886		size = 0;
   1887		break;
   1888	case VFIO_IRQ_SET_DATA_BOOL:
   1889		size = sizeof(uint8_t);
   1890		break;
   1891	case VFIO_IRQ_SET_DATA_EVENTFD:
   1892		size = sizeof(int32_t);
   1893		break;
   1894	default:
   1895		return -EINVAL;
   1896	}
   1897
   1898	if (size) {
   1899		if (hdr->argsz - minsz < hdr->count * size)
   1900			return -EINVAL;
   1901
   1902		if (!data_size)
   1903			return -EINVAL;
   1904
   1905		*data_size = hdr->count * size;
   1906	}
   1907
   1908	return 0;
   1909}
   1910EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
   1911
   1912/*
   1913 * Pin a set of guest PFNs and return their associated host PFNs for local
   1914 * domain only.
   1915 * @device [in]  : device
   1916 * @user_pfn [in]: array of user/guest PFNs to be pinned.
   1917 * @npage [in]   : count of elements in user_pfn array.  This count should not
   1918 *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
   1919 * @prot [in]    : protection flags
   1920 * @phys_pfn[out]: array of host PFNs
   1921 * Return error or number of pages pinned.
   1922 */
   1923int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
   1924		   int npage, int prot, unsigned long *phys_pfn)
   1925{
   1926	struct vfio_container *container;
   1927	struct vfio_group *group = device->group;
   1928	struct vfio_iommu_driver *driver;
   1929	int ret;
   1930
   1931	if (!user_pfn || !phys_pfn || !npage ||
   1932	    !vfio_assert_device_open(device))
   1933		return -EINVAL;
   1934
   1935	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
   1936		return -E2BIG;
   1937
   1938	if (group->dev_counter > 1)
   1939		return -EINVAL;
   1940
   1941	/* group->container cannot change while a vfio device is open */
   1942	container = group->container;
   1943	driver = container->iommu_driver;
   1944	if (likely(driver && driver->ops->pin_pages))
   1945		ret = driver->ops->pin_pages(container->iommu_data,
   1946					     group->iommu_group, user_pfn,
   1947					     npage, prot, phys_pfn);
   1948	else
   1949		ret = -ENOTTY;
   1950
   1951	return ret;
   1952}
   1953EXPORT_SYMBOL(vfio_pin_pages);
   1954
   1955/*
   1956 * Unpin set of host PFNs for local domain only.
   1957 * @device [in]  : device
   1958 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
   1959 *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
   1960 * @npage [in]   : count of elements in user_pfn array.  This count should not
   1961 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
   1962 * Return error or number of pages unpinned.
   1963 */
   1964int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
   1965		     int npage)
   1966{
   1967	struct vfio_container *container;
   1968	struct vfio_iommu_driver *driver;
   1969	int ret;
   1970
   1971	if (!user_pfn || !npage || !vfio_assert_device_open(device))
   1972		return -EINVAL;
   1973
   1974	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
   1975		return -E2BIG;
   1976
   1977	/* group->container cannot change while a vfio device is open */
   1978	container = device->group->container;
   1979	driver = container->iommu_driver;
   1980	if (likely(driver && driver->ops->unpin_pages))
   1981		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
   1982					       npage);
   1983	else
   1984		ret = -ENOTTY;
   1985
   1986	return ret;
   1987}
   1988EXPORT_SYMBOL(vfio_unpin_pages);
   1989
   1990/*
   1991 * This interface allows the CPUs to perform some sort of virtual DMA on
   1992 * behalf of the device.
   1993 *
   1994 * CPUs read/write from/into a range of IOVAs pointing to user space memory
   1995 * into/from a kernel buffer.
   1996 *
   1997 * As the read/write of user space memory is conducted via the CPUs and is
   1998 * not a real device DMA, it is not necessary to pin the user space memory.
   1999 *
   2000 * @device [in]		: VFIO device
   2001 * @user_iova [in]	: base IOVA of a user space buffer
   2002 * @data [in]		: pointer to kernel buffer
   2003 * @len [in]		: kernel buffer length
   2004 * @write		: indicate read or write
   2005 * Return error code on failure or 0 on success.
   2006 */
   2007int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
   2008		size_t len, bool write)
   2009{
   2010	struct vfio_container *container;
   2011	struct vfio_iommu_driver *driver;
   2012	int ret = 0;
   2013
   2014	if (!data || len <= 0 || !vfio_assert_device_open(device))
   2015		return -EINVAL;
   2016
   2017	/* group->container cannot change while a vfio device is open */
   2018	container = device->group->container;
   2019	driver = container->iommu_driver;
   2020
   2021	if (likely(driver && driver->ops->dma_rw))
   2022		ret = driver->ops->dma_rw(container->iommu_data,
   2023					  user_iova, data, len, write);
   2024	else
   2025		ret = -ENOTTY;
   2026	return ret;
   2027}
   2028EXPORT_SYMBOL(vfio_dma_rw);
   2029
   2030static int vfio_register_iommu_notifier(struct vfio_group *group,
   2031					unsigned long *events,
   2032					struct notifier_block *nb)
   2033{
   2034	struct vfio_container *container;
   2035	struct vfio_iommu_driver *driver;
   2036	int ret;
   2037
   2038	lockdep_assert_held_read(&group->group_rwsem);
   2039
   2040	container = group->container;
   2041	driver = container->iommu_driver;
   2042	if (likely(driver && driver->ops->register_notifier))
   2043		ret = driver->ops->register_notifier(container->iommu_data,
   2044						     events, nb);
   2045	else
   2046		ret = -ENOTTY;
   2047
   2048	return ret;
   2049}
   2050
   2051static int vfio_unregister_iommu_notifier(struct vfio_group *group,
   2052					  struct notifier_block *nb)
   2053{
   2054	struct vfio_container *container;
   2055	struct vfio_iommu_driver *driver;
   2056	int ret;
   2057
   2058	lockdep_assert_held_read(&group->group_rwsem);
   2059
   2060	container = group->container;
   2061	driver = container->iommu_driver;
   2062	if (likely(driver && driver->ops->unregister_notifier))
   2063		ret = driver->ops->unregister_notifier(container->iommu_data,
   2064						       nb);
   2065	else
   2066		ret = -ENOTTY;
   2067
   2068	return ret;
   2069}
   2070
   2071int vfio_register_notifier(struct vfio_device *device,
   2072			   enum vfio_notify_type type, unsigned long *events,
   2073			   struct notifier_block *nb)
   2074{
   2075	struct vfio_group *group = device->group;
   2076	int ret;
   2077
   2078	if (!nb || !events || (*events == 0) ||
   2079	    !vfio_assert_device_open(device))
   2080		return -EINVAL;
   2081
   2082	switch (type) {
   2083	case VFIO_IOMMU_NOTIFY:
   2084		ret = vfio_register_iommu_notifier(group, events, nb);
   2085		break;
   2086	default:
   2087		ret = -EINVAL;
   2088	}
   2089	return ret;
   2090}
   2091EXPORT_SYMBOL(vfio_register_notifier);
   2092
   2093int vfio_unregister_notifier(struct vfio_device *device,
   2094			     enum vfio_notify_type type,
   2095			     struct notifier_block *nb)
   2096{
   2097	struct vfio_group *group = device->group;
   2098	int ret;
   2099
   2100	if (!nb || !vfio_assert_device_open(device))
   2101		return -EINVAL;
   2102
   2103	switch (type) {
   2104	case VFIO_IOMMU_NOTIFY:
   2105		ret = vfio_unregister_iommu_notifier(group, nb);
   2106		break;
   2107	default:
   2108		ret = -EINVAL;
   2109	}
   2110	return ret;
   2111}
   2112EXPORT_SYMBOL(vfio_unregister_notifier);
   2113
   2114/*
   2115 * Module/class support
   2116 */
   2117static char *vfio_devnode(struct device *dev, umode_t *mode)
   2118{
   2119	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
   2120}
   2121
   2122static struct miscdevice vfio_dev = {
   2123	.minor = VFIO_MINOR,
   2124	.name = "vfio",
   2125	.fops = &vfio_fops,
   2126	.nodename = "vfio/vfio",
   2127	.mode = S_IRUGO | S_IWUGO,
   2128};
   2129
   2130static int __init vfio_init(void)
   2131{
   2132	int ret;
   2133
   2134	ida_init(&vfio.group_ida);
   2135	mutex_init(&vfio.group_lock);
   2136	mutex_init(&vfio.iommu_drivers_lock);
   2137	INIT_LIST_HEAD(&vfio.group_list);
   2138	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
   2139
   2140	ret = misc_register(&vfio_dev);
   2141	if (ret) {
   2142		pr_err("vfio: misc device register failed\n");
   2143		return ret;
   2144	}
   2145
   2146	/* /dev/vfio/$GROUP */
   2147	vfio.class = class_create(THIS_MODULE, "vfio");
   2148	if (IS_ERR(vfio.class)) {
   2149		ret = PTR_ERR(vfio.class);
   2150		goto err_class;
   2151	}
   2152
   2153	vfio.class->devnode = vfio_devnode;
   2154
   2155	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
   2156	if (ret)
   2157		goto err_alloc_chrdev;
   2158
   2159	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
   2160
   2161#ifdef CONFIG_VFIO_NOIOMMU
   2162	vfio_register_iommu_driver(&vfio_noiommu_ops);
   2163#endif
   2164	return 0;
   2165
   2166err_alloc_chrdev:
   2167	class_destroy(vfio.class);
   2168	vfio.class = NULL;
   2169err_class:
   2170	misc_deregister(&vfio_dev);
   2171	return ret;
   2172}
   2173
   2174static void __exit vfio_cleanup(void)
   2175{
   2176	WARN_ON(!list_empty(&vfio.group_list));
   2177
   2178#ifdef CONFIG_VFIO_NOIOMMU
   2179	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
   2180#endif
   2181	ida_destroy(&vfio.group_ida);
   2182	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
   2183	class_destroy(vfio.class);
   2184	vfio.class = NULL;
   2185	misc_deregister(&vfio_dev);
   2186	xa_destroy(&vfio_device_set_xa);
   2187}
   2188
   2189module_init(vfio_init);
   2190module_exit(vfio_cleanup);
   2191
   2192MODULE_VERSION(DRIVER_VERSION);
   2193MODULE_LICENSE("GPL v2");
   2194MODULE_AUTHOR(DRIVER_AUTHOR);
   2195MODULE_DESCRIPTION(DRIVER_DESC);
   2196MODULE_ALIAS_MISCDEV(VFIO_MINOR);
   2197MODULE_ALIAS("devname:vfio/vfio");
   2198MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");