cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

vfio_iommu_spapr_tce.c (32952B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * VFIO: IOMMU DMA mapping support for TCE on POWER
      4 *
      5 * Copyright (C) 2013 IBM Corp.  All rights reserved.
      6 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
      7 *
      8 * Derived from original vfio_iommu_type1.c:
      9 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
     10 *     Author: Alex Williamson <alex.williamson@redhat.com>
     11 */
     12
     13#include <linux/module.h>
     14#include <linux/pci.h>
     15#include <linux/slab.h>
     16#include <linux/uaccess.h>
     17#include <linux/err.h>
     18#include <linux/vfio.h>
     19#include <linux/vmalloc.h>
     20#include <linux/sched/mm.h>
     21#include <linux/sched/signal.h>
     22#include <linux/mm.h>
     23#include "vfio.h"
     24
     25#include <asm/iommu.h>
     26#include <asm/tce.h>
     27#include <asm/mmu_context.h>
     28
     29#define DRIVER_VERSION  "0.1"
     30#define DRIVER_AUTHOR   "aik@ozlabs.ru"
     31#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
     32
     33static void tce_iommu_detach_group(void *iommu_data,
     34		struct iommu_group *iommu_group);
     35
     36/*
     37 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
     38 *
     39 * This code handles mapping and unmapping of user data buffers
     40 * into DMA'ble space using the IOMMU
     41 */
     42
     43struct tce_iommu_group {
     44	struct list_head next;
     45	struct iommu_group *grp;
     46};
     47
     48/*
     49 * A container needs to remember which preregistered region  it has
     50 * referenced to do proper cleanup at the userspace process exit.
     51 */
     52struct tce_iommu_prereg {
     53	struct list_head next;
     54	struct mm_iommu_table_group_mem_t *mem;
     55};
     56
     57/*
     58 * The container descriptor supports only a single group per container.
     59 * Required by the API as the container is not supplied with the IOMMU group
     60 * at the moment of initialization.
     61 */
     62struct tce_container {
     63	struct mutex lock;
     64	bool enabled;
     65	bool v2;
     66	bool def_window_pending;
     67	unsigned long locked_pages;
     68	struct mm_struct *mm;
     69	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
     70	struct list_head group_list;
     71	struct list_head prereg_list;
     72};
     73
     74static long tce_iommu_mm_set(struct tce_container *container)
     75{
     76	if (container->mm) {
     77		if (container->mm == current->mm)
     78			return 0;
     79		return -EPERM;
     80	}
     81	BUG_ON(!current->mm);
     82	container->mm = current->mm;
     83	mmgrab(container->mm);
     84
     85	return 0;
     86}
     87
     88static long tce_iommu_prereg_free(struct tce_container *container,
     89		struct tce_iommu_prereg *tcemem)
     90{
     91	long ret;
     92
     93	ret = mm_iommu_put(container->mm, tcemem->mem);
     94	if (ret)
     95		return ret;
     96
     97	list_del(&tcemem->next);
     98	kfree(tcemem);
     99
    100	return 0;
    101}
    102
    103static long tce_iommu_unregister_pages(struct tce_container *container,
    104		__u64 vaddr, __u64 size)
    105{
    106	struct mm_iommu_table_group_mem_t *mem;
    107	struct tce_iommu_prereg *tcemem;
    108	bool found = false;
    109	long ret;
    110
    111	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
    112		return -EINVAL;
    113
    114	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
    115	if (!mem)
    116		return -ENOENT;
    117
    118	list_for_each_entry(tcemem, &container->prereg_list, next) {
    119		if (tcemem->mem == mem) {
    120			found = true;
    121			break;
    122		}
    123	}
    124
    125	if (!found)
    126		ret = -ENOENT;
    127	else
    128		ret = tce_iommu_prereg_free(container, tcemem);
    129
    130	mm_iommu_put(container->mm, mem);
    131
    132	return ret;
    133}
    134
    135static long tce_iommu_register_pages(struct tce_container *container,
    136		__u64 vaddr, __u64 size)
    137{
    138	long ret = 0;
    139	struct mm_iommu_table_group_mem_t *mem = NULL;
    140	struct tce_iommu_prereg *tcemem;
    141	unsigned long entries = size >> PAGE_SHIFT;
    142
    143	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
    144			((vaddr + size) < vaddr))
    145		return -EINVAL;
    146
    147	mem = mm_iommu_get(container->mm, vaddr, entries);
    148	if (mem) {
    149		list_for_each_entry(tcemem, &container->prereg_list, next) {
    150			if (tcemem->mem == mem) {
    151				ret = -EBUSY;
    152				goto put_exit;
    153			}
    154		}
    155	} else {
    156		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
    157		if (ret)
    158			return ret;
    159	}
    160
    161	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
    162	if (!tcemem) {
    163		ret = -ENOMEM;
    164		goto put_exit;
    165	}
    166
    167	tcemem->mem = mem;
    168	list_add(&tcemem->next, &container->prereg_list);
    169
    170	container->enabled = true;
    171
    172	return 0;
    173
    174put_exit:
    175	mm_iommu_put(container->mm, mem);
    176	return ret;
    177}
    178
    179static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
    180		unsigned int it_page_shift)
    181{
    182	struct page *page;
    183	unsigned long size = 0;
    184
    185	if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
    186		return size == (1UL << it_page_shift);
    187
    188	page = pfn_to_page(hpa >> PAGE_SHIFT);
    189	/*
    190	 * Check that the TCE table granularity is not bigger than the size of
    191	 * a page we just found. Otherwise the hardware can get access to
    192	 * a bigger memory chunk that it should.
    193	 */
    194	return page_shift(compound_head(page)) >= it_page_shift;
    195}
    196
    197static inline bool tce_groups_attached(struct tce_container *container)
    198{
    199	return !list_empty(&container->group_list);
    200}
    201
    202static long tce_iommu_find_table(struct tce_container *container,
    203		phys_addr_t ioba, struct iommu_table **ptbl)
    204{
    205	long i;
    206
    207	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
    208		struct iommu_table *tbl = container->tables[i];
    209
    210		if (tbl) {
    211			unsigned long entry = ioba >> tbl->it_page_shift;
    212			unsigned long start = tbl->it_offset;
    213			unsigned long end = start + tbl->it_size;
    214
    215			if ((start <= entry) && (entry < end)) {
    216				*ptbl = tbl;
    217				return i;
    218			}
    219		}
    220	}
    221
    222	return -1;
    223}
    224
    225static int tce_iommu_find_free_table(struct tce_container *container)
    226{
    227	int i;
    228
    229	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
    230		if (!container->tables[i])
    231			return i;
    232	}
    233
    234	return -ENOSPC;
    235}
    236
    237static int tce_iommu_enable(struct tce_container *container)
    238{
    239	int ret = 0;
    240	unsigned long locked;
    241	struct iommu_table_group *table_group;
    242	struct tce_iommu_group *tcegrp;
    243
    244	if (container->enabled)
    245		return -EBUSY;
    246
    247	/*
    248	 * When userspace pages are mapped into the IOMMU, they are effectively
    249	 * locked memory, so, theoretically, we need to update the accounting
    250	 * of locked pages on each map and unmap.  For powerpc, the map unmap
    251	 * paths can be very hot, though, and the accounting would kill
    252	 * performance, especially since it would be difficult to impossible
    253	 * to handle the accounting in real mode only.
    254	 *
    255	 * To address that, rather than precisely accounting every page, we
    256	 * instead account for a worst case on locked memory when the iommu is
    257	 * enabled and disabled.  The worst case upper bound on locked memory
    258	 * is the size of the whole iommu window, which is usually relatively
    259	 * small (compared to total memory sizes) on POWER hardware.
    260	 *
    261	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
    262	 * that would effectively kill the guest at random points, much better
    263	 * enforcing the limit based on the max that the guest can map.
    264	 *
    265	 * Unfortunately at the moment it counts whole tables, no matter how
    266	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
    267	 * each with 2GB DMA window, 8GB will be counted here. The reason for
    268	 * this is that we cannot tell here the amount of RAM used by the guest
    269	 * as this information is only available from KVM and VFIO is
    270	 * KVM agnostic.
    271	 *
    272	 * So we do not allow enabling a container without a group attached
    273	 * as there is no way to know how much we should increment
    274	 * the locked_vm counter.
    275	 */
    276	if (!tce_groups_attached(container))
    277		return -ENODEV;
    278
    279	tcegrp = list_first_entry(&container->group_list,
    280			struct tce_iommu_group, next);
    281	table_group = iommu_group_get_iommudata(tcegrp->grp);
    282	if (!table_group)
    283		return -ENODEV;
    284
    285	if (!table_group->tce32_size)
    286		return -EPERM;
    287
    288	ret = tce_iommu_mm_set(container);
    289	if (ret)
    290		return ret;
    291
    292	locked = table_group->tce32_size >> PAGE_SHIFT;
    293	ret = account_locked_vm(container->mm, locked, true);
    294	if (ret)
    295		return ret;
    296
    297	container->locked_pages = locked;
    298
    299	container->enabled = true;
    300
    301	return ret;
    302}
    303
    304static void tce_iommu_disable(struct tce_container *container)
    305{
    306	if (!container->enabled)
    307		return;
    308
    309	container->enabled = false;
    310
    311	BUG_ON(!container->mm);
    312	account_locked_vm(container->mm, container->locked_pages, false);
    313}
    314
    315static void *tce_iommu_open(unsigned long arg)
    316{
    317	struct tce_container *container;
    318
    319	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
    320		pr_err("tce_vfio: Wrong IOMMU type\n");
    321		return ERR_PTR(-EINVAL);
    322	}
    323
    324	container = kzalloc(sizeof(*container), GFP_KERNEL);
    325	if (!container)
    326		return ERR_PTR(-ENOMEM);
    327
    328	mutex_init(&container->lock);
    329	INIT_LIST_HEAD_RCU(&container->group_list);
    330	INIT_LIST_HEAD_RCU(&container->prereg_list);
    331
    332	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
    333
    334	return container;
    335}
    336
    337static int tce_iommu_clear(struct tce_container *container,
    338		struct iommu_table *tbl,
    339		unsigned long entry, unsigned long pages);
    340static void tce_iommu_free_table(struct tce_container *container,
    341		struct iommu_table *tbl);
    342
    343static void tce_iommu_release(void *iommu_data)
    344{
    345	struct tce_container *container = iommu_data;
    346	struct tce_iommu_group *tcegrp;
    347	struct tce_iommu_prereg *tcemem, *tmtmp;
    348	long i;
    349
    350	while (tce_groups_attached(container)) {
    351		tcegrp = list_first_entry(&container->group_list,
    352				struct tce_iommu_group, next);
    353		tce_iommu_detach_group(iommu_data, tcegrp->grp);
    354	}
    355
    356	/*
    357	 * If VFIO created a table, it was not disposed
    358	 * by tce_iommu_detach_group() so do it now.
    359	 */
    360	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
    361		struct iommu_table *tbl = container->tables[i];
    362
    363		if (!tbl)
    364			continue;
    365
    366		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
    367		tce_iommu_free_table(container, tbl);
    368	}
    369
    370	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
    371		WARN_ON(tce_iommu_prereg_free(container, tcemem));
    372
    373	tce_iommu_disable(container);
    374	if (container->mm)
    375		mmdrop(container->mm);
    376	mutex_destroy(&container->lock);
    377
    378	kfree(container);
    379}
    380
    381static void tce_iommu_unuse_page(struct tce_container *container,
    382		unsigned long hpa)
    383{
    384	struct page *page;
    385
    386	page = pfn_to_page(hpa >> PAGE_SHIFT);
    387	unpin_user_page(page);
    388}
    389
    390static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
    391		unsigned long tce, unsigned long shift,
    392		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
    393{
    394	long ret = 0;
    395	struct mm_iommu_table_group_mem_t *mem;
    396
    397	mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
    398	if (!mem)
    399		return -EINVAL;
    400
    401	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
    402	if (ret)
    403		return -EINVAL;
    404
    405	*pmem = mem;
    406
    407	return 0;
    408}
    409
    410static void tce_iommu_unuse_page_v2(struct tce_container *container,
    411		struct iommu_table *tbl, unsigned long entry)
    412{
    413	struct mm_iommu_table_group_mem_t *mem = NULL;
    414	int ret;
    415	unsigned long hpa = 0;
    416	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
    417
    418	if (!pua)
    419		return;
    420
    421	ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
    422			tbl->it_page_shift, &hpa, &mem);
    423	if (ret)
    424		pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
    425				__func__, be64_to_cpu(*pua), entry, ret);
    426	if (mem)
    427		mm_iommu_mapped_dec(mem);
    428
    429	*pua = cpu_to_be64(0);
    430}
    431
    432static int tce_iommu_clear(struct tce_container *container,
    433		struct iommu_table *tbl,
    434		unsigned long entry, unsigned long pages)
    435{
    436	unsigned long oldhpa;
    437	long ret;
    438	enum dma_data_direction direction;
    439	unsigned long lastentry = entry + pages, firstentry = entry;
    440
    441	for ( ; entry < lastentry; ++entry) {
    442		if (tbl->it_indirect_levels && tbl->it_userspace) {
    443			/*
    444			 * For multilevel tables, we can take a shortcut here
    445			 * and skip some TCEs as we know that the userspace
    446			 * addresses cache is a mirror of the real TCE table
    447			 * and if it is missing some indirect levels, then
    448			 * the hardware table does not have them allocated
    449			 * either and therefore does not require updating.
    450			 */
    451			__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
    452					entry);
    453			if (!pua) {
    454				/* align to level_size which is power of two */
    455				entry |= tbl->it_level_size - 1;
    456				continue;
    457			}
    458		}
    459
    460		cond_resched();
    461
    462		direction = DMA_NONE;
    463		oldhpa = 0;
    464		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
    465				&direction);
    466		if (ret)
    467			continue;
    468
    469		if (direction == DMA_NONE)
    470			continue;
    471
    472		if (container->v2) {
    473			tce_iommu_unuse_page_v2(container, tbl, entry);
    474			continue;
    475		}
    476
    477		tce_iommu_unuse_page(container, oldhpa);
    478	}
    479
    480	iommu_tce_kill(tbl, firstentry, pages);
    481
    482	return 0;
    483}
    484
    485static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
    486{
    487	struct page *page = NULL;
    488	enum dma_data_direction direction = iommu_tce_direction(tce);
    489
    490	if (pin_user_pages_fast(tce & PAGE_MASK, 1,
    491			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
    492			&page) != 1)
    493		return -EFAULT;
    494
    495	*hpa = __pa((unsigned long) page_address(page));
    496
    497	return 0;
    498}
    499
    500static long tce_iommu_build(struct tce_container *container,
    501		struct iommu_table *tbl,
    502		unsigned long entry, unsigned long tce, unsigned long pages,
    503		enum dma_data_direction direction)
    504{
    505	long i, ret = 0;
    506	unsigned long hpa;
    507	enum dma_data_direction dirtmp;
    508
    509	for (i = 0; i < pages; ++i) {
    510		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
    511
    512		ret = tce_iommu_use_page(tce, &hpa);
    513		if (ret)
    514			break;
    515
    516		if (!tce_page_is_contained(container->mm, hpa,
    517				tbl->it_page_shift)) {
    518			ret = -EPERM;
    519			break;
    520		}
    521
    522		hpa |= offset;
    523		dirtmp = direction;
    524		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
    525				&hpa, &dirtmp);
    526		if (ret) {
    527			tce_iommu_unuse_page(container, hpa);
    528			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
    529					__func__, entry << tbl->it_page_shift,
    530					tce, ret);
    531			break;
    532		}
    533
    534		if (dirtmp != DMA_NONE)
    535			tce_iommu_unuse_page(container, hpa);
    536
    537		tce += IOMMU_PAGE_SIZE(tbl);
    538	}
    539
    540	if (ret)
    541		tce_iommu_clear(container, tbl, entry, i);
    542	else
    543		iommu_tce_kill(tbl, entry, pages);
    544
    545	return ret;
    546}
    547
    548static long tce_iommu_build_v2(struct tce_container *container,
    549		struct iommu_table *tbl,
    550		unsigned long entry, unsigned long tce, unsigned long pages,
    551		enum dma_data_direction direction)
    552{
    553	long i, ret = 0;
    554	unsigned long hpa;
    555	enum dma_data_direction dirtmp;
    556
    557	for (i = 0; i < pages; ++i) {
    558		struct mm_iommu_table_group_mem_t *mem = NULL;
    559		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
    560
    561		ret = tce_iommu_prereg_ua_to_hpa(container,
    562				tce, tbl->it_page_shift, &hpa, &mem);
    563		if (ret)
    564			break;
    565
    566		if (!tce_page_is_contained(container->mm, hpa,
    567				tbl->it_page_shift)) {
    568			ret = -EPERM;
    569			break;
    570		}
    571
    572		/* Preserve offset within IOMMU page */
    573		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
    574		dirtmp = direction;
    575
    576		/* The registered region is being unregistered */
    577		if (mm_iommu_mapped_inc(mem))
    578			break;
    579
    580		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
    581				&hpa, &dirtmp);
    582		if (ret) {
    583			/* dirtmp cannot be DMA_NONE here */
    584			tce_iommu_unuse_page_v2(container, tbl, entry + i);
    585			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
    586					__func__, entry << tbl->it_page_shift,
    587					tce, ret);
    588			break;
    589		}
    590
    591		if (dirtmp != DMA_NONE)
    592			tce_iommu_unuse_page_v2(container, tbl, entry + i);
    593
    594		*pua = cpu_to_be64(tce);
    595
    596		tce += IOMMU_PAGE_SIZE(tbl);
    597	}
    598
    599	if (ret)
    600		tce_iommu_clear(container, tbl, entry, i);
    601	else
    602		iommu_tce_kill(tbl, entry, pages);
    603
    604	return ret;
    605}
    606
    607static long tce_iommu_create_table(struct tce_container *container,
    608			struct iommu_table_group *table_group,
    609			int num,
    610			__u32 page_shift,
    611			__u64 window_size,
    612			__u32 levels,
    613			struct iommu_table **ptbl)
    614{
    615	long ret, table_size;
    616
    617	table_size = table_group->ops->get_table_size(page_shift, window_size,
    618			levels);
    619	if (!table_size)
    620		return -EINVAL;
    621
    622	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
    623	if (ret)
    624		return ret;
    625
    626	ret = table_group->ops->create_table(table_group, num,
    627			page_shift, window_size, levels, ptbl);
    628
    629	WARN_ON(!ret && !(*ptbl)->it_ops->free);
    630	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
    631
    632	return ret;
    633}
    634
    635static void tce_iommu_free_table(struct tce_container *container,
    636		struct iommu_table *tbl)
    637{
    638	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
    639
    640	iommu_tce_table_put(tbl);
    641	account_locked_vm(container->mm, pages, false);
    642}
    643
    644static long tce_iommu_create_window(struct tce_container *container,
    645		__u32 page_shift, __u64 window_size, __u32 levels,
    646		__u64 *start_addr)
    647{
    648	struct tce_iommu_group *tcegrp;
    649	struct iommu_table_group *table_group;
    650	struct iommu_table *tbl = NULL;
    651	long ret, num;
    652
    653	num = tce_iommu_find_free_table(container);
    654	if (num < 0)
    655		return num;
    656
    657	/* Get the first group for ops::create_table */
    658	tcegrp = list_first_entry(&container->group_list,
    659			struct tce_iommu_group, next);
    660	table_group = iommu_group_get_iommudata(tcegrp->grp);
    661	if (!table_group)
    662		return -EFAULT;
    663
    664	if (!(table_group->pgsizes & (1ULL << page_shift)))
    665		return -EINVAL;
    666
    667	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
    668			!table_group->ops->get_table_size ||
    669			!table_group->ops->create_table)
    670		return -EPERM;
    671
    672	/* Create TCE table */
    673	ret = tce_iommu_create_table(container, table_group, num,
    674			page_shift, window_size, levels, &tbl);
    675	if (ret)
    676		return ret;
    677
    678	BUG_ON(!tbl->it_ops->free);
    679
    680	/*
    681	 * Program the table to every group.
    682	 * Groups have been tested for compatibility at the attach time.
    683	 */
    684	list_for_each_entry(tcegrp, &container->group_list, next) {
    685		table_group = iommu_group_get_iommudata(tcegrp->grp);
    686
    687		ret = table_group->ops->set_window(table_group, num, tbl);
    688		if (ret)
    689			goto unset_exit;
    690	}
    691
    692	container->tables[num] = tbl;
    693
    694	/* Return start address assigned by platform in create_table() */
    695	*start_addr = tbl->it_offset << tbl->it_page_shift;
    696
    697	return 0;
    698
    699unset_exit:
    700	list_for_each_entry(tcegrp, &container->group_list, next) {
    701		table_group = iommu_group_get_iommudata(tcegrp->grp);
    702		table_group->ops->unset_window(table_group, num);
    703	}
    704	tce_iommu_free_table(container, tbl);
    705
    706	return ret;
    707}
    708
    709static long tce_iommu_remove_window(struct tce_container *container,
    710		__u64 start_addr)
    711{
    712	struct iommu_table_group *table_group = NULL;
    713	struct iommu_table *tbl;
    714	struct tce_iommu_group *tcegrp;
    715	int num;
    716
    717	num = tce_iommu_find_table(container, start_addr, &tbl);
    718	if (num < 0)
    719		return -EINVAL;
    720
    721	BUG_ON(!tbl->it_size);
    722
    723	/* Detach groups from IOMMUs */
    724	list_for_each_entry(tcegrp, &container->group_list, next) {
    725		table_group = iommu_group_get_iommudata(tcegrp->grp);
    726
    727		/*
    728		 * SPAPR TCE IOMMU exposes the default DMA window to
    729		 * the guest via dma32_window_start/size of
    730		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
    731		 * the userspace to remove this window, some do not so
    732		 * here we check for the platform capability.
    733		 */
    734		if (!table_group->ops || !table_group->ops->unset_window)
    735			return -EPERM;
    736
    737		table_group->ops->unset_window(table_group, num);
    738	}
    739
    740	/* Free table */
    741	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
    742	tce_iommu_free_table(container, tbl);
    743	container->tables[num] = NULL;
    744
    745	return 0;
    746}
    747
    748static long tce_iommu_create_default_window(struct tce_container *container)
    749{
    750	long ret;
    751	__u64 start_addr = 0;
    752	struct tce_iommu_group *tcegrp;
    753	struct iommu_table_group *table_group;
    754
    755	if (!container->def_window_pending)
    756		return 0;
    757
    758	if (!tce_groups_attached(container))
    759		return -ENODEV;
    760
    761	tcegrp = list_first_entry(&container->group_list,
    762			struct tce_iommu_group, next);
    763	table_group = iommu_group_get_iommudata(tcegrp->grp);
    764	if (!table_group)
    765		return -ENODEV;
    766
    767	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
    768			table_group->tce32_size, 1, &start_addr);
    769	WARN_ON_ONCE(!ret && start_addr);
    770
    771	if (!ret)
    772		container->def_window_pending = false;
    773
    774	return ret;
    775}
    776
    777static long tce_iommu_ioctl(void *iommu_data,
    778				 unsigned int cmd, unsigned long arg)
    779{
    780	struct tce_container *container = iommu_data;
    781	unsigned long minsz, ddwsz;
    782	long ret;
    783
    784	switch (cmd) {
    785	case VFIO_CHECK_EXTENSION:
    786		switch (arg) {
    787		case VFIO_SPAPR_TCE_IOMMU:
    788		case VFIO_SPAPR_TCE_v2_IOMMU:
    789			ret = 1;
    790			break;
    791		default:
    792			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
    793			break;
    794		}
    795
    796		return (ret < 0) ? 0 : ret;
    797	}
    798
    799	/*
    800	 * Sanity check to prevent one userspace from manipulating
    801	 * another userspace mm.
    802	 */
    803	BUG_ON(!container);
    804	if (container->mm && container->mm != current->mm)
    805		return -EPERM;
    806
    807	switch (cmd) {
    808	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
    809		struct vfio_iommu_spapr_tce_info info;
    810		struct tce_iommu_group *tcegrp;
    811		struct iommu_table_group *table_group;
    812
    813		if (!tce_groups_attached(container))
    814			return -ENXIO;
    815
    816		tcegrp = list_first_entry(&container->group_list,
    817				struct tce_iommu_group, next);
    818		table_group = iommu_group_get_iommudata(tcegrp->grp);
    819
    820		if (!table_group)
    821			return -ENXIO;
    822
    823		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
    824				dma32_window_size);
    825
    826		if (copy_from_user(&info, (void __user *)arg, minsz))
    827			return -EFAULT;
    828
    829		if (info.argsz < minsz)
    830			return -EINVAL;
    831
    832		info.dma32_window_start = table_group->tce32_start;
    833		info.dma32_window_size = table_group->tce32_size;
    834		info.flags = 0;
    835		memset(&info.ddw, 0, sizeof(info.ddw));
    836
    837		if (table_group->max_dynamic_windows_supported &&
    838				container->v2) {
    839			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
    840			info.ddw.pgsizes = table_group->pgsizes;
    841			info.ddw.max_dynamic_windows_supported =
    842				table_group->max_dynamic_windows_supported;
    843			info.ddw.levels = table_group->max_levels;
    844		}
    845
    846		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
    847
    848		if (info.argsz >= ddwsz)
    849			minsz = ddwsz;
    850
    851		if (copy_to_user((void __user *)arg, &info, minsz))
    852			return -EFAULT;
    853
    854		return 0;
    855	}
    856	case VFIO_IOMMU_MAP_DMA: {
    857		struct vfio_iommu_type1_dma_map param;
    858		struct iommu_table *tbl = NULL;
    859		long num;
    860		enum dma_data_direction direction;
    861
    862		if (!container->enabled)
    863			return -EPERM;
    864
    865		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
    866
    867		if (copy_from_user(&param, (void __user *)arg, minsz))
    868			return -EFAULT;
    869
    870		if (param.argsz < minsz)
    871			return -EINVAL;
    872
    873		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
    874				VFIO_DMA_MAP_FLAG_WRITE))
    875			return -EINVAL;
    876
    877		ret = tce_iommu_create_default_window(container);
    878		if (ret)
    879			return ret;
    880
    881		num = tce_iommu_find_table(container, param.iova, &tbl);
    882		if (num < 0)
    883			return -ENXIO;
    884
    885		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
    886				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
    887			return -EINVAL;
    888
    889		/* iova is checked by the IOMMU API */
    890		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
    891			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
    892				direction = DMA_BIDIRECTIONAL;
    893			else
    894				direction = DMA_TO_DEVICE;
    895		} else {
    896			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
    897				direction = DMA_FROM_DEVICE;
    898			else
    899				return -EINVAL;
    900		}
    901
    902		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
    903		if (ret)
    904			return ret;
    905
    906		if (container->v2)
    907			ret = tce_iommu_build_v2(container, tbl,
    908					param.iova >> tbl->it_page_shift,
    909					param.vaddr,
    910					param.size >> tbl->it_page_shift,
    911					direction);
    912		else
    913			ret = tce_iommu_build(container, tbl,
    914					param.iova >> tbl->it_page_shift,
    915					param.vaddr,
    916					param.size >> tbl->it_page_shift,
    917					direction);
    918
    919		iommu_flush_tce(tbl);
    920
    921		return ret;
    922	}
    923	case VFIO_IOMMU_UNMAP_DMA: {
    924		struct vfio_iommu_type1_dma_unmap param;
    925		struct iommu_table *tbl = NULL;
    926		long num;
    927
    928		if (!container->enabled)
    929			return -EPERM;
    930
    931		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
    932				size);
    933
    934		if (copy_from_user(&param, (void __user *)arg, minsz))
    935			return -EFAULT;
    936
    937		if (param.argsz < minsz)
    938			return -EINVAL;
    939
    940		/* No flag is supported now */
    941		if (param.flags)
    942			return -EINVAL;
    943
    944		ret = tce_iommu_create_default_window(container);
    945		if (ret)
    946			return ret;
    947
    948		num = tce_iommu_find_table(container, param.iova, &tbl);
    949		if (num < 0)
    950			return -ENXIO;
    951
    952		if (param.size & ~IOMMU_PAGE_MASK(tbl))
    953			return -EINVAL;
    954
    955		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
    956				param.size >> tbl->it_page_shift);
    957		if (ret)
    958			return ret;
    959
    960		ret = tce_iommu_clear(container, tbl,
    961				param.iova >> tbl->it_page_shift,
    962				param.size >> tbl->it_page_shift);
    963		iommu_flush_tce(tbl);
    964
    965		return ret;
    966	}
    967	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
    968		struct vfio_iommu_spapr_register_memory param;
    969
    970		if (!container->v2)
    971			break;
    972
    973		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
    974				size);
    975
    976		ret = tce_iommu_mm_set(container);
    977		if (ret)
    978			return ret;
    979
    980		if (copy_from_user(&param, (void __user *)arg, minsz))
    981			return -EFAULT;
    982
    983		if (param.argsz < minsz)
    984			return -EINVAL;
    985
    986		/* No flag is supported now */
    987		if (param.flags)
    988			return -EINVAL;
    989
    990		mutex_lock(&container->lock);
    991		ret = tce_iommu_register_pages(container, param.vaddr,
    992				param.size);
    993		mutex_unlock(&container->lock);
    994
    995		return ret;
    996	}
    997	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
    998		struct vfio_iommu_spapr_register_memory param;
    999
   1000		if (!container->v2)
   1001			break;
   1002
   1003		if (!container->mm)
   1004			return -EPERM;
   1005
   1006		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
   1007				size);
   1008
   1009		if (copy_from_user(&param, (void __user *)arg, minsz))
   1010			return -EFAULT;
   1011
   1012		if (param.argsz < minsz)
   1013			return -EINVAL;
   1014
   1015		/* No flag is supported now */
   1016		if (param.flags)
   1017			return -EINVAL;
   1018
   1019		mutex_lock(&container->lock);
   1020		ret = tce_iommu_unregister_pages(container, param.vaddr,
   1021				param.size);
   1022		mutex_unlock(&container->lock);
   1023
   1024		return ret;
   1025	}
   1026	case VFIO_IOMMU_ENABLE:
   1027		if (container->v2)
   1028			break;
   1029
   1030		mutex_lock(&container->lock);
   1031		ret = tce_iommu_enable(container);
   1032		mutex_unlock(&container->lock);
   1033		return ret;
   1034
   1035
   1036	case VFIO_IOMMU_DISABLE:
   1037		if (container->v2)
   1038			break;
   1039
   1040		mutex_lock(&container->lock);
   1041		tce_iommu_disable(container);
   1042		mutex_unlock(&container->lock);
   1043		return 0;
   1044
   1045	case VFIO_EEH_PE_OP: {
   1046		struct tce_iommu_group *tcegrp;
   1047
   1048		ret = 0;
   1049		list_for_each_entry(tcegrp, &container->group_list, next) {
   1050			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
   1051					cmd, arg);
   1052			if (ret)
   1053				return ret;
   1054		}
   1055		return ret;
   1056	}
   1057
   1058	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
   1059		struct vfio_iommu_spapr_tce_create create;
   1060
   1061		if (!container->v2)
   1062			break;
   1063
   1064		ret = tce_iommu_mm_set(container);
   1065		if (ret)
   1066			return ret;
   1067
   1068		if (!tce_groups_attached(container))
   1069			return -ENXIO;
   1070
   1071		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
   1072				start_addr);
   1073
   1074		if (copy_from_user(&create, (void __user *)arg, minsz))
   1075			return -EFAULT;
   1076
   1077		if (create.argsz < minsz)
   1078			return -EINVAL;
   1079
   1080		if (create.flags)
   1081			return -EINVAL;
   1082
   1083		mutex_lock(&container->lock);
   1084
   1085		ret = tce_iommu_create_default_window(container);
   1086		if (!ret)
   1087			ret = tce_iommu_create_window(container,
   1088					create.page_shift,
   1089					create.window_size, create.levels,
   1090					&create.start_addr);
   1091
   1092		mutex_unlock(&container->lock);
   1093
   1094		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
   1095			ret = -EFAULT;
   1096
   1097		return ret;
   1098	}
   1099	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
   1100		struct vfio_iommu_spapr_tce_remove remove;
   1101
   1102		if (!container->v2)
   1103			break;
   1104
   1105		ret = tce_iommu_mm_set(container);
   1106		if (ret)
   1107			return ret;
   1108
   1109		if (!tce_groups_attached(container))
   1110			return -ENXIO;
   1111
   1112		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
   1113				start_addr);
   1114
   1115		if (copy_from_user(&remove, (void __user *)arg, minsz))
   1116			return -EFAULT;
   1117
   1118		if (remove.argsz < minsz)
   1119			return -EINVAL;
   1120
   1121		if (remove.flags)
   1122			return -EINVAL;
   1123
   1124		if (container->def_window_pending && !remove.start_addr) {
   1125			container->def_window_pending = false;
   1126			return 0;
   1127		}
   1128
   1129		mutex_lock(&container->lock);
   1130
   1131		ret = tce_iommu_remove_window(container, remove.start_addr);
   1132
   1133		mutex_unlock(&container->lock);
   1134
   1135		return ret;
   1136	}
   1137	}
   1138
   1139	return -ENOTTY;
   1140}
   1141
   1142static void tce_iommu_release_ownership(struct tce_container *container,
   1143		struct iommu_table_group *table_group)
   1144{
   1145	int i;
   1146
   1147	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
   1148		struct iommu_table *tbl = container->tables[i];
   1149
   1150		if (!tbl)
   1151			continue;
   1152
   1153		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
   1154		if (tbl->it_map)
   1155			iommu_release_ownership(tbl);
   1156
   1157		container->tables[i] = NULL;
   1158	}
   1159}
   1160
   1161static int tce_iommu_take_ownership(struct tce_container *container,
   1162		struct iommu_table_group *table_group)
   1163{
   1164	int i, j, rc = 0;
   1165
   1166	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
   1167		struct iommu_table *tbl = table_group->tables[i];
   1168
   1169		if (!tbl || !tbl->it_map)
   1170			continue;
   1171
   1172		rc = iommu_take_ownership(tbl);
   1173		if (rc) {
   1174			for (j = 0; j < i; ++j)
   1175				iommu_release_ownership(
   1176						table_group->tables[j]);
   1177
   1178			return rc;
   1179		}
   1180	}
   1181
   1182	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
   1183		container->tables[i] = table_group->tables[i];
   1184
   1185	return 0;
   1186}
   1187
   1188static void tce_iommu_release_ownership_ddw(struct tce_container *container,
   1189		struct iommu_table_group *table_group)
   1190{
   1191	long i;
   1192
   1193	if (!table_group->ops->unset_window) {
   1194		WARN_ON_ONCE(1);
   1195		return;
   1196	}
   1197
   1198	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
   1199		if (container->tables[i])
   1200			table_group->ops->unset_window(table_group, i);
   1201
   1202	table_group->ops->release_ownership(table_group);
   1203}
   1204
   1205static long tce_iommu_take_ownership_ddw(struct tce_container *container,
   1206		struct iommu_table_group *table_group)
   1207{
   1208	long i, ret = 0;
   1209
   1210	if (!table_group->ops->create_table || !table_group->ops->set_window ||
   1211			!table_group->ops->release_ownership) {
   1212		WARN_ON_ONCE(1);
   1213		return -EFAULT;
   1214	}
   1215
   1216	table_group->ops->take_ownership(table_group);
   1217
   1218	/* Set all windows to the new group */
   1219	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
   1220		struct iommu_table *tbl = container->tables[i];
   1221
   1222		if (!tbl)
   1223			continue;
   1224
   1225		ret = table_group->ops->set_window(table_group, i, tbl);
   1226		if (ret)
   1227			goto release_exit;
   1228	}
   1229
   1230	return 0;
   1231
   1232release_exit:
   1233	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
   1234		table_group->ops->unset_window(table_group, i);
   1235
   1236	table_group->ops->release_ownership(table_group);
   1237
   1238	return ret;
   1239}
   1240
   1241static int tce_iommu_attach_group(void *iommu_data,
   1242		struct iommu_group *iommu_group, enum vfio_group_type type)
   1243{
   1244	int ret = 0;
   1245	struct tce_container *container = iommu_data;
   1246	struct iommu_table_group *table_group;
   1247	struct tce_iommu_group *tcegrp = NULL;
   1248
   1249	if (type == VFIO_EMULATED_IOMMU)
   1250		return -EINVAL;
   1251
   1252	mutex_lock(&container->lock);
   1253
   1254	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
   1255			iommu_group_id(iommu_group), iommu_group); */
   1256	table_group = iommu_group_get_iommudata(iommu_group);
   1257	if (!table_group) {
   1258		ret = -ENODEV;
   1259		goto unlock_exit;
   1260	}
   1261
   1262	if (tce_groups_attached(container) && (!table_group->ops ||
   1263			!table_group->ops->take_ownership ||
   1264			!table_group->ops->release_ownership)) {
   1265		ret = -EBUSY;
   1266		goto unlock_exit;
   1267	}
   1268
   1269	/* Check if new group has the same iommu_ops (i.e. compatible) */
   1270	list_for_each_entry(tcegrp, &container->group_list, next) {
   1271		struct iommu_table_group *table_group_tmp;
   1272
   1273		if (tcegrp->grp == iommu_group) {
   1274			pr_warn("tce_vfio: Group %d is already attached\n",
   1275					iommu_group_id(iommu_group));
   1276			ret = -EBUSY;
   1277			goto unlock_exit;
   1278		}
   1279		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
   1280		if (table_group_tmp->ops->create_table !=
   1281				table_group->ops->create_table) {
   1282			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
   1283					iommu_group_id(iommu_group),
   1284					iommu_group_id(tcegrp->grp));
   1285			ret = -EPERM;
   1286			goto unlock_exit;
   1287		}
   1288	}
   1289
   1290	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
   1291	if (!tcegrp) {
   1292		ret = -ENOMEM;
   1293		goto unlock_exit;
   1294	}
   1295
   1296	if (!table_group->ops || !table_group->ops->take_ownership ||
   1297			!table_group->ops->release_ownership) {
   1298		if (container->v2) {
   1299			ret = -EPERM;
   1300			goto free_exit;
   1301		}
   1302		ret = tce_iommu_take_ownership(container, table_group);
   1303	} else {
   1304		if (!container->v2) {
   1305			ret = -EPERM;
   1306			goto free_exit;
   1307		}
   1308		ret = tce_iommu_take_ownership_ddw(container, table_group);
   1309		if (!tce_groups_attached(container) && !container->tables[0])
   1310			container->def_window_pending = true;
   1311	}
   1312
   1313	if (!ret) {
   1314		tcegrp->grp = iommu_group;
   1315		list_add(&tcegrp->next, &container->group_list);
   1316	}
   1317
   1318free_exit:
   1319	if (ret && tcegrp)
   1320		kfree(tcegrp);
   1321
   1322unlock_exit:
   1323	mutex_unlock(&container->lock);
   1324
   1325	return ret;
   1326}
   1327
   1328static void tce_iommu_detach_group(void *iommu_data,
   1329		struct iommu_group *iommu_group)
   1330{
   1331	struct tce_container *container = iommu_data;
   1332	struct iommu_table_group *table_group;
   1333	bool found = false;
   1334	struct tce_iommu_group *tcegrp;
   1335
   1336	mutex_lock(&container->lock);
   1337
   1338	list_for_each_entry(tcegrp, &container->group_list, next) {
   1339		if (tcegrp->grp == iommu_group) {
   1340			found = true;
   1341			break;
   1342		}
   1343	}
   1344
   1345	if (!found) {
   1346		pr_warn("tce_vfio: detaching unattached group #%u\n",
   1347				iommu_group_id(iommu_group));
   1348		goto unlock_exit;
   1349	}
   1350
   1351	list_del(&tcegrp->next);
   1352	kfree(tcegrp);
   1353
   1354	table_group = iommu_group_get_iommudata(iommu_group);
   1355	BUG_ON(!table_group);
   1356
   1357	if (!table_group->ops || !table_group->ops->release_ownership)
   1358		tce_iommu_release_ownership(container, table_group);
   1359	else
   1360		tce_iommu_release_ownership_ddw(container, table_group);
   1361
   1362unlock_exit:
   1363	mutex_unlock(&container->lock);
   1364}
   1365
   1366static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
   1367	.name		= "iommu-vfio-powerpc",
   1368	.owner		= THIS_MODULE,
   1369	.open		= tce_iommu_open,
   1370	.release	= tce_iommu_release,
   1371	.ioctl		= tce_iommu_ioctl,
   1372	.attach_group	= tce_iommu_attach_group,
   1373	.detach_group	= tce_iommu_detach_group,
   1374};
   1375
   1376static int __init tce_iommu_init(void)
   1377{
   1378	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
   1379}
   1380
   1381static void __exit tce_iommu_cleanup(void)
   1382{
   1383	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
   1384}
   1385
   1386module_init(tce_iommu_init);
   1387module_exit(tce_iommu_cleanup);
   1388
   1389MODULE_VERSION(DRIVER_VERSION);
   1390MODULE_LICENSE("GPL v2");
   1391MODULE_AUTHOR(DRIVER_AUTHOR);
   1392MODULE_DESCRIPTION(DRIVER_DESC);
   1393