cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

iommu.c (30373B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
      4 * 
      5 * Rewrite, cleanup, new allocation schemes, virtual merging: 
      6 * Copyright (C) 2004 Olof Johansson, IBM Corporation
      7 *               and  Ben. Herrenschmidt, IBM Corporation
      8 *
      9 * Dynamic DMA mapping support, bus-independent parts.
     10 */
     11
     12
     13#include <linux/init.h>
     14#include <linux/types.h>
     15#include <linux/slab.h>
     16#include <linux/mm.h>
     17#include <linux/spinlock.h>
     18#include <linux/string.h>
     19#include <linux/dma-mapping.h>
     20#include <linux/bitmap.h>
     21#include <linux/iommu-helper.h>
     22#include <linux/crash_dump.h>
     23#include <linux/hash.h>
     24#include <linux/fault-inject.h>
     25#include <linux/pci.h>
     26#include <linux/iommu.h>
     27#include <linux/sched.h>
     28#include <linux/debugfs.h>
     29#include <asm/io.h>
     30#include <asm/iommu.h>
     31#include <asm/pci-bridge.h>
     32#include <asm/machdep.h>
     33#include <asm/kdump.h>
     34#include <asm/fadump.h>
     35#include <asm/vio.h>
     36#include <asm/tce.h>
     37#include <asm/mmu_context.h>
     38
     39#define DBG(...)
     40
     41#ifdef CONFIG_IOMMU_DEBUGFS
     42static int iommu_debugfs_weight_get(void *data, u64 *val)
     43{
     44	struct iommu_table *tbl = data;
     45	*val = bitmap_weight(tbl->it_map, tbl->it_size);
     46	return 0;
     47}
     48DEFINE_DEBUGFS_ATTRIBUTE(iommu_debugfs_fops_weight, iommu_debugfs_weight_get, NULL, "%llu\n");
     49
     50static void iommu_debugfs_add(struct iommu_table *tbl)
     51{
     52	char name[10];
     53	struct dentry *liobn_entry;
     54
     55	sprintf(name, "%08lx", tbl->it_index);
     56	liobn_entry = debugfs_create_dir(name, iommu_debugfs_dir);
     57
     58	debugfs_create_file_unsafe("weight", 0400, liobn_entry, tbl, &iommu_debugfs_fops_weight);
     59	debugfs_create_ulong("it_size", 0400, liobn_entry, &tbl->it_size);
     60	debugfs_create_ulong("it_page_shift", 0400, liobn_entry, &tbl->it_page_shift);
     61	debugfs_create_ulong("it_reserved_start", 0400, liobn_entry, &tbl->it_reserved_start);
     62	debugfs_create_ulong("it_reserved_end", 0400, liobn_entry, &tbl->it_reserved_end);
     63	debugfs_create_ulong("it_indirect_levels", 0400, liobn_entry, &tbl->it_indirect_levels);
     64	debugfs_create_ulong("it_level_size", 0400, liobn_entry, &tbl->it_level_size);
     65}
     66
     67static void iommu_debugfs_del(struct iommu_table *tbl)
     68{
     69	char name[10];
     70	struct dentry *liobn_entry;
     71
     72	sprintf(name, "%08lx", tbl->it_index);
     73	liobn_entry = debugfs_lookup(name, iommu_debugfs_dir);
     74	debugfs_remove(liobn_entry);
     75}
     76#else
     77static void iommu_debugfs_add(struct iommu_table *tbl){}
     78static void iommu_debugfs_del(struct iommu_table *tbl){}
     79#endif
     80
     81static int novmerge;
     82
     83static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
     84
     85static int __init setup_iommu(char *str)
     86{
     87	if (!strcmp(str, "novmerge"))
     88		novmerge = 1;
     89	else if (!strcmp(str, "vmerge"))
     90		novmerge = 0;
     91	return 1;
     92}
     93
     94__setup("iommu=", setup_iommu);
     95
     96static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
     97
     98/*
     99 * We precalculate the hash to avoid doing it on every allocation.
    100 *
    101 * The hash is important to spread CPUs across all the pools. For example,
    102 * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
    103 * with 4 pools all primary threads would map to the same pool.
    104 */
    105static int __init setup_iommu_pool_hash(void)
    106{
    107	unsigned int i;
    108
    109	for_each_possible_cpu(i)
    110		per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
    111
    112	return 0;
    113}
    114subsys_initcall(setup_iommu_pool_hash);
    115
    116#ifdef CONFIG_FAIL_IOMMU
    117
    118static DECLARE_FAULT_ATTR(fail_iommu);
    119
    120static int __init setup_fail_iommu(char *str)
    121{
    122	return setup_fault_attr(&fail_iommu, str);
    123}
    124__setup("fail_iommu=", setup_fail_iommu);
    125
    126static bool should_fail_iommu(struct device *dev)
    127{
    128	return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1);
    129}
    130
    131static int __init fail_iommu_debugfs(void)
    132{
    133	struct dentry *dir = fault_create_debugfs_attr("fail_iommu",
    134						       NULL, &fail_iommu);
    135
    136	return PTR_ERR_OR_ZERO(dir);
    137}
    138late_initcall(fail_iommu_debugfs);
    139
    140static ssize_t fail_iommu_show(struct device *dev,
    141			       struct device_attribute *attr, char *buf)
    142{
    143	return sprintf(buf, "%d\n", dev->archdata.fail_iommu);
    144}
    145
    146static ssize_t fail_iommu_store(struct device *dev,
    147				struct device_attribute *attr, const char *buf,
    148				size_t count)
    149{
    150	int i;
    151
    152	if (count > 0 && sscanf(buf, "%d", &i) > 0)
    153		dev->archdata.fail_iommu = (i == 0) ? 0 : 1;
    154
    155	return count;
    156}
    157
    158static DEVICE_ATTR_RW(fail_iommu);
    159
    160static int fail_iommu_bus_notify(struct notifier_block *nb,
    161				 unsigned long action, void *data)
    162{
    163	struct device *dev = data;
    164
    165	if (action == BUS_NOTIFY_ADD_DEVICE) {
    166		if (device_create_file(dev, &dev_attr_fail_iommu))
    167			pr_warn("Unable to create IOMMU fault injection sysfs "
    168				"entries\n");
    169	} else if (action == BUS_NOTIFY_DEL_DEVICE) {
    170		device_remove_file(dev, &dev_attr_fail_iommu);
    171	}
    172
    173	return 0;
    174}
    175
    176static struct notifier_block fail_iommu_bus_notifier = {
    177	.notifier_call = fail_iommu_bus_notify
    178};
    179
    180static int __init fail_iommu_setup(void)
    181{
    182#ifdef CONFIG_PCI
    183	bus_register_notifier(&pci_bus_type, &fail_iommu_bus_notifier);
    184#endif
    185#ifdef CONFIG_IBMVIO
    186	bus_register_notifier(&vio_bus_type, &fail_iommu_bus_notifier);
    187#endif
    188
    189	return 0;
    190}
    191/*
    192 * Must execute after PCI and VIO subsystem have initialised but before
    193 * devices are probed.
    194 */
    195arch_initcall(fail_iommu_setup);
    196#else
    197static inline bool should_fail_iommu(struct device *dev)
    198{
    199	return false;
    200}
    201#endif
    202
    203static unsigned long iommu_range_alloc(struct device *dev,
    204				       struct iommu_table *tbl,
    205                                       unsigned long npages,
    206                                       unsigned long *handle,
    207                                       unsigned long mask,
    208                                       unsigned int align_order)
    209{ 
    210	unsigned long n, end, start;
    211	unsigned long limit;
    212	int largealloc = npages > 15;
    213	int pass = 0;
    214	unsigned long align_mask;
    215	unsigned long flags;
    216	unsigned int pool_nr;
    217	struct iommu_pool *pool;
    218
    219	align_mask = (1ull << align_order) - 1;
    220
    221	/* This allocator was derived from x86_64's bit string search */
    222
    223	/* Sanity check */
    224	if (unlikely(npages == 0)) {
    225		if (printk_ratelimit())
    226			WARN_ON(1);
    227		return DMA_MAPPING_ERROR;
    228	}
    229
    230	if (should_fail_iommu(dev))
    231		return DMA_MAPPING_ERROR;
    232
    233	/*
    234	 * We don't need to disable preemption here because any CPU can
    235	 * safely use any IOMMU pool.
    236	 */
    237	pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1);
    238
    239	if (largealloc)
    240		pool = &(tbl->large_pool);
    241	else
    242		pool = &(tbl->pools[pool_nr]);
    243
    244	spin_lock_irqsave(&(pool->lock), flags);
    245
    246again:
    247	if ((pass == 0) && handle && *handle &&
    248	    (*handle >= pool->start) && (*handle < pool->end))
    249		start = *handle;
    250	else
    251		start = pool->hint;
    252
    253	limit = pool->end;
    254
    255	/* The case below can happen if we have a small segment appended
    256	 * to a large, or when the previous alloc was at the very end of
    257	 * the available space. If so, go back to the initial start.
    258	 */
    259	if (start >= limit)
    260		start = pool->start;
    261
    262	if (limit + tbl->it_offset > mask) {
    263		limit = mask - tbl->it_offset + 1;
    264		/* If we're constrained on address range, first try
    265		 * at the masked hint to avoid O(n) search complexity,
    266		 * but on second pass, start at 0 in pool 0.
    267		 */
    268		if ((start & mask) >= limit || pass > 0) {
    269			spin_unlock(&(pool->lock));
    270			pool = &(tbl->pools[0]);
    271			spin_lock(&(pool->lock));
    272			start = pool->start;
    273		} else {
    274			start &= mask;
    275		}
    276	}
    277
    278	n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
    279			dma_get_seg_boundary_nr_pages(dev, tbl->it_page_shift),
    280			align_mask);
    281	if (n == -1) {
    282		if (likely(pass == 0)) {
    283			/* First try the pool from the start */
    284			pool->hint = pool->start;
    285			pass++;
    286			goto again;
    287
    288		} else if (pass <= tbl->nr_pools) {
    289			/* Now try scanning all the other pools */
    290			spin_unlock(&(pool->lock));
    291			pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
    292			pool = &tbl->pools[pool_nr];
    293			spin_lock(&(pool->lock));
    294			pool->hint = pool->start;
    295			pass++;
    296			goto again;
    297
    298		} else if (pass == tbl->nr_pools + 1) {
    299			/* Last resort: try largepool */
    300			spin_unlock(&pool->lock);
    301			pool = &tbl->large_pool;
    302			spin_lock(&pool->lock);
    303			pool->hint = pool->start;
    304			pass++;
    305			goto again;
    306
    307		} else {
    308			/* Give up */
    309			spin_unlock_irqrestore(&(pool->lock), flags);
    310			return DMA_MAPPING_ERROR;
    311		}
    312	}
    313
    314	end = n + npages;
    315
    316	/* Bump the hint to a new block for small allocs. */
    317	if (largealloc) {
    318		/* Don't bump to new block to avoid fragmentation */
    319		pool->hint = end;
    320	} else {
    321		/* Overflow will be taken care of at the next allocation */
    322		pool->hint = (end + tbl->it_blocksize - 1) &
    323		                ~(tbl->it_blocksize - 1);
    324	}
    325
    326	/* Update handle for SG allocations */
    327	if (handle)
    328		*handle = end;
    329
    330	spin_unlock_irqrestore(&(pool->lock), flags);
    331
    332	return n;
    333}
    334
    335static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
    336			      void *page, unsigned int npages,
    337			      enum dma_data_direction direction,
    338			      unsigned long mask, unsigned int align_order,
    339			      unsigned long attrs)
    340{
    341	unsigned long entry;
    342	dma_addr_t ret = DMA_MAPPING_ERROR;
    343	int build_fail;
    344
    345	entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
    346
    347	if (unlikely(entry == DMA_MAPPING_ERROR))
    348		return DMA_MAPPING_ERROR;
    349
    350	entry += tbl->it_offset;	/* Offset into real TCE table */
    351	ret = entry << tbl->it_page_shift;	/* Set the return dma address */
    352
    353	/* Put the TCEs in the HW table */
    354	build_fail = tbl->it_ops->set(tbl, entry, npages,
    355				      (unsigned long)page &
    356				      IOMMU_PAGE_MASK(tbl), direction, attrs);
    357
    358	/* tbl->it_ops->set() only returns non-zero for transient errors.
    359	 * Clean up the table bitmap in this case and return
    360	 * DMA_MAPPING_ERROR. For all other errors the functionality is
    361	 * not altered.
    362	 */
    363	if (unlikely(build_fail)) {
    364		__iommu_free(tbl, ret, npages);
    365		return DMA_MAPPING_ERROR;
    366	}
    367
    368	/* Flush/invalidate TLB caches if necessary */
    369	if (tbl->it_ops->flush)
    370		tbl->it_ops->flush(tbl);
    371
    372	/* Make sure updates are seen by hardware */
    373	mb();
    374
    375	return ret;
    376}
    377
    378static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
    379			     unsigned int npages)
    380{
    381	unsigned long entry, free_entry;
    382
    383	entry = dma_addr >> tbl->it_page_shift;
    384	free_entry = entry - tbl->it_offset;
    385
    386	if (((free_entry + npages) > tbl->it_size) ||
    387	    (entry < tbl->it_offset)) {
    388		if (printk_ratelimit()) {
    389			printk(KERN_INFO "iommu_free: invalid entry\n");
    390			printk(KERN_INFO "\tentry     = 0x%lx\n", entry); 
    391			printk(KERN_INFO "\tdma_addr  = 0x%llx\n", (u64)dma_addr);
    392			printk(KERN_INFO "\tTable     = 0x%llx\n", (u64)tbl);
    393			printk(KERN_INFO "\tbus#      = 0x%llx\n", (u64)tbl->it_busno);
    394			printk(KERN_INFO "\tsize      = 0x%llx\n", (u64)tbl->it_size);
    395			printk(KERN_INFO "\tstartOff  = 0x%llx\n", (u64)tbl->it_offset);
    396			printk(KERN_INFO "\tindex     = 0x%llx\n", (u64)tbl->it_index);
    397			WARN_ON(1);
    398		}
    399
    400		return false;
    401	}
    402
    403	return true;
    404}
    405
    406static struct iommu_pool *get_pool(struct iommu_table *tbl,
    407				   unsigned long entry)
    408{
    409	struct iommu_pool *p;
    410	unsigned long largepool_start = tbl->large_pool.start;
    411
    412	/* The large pool is the last pool at the top of the table */
    413	if (entry >= largepool_start) {
    414		p = &tbl->large_pool;
    415	} else {
    416		unsigned int pool_nr = entry / tbl->poolsize;
    417
    418		BUG_ON(pool_nr > tbl->nr_pools);
    419		p = &tbl->pools[pool_nr];
    420	}
    421
    422	return p;
    423}
    424
    425static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
    426			 unsigned int npages)
    427{
    428	unsigned long entry, free_entry;
    429	unsigned long flags;
    430	struct iommu_pool *pool;
    431
    432	entry = dma_addr >> tbl->it_page_shift;
    433	free_entry = entry - tbl->it_offset;
    434
    435	pool = get_pool(tbl, free_entry);
    436
    437	if (!iommu_free_check(tbl, dma_addr, npages))
    438		return;
    439
    440	tbl->it_ops->clear(tbl, entry, npages);
    441
    442	spin_lock_irqsave(&(pool->lock), flags);
    443	bitmap_clear(tbl->it_map, free_entry, npages);
    444	spin_unlock_irqrestore(&(pool->lock), flags);
    445}
    446
    447static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
    448		unsigned int npages)
    449{
    450	__iommu_free(tbl, dma_addr, npages);
    451
    452	/* Make sure TLB cache is flushed if the HW needs it. We do
    453	 * not do an mb() here on purpose, it is not needed on any of
    454	 * the current platforms.
    455	 */
    456	if (tbl->it_ops->flush)
    457		tbl->it_ops->flush(tbl);
    458}
    459
    460int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
    461		     struct scatterlist *sglist, int nelems,
    462		     unsigned long mask, enum dma_data_direction direction,
    463		     unsigned long attrs)
    464{
    465	dma_addr_t dma_next = 0, dma_addr;
    466	struct scatterlist *s, *outs, *segstart;
    467	int outcount, incount, i, build_fail = 0;
    468	unsigned int align;
    469	unsigned long handle;
    470	unsigned int max_seg_size;
    471
    472	BUG_ON(direction == DMA_NONE);
    473
    474	if ((nelems == 0) || !tbl)
    475		return -EINVAL;
    476
    477	outs = s = segstart = &sglist[0];
    478	outcount = 1;
    479	incount = nelems;
    480	handle = 0;
    481
    482	/* Init first segment length for backout at failure */
    483	outs->dma_length = 0;
    484
    485	DBG("sg mapping %d elements:\n", nelems);
    486
    487	max_seg_size = dma_get_max_seg_size(dev);
    488	for_each_sg(sglist, s, nelems, i) {
    489		unsigned long vaddr, npages, entry, slen;
    490
    491		slen = s->length;
    492		/* Sanity check */
    493		if (slen == 0) {
    494			dma_next = 0;
    495			continue;
    496		}
    497		/* Allocate iommu entries for that segment */
    498		vaddr = (unsigned long) sg_virt(s);
    499		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl));
    500		align = 0;
    501		if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE &&
    502		    (vaddr & ~PAGE_MASK) == 0)
    503			align = PAGE_SHIFT - tbl->it_page_shift;
    504		entry = iommu_range_alloc(dev, tbl, npages, &handle,
    505					  mask >> tbl->it_page_shift, align);
    506
    507		DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
    508
    509		/* Handle failure */
    510		if (unlikely(entry == DMA_MAPPING_ERROR)) {
    511			if (!(attrs & DMA_ATTR_NO_WARN) &&
    512			    printk_ratelimit())
    513				dev_info(dev, "iommu_alloc failed, tbl %p "
    514					 "vaddr %lx npages %lu\n", tbl, vaddr,
    515					 npages);
    516			goto failure;
    517		}
    518
    519		/* Convert entry to a dma_addr_t */
    520		entry += tbl->it_offset;
    521		dma_addr = entry << tbl->it_page_shift;
    522		dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl));
    523
    524		DBG("  - %lu pages, entry: %lx, dma_addr: %lx\n",
    525			    npages, entry, dma_addr);
    526
    527		/* Insert into HW table */
    528		build_fail = tbl->it_ops->set(tbl, entry, npages,
    529					      vaddr & IOMMU_PAGE_MASK(tbl),
    530					      direction, attrs);
    531		if(unlikely(build_fail))
    532			goto failure;
    533
    534		/* If we are in an open segment, try merging */
    535		if (segstart != s) {
    536			DBG("  - trying merge...\n");
    537			/* We cannot merge if:
    538			 * - allocated dma_addr isn't contiguous to previous allocation
    539			 */
    540			if (novmerge || (dma_addr != dma_next) ||
    541			    (outs->dma_length + s->length > max_seg_size)) {
    542				/* Can't merge: create a new segment */
    543				segstart = s;
    544				outcount++;
    545				outs = sg_next(outs);
    546				DBG("    can't merge, new segment.\n");
    547			} else {
    548				outs->dma_length += s->length;
    549				DBG("    merged, new len: %ux\n", outs->dma_length);
    550			}
    551		}
    552
    553		if (segstart == s) {
    554			/* This is a new segment, fill entries */
    555			DBG("  - filling new segment.\n");
    556			outs->dma_address = dma_addr;
    557			outs->dma_length = slen;
    558		}
    559
    560		/* Calculate next page pointer for contiguous check */
    561		dma_next = dma_addr + slen;
    562
    563		DBG("  - dma next is: %lx\n", dma_next);
    564	}
    565
    566	/* Flush/invalidate TLB caches if necessary */
    567	if (tbl->it_ops->flush)
    568		tbl->it_ops->flush(tbl);
    569
    570	DBG("mapped %d elements:\n", outcount);
    571
    572	/* For the sake of ppc_iommu_unmap_sg, we clear out the length in the
    573	 * next entry of the sglist if we didn't fill the list completely
    574	 */
    575	if (outcount < incount) {
    576		outs = sg_next(outs);
    577		outs->dma_length = 0;
    578	}
    579
    580	/* Make sure updates are seen by hardware */
    581	mb();
    582
    583	return outcount;
    584
    585 failure:
    586	for_each_sg(sglist, s, nelems, i) {
    587		if (s->dma_length != 0) {
    588			unsigned long vaddr, npages;
    589
    590			vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl);
    591			npages = iommu_num_pages(s->dma_address, s->dma_length,
    592						 IOMMU_PAGE_SIZE(tbl));
    593			__iommu_free(tbl, vaddr, npages);
    594			s->dma_length = 0;
    595		}
    596		if (s == outs)
    597			break;
    598	}
    599	return -EIO;
    600}
    601
    602
    603void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
    604			int nelems, enum dma_data_direction direction,
    605			unsigned long attrs)
    606{
    607	struct scatterlist *sg;
    608
    609	BUG_ON(direction == DMA_NONE);
    610
    611	if (!tbl)
    612		return;
    613
    614	sg = sglist;
    615	while (nelems--) {
    616		unsigned int npages;
    617		dma_addr_t dma_handle = sg->dma_address;
    618
    619		if (sg->dma_length == 0)
    620			break;
    621		npages = iommu_num_pages(dma_handle, sg->dma_length,
    622					 IOMMU_PAGE_SIZE(tbl));
    623		__iommu_free(tbl, dma_handle, npages);
    624		sg = sg_next(sg);
    625	}
    626
    627	/* Flush/invalidate TLBs if necessary. As for iommu_free(), we
    628	 * do not do an mb() here, the affected platforms do not need it
    629	 * when freeing.
    630	 */
    631	if (tbl->it_ops->flush)
    632		tbl->it_ops->flush(tbl);
    633}
    634
    635static void iommu_table_clear(struct iommu_table *tbl)
    636{
    637	/*
    638	 * In case of firmware assisted dump system goes through clean
    639	 * reboot process at the time of system crash. Hence it's safe to
    640	 * clear the TCE entries if firmware assisted dump is active.
    641	 */
    642	if (!is_kdump_kernel() || is_fadump_active()) {
    643		/* Clear the table in case firmware left allocations in it */
    644		tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size);
    645		return;
    646	}
    647
    648#ifdef CONFIG_CRASH_DUMP
    649	if (tbl->it_ops->get) {
    650		unsigned long index, tceval, tcecount = 0;
    651
    652		/* Reserve the existing mappings left by the first kernel. */
    653		for (index = 0; index < tbl->it_size; index++) {
    654			tceval = tbl->it_ops->get(tbl, index + tbl->it_offset);
    655			/*
    656			 * Freed TCE entry contains 0x7fffffffffffffff on JS20
    657			 */
    658			if (tceval && (tceval != 0x7fffffffffffffffUL)) {
    659				__set_bit(index, tbl->it_map);
    660				tcecount++;
    661			}
    662		}
    663
    664		if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) {
    665			printk(KERN_WARNING "TCE table is full; freeing ");
    666			printk(KERN_WARNING "%d entries for the kdump boot\n",
    667				KDUMP_MIN_TCE_ENTRIES);
    668			for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES;
    669				index < tbl->it_size; index++)
    670				__clear_bit(index, tbl->it_map);
    671		}
    672	}
    673#endif
    674}
    675
    676static void iommu_table_reserve_pages(struct iommu_table *tbl,
    677		unsigned long res_start, unsigned long res_end)
    678{
    679	int i;
    680
    681	WARN_ON_ONCE(res_end < res_start);
    682	/*
    683	 * Reserve page 0 so it will not be used for any mappings.
    684	 * This avoids buggy drivers that consider page 0 to be invalid
    685	 * to crash the machine or even lose data.
    686	 */
    687	if (tbl->it_offset == 0)
    688		set_bit(0, tbl->it_map);
    689
    690	if (res_start < tbl->it_offset)
    691		res_start = tbl->it_offset;
    692
    693	if (res_end > (tbl->it_offset + tbl->it_size))
    694		res_end = tbl->it_offset + tbl->it_size;
    695
    696	/* Check if res_start..res_end is a valid range in the table */
    697	if (res_start >= res_end) {
    698		tbl->it_reserved_start = tbl->it_offset;
    699		tbl->it_reserved_end = tbl->it_offset;
    700		return;
    701	}
    702
    703	tbl->it_reserved_start = res_start;
    704	tbl->it_reserved_end = res_end;
    705
    706	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
    707		set_bit(i - tbl->it_offset, tbl->it_map);
    708}
    709
    710/*
    711 * Build a iommu_table structure.  This contains a bit map which
    712 * is used to manage allocation of the tce space.
    713 */
    714struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
    715		unsigned long res_start, unsigned long res_end)
    716{
    717	unsigned long sz;
    718	static int welcomed = 0;
    719	unsigned int i;
    720	struct iommu_pool *p;
    721
    722	BUG_ON(!tbl->it_ops);
    723
    724	/* number of bytes needed for the bitmap */
    725	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
    726
    727	tbl->it_map = vzalloc_node(sz, nid);
    728	if (!tbl->it_map) {
    729		pr_err("%s: Can't allocate %ld bytes\n", __func__, sz);
    730		return NULL;
    731	}
    732
    733	iommu_table_reserve_pages(tbl, res_start, res_end);
    734
    735	/* We only split the IOMMU table if we have 1GB or more of space */
    736	if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
    737		tbl->nr_pools = IOMMU_NR_POOLS;
    738	else
    739		tbl->nr_pools = 1;
    740
    741	/* We reserve the top 1/4 of the table for large allocations */
    742	tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
    743
    744	for (i = 0; i < tbl->nr_pools; i++) {
    745		p = &tbl->pools[i];
    746		spin_lock_init(&(p->lock));
    747		p->start = tbl->poolsize * i;
    748		p->hint = p->start;
    749		p->end = p->start + tbl->poolsize;
    750	}
    751
    752	p = &tbl->large_pool;
    753	spin_lock_init(&(p->lock));
    754	p->start = tbl->poolsize * i;
    755	p->hint = p->start;
    756	p->end = tbl->it_size;
    757
    758	iommu_table_clear(tbl);
    759
    760	if (!welcomed) {
    761		printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n",
    762		       novmerge ? "disabled" : "enabled");
    763		welcomed = 1;
    764	}
    765
    766	iommu_debugfs_add(tbl);
    767
    768	return tbl;
    769}
    770
    771bool iommu_table_in_use(struct iommu_table *tbl)
    772{
    773	unsigned long start = 0, end;
    774
    775	/* ignore reserved bit0 */
    776	if (tbl->it_offset == 0)
    777		start = 1;
    778	end = tbl->it_reserved_start - tbl->it_offset;
    779	if (find_next_bit(tbl->it_map, end, start) != end)
    780		return true;
    781
    782	start = tbl->it_reserved_end - tbl->it_offset;
    783	end = tbl->it_size;
    784	return find_next_bit(tbl->it_map, end, start) != end;
    785}
    786
    787static void iommu_table_free(struct kref *kref)
    788{
    789	struct iommu_table *tbl;
    790
    791	tbl = container_of(kref, struct iommu_table, it_kref);
    792
    793	if (tbl->it_ops->free)
    794		tbl->it_ops->free(tbl);
    795
    796	if (!tbl->it_map) {
    797		kfree(tbl);
    798		return;
    799	}
    800
    801	iommu_debugfs_del(tbl);
    802
    803	/* verify that table contains no entries */
    804	if (iommu_table_in_use(tbl))
    805		pr_warn("%s: Unexpected TCEs\n", __func__);
    806
    807	/* free bitmap */
    808	vfree(tbl->it_map);
    809
    810	/* free table */
    811	kfree(tbl);
    812}
    813
    814struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
    815{
    816	if (kref_get_unless_zero(&tbl->it_kref))
    817		return tbl;
    818
    819	return NULL;
    820}
    821EXPORT_SYMBOL_GPL(iommu_tce_table_get);
    822
    823int iommu_tce_table_put(struct iommu_table *tbl)
    824{
    825	if (WARN_ON(!tbl))
    826		return 0;
    827
    828	return kref_put(&tbl->it_kref, iommu_table_free);
    829}
    830EXPORT_SYMBOL_GPL(iommu_tce_table_put);
    831
    832/* Creates TCEs for a user provided buffer.  The user buffer must be
    833 * contiguous real kernel storage (not vmalloc).  The address passed here
    834 * comprises a page address and offset into that page. The dma_addr_t
    835 * returned will point to the same byte within the page as was passed in.
    836 */
    837dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
    838			  struct page *page, unsigned long offset, size_t size,
    839			  unsigned long mask, enum dma_data_direction direction,
    840			  unsigned long attrs)
    841{
    842	dma_addr_t dma_handle = DMA_MAPPING_ERROR;
    843	void *vaddr;
    844	unsigned long uaddr;
    845	unsigned int npages, align;
    846
    847	BUG_ON(direction == DMA_NONE);
    848
    849	vaddr = page_address(page) + offset;
    850	uaddr = (unsigned long)vaddr;
    851
    852	if (tbl) {
    853		npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl));
    854		align = 0;
    855		if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE &&
    856		    ((unsigned long)vaddr & ~PAGE_MASK) == 0)
    857			align = PAGE_SHIFT - tbl->it_page_shift;
    858
    859		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
    860					 mask >> tbl->it_page_shift, align,
    861					 attrs);
    862		if (dma_handle == DMA_MAPPING_ERROR) {
    863			if (!(attrs & DMA_ATTR_NO_WARN) &&
    864			    printk_ratelimit())  {
    865				dev_info(dev, "iommu_alloc failed, tbl %p "
    866					 "vaddr %p npages %d\n", tbl, vaddr,
    867					 npages);
    868			}
    869		} else
    870			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl));
    871	}
    872
    873	return dma_handle;
    874}
    875
    876void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
    877		      size_t size, enum dma_data_direction direction,
    878		      unsigned long attrs)
    879{
    880	unsigned int npages;
    881
    882	BUG_ON(direction == DMA_NONE);
    883
    884	if (tbl) {
    885		npages = iommu_num_pages(dma_handle, size,
    886					 IOMMU_PAGE_SIZE(tbl));
    887		iommu_free(tbl, dma_handle, npages);
    888	}
    889}
    890
    891/* Allocates a contiguous real buffer and creates mappings over it.
    892 * Returns the virtual address of the buffer and sets dma_handle
    893 * to the dma address (mapping) of the first page.
    894 */
    895void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
    896			   size_t size,	dma_addr_t *dma_handle,
    897			   unsigned long mask, gfp_t flag, int node)
    898{
    899	void *ret = NULL;
    900	dma_addr_t mapping;
    901	unsigned int order;
    902	unsigned int nio_pages, io_order;
    903	struct page *page;
    904
    905	size = PAGE_ALIGN(size);
    906	order = get_order(size);
    907
    908 	/*
    909	 * Client asked for way too much space.  This is checked later
    910	 * anyway.  It is easier to debug here for the drivers than in
    911	 * the tce tables.
    912	 */
    913	if (order >= IOMAP_MAX_ORDER) {
    914		dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n",
    915			 size);
    916		return NULL;
    917	}
    918
    919	if (!tbl)
    920		return NULL;
    921
    922	/* Alloc enough pages (and possibly more) */
    923	page = alloc_pages_node(node, flag, order);
    924	if (!page)
    925		return NULL;
    926	ret = page_address(page);
    927	memset(ret, 0, size);
    928
    929	/* Set up tces to cover the allocated range */
    930	nio_pages = size >> tbl->it_page_shift;
    931	io_order = get_iommu_order(size, tbl);
    932	mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
    933			      mask >> tbl->it_page_shift, io_order, 0);
    934	if (mapping == DMA_MAPPING_ERROR) {
    935		free_pages((unsigned long)ret, order);
    936		return NULL;
    937	}
    938	*dma_handle = mapping;
    939	return ret;
    940}
    941
    942void iommu_free_coherent(struct iommu_table *tbl, size_t size,
    943			 void *vaddr, dma_addr_t dma_handle)
    944{
    945	if (tbl) {
    946		unsigned int nio_pages;
    947
    948		size = PAGE_ALIGN(size);
    949		nio_pages = size >> tbl->it_page_shift;
    950		iommu_free(tbl, dma_handle, nio_pages);
    951		size = PAGE_ALIGN(size);
    952		free_pages((unsigned long)vaddr, get_order(size));
    953	}
    954}
    955
    956unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
    957{
    958	switch (dir) {
    959	case DMA_BIDIRECTIONAL:
    960		return TCE_PCI_READ | TCE_PCI_WRITE;
    961	case DMA_FROM_DEVICE:
    962		return TCE_PCI_WRITE;
    963	case DMA_TO_DEVICE:
    964		return TCE_PCI_READ;
    965	default:
    966		return 0;
    967	}
    968}
    969EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
    970
    971#ifdef CONFIG_IOMMU_API
    972/*
    973 * SPAPR TCE API
    974 */
    975static void group_release(void *iommu_data)
    976{
    977	struct iommu_table_group *table_group = iommu_data;
    978
    979	table_group->group = NULL;
    980}
    981
    982void iommu_register_group(struct iommu_table_group *table_group,
    983		int pci_domain_number, unsigned long pe_num)
    984{
    985	struct iommu_group *grp;
    986	char *name;
    987
    988	grp = iommu_group_alloc();
    989	if (IS_ERR(grp)) {
    990		pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
    991				PTR_ERR(grp));
    992		return;
    993	}
    994	table_group->group = grp;
    995	iommu_group_set_iommudata(grp, table_group, group_release);
    996	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
    997			pci_domain_number, pe_num);
    998	if (!name)
    999		return;
   1000	iommu_group_set_name(grp, name);
   1001	kfree(name);
   1002}
   1003
   1004enum dma_data_direction iommu_tce_direction(unsigned long tce)
   1005{
   1006	if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
   1007		return DMA_BIDIRECTIONAL;
   1008	else if (tce & TCE_PCI_READ)
   1009		return DMA_TO_DEVICE;
   1010	else if (tce & TCE_PCI_WRITE)
   1011		return DMA_FROM_DEVICE;
   1012	else
   1013		return DMA_NONE;
   1014}
   1015EXPORT_SYMBOL_GPL(iommu_tce_direction);
   1016
   1017void iommu_flush_tce(struct iommu_table *tbl)
   1018{
   1019	/* Flush/invalidate TLB caches if necessary */
   1020	if (tbl->it_ops->flush)
   1021		tbl->it_ops->flush(tbl);
   1022
   1023	/* Make sure updates are seen by hardware */
   1024	mb();
   1025}
   1026EXPORT_SYMBOL_GPL(iommu_flush_tce);
   1027
   1028int iommu_tce_check_ioba(unsigned long page_shift,
   1029		unsigned long offset, unsigned long size,
   1030		unsigned long ioba, unsigned long npages)
   1031{
   1032	unsigned long mask = (1UL << page_shift) - 1;
   1033
   1034	if (ioba & mask)
   1035		return -EINVAL;
   1036
   1037	ioba >>= page_shift;
   1038	if (ioba < offset)
   1039		return -EINVAL;
   1040
   1041	if ((ioba + 1) > (offset + size))
   1042		return -EINVAL;
   1043
   1044	return 0;
   1045}
   1046EXPORT_SYMBOL_GPL(iommu_tce_check_ioba);
   1047
   1048int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
   1049{
   1050	unsigned long mask = (1UL << page_shift) - 1;
   1051
   1052	if (gpa & mask)
   1053		return -EINVAL;
   1054
   1055	return 0;
   1056}
   1057EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
   1058
   1059extern long iommu_tce_xchg_no_kill(struct mm_struct *mm,
   1060		struct iommu_table *tbl,
   1061		unsigned long entry, unsigned long *hpa,
   1062		enum dma_data_direction *direction)
   1063{
   1064	long ret;
   1065	unsigned long size = 0;
   1066
   1067	ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction);
   1068	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
   1069			(*direction == DMA_BIDIRECTIONAL)) &&
   1070			!mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift,
   1071					&size))
   1072		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
   1073
   1074	return ret;
   1075}
   1076EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill);
   1077
   1078void iommu_tce_kill(struct iommu_table *tbl,
   1079		unsigned long entry, unsigned long pages)
   1080{
   1081	if (tbl->it_ops->tce_kill)
   1082		tbl->it_ops->tce_kill(tbl, entry, pages);
   1083}
   1084EXPORT_SYMBOL_GPL(iommu_tce_kill);
   1085
   1086int iommu_take_ownership(struct iommu_table *tbl)
   1087{
   1088	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
   1089	int ret = 0;
   1090
   1091	/*
   1092	 * VFIO does not control TCE entries allocation and the guest
   1093	 * can write new TCEs on top of existing ones so iommu_tce_build()
   1094	 * must be able to release old pages. This functionality
   1095	 * requires exchange() callback defined so if it is not
   1096	 * implemented, we disallow taking ownership over the table.
   1097	 */
   1098	if (!tbl->it_ops->xchg_no_kill)
   1099		return -EINVAL;
   1100
   1101	spin_lock_irqsave(&tbl->large_pool.lock, flags);
   1102	for (i = 0; i < tbl->nr_pools; i++)
   1103		spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
   1104
   1105	if (iommu_table_in_use(tbl)) {
   1106		pr_err("iommu_tce: it_map is not empty");
   1107		ret = -EBUSY;
   1108	} else {
   1109		memset(tbl->it_map, 0xff, sz);
   1110	}
   1111
   1112	for (i = 0; i < tbl->nr_pools; i++)
   1113		spin_unlock(&tbl->pools[i].lock);
   1114	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
   1115
   1116	return ret;
   1117}
   1118EXPORT_SYMBOL_GPL(iommu_take_ownership);
   1119
   1120void iommu_release_ownership(struct iommu_table *tbl)
   1121{
   1122	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
   1123
   1124	spin_lock_irqsave(&tbl->large_pool.lock, flags);
   1125	for (i = 0; i < tbl->nr_pools; i++)
   1126		spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
   1127
   1128	memset(tbl->it_map, 0, sz);
   1129
   1130	iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
   1131			tbl->it_reserved_end);
   1132
   1133	for (i = 0; i < tbl->nr_pools; i++)
   1134		spin_unlock(&tbl->pools[i].lock);
   1135	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
   1136}
   1137EXPORT_SYMBOL_GPL(iommu_release_ownership);
   1138
   1139int iommu_add_device(struct iommu_table_group *table_group, struct device *dev)
   1140{
   1141	/*
   1142	 * The sysfs entries should be populated before
   1143	 * binding IOMMU group. If sysfs entries isn't
   1144	 * ready, we simply bail.
   1145	 */
   1146	if (!device_is_registered(dev))
   1147		return -ENOENT;
   1148
   1149	if (device_iommu_mapped(dev)) {
   1150		pr_debug("%s: Skipping device %s with iommu group %d\n",
   1151			 __func__, dev_name(dev),
   1152			 iommu_group_id(dev->iommu_group));
   1153		return -EBUSY;
   1154	}
   1155
   1156	pr_debug("%s: Adding %s to iommu group %d\n",
   1157		 __func__, dev_name(dev),  iommu_group_id(table_group->group));
   1158
   1159	return iommu_group_add_device(table_group->group, dev);
   1160}
   1161EXPORT_SYMBOL_GPL(iommu_add_device);
   1162
   1163void iommu_del_device(struct device *dev)
   1164{
   1165	/*
   1166	 * Some devices might not have IOMMU table and group
   1167	 * and we needn't detach them from the associated
   1168	 * IOMMU groups
   1169	 */
   1170	if (!device_iommu_mapped(dev)) {
   1171		pr_debug("iommu_tce: skipping device %s with no tbl\n",
   1172			 dev_name(dev));
   1173		return;
   1174	}
   1175
   1176	iommu_group_remove_device(dev);
   1177}
   1178EXPORT_SYMBOL_GPL(iommu_del_device);
   1179#endif /* CONFIG_IOMMU_API */