cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sba_iommu.c (57992B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3**  IA64 System Bus Adapter (SBA) I/O MMU manager
      4**
      5**	(c) Copyright 2002-2005 Alex Williamson
      6**	(c) Copyright 2002-2003 Grant Grundler
      7**	(c) Copyright 2002-2005 Hewlett-Packard Company
      8**
      9**	Portions (c) 2000 Grant Grundler (from parisc I/O MMU code)
     10**	Portions (c) 1999 Dave S. Miller (from sparc64 I/O MMU code)
     11**
     12**
     13**
     14** This module initializes the IOC (I/O Controller) found on HP
     15** McKinley machines and their successors.
     16**
     17*/
     18
     19#include <linux/types.h>
     20#include <linux/kernel.h>
     21#include <linux/module.h>
     22#include <linux/spinlock.h>
     23#include <linux/slab.h>
     24#include <linux/init.h>
     25#include <linux/mm.h>
     26#include <linux/string.h>
     27#include <linux/pci.h>
     28#include <linux/proc_fs.h>
     29#include <linux/seq_file.h>
     30#include <linux/acpi.h>
     31#include <linux/efi.h>
     32#include <linux/nodemask.h>
     33#include <linux/bitops.h>         /* hweight64() */
     34#include <linux/crash_dump.h>
     35#include <linux/iommu-helper.h>
     36#include <linux/dma-map-ops.h>
     37#include <linux/prefetch.h>
     38#include <linux/swiotlb.h>
     39
     40#include <asm/delay.h>		/* ia64_get_itc() */
     41#include <asm/io.h>
     42#include <asm/page.h>		/* PAGE_OFFSET */
     43#include <asm/dma.h>
     44
     45#include <asm/acpi-ext.h>
     46
     47#define PFX "IOC: "
     48
     49/*
     50** Enabling timing search of the pdir resource map.  Output in /proc.
     51** Disabled by default to optimize performance.
     52*/
     53#undef PDIR_SEARCH_TIMING
     54
     55/*
     56** This option allows cards capable of 64bit DMA to bypass the IOMMU.  If
     57** not defined, all DMA will be 32bit and go through the TLB.
     58** There's potentially a conflict in the bio merge code with us
     59** advertising an iommu, but then bypassing it.  Since I/O MMU bypassing
     60** appears to give more performance than bio-level virtual merging, we'll
     61** do the former for now.  NOTE: BYPASS_SG also needs to be undef'd to
     62** completely restrict DMA to the IOMMU.
     63*/
     64#define ALLOW_IOV_BYPASS
     65
     66/*
     67** This option specifically allows/disallows bypassing scatterlists with
     68** multiple entries.  Coalescing these entries can allow better DMA streaming
     69** and in some cases shows better performance than entirely bypassing the
     70** IOMMU.  Performance increase on the order of 1-2% sequential output/input
     71** using bonnie++ on a RAID0 MD device (sym2 & mpt).
     72*/
     73#undef ALLOW_IOV_BYPASS_SG
     74
     75/*
     76** If a device prefetches beyond the end of a valid pdir entry, it will cause
     77** a hard failure, ie. MCA.  Version 3.0 and later of the zx1 LBA should
     78** disconnect on 4k boundaries and prevent such issues.  If the device is
     79** particularly aggressive, this option will keep the entire pdir valid such
     80** that prefetching will hit a valid address.  This could severely impact
     81** error containment, and is therefore off by default.  The page that is
     82** used for spill-over is poisoned, so that should help debugging somewhat.
     83*/
     84#undef FULL_VALID_PDIR
     85
     86#define ENABLE_MARK_CLEAN
     87
     88/*
     89** The number of debug flags is a clue - this code is fragile.  NOTE: since
     90** tightening the use of res_lock the resource bitmap and actual pdir are no
     91** longer guaranteed to stay in sync.  The sanity checking code isn't going to
     92** like that.
     93*/
     94#undef DEBUG_SBA_INIT
     95#undef DEBUG_SBA_RUN
     96#undef DEBUG_SBA_RUN_SG
     97#undef DEBUG_SBA_RESOURCE
     98#undef ASSERT_PDIR_SANITY
     99#undef DEBUG_LARGE_SG_ENTRIES
    100#undef DEBUG_BYPASS
    101
    102#if defined(FULL_VALID_PDIR) && defined(ASSERT_PDIR_SANITY)
    103#error FULL_VALID_PDIR and ASSERT_PDIR_SANITY are mutually exclusive
    104#endif
    105
    106#define SBA_INLINE	__inline__
    107/* #define SBA_INLINE */
    108
    109#ifdef DEBUG_SBA_INIT
    110#define DBG_INIT(x...)	printk(x)
    111#else
    112#define DBG_INIT(x...)
    113#endif
    114
    115#ifdef DEBUG_SBA_RUN
    116#define DBG_RUN(x...)	printk(x)
    117#else
    118#define DBG_RUN(x...)
    119#endif
    120
    121#ifdef DEBUG_SBA_RUN_SG
    122#define DBG_RUN_SG(x...)	printk(x)
    123#else
    124#define DBG_RUN_SG(x...)
    125#endif
    126
    127
    128#ifdef DEBUG_SBA_RESOURCE
    129#define DBG_RES(x...)	printk(x)
    130#else
    131#define DBG_RES(x...)
    132#endif
    133
    134#ifdef DEBUG_BYPASS
    135#define DBG_BYPASS(x...)	printk(x)
    136#else
    137#define DBG_BYPASS(x...)
    138#endif
    139
    140#ifdef ASSERT_PDIR_SANITY
    141#define ASSERT(expr) \
    142        if(!(expr)) { \
    143                printk( "\n" __FILE__ ":%d: Assertion " #expr " failed!\n",__LINE__); \
    144                panic(#expr); \
    145        }
    146#else
    147#define ASSERT(expr)
    148#endif
    149
    150/*
    151** The number of pdir entries to "free" before issuing
    152** a read to PCOM register to flush out PCOM writes.
    153** Interacts with allocation granularity (ie 4 or 8 entries
    154** allocated and free'd/purged at a time might make this
    155** less interesting).
    156*/
    157#define DELAYED_RESOURCE_CNT	64
    158
    159#define PCI_DEVICE_ID_HP_SX2000_IOC	0x12ec
    160
    161#define ZX1_IOC_ID	((PCI_DEVICE_ID_HP_ZX1_IOC << 16) | PCI_VENDOR_ID_HP)
    162#define ZX2_IOC_ID	((PCI_DEVICE_ID_HP_ZX2_IOC << 16) | PCI_VENDOR_ID_HP)
    163#define REO_IOC_ID	((PCI_DEVICE_ID_HP_REO_IOC << 16) | PCI_VENDOR_ID_HP)
    164#define SX1000_IOC_ID	((PCI_DEVICE_ID_HP_SX1000_IOC << 16) | PCI_VENDOR_ID_HP)
    165#define SX2000_IOC_ID	((PCI_DEVICE_ID_HP_SX2000_IOC << 16) | PCI_VENDOR_ID_HP)
    166
    167#define ZX1_IOC_OFFSET	0x1000	/* ACPI reports SBA, we want IOC */
    168
    169#define IOC_FUNC_ID	0x000
    170#define IOC_FCLASS	0x008	/* function class, bist, header, rev... */
    171#define IOC_IBASE	0x300	/* IO TLB */
    172#define IOC_IMASK	0x308
    173#define IOC_PCOM	0x310
    174#define IOC_TCNFG	0x318
    175#define IOC_PDIR_BASE	0x320
    176
    177#define IOC_ROPE0_CFG	0x500
    178#define   IOC_ROPE_AO	  0x10	/* Allow "Relaxed Ordering" */
    179
    180
    181/* AGP GART driver looks for this */
    182#define ZX1_SBA_IOMMU_COOKIE	0x0000badbadc0ffeeUL
    183
    184/*
    185** The zx1 IOC supports 4/8/16/64KB page sizes (see TCNFG register)
    186**
    187** Some IOCs (sx1000) can run at the above pages sizes, but are
    188** really only supported using the IOC at a 4k page size.
    189**
    190** iovp_size could only be greater than PAGE_SIZE if we are
    191** confident the drivers really only touch the next physical
    192** page iff that driver instance owns it.
    193*/
    194static unsigned long iovp_size;
    195static unsigned long iovp_shift;
    196static unsigned long iovp_mask;
    197
    198struct ioc {
    199	void __iomem	*ioc_hpa;	/* I/O MMU base address */
    200	char		*res_map;	/* resource map, bit == pdir entry */
    201	u64		*pdir_base;	/* physical base address */
    202	unsigned long	ibase;		/* pdir IOV Space base */
    203	unsigned long	imask;		/* pdir IOV Space mask */
    204
    205	unsigned long	*res_hint;	/* next avail IOVP - circular search */
    206	unsigned long	dma_mask;
    207	spinlock_t	res_lock;	/* protects the resource bitmap, but must be held when */
    208					/* clearing pdir to prevent races with allocations. */
    209	unsigned int	res_bitshift;	/* from the RIGHT! */
    210	unsigned int	res_size;	/* size of resource map in bytes */
    211#ifdef CONFIG_NUMA
    212	unsigned int	node;		/* node where this IOC lives */
    213#endif
    214#if DELAYED_RESOURCE_CNT > 0
    215	spinlock_t	saved_lock;	/* may want to try to get this on a separate cacheline */
    216					/* than res_lock for bigger systems. */
    217	int		saved_cnt;
    218	struct sba_dma_pair {
    219		dma_addr_t	iova;
    220		size_t		size;
    221	} saved[DELAYED_RESOURCE_CNT];
    222#endif
    223
    224#ifdef PDIR_SEARCH_TIMING
    225#define SBA_SEARCH_SAMPLE	0x100
    226	unsigned long avg_search[SBA_SEARCH_SAMPLE];
    227	unsigned long avg_idx;	/* current index into avg_search */
    228#endif
    229
    230	/* Stuff we don't need in performance path */
    231	struct ioc	*next;		/* list of IOC's in system */
    232	acpi_handle	handle;		/* for multiple IOC's */
    233	const char 	*name;
    234	unsigned int	func_id;
    235	unsigned int	rev;		/* HW revision of chip */
    236	u32		iov_size;
    237	unsigned int	pdir_size;	/* in bytes, determined by IOV Space size */
    238	struct pci_dev	*sac_only_dev;
    239};
    240
    241static struct ioc *ioc_list, *ioc_found;
    242static int reserve_sba_gart = 1;
    243
    244static SBA_INLINE void sba_mark_invalid(struct ioc *, dma_addr_t, size_t);
    245static SBA_INLINE void sba_free_range(struct ioc *, dma_addr_t, size_t);
    246
    247#define sba_sg_address(sg)	sg_virt((sg))
    248
    249#ifdef FULL_VALID_PDIR
    250static u64 prefetch_spill_page;
    251#endif
    252
    253#define GET_IOC(dev)	((dev_is_pci(dev))						\
    254			 ? ((struct ioc *) PCI_CONTROLLER(to_pci_dev(dev))->iommu) : NULL)
    255
    256/*
    257** DMA_CHUNK_SIZE is used by the SCSI mid-layer to break up
    258** (or rather not merge) DMAs into manageable chunks.
    259** On parisc, this is more of the software/tuning constraint
    260** rather than the HW. I/O MMU allocation algorithms can be
    261** faster with smaller sizes (to some degree).
    262*/
    263#define DMA_CHUNK_SIZE  (BITS_PER_LONG*iovp_size)
    264
    265#define ROUNDUP(x,y) ((x + ((y)-1)) & ~((y)-1))
    266
    267/************************************
    268** SBA register read and write support
    269**
    270** BE WARNED: register writes are posted.
    271**  (ie follow writes which must reach HW with a read)
    272**
    273*/
    274#define READ_REG(addr)       __raw_readq(addr)
    275#define WRITE_REG(val, addr) __raw_writeq(val, addr)
    276
    277#ifdef DEBUG_SBA_INIT
    278
    279/**
    280 * sba_dump_tlb - debugging only - print IOMMU operating parameters
    281 * @hpa: base address of the IOMMU
    282 *
    283 * Print the size/location of the IO MMU PDIR.
    284 */
    285static void
    286sba_dump_tlb(char *hpa)
    287{
    288	DBG_INIT("IO TLB at 0x%p\n", (void *)hpa);
    289	DBG_INIT("IOC_IBASE    : %016lx\n", READ_REG(hpa+IOC_IBASE));
    290	DBG_INIT("IOC_IMASK    : %016lx\n", READ_REG(hpa+IOC_IMASK));
    291	DBG_INIT("IOC_TCNFG    : %016lx\n", READ_REG(hpa+IOC_TCNFG));
    292	DBG_INIT("IOC_PDIR_BASE: %016lx\n", READ_REG(hpa+IOC_PDIR_BASE));
    293	DBG_INIT("\n");
    294}
    295#endif
    296
    297
    298#ifdef ASSERT_PDIR_SANITY
    299
    300/**
    301 * sba_dump_pdir_entry - debugging only - print one IOMMU PDIR entry
    302 * @ioc: IO MMU structure which owns the pdir we are interested in.
    303 * @msg: text to print ont the output line.
    304 * @pide: pdir index.
    305 *
    306 * Print one entry of the IO MMU PDIR in human readable form.
    307 */
    308static void
    309sba_dump_pdir_entry(struct ioc *ioc, char *msg, uint pide)
    310{
    311	/* start printing from lowest pde in rval */
    312	u64 *ptr = &ioc->pdir_base[pide  & ~(BITS_PER_LONG - 1)];
    313	unsigned long *rptr = (unsigned long *) &ioc->res_map[(pide >>3) & -sizeof(unsigned long)];
    314	uint rcnt;
    315
    316	printk(KERN_DEBUG "SBA: %s rp %p bit %d rval 0x%lx\n",
    317		 msg, rptr, pide & (BITS_PER_LONG - 1), *rptr);
    318
    319	rcnt = 0;
    320	while (rcnt < BITS_PER_LONG) {
    321		printk(KERN_DEBUG "%s %2d %p %016Lx\n",
    322		       (rcnt == (pide & (BITS_PER_LONG - 1)))
    323		       ? "    -->" : "       ",
    324		       rcnt, ptr, (unsigned long long) *ptr );
    325		rcnt++;
    326		ptr++;
    327	}
    328	printk(KERN_DEBUG "%s", msg);
    329}
    330
    331
    332/**
    333 * sba_check_pdir - debugging only - consistency checker
    334 * @ioc: IO MMU structure which owns the pdir we are interested in.
    335 * @msg: text to print ont the output line.
    336 *
    337 * Verify the resource map and pdir state is consistent
    338 */
    339static int
    340sba_check_pdir(struct ioc *ioc, char *msg)
    341{
    342	u64 *rptr_end = (u64 *) &(ioc->res_map[ioc->res_size]);
    343	u64 *rptr = (u64 *) ioc->res_map;	/* resource map ptr */
    344	u64 *pptr = ioc->pdir_base;	/* pdir ptr */
    345	uint pide = 0;
    346
    347	while (rptr < rptr_end) {
    348		u64 rval;
    349		int rcnt; /* number of bits we might check */
    350
    351		rval = *rptr;
    352		rcnt = 64;
    353
    354		while (rcnt) {
    355			/* Get last byte and highest bit from that */
    356			u32 pde = ((u32)((*pptr >> (63)) & 0x1));
    357			if ((rval & 0x1) ^ pde)
    358			{
    359				/*
    360				** BUMMER!  -- res_map != pdir --
    361				** Dump rval and matching pdir entries
    362				*/
    363				sba_dump_pdir_entry(ioc, msg, pide);
    364				return(1);
    365			}
    366			rcnt--;
    367			rval >>= 1;	/* try the next bit */
    368			pptr++;
    369			pide++;
    370		}
    371		rptr++;	/* look at next word of res_map */
    372	}
    373	/* It'd be nice if we always got here :^) */
    374	return 0;
    375}
    376
    377
    378/**
    379 * sba_dump_sg - debugging only - print Scatter-Gather list
    380 * @ioc: IO MMU structure which owns the pdir we are interested in.
    381 * @startsg: head of the SG list
    382 * @nents: number of entries in SG list
    383 *
    384 * print the SG list so we can verify it's correct by hand.
    385 */
    386static void
    387sba_dump_sg( struct ioc *ioc, struct scatterlist *startsg, int nents)
    388{
    389	while (nents-- > 0) {
    390		printk(KERN_DEBUG " %d : DMA %08lx/%05x CPU %p\n", nents,
    391		       startsg->dma_address, startsg->dma_length,
    392		       sba_sg_address(startsg));
    393		startsg = sg_next(startsg);
    394	}
    395}
    396
    397static void
    398sba_check_sg( struct ioc *ioc, struct scatterlist *startsg, int nents)
    399{
    400	struct scatterlist *the_sg = startsg;
    401	int the_nents = nents;
    402
    403	while (the_nents-- > 0) {
    404		if (sba_sg_address(the_sg) == 0x0UL)
    405			sba_dump_sg(NULL, startsg, nents);
    406		the_sg = sg_next(the_sg);
    407	}
    408}
    409
    410#endif /* ASSERT_PDIR_SANITY */
    411
    412
    413
    414
    415/**************************************************************
    416*
    417*   I/O Pdir Resource Management
    418*
    419*   Bits set in the resource map are in use.
    420*   Each bit can represent a number of pages.
    421*   LSbs represent lower addresses (IOVA's).
    422*
    423***************************************************************/
    424#define PAGES_PER_RANGE 1	/* could increase this to 4 or 8 if needed */
    425
    426/* Convert from IOVP to IOVA and vice versa. */
    427#define SBA_IOVA(ioc,iovp,offset) ((ioc->ibase) | (iovp) | (offset))
    428#define SBA_IOVP(ioc,iova) ((iova) & ~(ioc->ibase))
    429
    430#define PDIR_ENTRY_SIZE	sizeof(u64)
    431
    432#define PDIR_INDEX(iovp)   ((iovp)>>iovp_shift)
    433
    434#define RESMAP_MASK(n)    ~(~0UL << (n))
    435#define RESMAP_IDX_MASK   (sizeof(unsigned long) - 1)
    436
    437
    438/**
    439 * For most cases the normal get_order is sufficient, however it limits us
    440 * to PAGE_SIZE being the minimum mapping alignment and TC flush granularity.
    441 * It only incurs about 1 clock cycle to use this one with the static variable
    442 * and makes the code more intuitive.
    443 */
    444static SBA_INLINE int
    445get_iovp_order (unsigned long size)
    446{
    447	long double d = size - 1;
    448	long order;
    449
    450	order = ia64_getf_exp(d);
    451	order = order - iovp_shift - 0xffff + 1;
    452	if (order < 0)
    453		order = 0;
    454	return order;
    455}
    456
    457static unsigned long ptr_to_pide(struct ioc *ioc, unsigned long *res_ptr,
    458				 unsigned int bitshiftcnt)
    459{
    460	return (((unsigned long)res_ptr - (unsigned long)ioc->res_map) << 3)
    461		+ bitshiftcnt;
    462}
    463
    464/**
    465 * sba_search_bitmap - find free space in IO PDIR resource bitmap
    466 * @ioc: IO MMU structure which owns the pdir we are interested in.
    467 * @bits_wanted: number of entries we need.
    468 * @use_hint: use res_hint to indicate where to start looking
    469 *
    470 * Find consecutive free bits in resource bitmap.
    471 * Each bit represents one entry in the IO Pdir.
    472 * Cool perf optimization: search for log2(size) bits at a time.
    473 */
    474static SBA_INLINE unsigned long
    475sba_search_bitmap(struct ioc *ioc, struct device *dev,
    476		  unsigned long bits_wanted, int use_hint)
    477{
    478	unsigned long *res_ptr;
    479	unsigned long *res_end = (unsigned long *) &(ioc->res_map[ioc->res_size]);
    480	unsigned long flags, pide = ~0UL, tpide;
    481	unsigned long boundary_size;
    482	unsigned long shift;
    483	int ret;
    484
    485	ASSERT(((unsigned long) ioc->res_hint & (sizeof(unsigned long) - 1UL)) == 0);
    486	ASSERT(res_ptr < res_end);
    487
    488	boundary_size = dma_get_seg_boundary_nr_pages(dev, iovp_shift);
    489
    490	BUG_ON(ioc->ibase & ~iovp_mask);
    491	shift = ioc->ibase >> iovp_shift;
    492
    493	spin_lock_irqsave(&ioc->res_lock, flags);
    494
    495	/* Allow caller to force a search through the entire resource space */
    496	if (likely(use_hint)) {
    497		res_ptr = ioc->res_hint;
    498	} else {
    499		res_ptr = (ulong *)ioc->res_map;
    500		ioc->res_bitshift = 0;
    501	}
    502
    503	/*
    504	 * N.B.  REO/Grande defect AR2305 can cause TLB fetch timeouts
    505	 * if a TLB entry is purged while in use.  sba_mark_invalid()
    506	 * purges IOTLB entries in power-of-two sizes, so we also
    507	 * allocate IOVA space in power-of-two sizes.
    508	 */
    509	bits_wanted = 1UL << get_iovp_order(bits_wanted << iovp_shift);
    510
    511	if (likely(bits_wanted == 1)) {
    512		unsigned int bitshiftcnt;
    513		for(; res_ptr < res_end ; res_ptr++) {
    514			if (likely(*res_ptr != ~0UL)) {
    515				bitshiftcnt = ffz(*res_ptr);
    516				*res_ptr |= (1UL << bitshiftcnt);
    517				pide = ptr_to_pide(ioc, res_ptr, bitshiftcnt);
    518				ioc->res_bitshift = bitshiftcnt + bits_wanted;
    519				goto found_it;
    520			}
    521		}
    522		goto not_found;
    523
    524	}
    525	
    526	if (likely(bits_wanted <= BITS_PER_LONG/2)) {
    527		/*
    528		** Search the resource bit map on well-aligned values.
    529		** "o" is the alignment.
    530		** We need the alignment to invalidate I/O TLB using
    531		** SBA HW features in the unmap path.
    532		*/
    533		unsigned long o = 1 << get_iovp_order(bits_wanted << iovp_shift);
    534		uint bitshiftcnt = ROUNDUP(ioc->res_bitshift, o);
    535		unsigned long mask, base_mask;
    536
    537		base_mask = RESMAP_MASK(bits_wanted);
    538		mask = base_mask << bitshiftcnt;
    539
    540		DBG_RES("%s() o %ld %p", __func__, o, res_ptr);
    541		for(; res_ptr < res_end ; res_ptr++)
    542		{ 
    543			DBG_RES("    %p %lx %lx\n", res_ptr, mask, *res_ptr);
    544			ASSERT(0 != mask);
    545			for (; mask ; mask <<= o, bitshiftcnt += o) {
    546				tpide = ptr_to_pide(ioc, res_ptr, bitshiftcnt);
    547				ret = iommu_is_span_boundary(tpide, bits_wanted,
    548							     shift,
    549							     boundary_size);
    550				if ((0 == ((*res_ptr) & mask)) && !ret) {
    551					*res_ptr |= mask;     /* mark resources busy! */
    552					pide = tpide;
    553					ioc->res_bitshift = bitshiftcnt + bits_wanted;
    554					goto found_it;
    555				}
    556			}
    557
    558			bitshiftcnt = 0;
    559			mask = base_mask;
    560
    561		}
    562
    563	} else {
    564		int qwords, bits, i;
    565		unsigned long *end;
    566
    567		qwords = bits_wanted >> 6; /* /64 */
    568		bits = bits_wanted - (qwords * BITS_PER_LONG);
    569
    570		end = res_end - qwords;
    571
    572		for (; res_ptr < end; res_ptr++) {
    573			tpide = ptr_to_pide(ioc, res_ptr, 0);
    574			ret = iommu_is_span_boundary(tpide, bits_wanted,
    575						     shift, boundary_size);
    576			if (ret)
    577				goto next_ptr;
    578			for (i = 0 ; i < qwords ; i++) {
    579				if (res_ptr[i] != 0)
    580					goto next_ptr;
    581			}
    582			if (bits && res_ptr[i] && (__ffs(res_ptr[i]) < bits))
    583				continue;
    584
    585			/* Found it, mark it */
    586			for (i = 0 ; i < qwords ; i++)
    587				res_ptr[i] = ~0UL;
    588			res_ptr[i] |= RESMAP_MASK(bits);
    589
    590			pide = tpide;
    591			res_ptr += qwords;
    592			ioc->res_bitshift = bits;
    593			goto found_it;
    594next_ptr:
    595			;
    596		}
    597	}
    598
    599not_found:
    600	prefetch(ioc->res_map);
    601	ioc->res_hint = (unsigned long *) ioc->res_map;
    602	ioc->res_bitshift = 0;
    603	spin_unlock_irqrestore(&ioc->res_lock, flags);
    604	return (pide);
    605
    606found_it:
    607	ioc->res_hint = res_ptr;
    608	spin_unlock_irqrestore(&ioc->res_lock, flags);
    609	return (pide);
    610}
    611
    612
    613/**
    614 * sba_alloc_range - find free bits and mark them in IO PDIR resource bitmap
    615 * @ioc: IO MMU structure which owns the pdir we are interested in.
    616 * @size: number of bytes to create a mapping for
    617 *
    618 * Given a size, find consecutive unmarked and then mark those bits in the
    619 * resource bit map.
    620 */
    621static int
    622sba_alloc_range(struct ioc *ioc, struct device *dev, size_t size)
    623{
    624	unsigned int pages_needed = size >> iovp_shift;
    625#ifdef PDIR_SEARCH_TIMING
    626	unsigned long itc_start;
    627#endif
    628	unsigned long pide;
    629
    630	ASSERT(pages_needed);
    631	ASSERT(0 == (size & ~iovp_mask));
    632
    633#ifdef PDIR_SEARCH_TIMING
    634	itc_start = ia64_get_itc();
    635#endif
    636	/*
    637	** "seek and ye shall find"...praying never hurts either...
    638	*/
    639	pide = sba_search_bitmap(ioc, dev, pages_needed, 1);
    640	if (unlikely(pide >= (ioc->res_size << 3))) {
    641		pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
    642		if (unlikely(pide >= (ioc->res_size << 3))) {
    643#if DELAYED_RESOURCE_CNT > 0
    644			unsigned long flags;
    645
    646			/*
    647			** With delayed resource freeing, we can give this one more shot.  We're
    648			** getting close to being in trouble here, so do what we can to make this
    649			** one count.
    650			*/
    651			spin_lock_irqsave(&ioc->saved_lock, flags);
    652			if (ioc->saved_cnt > 0) {
    653				struct sba_dma_pair *d;
    654				int cnt = ioc->saved_cnt;
    655
    656				d = &(ioc->saved[ioc->saved_cnt - 1]);
    657
    658				spin_lock(&ioc->res_lock);
    659				while (cnt--) {
    660					sba_mark_invalid(ioc, d->iova, d->size);
    661					sba_free_range(ioc, d->iova, d->size);
    662					d--;
    663				}
    664				ioc->saved_cnt = 0;
    665				READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
    666				spin_unlock(&ioc->res_lock);
    667			}
    668			spin_unlock_irqrestore(&ioc->saved_lock, flags);
    669
    670			pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
    671			if (unlikely(pide >= (ioc->res_size << 3))) {
    672				printk(KERN_WARNING "%s: I/O MMU @ %p is"
    673				       "out of mapping resources, %u %u %lx\n",
    674				       __func__, ioc->ioc_hpa, ioc->res_size,
    675				       pages_needed, dma_get_seg_boundary(dev));
    676				return -1;
    677			}
    678#else
    679			printk(KERN_WARNING "%s: I/O MMU @ %p is"
    680			       "out of mapping resources, %u %u %lx\n",
    681			       __func__, ioc->ioc_hpa, ioc->res_size,
    682			       pages_needed, dma_get_seg_boundary(dev));
    683			return -1;
    684#endif
    685		}
    686	}
    687
    688#ifdef PDIR_SEARCH_TIMING
    689	ioc->avg_search[ioc->avg_idx++] = (ia64_get_itc() - itc_start) / pages_needed;
    690	ioc->avg_idx &= SBA_SEARCH_SAMPLE - 1;
    691#endif
    692
    693	prefetchw(&(ioc->pdir_base[pide]));
    694
    695#ifdef ASSERT_PDIR_SANITY
    696	/* verify the first enable bit is clear */
    697	if(0x00 != ((u8 *) ioc->pdir_base)[pide*PDIR_ENTRY_SIZE + 7]) {
    698		sba_dump_pdir_entry(ioc, "sba_search_bitmap() botched it?", pide);
    699	}
    700#endif
    701
    702	DBG_RES("%s(%x) %d -> %lx hint %x/%x\n",
    703		__func__, size, pages_needed, pide,
    704		(uint) ((unsigned long) ioc->res_hint - (unsigned long) ioc->res_map),
    705		ioc->res_bitshift );
    706
    707	return (pide);
    708}
    709
    710
    711/**
    712 * sba_free_range - unmark bits in IO PDIR resource bitmap
    713 * @ioc: IO MMU structure which owns the pdir we are interested in.
    714 * @iova: IO virtual address which was previously allocated.
    715 * @size: number of bytes to create a mapping for
    716 *
    717 * clear bits in the ioc's resource map
    718 */
    719static SBA_INLINE void
    720sba_free_range(struct ioc *ioc, dma_addr_t iova, size_t size)
    721{
    722	unsigned long iovp = SBA_IOVP(ioc, iova);
    723	unsigned int pide = PDIR_INDEX(iovp);
    724	unsigned int ridx = pide >> 3;	/* convert bit to byte address */
    725	unsigned long *res_ptr = (unsigned long *) &((ioc)->res_map[ridx & ~RESMAP_IDX_MASK]);
    726	int bits_not_wanted = size >> iovp_shift;
    727	unsigned long m;
    728
    729	/* Round up to power-of-two size: see AR2305 note above */
    730	bits_not_wanted = 1UL << get_iovp_order(bits_not_wanted << iovp_shift);
    731	for (; bits_not_wanted > 0 ; res_ptr++) {
    732		
    733		if (unlikely(bits_not_wanted > BITS_PER_LONG)) {
    734
    735			/* these mappings start 64bit aligned */
    736			*res_ptr = 0UL;
    737			bits_not_wanted -= BITS_PER_LONG;
    738			pide += BITS_PER_LONG;
    739
    740		} else {
    741
    742			/* 3-bits "bit" address plus 2 (or 3) bits for "byte" == bit in word */
    743			m = RESMAP_MASK(bits_not_wanted) << (pide & (BITS_PER_LONG - 1));
    744			bits_not_wanted = 0;
    745
    746			DBG_RES("%s( ,%x,%x) %x/%lx %x %p %lx\n", __func__, (uint) iova, size,
    747			        bits_not_wanted, m, pide, res_ptr, *res_ptr);
    748
    749			ASSERT(m != 0);
    750			ASSERT(bits_not_wanted);
    751			ASSERT((*res_ptr & m) == m); /* verify same bits are set */
    752			*res_ptr &= ~m;
    753		}
    754	}
    755}
    756
    757
    758/**************************************************************
    759*
    760*   "Dynamic DMA Mapping" support (aka "Coherent I/O")
    761*
    762***************************************************************/
    763
    764/**
    765 * sba_io_pdir_entry - fill in one IO PDIR entry
    766 * @pdir_ptr:  pointer to IO PDIR entry
    767 * @vba: Virtual CPU address of buffer to map
    768 *
    769 * SBA Mapping Routine
    770 *
    771 * Given a virtual address (vba, arg1) sba_io_pdir_entry()
    772 * loads the I/O PDIR entry pointed to by pdir_ptr (arg0).
    773 * Each IO Pdir entry consists of 8 bytes as shown below
    774 * (LSB == bit 0):
    775 *
    776 *  63                    40                                 11    7        0
    777 * +-+---------------------+----------------------------------+----+--------+
    778 * |V|        U            |            PPN[39:12]            | U  |   FF   |
    779 * +-+---------------------+----------------------------------+----+--------+
    780 *
    781 *  V  == Valid Bit
    782 *  U  == Unused
    783 * PPN == Physical Page Number
    784 *
    785 * The physical address fields are filled with the results of virt_to_phys()
    786 * on the vba.
    787 */
    788
    789#if 1
    790#define sba_io_pdir_entry(pdir_ptr, vba) *pdir_ptr = ((vba & ~0xE000000000000FFFULL)	\
    791						      | 0x8000000000000000ULL)
    792#else
    793void SBA_INLINE
    794sba_io_pdir_entry(u64 *pdir_ptr, unsigned long vba)
    795{
    796	*pdir_ptr = ((vba & ~0xE000000000000FFFULL) | 0x80000000000000FFULL);
    797}
    798#endif
    799
    800#ifdef ENABLE_MARK_CLEAN
    801/**
    802 * Since DMA is i-cache coherent, any (complete) pages that were written via
    803 * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
    804 * flush them when they get mapped into an executable vm-area.
    805 */
    806static void
    807mark_clean (void *addr, size_t size)
    808{
    809	unsigned long pg_addr, end;
    810
    811	pg_addr = PAGE_ALIGN((unsigned long) addr);
    812	end = (unsigned long) addr + size;
    813	while (pg_addr + PAGE_SIZE <= end) {
    814		struct page *page = virt_to_page((void *)pg_addr);
    815		set_bit(PG_arch_1, &page->flags);
    816		pg_addr += PAGE_SIZE;
    817	}
    818}
    819#endif
    820
    821/**
    822 * sba_mark_invalid - invalidate one or more IO PDIR entries
    823 * @ioc: IO MMU structure which owns the pdir we are interested in.
    824 * @iova:  IO Virtual Address mapped earlier
    825 * @byte_cnt:  number of bytes this mapping covers.
    826 *
    827 * Marking the IO PDIR entry(ies) as Invalid and invalidate
    828 * corresponding IO TLB entry. The PCOM (Purge Command Register)
    829 * is to purge stale entries in the IO TLB when unmapping entries.
    830 *
    831 * The PCOM register supports purging of multiple pages, with a minium
    832 * of 1 page and a maximum of 2GB. Hardware requires the address be
    833 * aligned to the size of the range being purged. The size of the range
    834 * must be a power of 2. The "Cool perf optimization" in the
    835 * allocation routine helps keep that true.
    836 */
    837static SBA_INLINE void
    838sba_mark_invalid(struct ioc *ioc, dma_addr_t iova, size_t byte_cnt)
    839{
    840	u32 iovp = (u32) SBA_IOVP(ioc,iova);
    841
    842	int off = PDIR_INDEX(iovp);
    843
    844	/* Must be non-zero and rounded up */
    845	ASSERT(byte_cnt > 0);
    846	ASSERT(0 == (byte_cnt & ~iovp_mask));
    847
    848#ifdef ASSERT_PDIR_SANITY
    849	/* Assert first pdir entry is set */
    850	if (!(ioc->pdir_base[off] >> 60)) {
    851		sba_dump_pdir_entry(ioc,"sba_mark_invalid()", PDIR_INDEX(iovp));
    852	}
    853#endif
    854
    855	if (byte_cnt <= iovp_size)
    856	{
    857		ASSERT(off < ioc->pdir_size);
    858
    859		iovp |= iovp_shift;     /* set "size" field for PCOM */
    860
    861#ifndef FULL_VALID_PDIR
    862		/*
    863		** clear I/O PDIR entry "valid" bit
    864		** Do NOT clear the rest - save it for debugging.
    865		** We should only clear bits that have previously
    866		** been enabled.
    867		*/
    868		ioc->pdir_base[off] &= ~(0x80000000000000FFULL);
    869#else
    870		/*
    871  		** If we want to maintain the PDIR as valid, put in
    872		** the spill page so devices prefetching won't
    873		** cause a hard fail.
    874		*/
    875		ioc->pdir_base[off] = (0x80000000000000FFULL | prefetch_spill_page);
    876#endif
    877	} else {
    878		u32 t = get_iovp_order(byte_cnt) + iovp_shift;
    879
    880		iovp |= t;
    881		ASSERT(t <= 31);   /* 2GB! Max value of "size" field */
    882
    883		do {
    884			/* verify this pdir entry is enabled */
    885			ASSERT(ioc->pdir_base[off]  >> 63);
    886#ifndef FULL_VALID_PDIR
    887			/* clear I/O Pdir entry "valid" bit first */
    888			ioc->pdir_base[off] &= ~(0x80000000000000FFULL);
    889#else
    890			ioc->pdir_base[off] = (0x80000000000000FFULL | prefetch_spill_page);
    891#endif
    892			off++;
    893			byte_cnt -= iovp_size;
    894		} while (byte_cnt > 0);
    895	}
    896
    897	WRITE_REG(iovp | ioc->ibase, ioc->ioc_hpa+IOC_PCOM);
    898}
    899
    900/**
    901 * sba_map_page - map one buffer and return IOVA for DMA
    902 * @dev: instance of PCI owned by the driver that's asking.
    903 * @page: page to map
    904 * @poff: offset into page
    905 * @size: number of bytes to map
    906 * @dir: dma direction
    907 * @attrs: optional dma attributes
    908 *
    909 * See Documentation/core-api/dma-api-howto.rst
    910 */
    911static dma_addr_t sba_map_page(struct device *dev, struct page *page,
    912			       unsigned long poff, size_t size,
    913			       enum dma_data_direction dir,
    914			       unsigned long attrs)
    915{
    916	struct ioc *ioc;
    917	void *addr = page_address(page) + poff;
    918	dma_addr_t iovp;
    919	dma_addr_t offset;
    920	u64 *pdir_start;
    921	int pide;
    922#ifdef ASSERT_PDIR_SANITY
    923	unsigned long flags;
    924#endif
    925#ifdef ALLOW_IOV_BYPASS
    926	unsigned long pci_addr = virt_to_phys(addr);
    927#endif
    928
    929#ifdef ALLOW_IOV_BYPASS
    930	ASSERT(to_pci_dev(dev)->dma_mask);
    931	/*
    932 	** Check if the PCI device can DMA to ptr... if so, just return ptr
    933 	*/
    934	if (likely((pci_addr & ~to_pci_dev(dev)->dma_mask) == 0)) {
    935		/*
    936 		** Device is bit capable of DMA'ing to the buffer...
    937		** just return the PCI address of ptr
    938 		*/
    939		DBG_BYPASS("sba_map_page() bypass mask/addr: "
    940			   "0x%lx/0x%lx\n",
    941		           to_pci_dev(dev)->dma_mask, pci_addr);
    942		return pci_addr;
    943	}
    944#endif
    945	ioc = GET_IOC(dev);
    946	ASSERT(ioc);
    947
    948	prefetch(ioc->res_hint);
    949
    950	ASSERT(size > 0);
    951	ASSERT(size <= DMA_CHUNK_SIZE);
    952
    953	/* save offset bits */
    954	offset = ((dma_addr_t) (long) addr) & ~iovp_mask;
    955
    956	/* round up to nearest iovp_size */
    957	size = (size + offset + ~iovp_mask) & iovp_mask;
    958
    959#ifdef ASSERT_PDIR_SANITY
    960	spin_lock_irqsave(&ioc->res_lock, flags);
    961	if (sba_check_pdir(ioc,"Check before sba_map_page()"))
    962		panic("Sanity check failed");
    963	spin_unlock_irqrestore(&ioc->res_lock, flags);
    964#endif
    965
    966	pide = sba_alloc_range(ioc, dev, size);
    967	if (pide < 0)
    968		return DMA_MAPPING_ERROR;
    969
    970	iovp = (dma_addr_t) pide << iovp_shift;
    971
    972	DBG_RUN("%s() 0x%p -> 0x%lx\n", __func__, addr, (long) iovp | offset);
    973
    974	pdir_start = &(ioc->pdir_base[pide]);
    975
    976	while (size > 0) {
    977		ASSERT(((u8 *)pdir_start)[7] == 0); /* verify availability */
    978		sba_io_pdir_entry(pdir_start, (unsigned long) addr);
    979
    980		DBG_RUN("     pdir 0x%p %lx\n", pdir_start, *pdir_start);
    981
    982		addr += iovp_size;
    983		size -= iovp_size;
    984		pdir_start++;
    985	}
    986	/* force pdir update */
    987	wmb();
    988
    989	/* form complete address */
    990#ifdef ASSERT_PDIR_SANITY
    991	spin_lock_irqsave(&ioc->res_lock, flags);
    992	sba_check_pdir(ioc,"Check after sba_map_page()");
    993	spin_unlock_irqrestore(&ioc->res_lock, flags);
    994#endif
    995	return SBA_IOVA(ioc, iovp, offset);
    996}
    997
    998#ifdef ENABLE_MARK_CLEAN
    999static SBA_INLINE void
   1000sba_mark_clean(struct ioc *ioc, dma_addr_t iova, size_t size)
   1001{
   1002	u32	iovp = (u32) SBA_IOVP(ioc,iova);
   1003	int	off = PDIR_INDEX(iovp);
   1004	void	*addr;
   1005
   1006	if (size <= iovp_size) {
   1007		addr = phys_to_virt(ioc->pdir_base[off] &
   1008		                    ~0xE000000000000FFFULL);
   1009		mark_clean(addr, size);
   1010	} else {
   1011		do {
   1012			addr = phys_to_virt(ioc->pdir_base[off] &
   1013			                    ~0xE000000000000FFFULL);
   1014			mark_clean(addr, min(size, iovp_size));
   1015			off++;
   1016			size -= iovp_size;
   1017		} while (size > 0);
   1018	}
   1019}
   1020#endif
   1021
   1022/**
   1023 * sba_unmap_page - unmap one IOVA and free resources
   1024 * @dev: instance of PCI owned by the driver that's asking.
   1025 * @iova:  IOVA of driver buffer previously mapped.
   1026 * @size:  number of bytes mapped in driver buffer.
   1027 * @dir:  R/W or both.
   1028 * @attrs: optional dma attributes
   1029 *
   1030 * See Documentation/core-api/dma-api-howto.rst
   1031 */
   1032static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size,
   1033			   enum dma_data_direction dir, unsigned long attrs)
   1034{
   1035	struct ioc *ioc;
   1036#if DELAYED_RESOURCE_CNT > 0
   1037	struct sba_dma_pair *d;
   1038#endif
   1039	unsigned long flags;
   1040	dma_addr_t offset;
   1041
   1042	ioc = GET_IOC(dev);
   1043	ASSERT(ioc);
   1044
   1045#ifdef ALLOW_IOV_BYPASS
   1046	if (likely((iova & ioc->imask) != ioc->ibase)) {
   1047		/*
   1048		** Address does not fall w/in IOVA, must be bypassing
   1049		*/
   1050		DBG_BYPASS("sba_unmap_page() bypass addr: 0x%lx\n",
   1051			   iova);
   1052
   1053#ifdef ENABLE_MARK_CLEAN
   1054		if (dir == DMA_FROM_DEVICE) {
   1055			mark_clean(phys_to_virt(iova), size);
   1056		}
   1057#endif
   1058		return;
   1059	}
   1060#endif
   1061	offset = iova & ~iovp_mask;
   1062
   1063	DBG_RUN("%s() iovp 0x%lx/%x\n", __func__, (long) iova, size);
   1064
   1065	iova ^= offset;        /* clear offset bits */
   1066	size += offset;
   1067	size = ROUNDUP(size, iovp_size);
   1068
   1069#ifdef ENABLE_MARK_CLEAN
   1070	if (dir == DMA_FROM_DEVICE)
   1071		sba_mark_clean(ioc, iova, size);
   1072#endif
   1073
   1074#if DELAYED_RESOURCE_CNT > 0
   1075	spin_lock_irqsave(&ioc->saved_lock, flags);
   1076	d = &(ioc->saved[ioc->saved_cnt]);
   1077	d->iova = iova;
   1078	d->size = size;
   1079	if (unlikely(++(ioc->saved_cnt) >= DELAYED_RESOURCE_CNT)) {
   1080		int cnt = ioc->saved_cnt;
   1081		spin_lock(&ioc->res_lock);
   1082		while (cnt--) {
   1083			sba_mark_invalid(ioc, d->iova, d->size);
   1084			sba_free_range(ioc, d->iova, d->size);
   1085			d--;
   1086		}
   1087		ioc->saved_cnt = 0;
   1088		READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
   1089		spin_unlock(&ioc->res_lock);
   1090	}
   1091	spin_unlock_irqrestore(&ioc->saved_lock, flags);
   1092#else /* DELAYED_RESOURCE_CNT == 0 */
   1093	spin_lock_irqsave(&ioc->res_lock, flags);
   1094	sba_mark_invalid(ioc, iova, size);
   1095	sba_free_range(ioc, iova, size);
   1096	READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
   1097	spin_unlock_irqrestore(&ioc->res_lock, flags);
   1098#endif /* DELAYED_RESOURCE_CNT == 0 */
   1099}
   1100
   1101/**
   1102 * sba_alloc_coherent - allocate/map shared mem for DMA
   1103 * @dev: instance of PCI owned by the driver that's asking.
   1104 * @size:  number of bytes mapped in driver buffer.
   1105 * @dma_handle:  IOVA of new buffer.
   1106 *
   1107 * See Documentation/core-api/dma-api-howto.rst
   1108 */
   1109static void *
   1110sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
   1111		   gfp_t flags, unsigned long attrs)
   1112{
   1113	struct page *page;
   1114	struct ioc *ioc;
   1115	int node = -1;
   1116	void *addr;
   1117
   1118	ioc = GET_IOC(dev);
   1119	ASSERT(ioc);
   1120#ifdef CONFIG_NUMA
   1121	node = ioc->node;
   1122#endif
   1123
   1124	page = alloc_pages_node(node, flags, get_order(size));
   1125	if (unlikely(!page))
   1126		return NULL;
   1127
   1128	addr = page_address(page);
   1129	memset(addr, 0, size);
   1130	*dma_handle = page_to_phys(page);
   1131
   1132#ifdef ALLOW_IOV_BYPASS
   1133	ASSERT(dev->coherent_dma_mask);
   1134	/*
   1135 	** Check if the PCI device can DMA to ptr... if so, just return ptr
   1136 	*/
   1137	if (likely((*dma_handle & ~dev->coherent_dma_mask) == 0)) {
   1138		DBG_BYPASS("sba_alloc_coherent() bypass mask/addr: 0x%lx/0x%lx\n",
   1139		           dev->coherent_dma_mask, *dma_handle);
   1140
   1141		return addr;
   1142	}
   1143#endif
   1144
   1145	/*
   1146	 * If device can't bypass or bypass is disabled, pass the 32bit fake
   1147	 * device to map single to get an iova mapping.
   1148	 */
   1149	*dma_handle = sba_map_page(&ioc->sac_only_dev->dev, page, 0, size,
   1150			DMA_BIDIRECTIONAL, 0);
   1151	if (dma_mapping_error(dev, *dma_handle))
   1152		return NULL;
   1153	return addr;
   1154}
   1155
   1156
   1157/**
   1158 * sba_free_coherent - free/unmap shared mem for DMA
   1159 * @dev: instance of PCI owned by the driver that's asking.
   1160 * @size:  number of bytes mapped in driver buffer.
   1161 * @vaddr:  virtual address IOVA of "consistent" buffer.
   1162 * @dma_handler:  IO virtual address of "consistent" buffer.
   1163 *
   1164 * See Documentation/core-api/dma-api-howto.rst
   1165 */
   1166static void sba_free_coherent(struct device *dev, size_t size, void *vaddr,
   1167			      dma_addr_t dma_handle, unsigned long attrs)
   1168{
   1169	sba_unmap_page(dev, dma_handle, size, 0, 0);
   1170	free_pages((unsigned long) vaddr, get_order(size));
   1171}
   1172
   1173
   1174/*
   1175** Since 0 is a valid pdir_base index value, can't use that
   1176** to determine if a value is valid or not. Use a flag to indicate
   1177** the SG list entry contains a valid pdir index.
   1178*/
   1179#define PIDE_FLAG 0x1UL
   1180
   1181#ifdef DEBUG_LARGE_SG_ENTRIES
   1182int dump_run_sg = 0;
   1183#endif
   1184
   1185
   1186/**
   1187 * sba_fill_pdir - write allocated SG entries into IO PDIR
   1188 * @ioc: IO MMU structure which owns the pdir we are interested in.
   1189 * @startsg:  list of IOVA/size pairs
   1190 * @nents: number of entries in startsg list
   1191 *
   1192 * Take preprocessed SG list and write corresponding entries
   1193 * in the IO PDIR.
   1194 */
   1195
   1196static SBA_INLINE int
   1197sba_fill_pdir(
   1198	struct ioc *ioc,
   1199	struct scatterlist *startsg,
   1200	int nents)
   1201{
   1202	struct scatterlist *dma_sg = startsg;	/* pointer to current DMA */
   1203	int n_mappings = 0;
   1204	u64 *pdirp = NULL;
   1205	unsigned long dma_offset = 0;
   1206
   1207	while (nents-- > 0) {
   1208		int     cnt = startsg->dma_length;
   1209		startsg->dma_length = 0;
   1210
   1211#ifdef DEBUG_LARGE_SG_ENTRIES
   1212		if (dump_run_sg)
   1213			printk(" %2d : %08lx/%05x %p\n",
   1214				nents, startsg->dma_address, cnt,
   1215				sba_sg_address(startsg));
   1216#else
   1217		DBG_RUN_SG(" %d : %08lx/%05x %p\n",
   1218				nents, startsg->dma_address, cnt,
   1219				sba_sg_address(startsg));
   1220#endif
   1221		/*
   1222		** Look for the start of a new DMA stream
   1223		*/
   1224		if (startsg->dma_address & PIDE_FLAG) {
   1225			u32 pide = startsg->dma_address & ~PIDE_FLAG;
   1226			dma_offset = (unsigned long) pide & ~iovp_mask;
   1227			startsg->dma_address = 0;
   1228			if (n_mappings)
   1229				dma_sg = sg_next(dma_sg);
   1230			dma_sg->dma_address = pide | ioc->ibase;
   1231			pdirp = &(ioc->pdir_base[pide >> iovp_shift]);
   1232			n_mappings++;
   1233		}
   1234
   1235		/*
   1236		** Look for a VCONTIG chunk
   1237		*/
   1238		if (cnt) {
   1239			unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
   1240			ASSERT(pdirp);
   1241
   1242			/* Since multiple Vcontig blocks could make up
   1243			** one DMA stream, *add* cnt to dma_len.
   1244			*/
   1245			dma_sg->dma_length += cnt;
   1246			cnt += dma_offset;
   1247			dma_offset=0;	/* only want offset on first chunk */
   1248			cnt = ROUNDUP(cnt, iovp_size);
   1249			do {
   1250				sba_io_pdir_entry(pdirp, vaddr);
   1251				vaddr += iovp_size;
   1252				cnt -= iovp_size;
   1253				pdirp++;
   1254			} while (cnt > 0);
   1255		}
   1256		startsg = sg_next(startsg);
   1257	}
   1258	/* force pdir update */
   1259	wmb();
   1260
   1261#ifdef DEBUG_LARGE_SG_ENTRIES
   1262	dump_run_sg = 0;
   1263#endif
   1264	return(n_mappings);
   1265}
   1266
   1267
   1268/*
   1269** Two address ranges are DMA contiguous *iff* "end of prev" and
   1270** "start of next" are both on an IOV page boundary.
   1271**
   1272** (shift left is a quick trick to mask off upper bits)
   1273*/
   1274#define DMA_CONTIG(__X, __Y) \
   1275	(((((unsigned long) __X) | ((unsigned long) __Y)) << (BITS_PER_LONG - iovp_shift)) == 0UL)
   1276
   1277
   1278/**
   1279 * sba_coalesce_chunks - preprocess the SG list
   1280 * @ioc: IO MMU structure which owns the pdir we are interested in.
   1281 * @startsg:  list of IOVA/size pairs
   1282 * @nents: number of entries in startsg list
   1283 *
   1284 * First pass is to walk the SG list and determine where the breaks are
   1285 * in the DMA stream. Allocates PDIR entries but does not fill them.
   1286 * Returns the number of DMA chunks.
   1287 *
   1288 * Doing the fill separate from the coalescing/allocation keeps the
   1289 * code simpler. Future enhancement could make one pass through
   1290 * the sglist do both.
   1291 */
   1292static SBA_INLINE int
   1293sba_coalesce_chunks(struct ioc *ioc, struct device *dev,
   1294	struct scatterlist *startsg,
   1295	int nents)
   1296{
   1297	struct scatterlist *vcontig_sg;    /* VCONTIG chunk head */
   1298	unsigned long vcontig_len;         /* len of VCONTIG chunk */
   1299	unsigned long vcontig_end;
   1300	struct scatterlist *dma_sg;        /* next DMA stream head */
   1301	unsigned long dma_offset, dma_len; /* start/len of DMA stream */
   1302	int n_mappings = 0;
   1303	unsigned int max_seg_size = dma_get_max_seg_size(dev);
   1304	int idx;
   1305
   1306	while (nents > 0) {
   1307		unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
   1308
   1309		/*
   1310		** Prepare for first/next DMA stream
   1311		*/
   1312		dma_sg = vcontig_sg = startsg;
   1313		dma_len = vcontig_len = vcontig_end = startsg->length;
   1314		vcontig_end +=  vaddr;
   1315		dma_offset = vaddr & ~iovp_mask;
   1316
   1317		/* PARANOID: clear entries */
   1318		startsg->dma_address = startsg->dma_length = 0;
   1319
   1320		/*
   1321		** This loop terminates one iteration "early" since
   1322		** it's always looking one "ahead".
   1323		*/
   1324		while (--nents > 0) {
   1325			unsigned long vaddr;	/* tmp */
   1326
   1327			startsg = sg_next(startsg);
   1328
   1329			/* PARANOID */
   1330			startsg->dma_address = startsg->dma_length = 0;
   1331
   1332			/* catch brokenness in SCSI layer */
   1333			ASSERT(startsg->length <= DMA_CHUNK_SIZE);
   1334
   1335			/*
   1336			** First make sure current dma stream won't
   1337			** exceed DMA_CHUNK_SIZE if we coalesce the
   1338			** next entry.
   1339			*/
   1340			if (((dma_len + dma_offset + startsg->length + ~iovp_mask) & iovp_mask)
   1341			    > DMA_CHUNK_SIZE)
   1342				break;
   1343
   1344			if (dma_len + startsg->length > max_seg_size)
   1345				break;
   1346
   1347			/*
   1348			** Then look for virtually contiguous blocks.
   1349			**
   1350			** append the next transaction?
   1351			*/
   1352			vaddr = (unsigned long) sba_sg_address(startsg);
   1353			if  (vcontig_end == vaddr)
   1354			{
   1355				vcontig_len += startsg->length;
   1356				vcontig_end += startsg->length;
   1357				dma_len     += startsg->length;
   1358				continue;
   1359			}
   1360
   1361#ifdef DEBUG_LARGE_SG_ENTRIES
   1362			dump_run_sg = (vcontig_len > iovp_size);
   1363#endif
   1364
   1365			/*
   1366			** Not virtually contiguous.
   1367			** Terminate prev chunk.
   1368			** Start a new chunk.
   1369			**
   1370			** Once we start a new VCONTIG chunk, dma_offset
   1371			** can't change. And we need the offset from the first
   1372			** chunk - not the last one. Ergo Successive chunks
   1373			** must start on page boundaries and dove tail
   1374			** with it's predecessor.
   1375			*/
   1376			vcontig_sg->dma_length = vcontig_len;
   1377
   1378			vcontig_sg = startsg;
   1379			vcontig_len = startsg->length;
   1380
   1381			/*
   1382			** 3) do the entries end/start on page boundaries?
   1383			**    Don't update vcontig_end until we've checked.
   1384			*/
   1385			if (DMA_CONTIG(vcontig_end, vaddr))
   1386			{
   1387				vcontig_end = vcontig_len + vaddr;
   1388				dma_len += vcontig_len;
   1389				continue;
   1390			} else {
   1391				break;
   1392			}
   1393		}
   1394
   1395		/*
   1396		** End of DMA Stream
   1397		** Terminate last VCONTIG block.
   1398		** Allocate space for DMA stream.
   1399		*/
   1400		vcontig_sg->dma_length = vcontig_len;
   1401		dma_len = (dma_len + dma_offset + ~iovp_mask) & iovp_mask;
   1402		ASSERT(dma_len <= DMA_CHUNK_SIZE);
   1403		idx = sba_alloc_range(ioc, dev, dma_len);
   1404		if (idx < 0) {
   1405			dma_sg->dma_length = 0;
   1406			return -1;
   1407		}
   1408		dma_sg->dma_address = (dma_addr_t)(PIDE_FLAG | (idx << iovp_shift)
   1409						   | dma_offset);
   1410		n_mappings++;
   1411	}
   1412
   1413	return n_mappings;
   1414}
   1415
   1416static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
   1417			       int nents, enum dma_data_direction dir,
   1418			       unsigned long attrs);
   1419/**
   1420 * sba_map_sg - map Scatter/Gather list
   1421 * @dev: instance of PCI owned by the driver that's asking.
   1422 * @sglist:  array of buffer/length pairs
   1423 * @nents:  number of entries in list
   1424 * @dir:  R/W or both.
   1425 * @attrs: optional dma attributes
   1426 *
   1427 * See Documentation/core-api/dma-api-howto.rst
   1428 */
   1429static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist,
   1430			    int nents, enum dma_data_direction dir,
   1431			    unsigned long attrs)
   1432{
   1433	struct ioc *ioc;
   1434	int coalesced, filled = 0;
   1435#ifdef ASSERT_PDIR_SANITY
   1436	unsigned long flags;
   1437#endif
   1438#ifdef ALLOW_IOV_BYPASS_SG
   1439	struct scatterlist *sg;
   1440#endif
   1441
   1442	DBG_RUN_SG("%s() START %d entries\n", __func__, nents);
   1443	ioc = GET_IOC(dev);
   1444	ASSERT(ioc);
   1445
   1446#ifdef ALLOW_IOV_BYPASS_SG
   1447	ASSERT(to_pci_dev(dev)->dma_mask);
   1448	if (likely((ioc->dma_mask & ~to_pci_dev(dev)->dma_mask) == 0)) {
   1449		for_each_sg(sglist, sg, nents, filled) {
   1450			sg->dma_length = sg->length;
   1451			sg->dma_address = virt_to_phys(sba_sg_address(sg));
   1452		}
   1453		return filled;
   1454	}
   1455#endif
   1456	/* Fast path single entry scatterlists. */
   1457	if (nents == 1) {
   1458		sglist->dma_length = sglist->length;
   1459		sglist->dma_address = sba_map_page(dev, sg_page(sglist),
   1460				sglist->offset, sglist->length, dir, attrs);
   1461		if (dma_mapping_error(dev, sglist->dma_address))
   1462			return -EIO;
   1463		return 1;
   1464	}
   1465
   1466#ifdef ASSERT_PDIR_SANITY
   1467	spin_lock_irqsave(&ioc->res_lock, flags);
   1468	if (sba_check_pdir(ioc,"Check before sba_map_sg_attrs()"))
   1469	{
   1470		sba_dump_sg(ioc, sglist, nents);
   1471		panic("Check before sba_map_sg_attrs()");
   1472	}
   1473	spin_unlock_irqrestore(&ioc->res_lock, flags);
   1474#endif
   1475
   1476	prefetch(ioc->res_hint);
   1477
   1478	/*
   1479	** First coalesce the chunks and allocate I/O pdir space
   1480	**
   1481	** If this is one DMA stream, we can properly map using the
   1482	** correct virtual address associated with each DMA page.
   1483	** w/o this association, we wouldn't have coherent DMA!
   1484	** Access to the virtual address is what forces a two pass algorithm.
   1485	*/
   1486	coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents);
   1487	if (coalesced < 0) {
   1488		sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs);
   1489		return -ENOMEM;
   1490	}
   1491
   1492	/*
   1493	** Program the I/O Pdir
   1494	**
   1495	** map the virtual addresses to the I/O Pdir
   1496	** o dma_address will contain the pdir index
   1497	** o dma_len will contain the number of bytes to map
   1498	** o address contains the virtual address.
   1499	*/
   1500	filled = sba_fill_pdir(ioc, sglist, nents);
   1501
   1502#ifdef ASSERT_PDIR_SANITY
   1503	spin_lock_irqsave(&ioc->res_lock, flags);
   1504	if (sba_check_pdir(ioc,"Check after sba_map_sg_attrs()"))
   1505	{
   1506		sba_dump_sg(ioc, sglist, nents);
   1507		panic("Check after sba_map_sg_attrs()\n");
   1508	}
   1509	spin_unlock_irqrestore(&ioc->res_lock, flags);
   1510#endif
   1511
   1512	ASSERT(coalesced == filled);
   1513	DBG_RUN_SG("%s() DONE %d mappings\n", __func__, filled);
   1514
   1515	return filled;
   1516}
   1517
   1518/**
   1519 * sba_unmap_sg_attrs - unmap Scatter/Gather list
   1520 * @dev: instance of PCI owned by the driver that's asking.
   1521 * @sglist:  array of buffer/length pairs
   1522 * @nents:  number of entries in list
   1523 * @dir:  R/W or both.
   1524 * @attrs: optional dma attributes
   1525 *
   1526 * See Documentation/core-api/dma-api-howto.rst
   1527 */
   1528static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
   1529			       int nents, enum dma_data_direction dir,
   1530			       unsigned long attrs)
   1531{
   1532#ifdef ASSERT_PDIR_SANITY
   1533	struct ioc *ioc;
   1534	unsigned long flags;
   1535#endif
   1536
   1537	DBG_RUN_SG("%s() START %d entries,  %p,%x\n",
   1538		   __func__, nents, sba_sg_address(sglist), sglist->length);
   1539
   1540#ifdef ASSERT_PDIR_SANITY
   1541	ioc = GET_IOC(dev);
   1542	ASSERT(ioc);
   1543
   1544	spin_lock_irqsave(&ioc->res_lock, flags);
   1545	sba_check_pdir(ioc,"Check before sba_unmap_sg_attrs()");
   1546	spin_unlock_irqrestore(&ioc->res_lock, flags);
   1547#endif
   1548
   1549	while (nents && sglist->dma_length) {
   1550
   1551		sba_unmap_page(dev, sglist->dma_address, sglist->dma_length,
   1552			       dir, attrs);
   1553		sglist = sg_next(sglist);
   1554		nents--;
   1555	}
   1556
   1557	DBG_RUN_SG("%s() DONE (nents %d)\n", __func__,  nents);
   1558
   1559#ifdef ASSERT_PDIR_SANITY
   1560	spin_lock_irqsave(&ioc->res_lock, flags);
   1561	sba_check_pdir(ioc,"Check after sba_unmap_sg_attrs()");
   1562	spin_unlock_irqrestore(&ioc->res_lock, flags);
   1563#endif
   1564
   1565}
   1566
   1567/**************************************************************
   1568*
   1569*   Initialization and claim
   1570*
   1571***************************************************************/
   1572
   1573static void
   1574ioc_iova_init(struct ioc *ioc)
   1575{
   1576	int tcnfg;
   1577	int agp_found = 0;
   1578	struct pci_dev *device = NULL;
   1579#ifdef FULL_VALID_PDIR
   1580	unsigned long index;
   1581#endif
   1582
   1583	/*
   1584	** Firmware programs the base and size of a "safe IOVA space"
   1585	** (one that doesn't overlap memory or LMMIO space) in the
   1586	** IBASE and IMASK registers.
   1587	*/
   1588	ioc->ibase = READ_REG(ioc->ioc_hpa + IOC_IBASE) & ~0x1UL;
   1589	ioc->imask = READ_REG(ioc->ioc_hpa + IOC_IMASK) | 0xFFFFFFFF00000000UL;
   1590
   1591	ioc->iov_size = ~ioc->imask + 1;
   1592
   1593	DBG_INIT("%s() hpa %p IOV base 0x%lx mask 0x%lx (%dMB)\n",
   1594		__func__, ioc->ioc_hpa, ioc->ibase, ioc->imask,
   1595		ioc->iov_size >> 20);
   1596
   1597	switch (iovp_size) {
   1598		case  4*1024: tcnfg = 0; break;
   1599		case  8*1024: tcnfg = 1; break;
   1600		case 16*1024: tcnfg = 2; break;
   1601		case 64*1024: tcnfg = 3; break;
   1602		default:
   1603			panic(PFX "Unsupported IOTLB page size %ldK",
   1604				iovp_size >> 10);
   1605			break;
   1606	}
   1607	WRITE_REG(tcnfg, ioc->ioc_hpa + IOC_TCNFG);
   1608
   1609	ioc->pdir_size = (ioc->iov_size / iovp_size) * PDIR_ENTRY_SIZE;
   1610	ioc->pdir_base = (void *) __get_free_pages(GFP_KERNEL,
   1611						   get_order(ioc->pdir_size));
   1612	if (!ioc->pdir_base)
   1613		panic(PFX "Couldn't allocate I/O Page Table\n");
   1614
   1615	memset(ioc->pdir_base, 0, ioc->pdir_size);
   1616
   1617	DBG_INIT("%s() IOV page size %ldK pdir %p size %x\n", __func__,
   1618		iovp_size >> 10, ioc->pdir_base, ioc->pdir_size);
   1619
   1620	ASSERT(ALIGN((unsigned long) ioc->pdir_base, 4*1024) == (unsigned long) ioc->pdir_base);
   1621	WRITE_REG(virt_to_phys(ioc->pdir_base), ioc->ioc_hpa + IOC_PDIR_BASE);
   1622
   1623	/*
   1624	** If an AGP device is present, only use half of the IOV space
   1625	** for PCI DMA.  Unfortunately we can't know ahead of time
   1626	** whether GART support will actually be used, for now we
   1627	** can just key on an AGP device found in the system.
   1628	** We program the next pdir index after we stop w/ a key for
   1629	** the GART code to handshake on.
   1630	*/
   1631	for_each_pci_dev(device)	
   1632		agp_found |= pci_find_capability(device, PCI_CAP_ID_AGP);
   1633
   1634	if (agp_found && reserve_sba_gart) {
   1635		printk(KERN_INFO PFX "reserving %dMb of IOVA space at 0x%lx for agpgart\n",
   1636		      ioc->iov_size/2 >> 20, ioc->ibase + ioc->iov_size/2);
   1637		ioc->pdir_size /= 2;
   1638		((u64 *)ioc->pdir_base)[PDIR_INDEX(ioc->iov_size/2)] = ZX1_SBA_IOMMU_COOKIE;
   1639	}
   1640#ifdef FULL_VALID_PDIR
   1641	/*
   1642  	** Check to see if the spill page has been allocated, we don't need more than
   1643	** one across multiple SBAs.
   1644	*/
   1645	if (!prefetch_spill_page) {
   1646		char *spill_poison = "SBAIOMMU POISON";
   1647		int poison_size = 16;
   1648		void *poison_addr, *addr;
   1649
   1650		addr = (void *)__get_free_pages(GFP_KERNEL, get_order(iovp_size));
   1651		if (!addr)
   1652			panic(PFX "Couldn't allocate PDIR spill page\n");
   1653
   1654		poison_addr = addr;
   1655		for ( ; (u64) poison_addr < addr + iovp_size; poison_addr += poison_size)
   1656			memcpy(poison_addr, spill_poison, poison_size);
   1657
   1658		prefetch_spill_page = virt_to_phys(addr);
   1659
   1660		DBG_INIT("%s() prefetch spill addr: 0x%lx\n", __func__, prefetch_spill_page);
   1661	}
   1662	/*
   1663  	** Set all the PDIR entries valid w/ the spill page as the target
   1664	*/
   1665	for (index = 0 ; index < (ioc->pdir_size / PDIR_ENTRY_SIZE) ; index++)
   1666		((u64 *)ioc->pdir_base)[index] = (0x80000000000000FF | prefetch_spill_page);
   1667#endif
   1668
   1669	/* Clear I/O TLB of any possible entries */
   1670	WRITE_REG(ioc->ibase | (get_iovp_order(ioc->iov_size) + iovp_shift), ioc->ioc_hpa + IOC_PCOM);
   1671	READ_REG(ioc->ioc_hpa + IOC_PCOM);
   1672
   1673	/* Enable IOVA translation */
   1674	WRITE_REG(ioc->ibase | 1, ioc->ioc_hpa + IOC_IBASE);
   1675	READ_REG(ioc->ioc_hpa + IOC_IBASE);
   1676}
   1677
   1678static void __init
   1679ioc_resource_init(struct ioc *ioc)
   1680{
   1681	spin_lock_init(&ioc->res_lock);
   1682#if DELAYED_RESOURCE_CNT > 0
   1683	spin_lock_init(&ioc->saved_lock);
   1684#endif
   1685
   1686	/* resource map size dictated by pdir_size */
   1687	ioc->res_size = ioc->pdir_size / PDIR_ENTRY_SIZE; /* entries */
   1688	ioc->res_size >>= 3;  /* convert bit count to byte count */
   1689	DBG_INIT("%s() res_size 0x%x\n", __func__, ioc->res_size);
   1690
   1691	ioc->res_map = (char *) __get_free_pages(GFP_KERNEL,
   1692						 get_order(ioc->res_size));
   1693	if (!ioc->res_map)
   1694		panic(PFX "Couldn't allocate resource map\n");
   1695
   1696	memset(ioc->res_map, 0, ioc->res_size);
   1697	/* next available IOVP - circular search */
   1698	ioc->res_hint = (unsigned long *) ioc->res_map;
   1699
   1700#ifdef ASSERT_PDIR_SANITY
   1701	/* Mark first bit busy - ie no IOVA 0 */
   1702	ioc->res_map[0] = 0x1;
   1703	ioc->pdir_base[0] = 0x8000000000000000ULL | ZX1_SBA_IOMMU_COOKIE;
   1704#endif
   1705#ifdef FULL_VALID_PDIR
   1706	/* Mark the last resource used so we don't prefetch beyond IOVA space */
   1707	ioc->res_map[ioc->res_size - 1] |= 0x80UL; /* res_map is chars */
   1708	ioc->pdir_base[(ioc->pdir_size / PDIR_ENTRY_SIZE) - 1] = (0x80000000000000FF
   1709							      | prefetch_spill_page);
   1710#endif
   1711
   1712	DBG_INIT("%s() res_map %x %p\n", __func__,
   1713		 ioc->res_size, (void *) ioc->res_map);
   1714}
   1715
   1716static void __init
   1717ioc_sac_init(struct ioc *ioc)
   1718{
   1719	struct pci_dev *sac = NULL;
   1720	struct pci_controller *controller = NULL;
   1721
   1722	/*
   1723	 * pci_alloc_coherent() must return a DMA address which is
   1724	 * SAC (single address cycle) addressable, so allocate a
   1725	 * pseudo-device to enforce that.
   1726	 */
   1727	sac = kzalloc(sizeof(*sac), GFP_KERNEL);
   1728	if (!sac)
   1729		panic(PFX "Couldn't allocate struct pci_dev");
   1730
   1731	controller = kzalloc(sizeof(*controller), GFP_KERNEL);
   1732	if (!controller)
   1733		panic(PFX "Couldn't allocate struct pci_controller");
   1734
   1735	controller->iommu = ioc;
   1736	sac->sysdata = controller;
   1737	sac->dma_mask = 0xFFFFFFFFUL;
   1738	sac->dev.bus = &pci_bus_type;
   1739	ioc->sac_only_dev = sac;
   1740}
   1741
   1742static void __init
   1743ioc_zx1_init(struct ioc *ioc)
   1744{
   1745	unsigned long rope_config;
   1746	unsigned int i;
   1747
   1748	if (ioc->rev < 0x20)
   1749		panic(PFX "IOC 2.0 or later required for IOMMU support\n");
   1750
   1751	/* 38 bit memory controller + extra bit for range displaced by MMIO */
   1752	ioc->dma_mask = (0x1UL << 39) - 1;
   1753
   1754	/*
   1755	** Clear ROPE(N)_CONFIG AO bit.
   1756	** Disables "NT Ordering" (~= !"Relaxed Ordering")
   1757	** Overrides bit 1 in DMA Hint Sets.
   1758	** Improves netperf UDP_STREAM by ~10% for tg3 on bcm5701.
   1759	*/
   1760	for (i=0; i<(8*8); i+=8) {
   1761		rope_config = READ_REG(ioc->ioc_hpa + IOC_ROPE0_CFG + i);
   1762		rope_config &= ~IOC_ROPE_AO;
   1763		WRITE_REG(rope_config, ioc->ioc_hpa + IOC_ROPE0_CFG + i);
   1764	}
   1765}
   1766
   1767typedef void (initfunc)(struct ioc *);
   1768
   1769struct ioc_iommu {
   1770	u32 func_id;
   1771	char *name;
   1772	initfunc *init;
   1773};
   1774
   1775static struct ioc_iommu ioc_iommu_info[] __initdata = {
   1776	{ ZX1_IOC_ID, "zx1", ioc_zx1_init },
   1777	{ ZX2_IOC_ID, "zx2", NULL },
   1778	{ SX1000_IOC_ID, "sx1000", NULL },
   1779	{ SX2000_IOC_ID, "sx2000", NULL },
   1780};
   1781
   1782static void __init ioc_init(unsigned long hpa, struct ioc *ioc)
   1783{
   1784	struct ioc_iommu *info;
   1785
   1786	ioc->next = ioc_list;
   1787	ioc_list = ioc;
   1788
   1789	ioc->ioc_hpa = ioremap(hpa, 0x1000);
   1790
   1791	ioc->func_id = READ_REG(ioc->ioc_hpa + IOC_FUNC_ID);
   1792	ioc->rev = READ_REG(ioc->ioc_hpa + IOC_FCLASS) & 0xFFUL;
   1793	ioc->dma_mask = 0xFFFFFFFFFFFFFFFFUL;	/* conservative */
   1794
   1795	for (info = ioc_iommu_info; info < ioc_iommu_info + ARRAY_SIZE(ioc_iommu_info); info++) {
   1796		if (ioc->func_id == info->func_id) {
   1797			ioc->name = info->name;
   1798			if (info->init)
   1799				(info->init)(ioc);
   1800		}
   1801	}
   1802
   1803	iovp_size = (1 << iovp_shift);
   1804	iovp_mask = ~(iovp_size - 1);
   1805
   1806	DBG_INIT("%s: PAGE_SIZE %ldK, iovp_size %ldK\n", __func__,
   1807		PAGE_SIZE >> 10, iovp_size >> 10);
   1808
   1809	if (!ioc->name) {
   1810		ioc->name = kmalloc(24, GFP_KERNEL);
   1811		if (ioc->name)
   1812			sprintf((char *) ioc->name, "Unknown (%04x:%04x)",
   1813				ioc->func_id & 0xFFFF, (ioc->func_id >> 16) & 0xFFFF);
   1814		else
   1815			ioc->name = "Unknown";
   1816	}
   1817
   1818	ioc_iova_init(ioc);
   1819	ioc_resource_init(ioc);
   1820	ioc_sac_init(ioc);
   1821
   1822	printk(KERN_INFO PFX
   1823		"%s %d.%d HPA 0x%lx IOVA space %dMb at 0x%lx\n",
   1824		ioc->name, (ioc->rev >> 4) & 0xF, ioc->rev & 0xF,
   1825		hpa, ioc->iov_size >> 20, ioc->ibase);
   1826}
   1827
   1828
   1829
   1830/**************************************************************************
   1831**
   1832**   SBA initialization code (HW and SW)
   1833**
   1834**   o identify SBA chip itself
   1835**   o FIXME: initialize DMA hints for reasonable defaults
   1836**
   1837**************************************************************************/
   1838
   1839#ifdef CONFIG_PROC_FS
   1840static void *
   1841ioc_start(struct seq_file *s, loff_t *pos)
   1842{
   1843	struct ioc *ioc;
   1844	loff_t n = *pos;
   1845
   1846	for (ioc = ioc_list; ioc; ioc = ioc->next)
   1847		if (!n--)
   1848			return ioc;
   1849
   1850	return NULL;
   1851}
   1852
   1853static void *
   1854ioc_next(struct seq_file *s, void *v, loff_t *pos)
   1855{
   1856	struct ioc *ioc = v;
   1857
   1858	++*pos;
   1859	return ioc->next;
   1860}
   1861
   1862static void
   1863ioc_stop(struct seq_file *s, void *v)
   1864{
   1865}
   1866
   1867static int
   1868ioc_show(struct seq_file *s, void *v)
   1869{
   1870	struct ioc *ioc = v;
   1871	unsigned long *res_ptr = (unsigned long *)ioc->res_map;
   1872	int i, used = 0;
   1873
   1874	seq_printf(s, "Hewlett Packard %s IOC rev %d.%d\n",
   1875		ioc->name, ((ioc->rev >> 4) & 0xF), (ioc->rev & 0xF));
   1876#ifdef CONFIG_NUMA
   1877	if (ioc->node != NUMA_NO_NODE)
   1878		seq_printf(s, "NUMA node       : %d\n", ioc->node);
   1879#endif
   1880	seq_printf(s, "IOVA size       : %ld MB\n", ((ioc->pdir_size >> 3) * iovp_size)/(1024*1024));
   1881	seq_printf(s, "IOVA page size  : %ld kb\n", iovp_size/1024);
   1882
   1883	for (i = 0; i < (ioc->res_size / sizeof(unsigned long)); ++i, ++res_ptr)
   1884		used += hweight64(*res_ptr);
   1885
   1886	seq_printf(s, "PDIR size       : %d entries\n", ioc->pdir_size >> 3);
   1887	seq_printf(s, "PDIR used       : %d entries\n", used);
   1888
   1889#ifdef PDIR_SEARCH_TIMING
   1890	{
   1891		unsigned long i = 0, avg = 0, min, max;
   1892		min = max = ioc->avg_search[0];
   1893		for (i = 0; i < SBA_SEARCH_SAMPLE; i++) {
   1894			avg += ioc->avg_search[i];
   1895			if (ioc->avg_search[i] > max) max = ioc->avg_search[i];
   1896			if (ioc->avg_search[i] < min) min = ioc->avg_search[i];
   1897		}
   1898		avg /= SBA_SEARCH_SAMPLE;
   1899		seq_printf(s, "Bitmap search   : %ld/%ld/%ld (min/avg/max CPU Cycles/IOVA page)\n",
   1900		           min, avg, max);
   1901	}
   1902#endif
   1903#ifndef ALLOW_IOV_BYPASS
   1904	 seq_printf(s, "IOVA bypass disabled\n");
   1905#endif
   1906	return 0;
   1907}
   1908
   1909static const struct seq_operations ioc_seq_ops = {
   1910	.start = ioc_start,
   1911	.next  = ioc_next,
   1912	.stop  = ioc_stop,
   1913	.show  = ioc_show
   1914};
   1915
   1916static void __init
   1917ioc_proc_init(void)
   1918{
   1919	struct proc_dir_entry *dir;
   1920
   1921	dir = proc_mkdir("bus/mckinley", NULL);
   1922	if (!dir)
   1923		return;
   1924
   1925	proc_create_seq(ioc_list->name, 0, dir, &ioc_seq_ops);
   1926}
   1927#endif
   1928
   1929static void
   1930sba_connect_bus(struct pci_bus *bus)
   1931{
   1932	acpi_handle handle, parent;
   1933	acpi_status status;
   1934	struct ioc *ioc;
   1935
   1936	if (!PCI_CONTROLLER(bus))
   1937		panic(PFX "no sysdata on bus %d!\n", bus->number);
   1938
   1939	if (PCI_CONTROLLER(bus)->iommu)
   1940		return;
   1941
   1942	handle = acpi_device_handle(PCI_CONTROLLER(bus)->companion);
   1943	if (!handle)
   1944		return;
   1945
   1946	/*
   1947	 * The IOC scope encloses PCI root bridges in the ACPI
   1948	 * namespace, so work our way out until we find an IOC we
   1949	 * claimed previously.
   1950	 */
   1951	do {
   1952		for (ioc = ioc_list; ioc; ioc = ioc->next)
   1953			if (ioc->handle == handle) {
   1954				PCI_CONTROLLER(bus)->iommu = ioc;
   1955				return;
   1956			}
   1957
   1958		status = acpi_get_parent(handle, &parent);
   1959		handle = parent;
   1960	} while (ACPI_SUCCESS(status));
   1961
   1962	printk(KERN_WARNING "No IOC for PCI Bus %04x:%02x in ACPI\n", pci_domain_nr(bus), bus->number);
   1963}
   1964
   1965static void __init
   1966sba_map_ioc_to_node(struct ioc *ioc, acpi_handle handle)
   1967{
   1968#ifdef CONFIG_NUMA
   1969	unsigned int node;
   1970
   1971	node = acpi_get_node(handle);
   1972	if (node != NUMA_NO_NODE && !node_online(node))
   1973		node = NUMA_NO_NODE;
   1974
   1975	ioc->node = node;
   1976#endif
   1977}
   1978
   1979static void __init acpi_sba_ioc_add(struct ioc *ioc)
   1980{
   1981	acpi_handle handle = ioc->handle;
   1982	acpi_status status;
   1983	u64 hpa, length;
   1984	struct acpi_device_info *adi;
   1985
   1986	ioc_found = ioc->next;
   1987	status = hp_acpi_csr_space(handle, &hpa, &length);
   1988	if (ACPI_FAILURE(status))
   1989		goto err;
   1990
   1991	status = acpi_get_object_info(handle, &adi);
   1992	if (ACPI_FAILURE(status))
   1993		goto err;
   1994
   1995	/*
   1996	 * For HWP0001, only SBA appears in ACPI namespace.  It encloses the PCI
   1997	 * root bridges, and its CSR space includes the IOC function.
   1998	 */
   1999	if (strncmp("HWP0001", adi->hardware_id.string, 7) == 0) {
   2000		hpa += ZX1_IOC_OFFSET;
   2001		/* zx1 based systems default to kernel page size iommu pages */
   2002		if (!iovp_shift)
   2003			iovp_shift = min(PAGE_SHIFT, 16);
   2004	}
   2005	kfree(adi);
   2006
   2007	/*
   2008	 * default anything not caught above or specified on cmdline to 4k
   2009	 * iommu page size
   2010	 */
   2011	if (!iovp_shift)
   2012		iovp_shift = 12;
   2013
   2014	ioc_init(hpa, ioc);
   2015	/* setup NUMA node association */
   2016	sba_map_ioc_to_node(ioc, handle);
   2017	return;
   2018
   2019 err:
   2020	kfree(ioc);
   2021}
   2022
   2023static const struct acpi_device_id hp_ioc_iommu_device_ids[] = {
   2024	{"HWP0001", 0},
   2025	{"HWP0004", 0},
   2026	{"", 0},
   2027};
   2028
   2029static int acpi_sba_ioc_attach(struct acpi_device *device,
   2030			       const struct acpi_device_id *not_used)
   2031{
   2032	struct ioc *ioc;
   2033
   2034	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
   2035	if (!ioc)
   2036		return -ENOMEM;
   2037
   2038	ioc->next = ioc_found;
   2039	ioc_found = ioc;
   2040	ioc->handle = device->handle;
   2041	return 1;
   2042}
   2043
   2044
   2045static struct acpi_scan_handler acpi_sba_ioc_handler = {
   2046	.ids	= hp_ioc_iommu_device_ids,
   2047	.attach	= acpi_sba_ioc_attach,
   2048};
   2049
   2050static int __init acpi_sba_ioc_init_acpi(void)
   2051{
   2052	return acpi_scan_add_handler(&acpi_sba_ioc_handler);
   2053}
   2054/* This has to run before acpi_scan_init(). */
   2055arch_initcall(acpi_sba_ioc_init_acpi);
   2056
   2057static int sba_dma_supported (struct device *dev, u64 mask)
   2058{
   2059	/* make sure it's at least 32bit capable */
   2060	return ((mask & 0xFFFFFFFFUL) == 0xFFFFFFFFUL);
   2061}
   2062
   2063static const struct dma_map_ops sba_dma_ops = {
   2064	.alloc			= sba_alloc_coherent,
   2065	.free			= sba_free_coherent,
   2066	.map_page		= sba_map_page,
   2067	.unmap_page		= sba_unmap_page,
   2068	.map_sg			= sba_map_sg_attrs,
   2069	.unmap_sg		= sba_unmap_sg_attrs,
   2070	.dma_supported		= sba_dma_supported,
   2071	.mmap			= dma_common_mmap,
   2072	.get_sgtable		= dma_common_get_sgtable,
   2073	.alloc_pages		= dma_common_alloc_pages,
   2074	.free_pages		= dma_common_free_pages,
   2075};
   2076
   2077static int __init
   2078sba_init(void)
   2079{
   2080	/*
   2081	 * If we are booting a kdump kernel, the sba_iommu will cause devices
   2082	 * that were not shutdown properly to MCA as soon as they are turned
   2083	 * back on.  Our only option for a successful kdump kernel boot is to
   2084	 * use swiotlb.
   2085	 */
   2086	if (is_kdump_kernel())
   2087		return 0;
   2088
   2089	/*
   2090	 * ioc_found should be populated by the acpi_sba_ioc_handler's .attach()
   2091	 * routine, but that only happens if acpi_scan_init() has already run.
   2092	 */
   2093	while (ioc_found)
   2094		acpi_sba_ioc_add(ioc_found);
   2095
   2096	if (!ioc_list)
   2097		return 0;
   2098
   2099	{
   2100		struct pci_bus *b = NULL;
   2101		while ((b = pci_find_next_bus(b)) != NULL)
   2102			sba_connect_bus(b);
   2103	}
   2104
   2105	/* no need for swiotlb with the iommu */
   2106	swiotlb_exit();
   2107	dma_ops = &sba_dma_ops;
   2108
   2109#ifdef CONFIG_PROC_FS
   2110	ioc_proc_init();
   2111#endif
   2112	return 0;
   2113}
   2114
   2115subsys_initcall(sba_init); /* must be initialized after ACPI etc., but before any drivers... */
   2116
   2117static int __init
   2118nosbagart(char *str)
   2119{
   2120	reserve_sba_gart = 0;
   2121	return 1;
   2122}
   2123
   2124__setup("nosbagart", nosbagart);
   2125
   2126static int __init
   2127sba_page_override(char *str)
   2128{
   2129	unsigned long page_size;
   2130
   2131	page_size = memparse(str, &str);
   2132	switch (page_size) {
   2133		case 4096:
   2134		case 8192:
   2135		case 16384:
   2136		case 65536:
   2137			iovp_shift = ffs(page_size) - 1;
   2138			break;
   2139		default:
   2140			printk("%s: unknown/unsupported iommu page size %ld\n",
   2141			       __func__, page_size);
   2142	}
   2143
   2144	return 1;
   2145}
   2146
   2147__setup("sbapagesize=",sba_page_override);