cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

iommu.c (45825B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
      4 *
      5 * Rewrite, cleanup:
      6 *
      7 * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
      8 * Copyright (C) 2006 Olof Johansson <olof@lixom.net>
      9 *
     10 * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
     11 */
     12
     13#include <linux/init.h>
     14#include <linux/types.h>
     15#include <linux/slab.h>
     16#include <linux/mm.h>
     17#include <linux/memblock.h>
     18#include <linux/spinlock.h>
     19#include <linux/string.h>
     20#include <linux/pci.h>
     21#include <linux/dma-mapping.h>
     22#include <linux/crash_dump.h>
     23#include <linux/memory.h>
     24#include <linux/of.h>
     25#include <linux/iommu.h>
     26#include <linux/rculist.h>
     27#include <asm/io.h>
     28#include <asm/prom.h>
     29#include <asm/rtas.h>
     30#include <asm/iommu.h>
     31#include <asm/pci-bridge.h>
     32#include <asm/machdep.h>
     33#include <asm/firmware.h>
     34#include <asm/tce.h>
     35#include <asm/ppc-pci.h>
     36#include <asm/udbg.h>
     37#include <asm/mmzone.h>
     38#include <asm/plpar_wrappers.h>
     39
     40#include "pseries.h"
     41
     42enum {
     43	DDW_QUERY_PE_DMA_WIN  = 0,
     44	DDW_CREATE_PE_DMA_WIN = 1,
     45	DDW_REMOVE_PE_DMA_WIN = 2,
     46
     47	DDW_APPLICABLE_SIZE
     48};
     49
     50enum {
     51	DDW_EXT_SIZE = 0,
     52	DDW_EXT_RESET_DMA_WIN = 1,
     53	DDW_EXT_QUERY_OUT_SIZE = 2
     54};
     55
     56static struct iommu_table *iommu_pseries_alloc_table(int node)
     57{
     58	struct iommu_table *tbl;
     59
     60	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
     61	if (!tbl)
     62		return NULL;
     63
     64	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
     65	kref_init(&tbl->it_kref);
     66	return tbl;
     67}
     68
     69static struct iommu_table_group *iommu_pseries_alloc_group(int node)
     70{
     71	struct iommu_table_group *table_group;
     72
     73	table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
     74	if (!table_group)
     75		return NULL;
     76
     77	table_group->tables[0] = iommu_pseries_alloc_table(node);
     78	if (table_group->tables[0])
     79		return table_group;
     80
     81	kfree(table_group);
     82	return NULL;
     83}
     84
     85static void iommu_pseries_free_group(struct iommu_table_group *table_group,
     86		const char *node_name)
     87{
     88	struct iommu_table *tbl;
     89
     90	if (!table_group)
     91		return;
     92
     93	tbl = table_group->tables[0];
     94#ifdef CONFIG_IOMMU_API
     95	if (table_group->group) {
     96		iommu_group_put(table_group->group);
     97		BUG_ON(table_group->group);
     98	}
     99#endif
    100	iommu_tce_table_put(tbl);
    101
    102	kfree(table_group);
    103}
    104
    105static int tce_build_pSeries(struct iommu_table *tbl, long index,
    106			      long npages, unsigned long uaddr,
    107			      enum dma_data_direction direction,
    108			      unsigned long attrs)
    109{
    110	u64 proto_tce;
    111	__be64 *tcep;
    112	u64 rpn;
    113	const unsigned long tceshift = tbl->it_page_shift;
    114	const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
    115
    116	proto_tce = TCE_PCI_READ; // Read allowed
    117
    118	if (direction != DMA_TO_DEVICE)
    119		proto_tce |= TCE_PCI_WRITE;
    120
    121	tcep = ((__be64 *)tbl->it_base) + index;
    122
    123	while (npages--) {
    124		/* can't move this out since we might cross MEMBLOCK boundary */
    125		rpn = __pa(uaddr) >> tceshift;
    126		*tcep = cpu_to_be64(proto_tce | rpn << tceshift);
    127
    128		uaddr += pagesize;
    129		tcep++;
    130	}
    131	return 0;
    132}
    133
    134
    135static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
    136{
    137	__be64 *tcep;
    138
    139	tcep = ((__be64 *)tbl->it_base) + index;
    140
    141	while (npages--)
    142		*(tcep++) = 0;
    143}
    144
    145static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
    146{
    147	__be64 *tcep;
    148
    149	tcep = ((__be64 *)tbl->it_base) + index;
    150
    151	return be64_to_cpu(*tcep);
    152}
    153
    154static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
    155static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
    156
    157static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
    158				long npages, unsigned long uaddr,
    159				enum dma_data_direction direction,
    160				unsigned long attrs)
    161{
    162	u64 rc = 0;
    163	u64 proto_tce, tce;
    164	u64 rpn;
    165	int ret = 0;
    166	long tcenum_start = tcenum, npages_start = npages;
    167
    168	rpn = __pa(uaddr) >> tceshift;
    169	proto_tce = TCE_PCI_READ;
    170	if (direction != DMA_TO_DEVICE)
    171		proto_tce |= TCE_PCI_WRITE;
    172
    173	while (npages--) {
    174		tce = proto_tce | rpn << tceshift;
    175		rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
    176
    177		if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
    178			ret = (int)rc;
    179			tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
    180			                   (npages_start - (npages + 1)));
    181			break;
    182		}
    183
    184		if (rc && printk_ratelimit()) {
    185			printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
    186			printk("\tindex   = 0x%llx\n", (u64)liobn);
    187			printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
    188			printk("\ttce val = 0x%llx\n", tce );
    189			dump_stack();
    190		}
    191
    192		tcenum++;
    193		rpn++;
    194	}
    195	return ret;
    196}
    197
    198static DEFINE_PER_CPU(__be64 *, tce_page);
    199
    200static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
    201				     long npages, unsigned long uaddr,
    202				     enum dma_data_direction direction,
    203				     unsigned long attrs)
    204{
    205	u64 rc = 0;
    206	u64 proto_tce;
    207	__be64 *tcep;
    208	u64 rpn;
    209	long l, limit;
    210	long tcenum_start = tcenum, npages_start = npages;
    211	int ret = 0;
    212	unsigned long flags;
    213	const unsigned long tceshift = tbl->it_page_shift;
    214
    215	if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
    216		return tce_build_pSeriesLP(tbl->it_index, tcenum,
    217					   tceshift, npages, uaddr,
    218		                           direction, attrs);
    219	}
    220
    221	local_irq_save(flags);	/* to protect tcep and the page behind it */
    222
    223	tcep = __this_cpu_read(tce_page);
    224
    225	/* This is safe to do since interrupts are off when we're called
    226	 * from iommu_alloc{,_sg}()
    227	 */
    228	if (!tcep) {
    229		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
    230		/* If allocation fails, fall back to the loop implementation */
    231		if (!tcep) {
    232			local_irq_restore(flags);
    233			return tce_build_pSeriesLP(tbl->it_index, tcenum,
    234					tceshift,
    235					npages, uaddr, direction, attrs);
    236		}
    237		__this_cpu_write(tce_page, tcep);
    238	}
    239
    240	rpn = __pa(uaddr) >> tceshift;
    241	proto_tce = TCE_PCI_READ;
    242	if (direction != DMA_TO_DEVICE)
    243		proto_tce |= TCE_PCI_WRITE;
    244
    245	/* We can map max one pageful of TCEs at a time */
    246	do {
    247		/*
    248		 * Set up the page with TCE data, looping through and setting
    249		 * the values.
    250		 */
    251		limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
    252
    253		for (l = 0; l < limit; l++) {
    254			tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift);
    255			rpn++;
    256		}
    257
    258		rc = plpar_tce_put_indirect((u64)tbl->it_index,
    259					    (u64)tcenum << tceshift,
    260					    (u64)__pa(tcep),
    261					    limit);
    262
    263		npages -= limit;
    264		tcenum += limit;
    265	} while (npages > 0 && !rc);
    266
    267	local_irq_restore(flags);
    268
    269	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
    270		ret = (int)rc;
    271		tce_freemulti_pSeriesLP(tbl, tcenum_start,
    272		                        (npages_start - (npages + limit)));
    273		return ret;
    274	}
    275
    276	if (rc && printk_ratelimit()) {
    277		printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
    278		printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
    279		printk("\tnpages  = 0x%llx\n", (u64)npages);
    280		printk("\ttce[0] val = 0x%llx\n", tcep[0]);
    281		dump_stack();
    282	}
    283	return ret;
    284}
    285
    286static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
    287			       long npages)
    288{
    289	u64 rc;
    290
    291	while (npages--) {
    292		rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0);
    293
    294		if (rc && printk_ratelimit()) {
    295			printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
    296			printk("\tindex   = 0x%llx\n", (u64)liobn);
    297			printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
    298			dump_stack();
    299		}
    300
    301		tcenum++;
    302	}
    303}
    304
    305
    306static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
    307{
    308	u64 rc;
    309
    310	if (!firmware_has_feature(FW_FEATURE_STUFF_TCE))
    311		return tce_free_pSeriesLP(tbl->it_index, tcenum,
    312					  tbl->it_page_shift, npages);
    313
    314	rc = plpar_tce_stuff((u64)tbl->it_index,
    315			     (u64)tcenum << tbl->it_page_shift, 0, npages);
    316
    317	if (rc && printk_ratelimit()) {
    318		printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
    319		printk("\trc      = %lld\n", rc);
    320		printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
    321		printk("\tnpages  = 0x%llx\n", (u64)npages);
    322		dump_stack();
    323	}
    324}
    325
    326static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
    327{
    328	u64 rc;
    329	unsigned long tce_ret;
    330
    331	rc = plpar_tce_get((u64)tbl->it_index,
    332			   (u64)tcenum << tbl->it_page_shift, &tce_ret);
    333
    334	if (rc && printk_ratelimit()) {
    335		printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
    336		printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
    337		printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
    338		dump_stack();
    339	}
    340
    341	return tce_ret;
    342}
    343
    344/* this is compatible with cells for the device tree property */
    345struct dynamic_dma_window_prop {
    346	__be32	liobn;		/* tce table number */
    347	__be64	dma_base;	/* address hi,lo */
    348	__be32	tce_shift;	/* ilog2(tce_page_size) */
    349	__be32	window_shift;	/* ilog2(tce_window_size) */
    350};
    351
    352struct dma_win {
    353	struct device_node *device;
    354	const struct dynamic_dma_window_prop *prop;
    355	struct list_head list;
    356};
    357
    358/* Dynamic DMA Window support */
    359struct ddw_query_response {
    360	u32 windows_available;
    361	u64 largest_available_block;
    362	u32 page_size;
    363	u32 migration_capable;
    364};
    365
    366struct ddw_create_response {
    367	u32 liobn;
    368	u32 addr_hi;
    369	u32 addr_lo;
    370};
    371
    372static LIST_HEAD(dma_win_list);
    373/* prevents races between memory on/offline and window creation */
    374static DEFINE_SPINLOCK(dma_win_list_lock);
    375/* protects initializing window twice for same device */
    376static DEFINE_MUTEX(dma_win_init_mutex);
    377#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
    378#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
    379
    380static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
    381					unsigned long num_pfn, const void *arg)
    382{
    383	const struct dynamic_dma_window_prop *maprange = arg;
    384	int rc;
    385	u64 tce_size, num_tce, dma_offset, next;
    386	u32 tce_shift;
    387	long limit;
    388
    389	tce_shift = be32_to_cpu(maprange->tce_shift);
    390	tce_size = 1ULL << tce_shift;
    391	next = start_pfn << PAGE_SHIFT;
    392	num_tce = num_pfn << PAGE_SHIFT;
    393
    394	/* round back to the beginning of the tce page size */
    395	num_tce += next & (tce_size - 1);
    396	next &= ~(tce_size - 1);
    397
    398	/* covert to number of tces */
    399	num_tce |= tce_size - 1;
    400	num_tce >>= tce_shift;
    401
    402	do {
    403		/*
    404		 * Set up the page with TCE data, looping through and setting
    405		 * the values.
    406		 */
    407		limit = min_t(long, num_tce, 512);
    408		dma_offset = next + be64_to_cpu(maprange->dma_base);
    409
    410		rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
    411					     dma_offset,
    412					     0, limit);
    413		next += limit * tce_size;
    414		num_tce -= limit;
    415	} while (num_tce > 0 && !rc);
    416
    417	return rc;
    418}
    419
    420static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
    421					unsigned long num_pfn, const void *arg)
    422{
    423	const struct dynamic_dma_window_prop *maprange = arg;
    424	u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn;
    425	__be64 *tcep;
    426	u32 tce_shift;
    427	u64 rc = 0;
    428	long l, limit;
    429
    430	if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
    431		unsigned long tceshift = be32_to_cpu(maprange->tce_shift);
    432		unsigned long dmastart = (start_pfn << PAGE_SHIFT) +
    433				be64_to_cpu(maprange->dma_base);
    434		unsigned long tcenum = dmastart >> tceshift;
    435		unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift;
    436		void *uaddr = __va(start_pfn << PAGE_SHIFT);
    437
    438		return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn),
    439				tcenum, tceshift, npages, (unsigned long) uaddr,
    440				DMA_BIDIRECTIONAL, 0);
    441	}
    442
    443	local_irq_disable();	/* to protect tcep and the page behind it */
    444	tcep = __this_cpu_read(tce_page);
    445
    446	if (!tcep) {
    447		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
    448		if (!tcep) {
    449			local_irq_enable();
    450			return -ENOMEM;
    451		}
    452		__this_cpu_write(tce_page, tcep);
    453	}
    454
    455	proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
    456
    457	liobn = (u64)be32_to_cpu(maprange->liobn);
    458	tce_shift = be32_to_cpu(maprange->tce_shift);
    459	tce_size = 1ULL << tce_shift;
    460	next = start_pfn << PAGE_SHIFT;
    461	num_tce = num_pfn << PAGE_SHIFT;
    462
    463	/* round back to the beginning of the tce page size */
    464	num_tce += next & (tce_size - 1);
    465	next &= ~(tce_size - 1);
    466
    467	/* covert to number of tces */
    468	num_tce |= tce_size - 1;
    469	num_tce >>= tce_shift;
    470
    471	/* We can map max one pageful of TCEs at a time */
    472	do {
    473		/*
    474		 * Set up the page with TCE data, looping through and setting
    475		 * the values.
    476		 */
    477		limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
    478		dma_offset = next + be64_to_cpu(maprange->dma_base);
    479
    480		for (l = 0; l < limit; l++) {
    481			tcep[l] = cpu_to_be64(proto_tce | next);
    482			next += tce_size;
    483		}
    484
    485		rc = plpar_tce_put_indirect(liobn,
    486					    dma_offset,
    487					    (u64)__pa(tcep),
    488					    limit);
    489
    490		num_tce -= limit;
    491	} while (num_tce > 0 && !rc);
    492
    493	/* error cleanup: caller will clear whole range */
    494
    495	local_irq_enable();
    496	return rc;
    497}
    498
    499static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
    500		unsigned long num_pfn, void *arg)
    501{
    502	return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
    503}
    504
    505static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno,
    506					unsigned long liobn, unsigned long win_addr,
    507					unsigned long window_size, unsigned long page_shift,
    508					void *base, struct iommu_table_ops *table_ops)
    509{
    510	tbl->it_busno = busno;
    511	tbl->it_index = liobn;
    512	tbl->it_offset = win_addr >> page_shift;
    513	tbl->it_size = window_size >> page_shift;
    514	tbl->it_page_shift = page_shift;
    515	tbl->it_base = (unsigned long)base;
    516	tbl->it_blocksize = 16;
    517	tbl->it_type = TCE_PCI;
    518	tbl->it_ops = table_ops;
    519}
    520
    521struct iommu_table_ops iommu_table_pseries_ops;
    522
    523static void iommu_table_setparms(struct pci_controller *phb,
    524				 struct device_node *dn,
    525				 struct iommu_table *tbl)
    526{
    527	struct device_node *node;
    528	const unsigned long *basep;
    529	const u32 *sizep;
    530
    531	/* Test if we are going over 2GB of DMA space */
    532	if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {
    533		udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
    534		panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
    535	}
    536
    537	node = phb->dn;
    538	basep = of_get_property(node, "linux,tce-base", NULL);
    539	sizep = of_get_property(node, "linux,tce-size", NULL);
    540	if (basep == NULL || sizep == NULL) {
    541		printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has "
    542				"missing tce entries !\n", dn);
    543		return;
    544	}
    545
    546	iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur,
    547				    phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
    548				    __va(*basep), &iommu_table_pseries_ops);
    549
    550	if (!is_kdump_kernel())
    551		memset((void *)tbl->it_base, 0, *sizep);
    552
    553	phb->dma_window_base_cur += phb->dma_window_size;
    554}
    555
    556struct iommu_table_ops iommu_table_lpar_multi_ops;
    557
    558/*
    559 * iommu_table_setparms_lpar
    560 *
    561 * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
    562 */
    563static void iommu_table_setparms_lpar(struct pci_controller *phb,
    564				      struct device_node *dn,
    565				      struct iommu_table *tbl,
    566				      struct iommu_table_group *table_group,
    567				      const __be32 *dma_window)
    568{
    569	unsigned long offset, size, liobn;
    570
    571	of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
    572
    573	iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
    574				    &iommu_table_lpar_multi_ops);
    575
    576
    577	table_group->tce32_start = offset;
    578	table_group->tce32_size = size;
    579}
    580
    581struct iommu_table_ops iommu_table_pseries_ops = {
    582	.set = tce_build_pSeries,
    583	.clear = tce_free_pSeries,
    584	.get = tce_get_pseries
    585};
    586
    587static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
    588{
    589	struct device_node *dn;
    590	struct iommu_table *tbl;
    591	struct device_node *isa_dn, *isa_dn_orig;
    592	struct device_node *tmp;
    593	struct pci_dn *pci;
    594	int children;
    595
    596	dn = pci_bus_to_OF_node(bus);
    597
    598	pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn);
    599
    600	if (bus->self) {
    601		/* This is not a root bus, any setup will be done for the
    602		 * device-side of the bridge in iommu_dev_setup_pSeries().
    603		 */
    604		return;
    605	}
    606	pci = PCI_DN(dn);
    607
    608	/* Check if the ISA bus on the system is under
    609	 * this PHB.
    610	 */
    611	isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
    612
    613	while (isa_dn && isa_dn != dn)
    614		isa_dn = isa_dn->parent;
    615
    616	of_node_put(isa_dn_orig);
    617
    618	/* Count number of direct PCI children of the PHB. */
    619	for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
    620		children++;
    621
    622	pr_debug("Children: %d\n", children);
    623
    624	/* Calculate amount of DMA window per slot. Each window must be
    625	 * a power of two (due to pci_alloc_consistent requirements).
    626	 *
    627	 * Keep 256MB aside for PHBs with ISA.
    628	 */
    629
    630	if (!isa_dn) {
    631		/* No ISA/IDE - just set window size and return */
    632		pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
    633
    634		while (pci->phb->dma_window_size * children > 0x80000000ul)
    635			pci->phb->dma_window_size >>= 1;
    636		pr_debug("No ISA/IDE, window size is 0x%llx\n",
    637			 pci->phb->dma_window_size);
    638		pci->phb->dma_window_base_cur = 0;
    639
    640		return;
    641	}
    642
    643	/* If we have ISA, then we probably have an IDE
    644	 * controller too. Allocate a 128MB table but
    645	 * skip the first 128MB to avoid stepping on ISA
    646	 * space.
    647	 */
    648	pci->phb->dma_window_size = 0x8000000ul;
    649	pci->phb->dma_window_base_cur = 0x8000000ul;
    650
    651	pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
    652	tbl = pci->table_group->tables[0];
    653
    654	iommu_table_setparms(pci->phb, dn, tbl);
    655
    656	if (!iommu_init_table(tbl, pci->phb->node, 0, 0))
    657		panic("Failed to initialize iommu table");
    658
    659	/* Divide the rest (1.75GB) among the children */
    660	pci->phb->dma_window_size = 0x80000000ul;
    661	while (pci->phb->dma_window_size * children > 0x70000000ul)
    662		pci->phb->dma_window_size >>= 1;
    663
    664	pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
    665}
    666
    667#ifdef CONFIG_IOMMU_API
    668static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
    669				long *tce, enum dma_data_direction *direction)
    670{
    671	long rc;
    672	unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
    673	unsigned long flags, oldtce = 0;
    674	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
    675	unsigned long newtce = *tce | proto_tce;
    676
    677	spin_lock_irqsave(&tbl->large_pool.lock, flags);
    678
    679	rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
    680	if (!rc)
    681		rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
    682
    683	if (!rc) {
    684		*direction = iommu_tce_direction(oldtce);
    685		*tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
    686	}
    687
    688	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
    689
    690	return rc;
    691}
    692#endif
    693
    694struct iommu_table_ops iommu_table_lpar_multi_ops = {
    695	.set = tce_buildmulti_pSeriesLP,
    696#ifdef CONFIG_IOMMU_API
    697	.xchg_no_kill = tce_exchange_pseries,
    698#endif
    699	.clear = tce_freemulti_pSeriesLP,
    700	.get = tce_get_pSeriesLP
    701};
    702
    703static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
    704{
    705	struct iommu_table *tbl;
    706	struct device_node *dn, *pdn;
    707	struct pci_dn *ppci;
    708	const __be32 *dma_window = NULL;
    709
    710	dn = pci_bus_to_OF_node(bus);
    711
    712	pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
    713		 dn);
    714
    715	/*
    716	 * Find nearest ibm,dma-window (default DMA window), walking up the
    717	 * device tree
    718	 */
    719	for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
    720		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
    721		if (dma_window != NULL)
    722			break;
    723	}
    724
    725	if (dma_window == NULL) {
    726		pr_debug("  no ibm,dma-window property !\n");
    727		return;
    728	}
    729
    730	ppci = PCI_DN(pdn);
    731
    732	pr_debug("  parent is %pOF, iommu_table: 0x%p\n",
    733		 pdn, ppci->table_group);
    734
    735	if (!ppci->table_group) {
    736		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
    737		tbl = ppci->table_group->tables[0];
    738		iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
    739				ppci->table_group, dma_window);
    740
    741		if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
    742			panic("Failed to initialize iommu table");
    743		iommu_register_group(ppci->table_group,
    744				pci_domain_nr(bus), 0);
    745		pr_debug("  created table: %p\n", ppci->table_group);
    746	}
    747}
    748
    749
    750static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
    751{
    752	struct device_node *dn;
    753	struct iommu_table *tbl;
    754
    755	pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
    756
    757	dn = dev->dev.of_node;
    758
    759	/* If we're the direct child of a root bus, then we need to allocate
    760	 * an iommu table ourselves. The bus setup code should have setup
    761	 * the window sizes already.
    762	 */
    763	if (!dev->bus->self) {
    764		struct pci_controller *phb = PCI_DN(dn)->phb;
    765
    766		pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
    767		PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
    768		tbl = PCI_DN(dn)->table_group->tables[0];
    769		iommu_table_setparms(phb, dn, tbl);
    770
    771		if (!iommu_init_table(tbl, phb->node, 0, 0))
    772			panic("Failed to initialize iommu table");
    773
    774		set_iommu_table_base(&dev->dev, tbl);
    775		return;
    776	}
    777
    778	/* If this device is further down the bus tree, search upwards until
    779	 * an already allocated iommu table is found and use that.
    780	 */
    781
    782	while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
    783		dn = dn->parent;
    784
    785	if (dn && PCI_DN(dn))
    786		set_iommu_table_base(&dev->dev,
    787				PCI_DN(dn)->table_group->tables[0]);
    788	else
    789		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
    790		       pci_name(dev));
    791}
    792
    793static int __read_mostly disable_ddw;
    794
    795static int __init disable_ddw_setup(char *str)
    796{
    797	disable_ddw = 1;
    798	printk(KERN_INFO "ppc iommu: disabling ddw.\n");
    799
    800	return 0;
    801}
    802
    803early_param("disable_ddw", disable_ddw_setup);
    804
    805static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp)
    806{
    807	int ret;
    808
    809	ret = tce_clearrange_multi_pSeriesLP(0,
    810		1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
    811	if (ret)
    812		pr_warn("%pOF failed to clear tces in window.\n",
    813			np);
    814	else
    815		pr_debug("%pOF successfully cleared tces in window.\n",
    816			 np);
    817}
    818
    819/*
    820 * Call only if DMA window is clean.
    821 */
    822static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn)
    823{
    824	int ret;
    825
    826	ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
    827	if (ret)
    828		pr_warn("%pOF: failed to remove DMA window: rtas returned "
    829			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
    830			np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
    831	else
    832		pr_debug("%pOF: successfully removed DMA window: rtas returned "
    833			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
    834			np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
    835}
    836
    837static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
    838			      struct property *win)
    839{
    840	struct dynamic_dma_window_prop *dwp;
    841	u64 liobn;
    842
    843	dwp = win->value;
    844	liobn = (u64)be32_to_cpu(dwp->liobn);
    845
    846	clean_dma_window(np, dwp);
    847	__remove_dma_window(np, ddw_avail, liobn);
    848}
    849
    850static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name)
    851{
    852	struct property *win;
    853	u32 ddw_avail[DDW_APPLICABLE_SIZE];
    854	int ret = 0;
    855
    856	win = of_find_property(np, win_name, NULL);
    857	if (!win)
    858		return -EINVAL;
    859
    860	ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
    861					 &ddw_avail[0], DDW_APPLICABLE_SIZE);
    862	if (ret)
    863		return 0;
    864
    865
    866	if (win->length >= sizeof(struct dynamic_dma_window_prop))
    867		remove_dma_window(np, ddw_avail, win);
    868
    869	if (!remove_prop)
    870		return 0;
    871
    872	ret = of_remove_property(np, win);
    873	if (ret)
    874		pr_warn("%pOF: failed to remove DMA window property: %d\n",
    875			np, ret);
    876	return 0;
    877}
    878
    879static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift)
    880{
    881	struct dma_win *window;
    882	const struct dynamic_dma_window_prop *dma64;
    883	bool found = false;
    884
    885	spin_lock(&dma_win_list_lock);
    886	/* check if we already created a window and dupe that config if so */
    887	list_for_each_entry(window, &dma_win_list, list) {
    888		if (window->device == pdn) {
    889			dma64 = window->prop;
    890			*dma_addr = be64_to_cpu(dma64->dma_base);
    891			*window_shift = be32_to_cpu(dma64->window_shift);
    892			found = true;
    893			break;
    894		}
    895	}
    896	spin_unlock(&dma_win_list_lock);
    897
    898	return found;
    899}
    900
    901static struct dma_win *ddw_list_new_entry(struct device_node *pdn,
    902					  const struct dynamic_dma_window_prop *dma64)
    903{
    904	struct dma_win *window;
    905
    906	window = kzalloc(sizeof(*window), GFP_KERNEL);
    907	if (!window)
    908		return NULL;
    909
    910	window->device = pdn;
    911	window->prop = dma64;
    912
    913	return window;
    914}
    915
    916static void find_existing_ddw_windows_named(const char *name)
    917{
    918	int len;
    919	struct device_node *pdn;
    920	struct dma_win *window;
    921	const struct dynamic_dma_window_prop *dma64;
    922
    923	for_each_node_with_property(pdn, name) {
    924		dma64 = of_get_property(pdn, name, &len);
    925		if (!dma64 || len < sizeof(*dma64)) {
    926			remove_ddw(pdn, true, name);
    927			continue;
    928		}
    929
    930		window = ddw_list_new_entry(pdn, dma64);
    931		if (!window) {
    932			of_node_put(pdn);
    933			break;
    934		}
    935
    936		spin_lock(&dma_win_list_lock);
    937		list_add(&window->list, &dma_win_list);
    938		spin_unlock(&dma_win_list_lock);
    939	}
    940}
    941
    942static int find_existing_ddw_windows(void)
    943{
    944	if (!firmware_has_feature(FW_FEATURE_LPAR))
    945		return 0;
    946
    947	find_existing_ddw_windows_named(DIRECT64_PROPNAME);
    948	find_existing_ddw_windows_named(DMA64_PROPNAME);
    949
    950	return 0;
    951}
    952machine_arch_initcall(pseries, find_existing_ddw_windows);
    953
    954/**
    955 * ddw_read_ext - Get the value of an DDW extension
    956 * @np:		device node from which the extension value is to be read.
    957 * @extnum:	index number of the extension.
    958 * @value:	pointer to return value, modified when extension is available.
    959 *
    960 * Checks if "ibm,ddw-extensions" exists for this node, and get the value
    961 * on index 'extnum'.
    962 * It can be used only to check if a property exists, passing value == NULL.
    963 *
    964 * Returns:
    965 *	0 if extension successfully read
    966 *	-EINVAL if the "ibm,ddw-extensions" does not exist,
    967 *	-ENODATA if "ibm,ddw-extensions" does not have a value, and
    968 *	-EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.
    969 */
    970static inline int ddw_read_ext(const struct device_node *np, int extnum,
    971			       u32 *value)
    972{
    973	static const char propname[] = "ibm,ddw-extensions";
    974	u32 count;
    975	int ret;
    976
    977	ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count);
    978	if (ret)
    979		return ret;
    980
    981	if (count < extnum)
    982		return -EOVERFLOW;
    983
    984	if (!value)
    985		value = &count;
    986
    987	return of_property_read_u32_index(np, propname, extnum, value);
    988}
    989
    990static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
    991		     struct ddw_query_response *query,
    992		     struct device_node *parent)
    993{
    994	struct device_node *dn;
    995	struct pci_dn *pdn;
    996	u32 cfg_addr, ext_query, query_out[5];
    997	u64 buid;
    998	int ret, out_sz;
    999
   1000	/*
   1001	 * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many
   1002	 * output parameters ibm,query-pe-dma-windows will have, ranging from
   1003	 * 5 to 6.
   1004	 */
   1005	ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query);
   1006	if (!ret && ext_query == 1)
   1007		out_sz = 6;
   1008	else
   1009		out_sz = 5;
   1010
   1011	/*
   1012	 * Get the config address and phb buid of the PE window.
   1013	 * Rely on eeh to retrieve this for us.
   1014	 * Retrieve them from the pci device, not the node with the
   1015	 * dma-window property
   1016	 */
   1017	dn = pci_device_to_OF_node(dev);
   1018	pdn = PCI_DN(dn);
   1019	buid = pdn->phb->buid;
   1020	cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
   1021
   1022	ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,
   1023			cfg_addr, BUID_HI(buid), BUID_LO(buid));
   1024	dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d\n",
   1025		 ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
   1026		 BUID_LO(buid), ret);
   1027
   1028	switch (out_sz) {
   1029	case 5:
   1030		query->windows_available = query_out[0];
   1031		query->largest_available_block = query_out[1];
   1032		query->page_size = query_out[2];
   1033		query->migration_capable = query_out[3];
   1034		break;
   1035	case 6:
   1036		query->windows_available = query_out[0];
   1037		query->largest_available_block = ((u64)query_out[1] << 32) |
   1038						 query_out[2];
   1039		query->page_size = query_out[3];
   1040		query->migration_capable = query_out[4];
   1041		break;
   1042	}
   1043
   1044	return ret;
   1045}
   1046
   1047static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
   1048			struct ddw_create_response *create, int page_shift,
   1049			int window_shift)
   1050{
   1051	struct device_node *dn;
   1052	struct pci_dn *pdn;
   1053	u32 cfg_addr;
   1054	u64 buid;
   1055	int ret;
   1056
   1057	/*
   1058	 * Get the config address and phb buid of the PE window.
   1059	 * Rely on eeh to retrieve this for us.
   1060	 * Retrieve them from the pci device, not the node with the
   1061	 * dma-window property
   1062	 */
   1063	dn = pci_device_to_OF_node(dev);
   1064	pdn = PCI_DN(dn);
   1065	buid = pdn->phb->buid;
   1066	cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
   1067
   1068	do {
   1069		/* extra outputs are LIOBN and dma-addr (hi, lo) */
   1070		ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,
   1071				(u32 *)create, cfg_addr, BUID_HI(buid),
   1072				BUID_LO(buid), page_shift, window_shift);
   1073	} while (rtas_busy_delay(ret));
   1074	dev_info(&dev->dev,
   1075		"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
   1076		"(liobn = 0x%x starting addr = %x %x)\n",
   1077		 ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
   1078		 BUID_LO(buid), page_shift, window_shift, ret, create->liobn,
   1079		 create->addr_hi, create->addr_lo);
   1080
   1081	return ret;
   1082}
   1083
   1084struct failed_ddw_pdn {
   1085	struct device_node *pdn;
   1086	struct list_head list;
   1087};
   1088
   1089static LIST_HEAD(failed_ddw_pdn_list);
   1090
   1091static phys_addr_t ddw_memory_hotplug_max(void)
   1092{
   1093	phys_addr_t max_addr = memory_hotplug_max();
   1094	struct device_node *memory;
   1095
   1096	for_each_node_by_type(memory, "memory") {
   1097		unsigned long start, size;
   1098		int n_mem_addr_cells, n_mem_size_cells, len;
   1099		const __be32 *memcell_buf;
   1100
   1101		memcell_buf = of_get_property(memory, "reg", &len);
   1102		if (!memcell_buf || len <= 0)
   1103			continue;
   1104
   1105		n_mem_addr_cells = of_n_addr_cells(memory);
   1106		n_mem_size_cells = of_n_size_cells(memory);
   1107
   1108		start = of_read_number(memcell_buf, n_mem_addr_cells);
   1109		memcell_buf += n_mem_addr_cells;
   1110		size = of_read_number(memcell_buf, n_mem_size_cells);
   1111		memcell_buf += n_mem_size_cells;
   1112
   1113		max_addr = max_t(phys_addr_t, max_addr, start + size);
   1114	}
   1115
   1116	return max_addr;
   1117}
   1118
   1119/*
   1120 * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
   1121 * ibm,ddw-extensions, which carries the rtas token for
   1122 * ibm,reset-pe-dma-windows.
   1123 * That rtas-call can be used to restore the default DMA window for the device.
   1124 */
   1125static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
   1126{
   1127	int ret;
   1128	u32 cfg_addr, reset_dma_win;
   1129	u64 buid;
   1130	struct device_node *dn;
   1131	struct pci_dn *pdn;
   1132
   1133	ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);
   1134	if (ret)
   1135		return;
   1136
   1137	dn = pci_device_to_OF_node(dev);
   1138	pdn = PCI_DN(dn);
   1139	buid = pdn->phb->buid;
   1140	cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8);
   1141
   1142	ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
   1143			BUID_LO(buid));
   1144	if (ret)
   1145		dev_info(&dev->dev,
   1146			 "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
   1147			 reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
   1148			 ret);
   1149}
   1150
   1151/* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */
   1152static int iommu_get_page_shift(u32 query_page_size)
   1153{
   1154	/* Supported IO page-sizes according to LoPAR, note that 2M is out of order */
   1155	const int shift[] = {
   1156		__builtin_ctzll(SZ_4K),   __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M),
   1157		__builtin_ctzll(SZ_32M),  __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M),
   1158		__builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G), __builtin_ctzll(SZ_2M)
   1159	};
   1160
   1161	int i = ARRAY_SIZE(shift) - 1;
   1162	int ret = 0;
   1163
   1164	/*
   1165	 * On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field:
   1166	 * - bit 31 means 4k pages are supported,
   1167	 * - bit 30 means 64k pages are supported, and so on.
   1168	 * Larger pagesizes map more memory with the same amount of TCEs, so start probing them.
   1169	 */
   1170	for (; i >= 0 ; i--) {
   1171		if (query_page_size & (1 << i))
   1172			ret = max(ret, shift[i]);
   1173	}
   1174
   1175	return ret;
   1176}
   1177
   1178static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr,
   1179					    u32 page_shift, u32 window_shift)
   1180{
   1181	struct dynamic_dma_window_prop *ddwprop;
   1182	struct property *win64;
   1183
   1184	win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
   1185	if (!win64)
   1186		return NULL;
   1187
   1188	win64->name = kstrdup(propname, GFP_KERNEL);
   1189	ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
   1190	win64->value = ddwprop;
   1191	win64->length = sizeof(*ddwprop);
   1192	if (!win64->name || !win64->value) {
   1193		kfree(win64->name);
   1194		kfree(win64->value);
   1195		kfree(win64);
   1196		return NULL;
   1197	}
   1198
   1199	ddwprop->liobn = cpu_to_be32(liobn);
   1200	ddwprop->dma_base = cpu_to_be64(dma_addr);
   1201	ddwprop->tce_shift = cpu_to_be32(page_shift);
   1202	ddwprop->window_shift = cpu_to_be32(window_shift);
   1203
   1204	return win64;
   1205}
   1206
   1207/*
   1208 * If the PE supports dynamic dma windows, and there is space for a table
   1209 * that can map all pages in a linear offset, then setup such a table,
   1210 * and record the dma-offset in the struct device.
   1211 *
   1212 * dev: the pci device we are checking
   1213 * pdn: the parent pe node with the ibm,dma_window property
   1214 * Future: also check if we can remap the base window for our base page size
   1215 *
   1216 * returns true if can map all pages (direct mapping), false otherwise..
   1217 */
   1218static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
   1219{
   1220	int len = 0, ret;
   1221	int max_ram_len = order_base_2(ddw_memory_hotplug_max());
   1222	struct ddw_query_response query;
   1223	struct ddw_create_response create;
   1224	int page_shift;
   1225	u64 win_addr;
   1226	const char *win_name;
   1227	struct device_node *dn;
   1228	u32 ddw_avail[DDW_APPLICABLE_SIZE];
   1229	struct dma_win *window;
   1230	struct property *win64;
   1231	struct failed_ddw_pdn *fpdn;
   1232	bool default_win_removed = false, direct_mapping = false;
   1233	bool pmem_present;
   1234	struct pci_dn *pci = PCI_DN(pdn);
   1235	struct iommu_table *tbl = pci->table_group->tables[0];
   1236
   1237	dn = of_find_node_by_type(NULL, "ibm,pmemory");
   1238	pmem_present = dn != NULL;
   1239	of_node_put(dn);
   1240
   1241	mutex_lock(&dma_win_init_mutex);
   1242
   1243	if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) {
   1244		direct_mapping = (len >= max_ram_len);
   1245		goto out_unlock;
   1246	}
   1247
   1248	/*
   1249	 * If we already went through this for a previous function of
   1250	 * the same device and failed, we don't want to muck with the
   1251	 * DMA window again, as it will race with in-flight operations
   1252	 * and can lead to EEHs. The above mutex protects access to the
   1253	 * list.
   1254	 */
   1255	list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {
   1256		if (fpdn->pdn == pdn)
   1257			goto out_unlock;
   1258	}
   1259
   1260	/*
   1261	 * the ibm,ddw-applicable property holds the tokens for:
   1262	 * ibm,query-pe-dma-window
   1263	 * ibm,create-pe-dma-window
   1264	 * ibm,remove-pe-dma-window
   1265	 * for the given node in that order.
   1266	 * the property is actually in the parent, not the PE
   1267	 */
   1268	ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
   1269					 &ddw_avail[0], DDW_APPLICABLE_SIZE);
   1270	if (ret)
   1271		goto out_failed;
   1272
   1273       /*
   1274	 * Query if there is a second window of size to map the
   1275	 * whole partition.  Query returns number of windows, largest
   1276	 * block assigned to PE (partition endpoint), and two bitmasks
   1277	 * of page sizes: supported and supported for migrate-dma.
   1278	 */
   1279	dn = pci_device_to_OF_node(dev);
   1280	ret = query_ddw(dev, ddw_avail, &query, pdn);
   1281	if (ret != 0)
   1282		goto out_failed;
   1283
   1284	/*
   1285	 * If there is no window available, remove the default DMA window,
   1286	 * if it's present. This will make all the resources available to the
   1287	 * new DDW window.
   1288	 * If anything fails after this, we need to restore it, so also check
   1289	 * for extensions presence.
   1290	 */
   1291	if (query.windows_available == 0) {
   1292		struct property *default_win;
   1293		int reset_win_ext;
   1294
   1295		/* DDW + IOMMU on single window may fail if there is any allocation */
   1296		if (iommu_table_in_use(tbl)) {
   1297			dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
   1298			goto out_failed;
   1299		}
   1300
   1301		default_win = of_find_property(pdn, "ibm,dma-window", NULL);
   1302		if (!default_win)
   1303			goto out_failed;
   1304
   1305		reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
   1306		if (reset_win_ext)
   1307			goto out_failed;
   1308
   1309		remove_dma_window(pdn, ddw_avail, default_win);
   1310		default_win_removed = true;
   1311
   1312		/* Query again, to check if the window is available */
   1313		ret = query_ddw(dev, ddw_avail, &query, pdn);
   1314		if (ret != 0)
   1315			goto out_failed;
   1316
   1317		if (query.windows_available == 0) {
   1318			/* no windows are available for this device. */
   1319			dev_dbg(&dev->dev, "no free dynamic windows");
   1320			goto out_failed;
   1321		}
   1322	}
   1323
   1324	page_shift = iommu_get_page_shift(query.page_size);
   1325	if (!page_shift) {
   1326		dev_dbg(&dev->dev, "no supported page size in mask %x",
   1327			query.page_size);
   1328		goto out_failed;
   1329	}
   1330
   1331
   1332	/*
   1333	 * The "ibm,pmemory" can appear anywhere in the address space.
   1334	 * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
   1335	 * for the upper limit and fallback to max RAM otherwise but this
   1336	 * disables device::dma_ops_bypass.
   1337	 */
   1338	len = max_ram_len;
   1339	if (pmem_present) {
   1340		if (query.largest_available_block >=
   1341		    (1ULL << (MAX_PHYSMEM_BITS - page_shift)))
   1342			len = MAX_PHYSMEM_BITS;
   1343		else
   1344			dev_info(&dev->dev, "Skipping ibm,pmemory");
   1345	}
   1346
   1347	/* check if the available block * number of ptes will map everything */
   1348	if (query.largest_available_block < (1ULL << (len - page_shift))) {
   1349		dev_dbg(&dev->dev,
   1350			"can't map partition max 0x%llx with %llu %llu-sized pages\n",
   1351			1ULL << len,
   1352			query.largest_available_block,
   1353			1ULL << page_shift);
   1354
   1355		len = order_base_2(query.largest_available_block << page_shift);
   1356		win_name = DMA64_PROPNAME;
   1357	} else {
   1358		direct_mapping = !default_win_removed ||
   1359			(len == MAX_PHYSMEM_BITS) ||
   1360			(!pmem_present && (len == max_ram_len));
   1361		win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME;
   1362	}
   1363
   1364	ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
   1365	if (ret != 0)
   1366		goto out_failed;
   1367
   1368	dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
   1369		  create.liobn, dn);
   1370
   1371	win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
   1372	win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len);
   1373
   1374	if (!win64) {
   1375		dev_info(&dev->dev,
   1376			 "couldn't allocate property, property name, or value\n");
   1377		goto out_remove_win;
   1378	}
   1379
   1380	ret = of_add_property(pdn, win64);
   1381	if (ret) {
   1382		dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d",
   1383			pdn, ret);
   1384		goto out_free_prop;
   1385	}
   1386
   1387	window = ddw_list_new_entry(pdn, win64->value);
   1388	if (!window)
   1389		goto out_del_prop;
   1390
   1391	if (direct_mapping) {
   1392		/* DDW maps the whole partition, so enable direct DMA mapping */
   1393		ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
   1394					    win64->value, tce_setrange_multi_pSeriesLP_walk);
   1395		if (ret) {
   1396			dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",
   1397				 dn, ret);
   1398
   1399			/* Make sure to clean DDW if any TCE was set*/
   1400			clean_dma_window(pdn, win64->value);
   1401			goto out_del_list;
   1402		}
   1403	} else {
   1404		struct iommu_table *newtbl;
   1405		int i;
   1406		unsigned long start = 0, end = 0;
   1407
   1408		for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {
   1409			const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;
   1410
   1411			/* Look for MMIO32 */
   1412			if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) {
   1413				start = pci->phb->mem_resources[i].start;
   1414				end = pci->phb->mem_resources[i].end;
   1415				break;
   1416			}
   1417		}
   1418
   1419		/* New table for using DDW instead of the default DMA window */
   1420		newtbl = iommu_pseries_alloc_table(pci->phb->node);
   1421		if (!newtbl) {
   1422			dev_dbg(&dev->dev, "couldn't create new IOMMU table\n");
   1423			goto out_del_list;
   1424		}
   1425
   1426		iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr,
   1427					    1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);
   1428		iommu_init_table(newtbl, pci->phb->node, start, end);
   1429
   1430		pci->table_group->tables[1] = newtbl;
   1431
   1432		/* Keep default DMA window struct if removed */
   1433		if (default_win_removed) {
   1434			tbl->it_size = 0;
   1435			vfree(tbl->it_map);
   1436			tbl->it_map = NULL;
   1437		}
   1438
   1439		set_iommu_table_base(&dev->dev, newtbl);
   1440	}
   1441
   1442	spin_lock(&dma_win_list_lock);
   1443	list_add(&window->list, &dma_win_list);
   1444	spin_unlock(&dma_win_list_lock);
   1445
   1446	dev->dev.archdata.dma_offset = win_addr;
   1447	goto out_unlock;
   1448
   1449out_del_list:
   1450	kfree(window);
   1451
   1452out_del_prop:
   1453	of_remove_property(pdn, win64);
   1454
   1455out_free_prop:
   1456	kfree(win64->name);
   1457	kfree(win64->value);
   1458	kfree(win64);
   1459
   1460out_remove_win:
   1461	/* DDW is clean, so it's ok to call this directly. */
   1462	__remove_dma_window(pdn, ddw_avail, create.liobn);
   1463
   1464out_failed:
   1465	if (default_win_removed)
   1466		reset_dma_window(dev, pdn);
   1467
   1468	fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
   1469	if (!fpdn)
   1470		goto out_unlock;
   1471	fpdn->pdn = pdn;
   1472	list_add(&fpdn->list, &failed_ddw_pdn_list);
   1473
   1474out_unlock:
   1475	mutex_unlock(&dma_win_init_mutex);
   1476
   1477	/*
   1478	 * If we have persistent memory and the window size is only as big
   1479	 * as RAM, then we failed to create a window to cover persistent
   1480	 * memory and need to set the DMA limit.
   1481	 */
   1482	if (pmem_present && direct_mapping && len == max_ram_len)
   1483		dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len);
   1484
   1485	return direct_mapping;
   1486}
   1487
   1488static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
   1489{
   1490	struct device_node *pdn, *dn;
   1491	struct iommu_table *tbl;
   1492	const __be32 *dma_window = NULL;
   1493	struct pci_dn *pci;
   1494
   1495	pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
   1496
   1497	/* dev setup for LPAR is a little tricky, since the device tree might
   1498	 * contain the dma-window properties per-device and not necessarily
   1499	 * for the bus. So we need to search upwards in the tree until we
   1500	 * either hit a dma-window property, OR find a parent with a table
   1501	 * already allocated.
   1502	 */
   1503	dn = pci_device_to_OF_node(dev);
   1504	pr_debug("  node is %pOF\n", dn);
   1505
   1506	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
   1507	     pdn = pdn->parent) {
   1508		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
   1509		if (dma_window)
   1510			break;
   1511	}
   1512
   1513	if (!pdn || !PCI_DN(pdn)) {
   1514		printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
   1515		       "no DMA window found for pci dev=%s dn=%pOF\n",
   1516				 pci_name(dev), dn);
   1517		return;
   1518	}
   1519	pr_debug("  parent is %pOF\n", pdn);
   1520
   1521	pci = PCI_DN(pdn);
   1522	if (!pci->table_group) {
   1523		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
   1524		tbl = pci->table_group->tables[0];
   1525		iommu_table_setparms_lpar(pci->phb, pdn, tbl,
   1526				pci->table_group, dma_window);
   1527
   1528		iommu_init_table(tbl, pci->phb->node, 0, 0);
   1529		iommu_register_group(pci->table_group,
   1530				pci_domain_nr(pci->phb->bus), 0);
   1531		pr_debug("  created table: %p\n", pci->table_group);
   1532	} else {
   1533		pr_debug("  found DMA window, table: %p\n", pci->table_group);
   1534	}
   1535
   1536	set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
   1537	iommu_add_device(pci->table_group, &dev->dev);
   1538}
   1539
   1540static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
   1541{
   1542	struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
   1543	const __be32 *dma_window = NULL;
   1544
   1545	/* only attempt to use a new window if 64-bit DMA is requested */
   1546	if (dma_mask < DMA_BIT_MASK(64))
   1547		return false;
   1548
   1549	dev_dbg(&pdev->dev, "node is %pOF\n", dn);
   1550
   1551	/*
   1552	 * the device tree might contain the dma-window properties
   1553	 * per-device and not necessarily for the bus. So we need to
   1554	 * search upwards in the tree until we either hit a dma-window
   1555	 * property, OR find a parent with a table already allocated.
   1556	 */
   1557	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
   1558			pdn = pdn->parent) {
   1559		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
   1560		if (dma_window)
   1561			break;
   1562	}
   1563
   1564	if (pdn && PCI_DN(pdn))
   1565		return enable_ddw(pdev, pdn);
   1566
   1567	return false;
   1568}
   1569
   1570static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
   1571		void *data)
   1572{
   1573	struct dma_win *window;
   1574	struct memory_notify *arg = data;
   1575	int ret = 0;
   1576
   1577	switch (action) {
   1578	case MEM_GOING_ONLINE:
   1579		spin_lock(&dma_win_list_lock);
   1580		list_for_each_entry(window, &dma_win_list, list) {
   1581			ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
   1582					arg->nr_pages, window->prop);
   1583			/* XXX log error */
   1584		}
   1585		spin_unlock(&dma_win_list_lock);
   1586		break;
   1587	case MEM_CANCEL_ONLINE:
   1588	case MEM_OFFLINE:
   1589		spin_lock(&dma_win_list_lock);
   1590		list_for_each_entry(window, &dma_win_list, list) {
   1591			ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
   1592					arg->nr_pages, window->prop);
   1593			/* XXX log error */
   1594		}
   1595		spin_unlock(&dma_win_list_lock);
   1596		break;
   1597	default:
   1598		break;
   1599	}
   1600	if (ret && action != MEM_CANCEL_ONLINE)
   1601		return NOTIFY_BAD;
   1602
   1603	return NOTIFY_OK;
   1604}
   1605
   1606static struct notifier_block iommu_mem_nb = {
   1607	.notifier_call = iommu_mem_notifier,
   1608};
   1609
   1610static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)
   1611{
   1612	int err = NOTIFY_OK;
   1613	struct of_reconfig_data *rd = data;
   1614	struct device_node *np = rd->dn;
   1615	struct pci_dn *pci = PCI_DN(np);
   1616	struct dma_win *window;
   1617
   1618	switch (action) {
   1619	case OF_RECONFIG_DETACH_NODE:
   1620		/*
   1621		 * Removing the property will invoke the reconfig
   1622		 * notifier again, which causes dead-lock on the
   1623		 * read-write semaphore of the notifier chain. So
   1624		 * we have to remove the property when releasing
   1625		 * the device node.
   1626		 */
   1627		if (remove_ddw(np, false, DIRECT64_PROPNAME))
   1628			remove_ddw(np, false, DMA64_PROPNAME);
   1629
   1630		if (pci && pci->table_group)
   1631			iommu_pseries_free_group(pci->table_group,
   1632					np->full_name);
   1633
   1634		spin_lock(&dma_win_list_lock);
   1635		list_for_each_entry(window, &dma_win_list, list) {
   1636			if (window->device == np) {
   1637				list_del(&window->list);
   1638				kfree(window);
   1639				break;
   1640			}
   1641		}
   1642		spin_unlock(&dma_win_list_lock);
   1643		break;
   1644	default:
   1645		err = NOTIFY_DONE;
   1646		break;
   1647	}
   1648	return err;
   1649}
   1650
   1651static struct notifier_block iommu_reconfig_nb = {
   1652	.notifier_call = iommu_reconfig_notifier,
   1653};
   1654
   1655/* These are called very early. */
   1656void __init iommu_init_early_pSeries(void)
   1657{
   1658	if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
   1659		return;
   1660
   1661	if (firmware_has_feature(FW_FEATURE_LPAR)) {
   1662		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
   1663		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
   1664		if (!disable_ddw)
   1665			pseries_pci_controller_ops.iommu_bypass_supported =
   1666				iommu_bypass_supported_pSeriesLP;
   1667	} else {
   1668		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
   1669		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
   1670	}
   1671
   1672
   1673	of_reconfig_notifier_register(&iommu_reconfig_nb);
   1674	register_memory_notifier(&iommu_mem_nb);
   1675
   1676	set_pci_dma_ops(&dma_iommu_ops);
   1677}
   1678
   1679static int __init disable_multitce(char *str)
   1680{
   1681	if (strcmp(str, "off") == 0 &&
   1682	    firmware_has_feature(FW_FEATURE_LPAR) &&
   1683	    (firmware_has_feature(FW_FEATURE_PUT_TCE_IND) ||
   1684	     firmware_has_feature(FW_FEATURE_STUFF_TCE))) {
   1685		printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
   1686		powerpc_firmware_features &=
   1687			~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE);
   1688	}
   1689	return 1;
   1690}
   1691
   1692__setup("multitce=", disable_multitce);
   1693
   1694static int tce_iommu_bus_notifier(struct notifier_block *nb,
   1695		unsigned long action, void *data)
   1696{
   1697	struct device *dev = data;
   1698
   1699	switch (action) {
   1700	case BUS_NOTIFY_DEL_DEVICE:
   1701		iommu_del_device(dev);
   1702		return 0;
   1703	default:
   1704		return 0;
   1705	}
   1706}
   1707
   1708static struct notifier_block tce_iommu_bus_nb = {
   1709	.notifier_call = tce_iommu_bus_notifier,
   1710};
   1711
   1712static int __init tce_iommu_bus_notifier_init(void)
   1713{
   1714	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
   1715	return 0;
   1716}
   1717machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);