cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

pci-ioda.c (88964B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * Support PCI/PCIe on PowerNV platforms
      4 *
      5 * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
      6 */
      7
      8#undef DEBUG
      9
     10#include <linux/kernel.h>
     11#include <linux/pci.h>
     12#include <linux/crash_dump.h>
     13#include <linux/delay.h>
     14#include <linux/string.h>
     15#include <linux/init.h>
     16#include <linux/memblock.h>
     17#include <linux/irq.h>
     18#include <linux/io.h>
     19#include <linux/msi.h>
     20#include <linux/iommu.h>
     21#include <linux/rculist.h>
     22#include <linux/sizes.h>
     23#include <linux/debugfs.h>
     24#include <linux/of_address.h>
     25#include <linux/of_irq.h>
     26
     27#include <asm/sections.h>
     28#include <asm/io.h>
     29#include <asm/pci-bridge.h>
     30#include <asm/machdep.h>
     31#include <asm/msi_bitmap.h>
     32#include <asm/ppc-pci.h>
     33#include <asm/opal.h>
     34#include <asm/iommu.h>
     35#include <asm/tce.h>
     36#include <asm/xics.h>
     37#include <asm/firmware.h>
     38#include <asm/pnv-pci.h>
     39#include <asm/mmzone.h>
     40#include <asm/xive.h>
     41
     42#include <misc/cxl-base.h>
     43
     44#include "powernv.h"
     45#include "pci.h"
     46#include "../../../../drivers/pci/pci.h"
     47
     48#define PNV_IODA1_M64_NUM	16	/* Number of M64 BARs	*/
     49#define PNV_IODA1_M64_SEGS	8	/* Segments per M64 BAR	*/
     50#define PNV_IODA1_DMA32_SEGSIZE	0x10000000
     51
     52static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_OCAPI" };
     53
     54static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
     55static void pnv_pci_configure_bus(struct pci_bus *bus);
     56
     57void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
     58			    const char *fmt, ...)
     59{
     60	struct va_format vaf;
     61	va_list args;
     62	char pfix[32];
     63
     64	va_start(args, fmt);
     65
     66	vaf.fmt = fmt;
     67	vaf.va = &args;
     68
     69	if (pe->flags & PNV_IODA_PE_DEV)
     70		strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
     71	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
     72		sprintf(pfix, "%04x:%02x     ",
     73			pci_domain_nr(pe->pbus), pe->pbus->number);
     74#ifdef CONFIG_PCI_IOV
     75	else if (pe->flags & PNV_IODA_PE_VF)
     76		sprintf(pfix, "%04x:%02x:%2x.%d",
     77			pci_domain_nr(pe->parent_dev->bus),
     78			(pe->rid & 0xff00) >> 8,
     79			PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
     80#endif /* CONFIG_PCI_IOV*/
     81
     82	printk("%spci %s: [PE# %.2x] %pV",
     83	       level, pfix, pe->pe_number, &vaf);
     84
     85	va_end(args);
     86}
     87
     88static bool pnv_iommu_bypass_disabled __read_mostly;
     89static bool pci_reset_phbs __read_mostly;
     90
     91static int __init iommu_setup(char *str)
     92{
     93	if (!str)
     94		return -EINVAL;
     95
     96	while (*str) {
     97		if (!strncmp(str, "nobypass", 8)) {
     98			pnv_iommu_bypass_disabled = true;
     99			pr_info("PowerNV: IOMMU bypass window disabled.\n");
    100			break;
    101		}
    102		str += strcspn(str, ",");
    103		if (*str == ',')
    104			str++;
    105	}
    106
    107	return 0;
    108}
    109early_param("iommu", iommu_setup);
    110
    111static int __init pci_reset_phbs_setup(char *str)
    112{
    113	pci_reset_phbs = true;
    114	return 0;
    115}
    116
    117early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
    118
    119static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
    120{
    121	s64 rc;
    122
    123	phb->ioda.pe_array[pe_no].phb = phb;
    124	phb->ioda.pe_array[pe_no].pe_number = pe_no;
    125	phb->ioda.pe_array[pe_no].dma_setup_done = false;
    126
    127	/*
    128	 * Clear the PE frozen state as it might be put into frozen state
    129	 * in the last PCI remove path. It's not harmful to do so when the
    130	 * PE is already in unfrozen state.
    131	 */
    132	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
    133				       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
    134	if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
    135		pr_warn("%s: Error %lld unfreezing PHB#%x-PE#%x\n",
    136			__func__, rc, phb->hose->global_number, pe_no);
    137
    138	return &phb->ioda.pe_array[pe_no];
    139}
    140
    141static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
    142{
    143	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
    144		pr_warn("%s: Invalid PE %x on PHB#%x\n",
    145			__func__, pe_no, phb->hose->global_number);
    146		return;
    147	}
    148
    149	mutex_lock(&phb->ioda.pe_alloc_mutex);
    150	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
    151		pr_debug("%s: PE %x was reserved on PHB#%x\n",
    152			 __func__, pe_no, phb->hose->global_number);
    153	mutex_unlock(&phb->ioda.pe_alloc_mutex);
    154
    155	pnv_ioda_init_pe(phb, pe_no);
    156}
    157
    158struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count)
    159{
    160	struct pnv_ioda_pe *ret = NULL;
    161	int run = 0, pe, i;
    162
    163	mutex_lock(&phb->ioda.pe_alloc_mutex);
    164
    165	/* scan backwards for a run of @count cleared bits */
    166	for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
    167		if (test_bit(pe, phb->ioda.pe_alloc)) {
    168			run = 0;
    169			continue;
    170		}
    171
    172		run++;
    173		if (run == count)
    174			break;
    175	}
    176	if (run != count)
    177		goto out;
    178
    179	for (i = pe; i < pe + count; i++) {
    180		set_bit(i, phb->ioda.pe_alloc);
    181		pnv_ioda_init_pe(phb, i);
    182	}
    183	ret = &phb->ioda.pe_array[pe];
    184
    185out:
    186	mutex_unlock(&phb->ioda.pe_alloc_mutex);
    187	return ret;
    188}
    189
    190void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
    191{
    192	struct pnv_phb *phb = pe->phb;
    193	unsigned int pe_num = pe->pe_number;
    194
    195	WARN_ON(pe->pdev);
    196	memset(pe, 0, sizeof(struct pnv_ioda_pe));
    197
    198	mutex_lock(&phb->ioda.pe_alloc_mutex);
    199	clear_bit(pe_num, phb->ioda.pe_alloc);
    200	mutex_unlock(&phb->ioda.pe_alloc_mutex);
    201}
    202
    203/* The default M64 BAR is shared by all PEs */
    204static int pnv_ioda2_init_m64(struct pnv_phb *phb)
    205{
    206	const char *desc;
    207	struct resource *r;
    208	s64 rc;
    209
    210	/* Configure the default M64 BAR */
    211	rc = opal_pci_set_phb_mem_window(phb->opal_id,
    212					 OPAL_M64_WINDOW_TYPE,
    213					 phb->ioda.m64_bar_idx,
    214					 phb->ioda.m64_base,
    215					 0, /* unused */
    216					 phb->ioda.m64_size);
    217	if (rc != OPAL_SUCCESS) {
    218		desc = "configuring";
    219		goto fail;
    220	}
    221
    222	/* Enable the default M64 BAR */
    223	rc = opal_pci_phb_mmio_enable(phb->opal_id,
    224				      OPAL_M64_WINDOW_TYPE,
    225				      phb->ioda.m64_bar_idx,
    226				      OPAL_ENABLE_M64_SPLIT);
    227	if (rc != OPAL_SUCCESS) {
    228		desc = "enabling";
    229		goto fail;
    230	}
    231
    232	/*
    233	 * Exclude the segments for reserved and root bus PE, which
    234	 * are first or last two PEs.
    235	 */
    236	r = &phb->hose->mem_resources[1];
    237	if (phb->ioda.reserved_pe_idx == 0)
    238		r->start += (2 * phb->ioda.m64_segsize);
    239	else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
    240		r->end -= (2 * phb->ioda.m64_segsize);
    241	else
    242		pr_warn("  Cannot strip M64 segment for reserved PE#%x\n",
    243			phb->ioda.reserved_pe_idx);
    244
    245	return 0;
    246
    247fail:
    248	pr_warn("  Failure %lld %s M64 BAR#%d\n",
    249		rc, desc, phb->ioda.m64_bar_idx);
    250	opal_pci_phb_mmio_enable(phb->opal_id,
    251				 OPAL_M64_WINDOW_TYPE,
    252				 phb->ioda.m64_bar_idx,
    253				 OPAL_DISABLE_M64);
    254	return -EIO;
    255}
    256
    257static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
    258					 unsigned long *pe_bitmap)
    259{
    260	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
    261	struct resource *r;
    262	resource_size_t base, sgsz, start, end;
    263	int segno, i;
    264
    265	base = phb->ioda.m64_base;
    266	sgsz = phb->ioda.m64_segsize;
    267	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
    268		r = &pdev->resource[i];
    269		if (!r->parent || !pnv_pci_is_m64(phb, r))
    270			continue;
    271
    272		start = ALIGN_DOWN(r->start - base, sgsz);
    273		end = ALIGN(r->end - base, sgsz);
    274		for (segno = start / sgsz; segno < end / sgsz; segno++) {
    275			if (pe_bitmap)
    276				set_bit(segno, pe_bitmap);
    277			else
    278				pnv_ioda_reserve_pe(phb, segno);
    279		}
    280	}
    281}
    282
    283static int pnv_ioda1_init_m64(struct pnv_phb *phb)
    284{
    285	struct resource *r;
    286	int index;
    287
    288	/*
    289	 * There are 16 M64 BARs, each of which has 8 segments. So
    290	 * there are as many M64 segments as the maximum number of
    291	 * PEs, which is 128.
    292	 */
    293	for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
    294		unsigned long base, segsz = phb->ioda.m64_segsize;
    295		int64_t rc;
    296
    297		base = phb->ioda.m64_base +
    298		       index * PNV_IODA1_M64_SEGS * segsz;
    299		rc = opal_pci_set_phb_mem_window(phb->opal_id,
    300				OPAL_M64_WINDOW_TYPE, index, base, 0,
    301				PNV_IODA1_M64_SEGS * segsz);
    302		if (rc != OPAL_SUCCESS) {
    303			pr_warn("  Error %lld setting M64 PHB#%x-BAR#%d\n",
    304				rc, phb->hose->global_number, index);
    305			goto fail;
    306		}
    307
    308		rc = opal_pci_phb_mmio_enable(phb->opal_id,
    309				OPAL_M64_WINDOW_TYPE, index,
    310				OPAL_ENABLE_M64_SPLIT);
    311		if (rc != OPAL_SUCCESS) {
    312			pr_warn("  Error %lld enabling M64 PHB#%x-BAR#%d\n",
    313				rc, phb->hose->global_number, index);
    314			goto fail;
    315		}
    316	}
    317
    318	for (index = 0; index < phb->ioda.total_pe_num; index++) {
    319		int64_t rc;
    320
    321		/*
    322		 * P7IOC supports M64DT, which helps mapping M64 segment
    323		 * to one particular PE#. However, PHB3 has fixed mapping
    324		 * between M64 segment and PE#. In order to have same logic
    325		 * for P7IOC and PHB3, we enforce fixed mapping between M64
    326		 * segment and PE# on P7IOC.
    327		 */
    328		rc = opal_pci_map_pe_mmio_window(phb->opal_id,
    329				index, OPAL_M64_WINDOW_TYPE,
    330				index / PNV_IODA1_M64_SEGS,
    331				index % PNV_IODA1_M64_SEGS);
    332		if (rc != OPAL_SUCCESS) {
    333			pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
    334				__func__, rc, phb->hose->global_number,
    335				index);
    336			goto fail;
    337		}
    338	}
    339
    340	/*
    341	 * Exclude the segments for reserved and root bus PE, which
    342	 * are first or last two PEs.
    343	 */
    344	r = &phb->hose->mem_resources[1];
    345	if (phb->ioda.reserved_pe_idx == 0)
    346		r->start += (2 * phb->ioda.m64_segsize);
    347	else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
    348		r->end -= (2 * phb->ioda.m64_segsize);
    349	else
    350		WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
    351		     phb->ioda.reserved_pe_idx, phb->hose->global_number);
    352
    353	return 0;
    354
    355fail:
    356	for ( ; index >= 0; index--)
    357		opal_pci_phb_mmio_enable(phb->opal_id,
    358			OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
    359
    360	return -EIO;
    361}
    362
    363static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
    364				    unsigned long *pe_bitmap,
    365				    bool all)
    366{
    367	struct pci_dev *pdev;
    368
    369	list_for_each_entry(pdev, &bus->devices, bus_list) {
    370		pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
    371
    372		if (all && pdev->subordinate)
    373			pnv_ioda_reserve_m64_pe(pdev->subordinate,
    374						pe_bitmap, all);
    375	}
    376}
    377
    378static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
    379{
    380	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
    381	struct pnv_ioda_pe *master_pe, *pe;
    382	unsigned long size, *pe_alloc;
    383	int i;
    384
    385	/* Root bus shouldn't use M64 */
    386	if (pci_is_root_bus(bus))
    387		return NULL;
    388
    389	/* Allocate bitmap */
    390	size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
    391	pe_alloc = kzalloc(size, GFP_KERNEL);
    392	if (!pe_alloc) {
    393		pr_warn("%s: Out of memory !\n",
    394			__func__);
    395		return NULL;
    396	}
    397
    398	/* Figure out reserved PE numbers by the PE */
    399	pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
    400
    401	/*
    402	 * the current bus might not own M64 window and that's all
    403	 * contributed by its child buses. For the case, we needn't
    404	 * pick M64 dependent PE#.
    405	 */
    406	if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
    407		kfree(pe_alloc);
    408		return NULL;
    409	}
    410
    411	/*
    412	 * Figure out the master PE and put all slave PEs to master
    413	 * PE's list to form compound PE.
    414	 */
    415	master_pe = NULL;
    416	i = -1;
    417	while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
    418		phb->ioda.total_pe_num) {
    419		pe = &phb->ioda.pe_array[i];
    420
    421		phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
    422		if (!master_pe) {
    423			pe->flags |= PNV_IODA_PE_MASTER;
    424			INIT_LIST_HEAD(&pe->slaves);
    425			master_pe = pe;
    426		} else {
    427			pe->flags |= PNV_IODA_PE_SLAVE;
    428			pe->master = master_pe;
    429			list_add_tail(&pe->list, &master_pe->slaves);
    430		}
    431	}
    432
    433	kfree(pe_alloc);
    434	return master_pe;
    435}
    436
    437static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
    438{
    439	struct pci_controller *hose = phb->hose;
    440	struct device_node *dn = hose->dn;
    441	struct resource *res;
    442	u32 m64_range[2], i;
    443	const __be32 *r;
    444	u64 pci_addr;
    445
    446	if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
    447		pr_info("  Not support M64 window\n");
    448		return;
    449	}
    450
    451	if (!firmware_has_feature(FW_FEATURE_OPAL)) {
    452		pr_info("  Firmware too old to support M64 window\n");
    453		return;
    454	}
    455
    456	r = of_get_property(dn, "ibm,opal-m64-window", NULL);
    457	if (!r) {
    458		pr_info("  No <ibm,opal-m64-window> on %pOF\n",
    459			dn);
    460		return;
    461	}
    462
    463	/*
    464	 * Find the available M64 BAR range and pickup the last one for
    465	 * covering the whole 64-bits space. We support only one range.
    466	 */
    467	if (of_property_read_u32_array(dn, "ibm,opal-available-m64-ranges",
    468				       m64_range, 2)) {
    469		/* In absence of the property, assume 0..15 */
    470		m64_range[0] = 0;
    471		m64_range[1] = 16;
    472	}
    473	/* We only support 64 bits in our allocator */
    474	if (m64_range[1] > 63) {
    475		pr_warn("%s: Limiting M64 range to 63 (from %d) on PHB#%x\n",
    476			__func__, m64_range[1], phb->hose->global_number);
    477		m64_range[1] = 63;
    478	}
    479	/* Empty range, no m64 */
    480	if (m64_range[1] <= m64_range[0]) {
    481		pr_warn("%s: M64 empty, disabling M64 usage on PHB#%x\n",
    482			__func__, phb->hose->global_number);
    483		return;
    484	}
    485
    486	/* Configure M64 informations */
    487	res = &hose->mem_resources[1];
    488	res->name = dn->full_name;
    489	res->start = of_translate_address(dn, r + 2);
    490	res->end = res->start + of_read_number(r + 4, 2) - 1;
    491	res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
    492	pci_addr = of_read_number(r, 2);
    493	hose->mem_offset[1] = res->start - pci_addr;
    494
    495	phb->ioda.m64_size = resource_size(res);
    496	phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
    497	phb->ioda.m64_base = pci_addr;
    498
    499	/* This lines up nicely with the display from processing OF ranges */
    500	pr_info(" MEM 0x%016llx..0x%016llx -> 0x%016llx (M64 #%d..%d)\n",
    501		res->start, res->end, pci_addr, m64_range[0],
    502		m64_range[0] + m64_range[1] - 1);
    503
    504	/* Mark all M64 used up by default */
    505	phb->ioda.m64_bar_alloc = (unsigned long)-1;
    506
    507	/* Use last M64 BAR to cover M64 window */
    508	m64_range[1]--;
    509	phb->ioda.m64_bar_idx = m64_range[0] + m64_range[1];
    510
    511	pr_info(" Using M64 #%d as default window\n", phb->ioda.m64_bar_idx);
    512
    513	/* Mark remaining ones free */
    514	for (i = m64_range[0]; i < m64_range[1]; i++)
    515		clear_bit(i, &phb->ioda.m64_bar_alloc);
    516
    517	/*
    518	 * Setup init functions for M64 based on IODA version, IODA3 uses
    519	 * the IODA2 code.
    520	 */
    521	if (phb->type == PNV_PHB_IODA1)
    522		phb->init_m64 = pnv_ioda1_init_m64;
    523	else
    524		phb->init_m64 = pnv_ioda2_init_m64;
    525}
    526
    527static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
    528{
    529	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
    530	struct pnv_ioda_pe *slave;
    531	s64 rc;
    532
    533	/* Fetch master PE */
    534	if (pe->flags & PNV_IODA_PE_SLAVE) {
    535		pe = pe->master;
    536		if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
    537			return;
    538
    539		pe_no = pe->pe_number;
    540	}
    541
    542	/* Freeze master PE */
    543	rc = opal_pci_eeh_freeze_set(phb->opal_id,
    544				     pe_no,
    545				     OPAL_EEH_ACTION_SET_FREEZE_ALL);
    546	if (rc != OPAL_SUCCESS) {
    547		pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
    548			__func__, rc, phb->hose->global_number, pe_no);
    549		return;
    550	}
    551
    552	/* Freeze slave PEs */
    553	if (!(pe->flags & PNV_IODA_PE_MASTER))
    554		return;
    555
    556	list_for_each_entry(slave, &pe->slaves, list) {
    557		rc = opal_pci_eeh_freeze_set(phb->opal_id,
    558					     slave->pe_number,
    559					     OPAL_EEH_ACTION_SET_FREEZE_ALL);
    560		if (rc != OPAL_SUCCESS)
    561			pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
    562				__func__, rc, phb->hose->global_number,
    563				slave->pe_number);
    564	}
    565}
    566
    567static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
    568{
    569	struct pnv_ioda_pe *pe, *slave;
    570	s64 rc;
    571
    572	/* Find master PE */
    573	pe = &phb->ioda.pe_array[pe_no];
    574	if (pe->flags & PNV_IODA_PE_SLAVE) {
    575		pe = pe->master;
    576		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
    577		pe_no = pe->pe_number;
    578	}
    579
    580	/* Clear frozen state for master PE */
    581	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
    582	if (rc != OPAL_SUCCESS) {
    583		pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
    584			__func__, rc, opt, phb->hose->global_number, pe_no);
    585		return -EIO;
    586	}
    587
    588	if (!(pe->flags & PNV_IODA_PE_MASTER))
    589		return 0;
    590
    591	/* Clear frozen state for slave PEs */
    592	list_for_each_entry(slave, &pe->slaves, list) {
    593		rc = opal_pci_eeh_freeze_clear(phb->opal_id,
    594					     slave->pe_number,
    595					     opt);
    596		if (rc != OPAL_SUCCESS) {
    597			pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
    598				__func__, rc, opt, phb->hose->global_number,
    599				slave->pe_number);
    600			return -EIO;
    601		}
    602	}
    603
    604	return 0;
    605}
    606
    607static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
    608{
    609	struct pnv_ioda_pe *slave, *pe;
    610	u8 fstate = 0, state;
    611	__be16 pcierr = 0;
    612	s64 rc;
    613
    614	/* Sanity check on PE number */
    615	if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
    616		return OPAL_EEH_STOPPED_PERM_UNAVAIL;
    617
    618	/*
    619	 * Fetch the master PE and the PE instance might be
    620	 * not initialized yet.
    621	 */
    622	pe = &phb->ioda.pe_array[pe_no];
    623	if (pe->flags & PNV_IODA_PE_SLAVE) {
    624		pe = pe->master;
    625		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
    626		pe_no = pe->pe_number;
    627	}
    628
    629	/* Check the master PE */
    630	rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
    631					&state, &pcierr, NULL);
    632	if (rc != OPAL_SUCCESS) {
    633		pr_warn("%s: Failure %lld getting "
    634			"PHB#%x-PE#%x state\n",
    635			__func__, rc,
    636			phb->hose->global_number, pe_no);
    637		return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
    638	}
    639
    640	/* Check the slave PE */
    641	if (!(pe->flags & PNV_IODA_PE_MASTER))
    642		return state;
    643
    644	list_for_each_entry(slave, &pe->slaves, list) {
    645		rc = opal_pci_eeh_freeze_status(phb->opal_id,
    646						slave->pe_number,
    647						&fstate,
    648						&pcierr,
    649						NULL);
    650		if (rc != OPAL_SUCCESS) {
    651			pr_warn("%s: Failure %lld getting "
    652				"PHB#%x-PE#%x state\n",
    653				__func__, rc,
    654				phb->hose->global_number, slave->pe_number);
    655			return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
    656		}
    657
    658		/*
    659		 * Override the result based on the ascending
    660		 * priority.
    661		 */
    662		if (fstate > state)
    663			state = fstate;
    664	}
    665
    666	return state;
    667}
    668
    669struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn)
    670{
    671	int pe_number = phb->ioda.pe_rmap[bdfn];
    672
    673	if (pe_number == IODA_INVALID_PE)
    674		return NULL;
    675
    676	return &phb->ioda.pe_array[pe_number];
    677}
    678
    679struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
    680{
    681	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
    682	struct pci_dn *pdn = pci_get_pdn(dev);
    683
    684	if (!pdn)
    685		return NULL;
    686	if (pdn->pe_number == IODA_INVALID_PE)
    687		return NULL;
    688	return &phb->ioda.pe_array[pdn->pe_number];
    689}
    690
    691static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
    692				  struct pnv_ioda_pe *parent,
    693				  struct pnv_ioda_pe *child,
    694				  bool is_add)
    695{
    696	const char *desc = is_add ? "adding" : "removing";
    697	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
    698			      OPAL_REMOVE_PE_FROM_DOMAIN;
    699	struct pnv_ioda_pe *slave;
    700	long rc;
    701
    702	/* Parent PE affects child PE */
    703	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
    704				child->pe_number, op);
    705	if (rc != OPAL_SUCCESS) {
    706		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
    707			rc, desc);
    708		return -ENXIO;
    709	}
    710
    711	if (!(child->flags & PNV_IODA_PE_MASTER))
    712		return 0;
    713
    714	/* Compound case: parent PE affects slave PEs */
    715	list_for_each_entry(slave, &child->slaves, list) {
    716		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
    717					slave->pe_number, op);
    718		if (rc != OPAL_SUCCESS) {
    719			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
    720				rc, desc);
    721			return -ENXIO;
    722		}
    723	}
    724
    725	return 0;
    726}
    727
    728static int pnv_ioda_set_peltv(struct pnv_phb *phb,
    729			      struct pnv_ioda_pe *pe,
    730			      bool is_add)
    731{
    732	struct pnv_ioda_pe *slave;
    733	struct pci_dev *pdev = NULL;
    734	int ret;
    735
    736	/*
    737	 * Clear PE frozen state. If it's master PE, we need
    738	 * clear slave PE frozen state as well.
    739	 */
    740	if (is_add) {
    741		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
    742					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
    743		if (pe->flags & PNV_IODA_PE_MASTER) {
    744			list_for_each_entry(slave, &pe->slaves, list)
    745				opal_pci_eeh_freeze_clear(phb->opal_id,
    746							  slave->pe_number,
    747							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
    748		}
    749	}
    750
    751	/*
    752	 * Associate PE in PELT. We need add the PE into the
    753	 * corresponding PELT-V as well. Otherwise, the error
    754	 * originated from the PE might contribute to other
    755	 * PEs.
    756	 */
    757	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
    758	if (ret)
    759		return ret;
    760
    761	/* For compound PEs, any one affects all of them */
    762	if (pe->flags & PNV_IODA_PE_MASTER) {
    763		list_for_each_entry(slave, &pe->slaves, list) {
    764			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
    765			if (ret)
    766				return ret;
    767		}
    768	}
    769
    770	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
    771		pdev = pe->pbus->self;
    772	else if (pe->flags & PNV_IODA_PE_DEV)
    773		pdev = pe->pdev->bus->self;
    774#ifdef CONFIG_PCI_IOV
    775	else if (pe->flags & PNV_IODA_PE_VF)
    776		pdev = pe->parent_dev;
    777#endif /* CONFIG_PCI_IOV */
    778	while (pdev) {
    779		struct pci_dn *pdn = pci_get_pdn(pdev);
    780		struct pnv_ioda_pe *parent;
    781
    782		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
    783			parent = &phb->ioda.pe_array[pdn->pe_number];
    784			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
    785			if (ret)
    786				return ret;
    787		}
    788
    789		pdev = pdev->bus->self;
    790	}
    791
    792	return 0;
    793}
    794
    795static void pnv_ioda_unset_peltv(struct pnv_phb *phb,
    796				 struct pnv_ioda_pe *pe,
    797				 struct pci_dev *parent)
    798{
    799	int64_t rc;
    800
    801	while (parent) {
    802		struct pci_dn *pdn = pci_get_pdn(parent);
    803
    804		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
    805			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
    806						pe->pe_number,
    807						OPAL_REMOVE_PE_FROM_DOMAIN);
    808			/* XXX What to do in case of error ? */
    809		}
    810		parent = parent->bus->self;
    811	}
    812
    813	opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
    814				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
    815
    816	/* Disassociate PE in PELT */
    817	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
    818				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
    819	if (rc)
    820		pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
    821}
    822
    823int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
    824{
    825	struct pci_dev *parent;
    826	uint8_t bcomp, dcomp, fcomp;
    827	int64_t rc;
    828	long rid_end, rid;
    829
    830	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
    831	if (pe->pbus) {
    832		int count;
    833
    834		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
    835		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
    836		parent = pe->pbus->self;
    837		if (pe->flags & PNV_IODA_PE_BUS_ALL)
    838			count = resource_size(&pe->pbus->busn_res);
    839		else
    840			count = 1;
    841
    842		switch(count) {
    843		case  1: bcomp = OpalPciBusAll;         break;
    844		case  2: bcomp = OpalPciBus7Bits;       break;
    845		case  4: bcomp = OpalPciBus6Bits;       break;
    846		case  8: bcomp = OpalPciBus5Bits;       break;
    847		case 16: bcomp = OpalPciBus4Bits;       break;
    848		case 32: bcomp = OpalPciBus3Bits;       break;
    849		default:
    850			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
    851			        count);
    852			/* Do an exact match only */
    853			bcomp = OpalPciBusAll;
    854		}
    855		rid_end = pe->rid + (count << 8);
    856	} else {
    857#ifdef CONFIG_PCI_IOV
    858		if (pe->flags & PNV_IODA_PE_VF)
    859			parent = pe->parent_dev;
    860		else
    861#endif
    862			parent = pe->pdev->bus->self;
    863		bcomp = OpalPciBusAll;
    864		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
    865		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
    866		rid_end = pe->rid + 1;
    867	}
    868
    869	/* Clear the reverse map */
    870	for (rid = pe->rid; rid < rid_end; rid++)
    871		phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
    872
    873	/*
    874	 * Release from all parents PELT-V. NPUs don't have a PELTV
    875	 * table
    876	 */
    877	if (phb->type != PNV_PHB_NPU_OCAPI)
    878		pnv_ioda_unset_peltv(phb, pe, parent);
    879
    880	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
    881			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
    882	if (rc)
    883		pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc);
    884
    885	pe->pbus = NULL;
    886	pe->pdev = NULL;
    887#ifdef CONFIG_PCI_IOV
    888	pe->parent_dev = NULL;
    889#endif
    890
    891	return 0;
    892}
    893
    894int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
    895{
    896	uint8_t bcomp, dcomp, fcomp;
    897	long rc, rid_end, rid;
    898
    899	/* Bus validation ? */
    900	if (pe->pbus) {
    901		int count;
    902
    903		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
    904		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
    905		if (pe->flags & PNV_IODA_PE_BUS_ALL)
    906			count = resource_size(&pe->pbus->busn_res);
    907		else
    908			count = 1;
    909
    910		switch(count) {
    911		case  1: bcomp = OpalPciBusAll;		break;
    912		case  2: bcomp = OpalPciBus7Bits;	break;
    913		case  4: bcomp = OpalPciBus6Bits;	break;
    914		case  8: bcomp = OpalPciBus5Bits;	break;
    915		case 16: bcomp = OpalPciBus4Bits;	break;
    916		case 32: bcomp = OpalPciBus3Bits;	break;
    917		default:
    918			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
    919			        count);
    920			/* Do an exact match only */
    921			bcomp = OpalPciBusAll;
    922		}
    923		rid_end = pe->rid + (count << 8);
    924	} else {
    925		bcomp = OpalPciBusAll;
    926		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
    927		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
    928		rid_end = pe->rid + 1;
    929	}
    930
    931	/*
    932	 * Associate PE in PELT. We need add the PE into the
    933	 * corresponding PELT-V as well. Otherwise, the error
    934	 * originated from the PE might contribute to other
    935	 * PEs.
    936	 */
    937	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
    938			     bcomp, dcomp, fcomp, OPAL_MAP_PE);
    939	if (rc) {
    940		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
    941		return -ENXIO;
    942	}
    943
    944	/*
    945	 * Configure PELTV. NPUs don't have a PELTV table so skip
    946	 * configuration on them.
    947	 */
    948	if (phb->type != PNV_PHB_NPU_OCAPI)
    949		pnv_ioda_set_peltv(phb, pe, true);
    950
    951	/* Setup reverse map */
    952	for (rid = pe->rid; rid < rid_end; rid++)
    953		phb->ioda.pe_rmap[rid] = pe->pe_number;
    954
    955	/* Setup one MVTs on IODA1 */
    956	if (phb->type != PNV_PHB_IODA1) {
    957		pe->mve_number = 0;
    958		goto out;
    959	}
    960
    961	pe->mve_number = pe->pe_number;
    962	rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
    963	if (rc != OPAL_SUCCESS) {
    964		pe_err(pe, "OPAL error %ld setting up MVE %x\n",
    965		       rc, pe->mve_number);
    966		pe->mve_number = -1;
    967	} else {
    968		rc = opal_pci_set_mve_enable(phb->opal_id,
    969					     pe->mve_number, OPAL_ENABLE_MVE);
    970		if (rc) {
    971			pe_err(pe, "OPAL error %ld enabling MVE %x\n",
    972			       rc, pe->mve_number);
    973			pe->mve_number = -1;
    974		}
    975	}
    976
    977out:
    978	return 0;
    979}
    980
    981static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
    982{
    983	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
    984	struct pci_dn *pdn = pci_get_pdn(dev);
    985	struct pnv_ioda_pe *pe;
    986
    987	if (!pdn) {
    988		pr_err("%s: Device tree node not associated properly\n",
    989			   pci_name(dev));
    990		return NULL;
    991	}
    992	if (pdn->pe_number != IODA_INVALID_PE)
    993		return NULL;
    994
    995	pe = pnv_ioda_alloc_pe(phb, 1);
    996	if (!pe) {
    997		pr_warn("%s: Not enough PE# available, disabling device\n",
    998			pci_name(dev));
    999		return NULL;
   1000	}
   1001
   1002	/* NOTE: We don't get a reference for the pointer in the PE
   1003	 * data structure, both the device and PE structures should be
   1004	 * destroyed at the same time.
   1005	 *
   1006	 * At some point we want to remove the PDN completely anyways
   1007	 */
   1008	pdn->pe_number = pe->pe_number;
   1009	pe->flags = PNV_IODA_PE_DEV;
   1010	pe->pdev = dev;
   1011	pe->pbus = NULL;
   1012	pe->mve_number = -1;
   1013	pe->rid = dev->bus->number << 8 | pdn->devfn;
   1014	pe->device_count++;
   1015
   1016	pe_info(pe, "Associated device to PE\n");
   1017
   1018	if (pnv_ioda_configure_pe(phb, pe)) {
   1019		/* XXX What do we do here ? */
   1020		pnv_ioda_free_pe(pe);
   1021		pdn->pe_number = IODA_INVALID_PE;
   1022		pe->pdev = NULL;
   1023		return NULL;
   1024	}
   1025
   1026	/* Put PE to the list */
   1027	mutex_lock(&phb->ioda.pe_list_mutex);
   1028	list_add_tail(&pe->list, &phb->ioda.pe_list);
   1029	mutex_unlock(&phb->ioda.pe_list_mutex);
   1030	return pe;
   1031}
   1032
   1033/*
   1034 * There're 2 types of PCI bus sensitive PEs: One that is compromised of
   1035 * single PCI bus. Another one that contains the primary PCI bus and its
   1036 * subordinate PCI devices and buses. The second type of PE is normally
   1037 * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
   1038 */
   1039static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
   1040{
   1041	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
   1042	struct pnv_ioda_pe *pe = NULL;
   1043	unsigned int pe_num;
   1044
   1045	/*
   1046	 * In partial hotplug case, the PE instance might be still alive.
   1047	 * We should reuse it instead of allocating a new one.
   1048	 */
   1049	pe_num = phb->ioda.pe_rmap[bus->number << 8];
   1050	if (WARN_ON(pe_num != IODA_INVALID_PE)) {
   1051		pe = &phb->ioda.pe_array[pe_num];
   1052		return NULL;
   1053	}
   1054
   1055	/* PE number for root bus should have been reserved */
   1056	if (pci_is_root_bus(bus))
   1057		pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
   1058
   1059	/* Check if PE is determined by M64 */
   1060	if (!pe)
   1061		pe = pnv_ioda_pick_m64_pe(bus, all);
   1062
   1063	/* The PE number isn't pinned by M64 */
   1064	if (!pe)
   1065		pe = pnv_ioda_alloc_pe(phb, 1);
   1066
   1067	if (!pe) {
   1068		pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
   1069			__func__, pci_domain_nr(bus), bus->number);
   1070		return NULL;
   1071	}
   1072
   1073	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
   1074	pe->pbus = bus;
   1075	pe->pdev = NULL;
   1076	pe->mve_number = -1;
   1077	pe->rid = bus->busn_res.start << 8;
   1078
   1079	if (all)
   1080		pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n",
   1081			&bus->busn_res.start, &bus->busn_res.end,
   1082			pe->pe_number);
   1083	else
   1084		pe_info(pe, "Secondary bus %pad associated with PE#%x\n",
   1085			&bus->busn_res.start, pe->pe_number);
   1086
   1087	if (pnv_ioda_configure_pe(phb, pe)) {
   1088		/* XXX What do we do here ? */
   1089		pnv_ioda_free_pe(pe);
   1090		pe->pbus = NULL;
   1091		return NULL;
   1092	}
   1093
   1094	/* Put PE to the list */
   1095	list_add_tail(&pe->list, &phb->ioda.pe_list);
   1096
   1097	return pe;
   1098}
   1099
   1100static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
   1101				       struct pnv_ioda_pe *pe);
   1102
   1103static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
   1104{
   1105	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
   1106	struct pci_dn *pdn = pci_get_pdn(pdev);
   1107	struct pnv_ioda_pe *pe;
   1108
   1109	/* Check if the BDFN for this device is associated with a PE yet */
   1110	pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
   1111	if (!pe) {
   1112		/* VF PEs should be pre-configured in pnv_pci_sriov_enable() */
   1113		if (WARN_ON(pdev->is_virtfn))
   1114			return;
   1115
   1116		pnv_pci_configure_bus(pdev->bus);
   1117		pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
   1118		pci_info(pdev, "Configured PE#%x\n", pe ? pe->pe_number : 0xfffff);
   1119
   1120
   1121		/*
   1122		 * If we can't setup the IODA PE something has gone horribly
   1123		 * wrong and we can't enable DMA for the device.
   1124		 */
   1125		if (WARN_ON(!pe))
   1126			return;
   1127	} else {
   1128		pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number);
   1129	}
   1130
   1131	/*
   1132	 * We assume that bridges *probably* don't need to do any DMA so we can
   1133	 * skip allocating a TCE table, etc unless we get a non-bridge device.
   1134	 */
   1135	if (!pe->dma_setup_done && !pci_is_bridge(pdev)) {
   1136		switch (phb->type) {
   1137		case PNV_PHB_IODA1:
   1138			pnv_pci_ioda1_setup_dma_pe(phb, pe);
   1139			break;
   1140		case PNV_PHB_IODA2:
   1141			pnv_pci_ioda2_setup_dma_pe(phb, pe);
   1142			break;
   1143		default:
   1144			pr_warn("%s: No DMA for PHB#%x (type %d)\n",
   1145				__func__, phb->hose->global_number, phb->type);
   1146		}
   1147	}
   1148
   1149	if (pdn)
   1150		pdn->pe_number = pe->pe_number;
   1151	pe->device_count++;
   1152
   1153	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
   1154	pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
   1155	set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
   1156
   1157	/* PEs with a DMA weight of zero won't have a group */
   1158	if (pe->table_group.group)
   1159		iommu_add_device(&pe->table_group, &pdev->dev);
   1160}
   1161
   1162/*
   1163 * Reconfigure TVE#0 to be usable as 64-bit DMA space.
   1164 *
   1165 * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
   1166 * Devices can only access more than that if bit 59 of the PCI address is set
   1167 * by hardware, which indicates TVE#1 should be used instead of TVE#0.
   1168 * Many PCI devices are not capable of addressing that many bits, and as a
   1169 * result are limited to the 4GB of virtual memory made available to 32-bit
   1170 * devices in TVE#0.
   1171 *
   1172 * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
   1173 * devices by configuring the virtual memory past the first 4GB inaccessible
   1174 * by 64-bit DMAs.  This should only be used by devices that want more than
   1175 * 4GB, and only on PEs that have no 32-bit devices.
   1176 *
   1177 * Currently this will only work on PHB3 (POWER8).
   1178 */
   1179static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
   1180{
   1181	u64 window_size, table_size, tce_count, addr;
   1182	struct page *table_pages;
   1183	u64 tce_order = 28; /* 256MB TCEs */
   1184	__be64 *tces;
   1185	s64 rc;
   1186
   1187	/*
   1188	 * Window size needs to be a power of two, but needs to account for
   1189	 * shifting memory by the 4GB offset required to skip 32bit space.
   1190	 */
   1191	window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
   1192	tce_count = window_size >> tce_order;
   1193	table_size = tce_count << 3;
   1194
   1195	if (table_size < PAGE_SIZE)
   1196		table_size = PAGE_SIZE;
   1197
   1198	table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
   1199				       get_order(table_size));
   1200	if (!table_pages)
   1201		goto err;
   1202
   1203	tces = page_address(table_pages);
   1204	if (!tces)
   1205		goto err;
   1206
   1207	memset(tces, 0, table_size);
   1208
   1209	for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
   1210		tces[(addr + (1ULL << 32)) >> tce_order] =
   1211			cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
   1212	}
   1213
   1214	rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
   1215					pe->pe_number,
   1216					/* reconfigure window 0 */
   1217					(pe->pe_number << 1) + 0,
   1218					1,
   1219					__pa(tces),
   1220					table_size,
   1221					1 << tce_order);
   1222	if (rc == OPAL_SUCCESS) {
   1223		pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
   1224		return 0;
   1225	}
   1226err:
   1227	pe_err(pe, "Error configuring 64-bit DMA bypass\n");
   1228	return -EIO;
   1229}
   1230
   1231static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
   1232		u64 dma_mask)
   1233{
   1234	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
   1235	struct pci_dn *pdn = pci_get_pdn(pdev);
   1236	struct pnv_ioda_pe *pe;
   1237
   1238	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
   1239		return false;
   1240
   1241	pe = &phb->ioda.pe_array[pdn->pe_number];
   1242	if (pe->tce_bypass_enabled) {
   1243		u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
   1244		if (dma_mask >= top)
   1245			return true;
   1246	}
   1247
   1248	/*
   1249	 * If the device can't set the TCE bypass bit but still wants
   1250	 * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
   1251	 * bypass the 32-bit region and be usable for 64-bit DMAs.
   1252	 * The device needs to be able to address all of this space.
   1253	 */
   1254	if (dma_mask >> 32 &&
   1255	    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
   1256	    /* pe->pdev should be set if it's a single device, pe->pbus if not */
   1257	    (pe->device_count == 1 || !pe->pbus) &&
   1258	    phb->model == PNV_PHB_MODEL_PHB3) {
   1259		/* Configure the bypass mode */
   1260		s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
   1261		if (rc)
   1262			return false;
   1263		/* 4GB offset bypasses 32-bit space */
   1264		pdev->dev.archdata.dma_offset = (1ULL << 32);
   1265		return true;
   1266	}
   1267
   1268	return false;
   1269}
   1270
   1271static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb)
   1272{
   1273	return phb->regs + 0x210;
   1274}
   1275
   1276static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
   1277		unsigned long index, unsigned long npages)
   1278{
   1279	struct iommu_table_group_link *tgl = list_first_entry_or_null(
   1280			&tbl->it_group_list, struct iommu_table_group_link,
   1281			next);
   1282	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
   1283			struct pnv_ioda_pe, table_group);
   1284	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb);
   1285	unsigned long start, end, inc;
   1286
   1287	start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
   1288	end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
   1289			npages - 1);
   1290
   1291	/* p7ioc-style invalidation, 2 TCEs per write */
   1292	start |= (1ull << 63);
   1293	end |= (1ull << 63);
   1294	inc = 16;
   1295        end |= inc - 1;	/* round up end to be different than start */
   1296
   1297        mb(); /* Ensure above stores are visible */
   1298        while (start <= end) {
   1299		__raw_writeq_be(start, invalidate);
   1300                start += inc;
   1301        }
   1302
   1303	/*
   1304	 * The iommu layer will do another mb() for us on build()
   1305	 * and we don't care on free()
   1306	 */
   1307}
   1308
   1309static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
   1310		long npages, unsigned long uaddr,
   1311		enum dma_data_direction direction,
   1312		unsigned long attrs)
   1313{
   1314	int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
   1315			attrs);
   1316
   1317	if (!ret)
   1318		pnv_pci_p7ioc_tce_invalidate(tbl, index, npages);
   1319
   1320	return ret;
   1321}
   1322
   1323#ifdef CONFIG_IOMMU_API
   1324/* Common for IODA1 and IODA2 */
   1325static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
   1326		unsigned long *hpa, enum dma_data_direction *direction)
   1327{
   1328	return pnv_tce_xchg(tbl, index, hpa, direction);
   1329}
   1330#endif
   1331
   1332static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
   1333		long npages)
   1334{
   1335	pnv_tce_free(tbl, index, npages);
   1336
   1337	pnv_pci_p7ioc_tce_invalidate(tbl, index, npages);
   1338}
   1339
   1340static struct iommu_table_ops pnv_ioda1_iommu_ops = {
   1341	.set = pnv_ioda1_tce_build,
   1342#ifdef CONFIG_IOMMU_API
   1343	.xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
   1344	.tce_kill = pnv_pci_p7ioc_tce_invalidate,
   1345	.useraddrptr = pnv_tce_useraddrptr,
   1346#endif
   1347	.clear = pnv_ioda1_tce_free,
   1348	.get = pnv_tce_get,
   1349};
   1350
   1351#define PHB3_TCE_KILL_INVAL_ALL		PPC_BIT(0)
   1352#define PHB3_TCE_KILL_INVAL_PE		PPC_BIT(1)
   1353#define PHB3_TCE_KILL_INVAL_ONE		PPC_BIT(2)
   1354
   1355static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
   1356{
   1357	/* 01xb - invalidate TCEs that match the specified PE# */
   1358	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb);
   1359	unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
   1360
   1361	mb(); /* Ensure above stores are visible */
   1362	__raw_writeq_be(val, invalidate);
   1363}
   1364
   1365static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe,
   1366					unsigned shift, unsigned long index,
   1367					unsigned long npages)
   1368{
   1369	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb);
   1370	unsigned long start, end, inc;
   1371
   1372	/* We'll invalidate DMA address in PE scope */
   1373	start = PHB3_TCE_KILL_INVAL_ONE;
   1374	start |= (pe->pe_number & 0xFF);
   1375	end = start;
   1376
   1377	/* Figure out the start, end and step */
   1378	start |= (index << shift);
   1379	end |= ((index + npages - 1) << shift);
   1380	inc = (0x1ull << shift);
   1381	mb();
   1382
   1383	while (start <= end) {
   1384		__raw_writeq_be(start, invalidate);
   1385		start += inc;
   1386	}
   1387}
   1388
   1389static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
   1390{
   1391	struct pnv_phb *phb = pe->phb;
   1392
   1393	if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
   1394		pnv_pci_phb3_tce_invalidate_pe(pe);
   1395	else
   1396		opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL_PE,
   1397				  pe->pe_number, 0, 0, 0);
   1398}
   1399
   1400static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
   1401		unsigned long index, unsigned long npages)
   1402{
   1403	struct iommu_table_group_link *tgl;
   1404
   1405	list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
   1406		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
   1407				struct pnv_ioda_pe, table_group);
   1408		struct pnv_phb *phb = pe->phb;
   1409		unsigned int shift = tbl->it_page_shift;
   1410
   1411		if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
   1412			pnv_pci_phb3_tce_invalidate(pe, shift,
   1413						    index, npages);
   1414		else
   1415			opal_pci_tce_kill(phb->opal_id,
   1416					  OPAL_PCI_TCE_KILL_PAGES,
   1417					  pe->pe_number, 1u << shift,
   1418					  index << shift, npages);
   1419	}
   1420}
   1421
   1422static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
   1423		long npages, unsigned long uaddr,
   1424		enum dma_data_direction direction,
   1425		unsigned long attrs)
   1426{
   1427	int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
   1428			attrs);
   1429
   1430	if (!ret)
   1431		pnv_pci_ioda2_tce_invalidate(tbl, index, npages);
   1432
   1433	return ret;
   1434}
   1435
   1436static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
   1437		long npages)
   1438{
   1439	pnv_tce_free(tbl, index, npages);
   1440
   1441	pnv_pci_ioda2_tce_invalidate(tbl, index, npages);
   1442}
   1443
   1444static struct iommu_table_ops pnv_ioda2_iommu_ops = {
   1445	.set = pnv_ioda2_tce_build,
   1446#ifdef CONFIG_IOMMU_API
   1447	.xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
   1448	.tce_kill = pnv_pci_ioda2_tce_invalidate,
   1449	.useraddrptr = pnv_tce_useraddrptr,
   1450#endif
   1451	.clear = pnv_ioda2_tce_free,
   1452	.get = pnv_tce_get,
   1453	.free = pnv_pci_ioda2_table_free_pages,
   1454};
   1455
   1456static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
   1457{
   1458	unsigned int *weight = (unsigned int *)data;
   1459
   1460	/* This is quite simplistic. The "base" weight of a device
   1461	 * is 10. 0 means no DMA is to be accounted for it.
   1462	 */
   1463	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
   1464		return 0;
   1465
   1466	if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
   1467	    dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
   1468	    dev->class == PCI_CLASS_SERIAL_USB_EHCI)
   1469		*weight += 3;
   1470	else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
   1471		*weight += 15;
   1472	else
   1473		*weight += 10;
   1474
   1475	return 0;
   1476}
   1477
   1478static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
   1479{
   1480	unsigned int weight = 0;
   1481
   1482	/* SRIOV VF has same DMA32 weight as its PF */
   1483#ifdef CONFIG_PCI_IOV
   1484	if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
   1485		pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
   1486		return weight;
   1487	}
   1488#endif
   1489
   1490	if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
   1491		pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
   1492	} else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
   1493		struct pci_dev *pdev;
   1494
   1495		list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
   1496			pnv_pci_ioda_dev_dma_weight(pdev, &weight);
   1497	} else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
   1498		pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
   1499	}
   1500
   1501	return weight;
   1502}
   1503
   1504static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
   1505				       struct pnv_ioda_pe *pe)
   1506{
   1507
   1508	struct page *tce_mem = NULL;
   1509	struct iommu_table *tbl;
   1510	unsigned int weight, total_weight = 0;
   1511	unsigned int tce32_segsz, base, segs, avail, i;
   1512	int64_t rc;
   1513	void *addr;
   1514
   1515	/* XXX FIXME: Handle 64-bit only DMA devices */
   1516	/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
   1517	/* XXX FIXME: Allocate multi-level tables on PHB3 */
   1518	weight = pnv_pci_ioda_pe_dma_weight(pe);
   1519	if (!weight)
   1520		return;
   1521
   1522	pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
   1523		     &total_weight);
   1524	segs = (weight * phb->ioda.dma32_count) / total_weight;
   1525	if (!segs)
   1526		segs = 1;
   1527
   1528	/*
   1529	 * Allocate contiguous DMA32 segments. We begin with the expected
   1530	 * number of segments. With one more attempt, the number of DMA32
   1531	 * segments to be allocated is decreased by one until one segment
   1532	 * is allocated successfully.
   1533	 */
   1534	do {
   1535		for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
   1536			for (avail = 0, i = base; i < base + segs; i++) {
   1537				if (phb->ioda.dma32_segmap[i] ==
   1538				    IODA_INVALID_PE)
   1539					avail++;
   1540			}
   1541
   1542			if (avail == segs)
   1543				goto found;
   1544		}
   1545	} while (--segs);
   1546
   1547	if (!segs) {
   1548		pe_warn(pe, "No available DMA32 segments\n");
   1549		return;
   1550	}
   1551
   1552found:
   1553	tbl = pnv_pci_table_alloc(phb->hose->node);
   1554	if (WARN_ON(!tbl))
   1555		return;
   1556
   1557	iommu_register_group(&pe->table_group, phb->hose->global_number,
   1558			pe->pe_number);
   1559	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
   1560
   1561	/* Grab a 32-bit TCE table */
   1562	pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
   1563		weight, total_weight, base, segs);
   1564	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
   1565		base * PNV_IODA1_DMA32_SEGSIZE,
   1566		(base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
   1567
   1568	/* XXX Currently, we allocate one big contiguous table for the
   1569	 * TCEs. We only really need one chunk per 256M of TCE space
   1570	 * (ie per segment) but that's an optimization for later, it
   1571	 * requires some added smarts with our get/put_tce implementation
   1572	 *
   1573	 * Each TCE page is 4KB in size and each TCE entry occupies 8
   1574	 * bytes
   1575	 */
   1576	tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
   1577	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
   1578				   get_order(tce32_segsz * segs));
   1579	if (!tce_mem) {
   1580		pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
   1581		goto fail;
   1582	}
   1583	addr = page_address(tce_mem);
   1584	memset(addr, 0, tce32_segsz * segs);
   1585
   1586	/* Configure HW */
   1587	for (i = 0; i < segs; i++) {
   1588		rc = opal_pci_map_pe_dma_window(phb->opal_id,
   1589					      pe->pe_number,
   1590					      base + i, 1,
   1591					      __pa(addr) + tce32_segsz * i,
   1592					      tce32_segsz, IOMMU_PAGE_SIZE_4K);
   1593		if (rc) {
   1594			pe_err(pe, " Failed to configure 32-bit TCE table, err %lld\n",
   1595			       rc);
   1596			goto fail;
   1597		}
   1598	}
   1599
   1600	/* Setup DMA32 segment mapping */
   1601	for (i = base; i < base + segs; i++)
   1602		phb->ioda.dma32_segmap[i] = pe->pe_number;
   1603
   1604	/* Setup linux iommu table */
   1605	pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
   1606				  base * PNV_IODA1_DMA32_SEGSIZE,
   1607				  IOMMU_PAGE_SHIFT_4K);
   1608
   1609	tbl->it_ops = &pnv_ioda1_iommu_ops;
   1610	pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
   1611	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
   1612	if (!iommu_init_table(tbl, phb->hose->node, 0, 0))
   1613		panic("Failed to initialize iommu table");
   1614
   1615	pe->dma_setup_done = true;
   1616	return;
   1617 fail:
   1618	/* XXX Failure: Try to fallback to 64-bit only ? */
   1619	if (tce_mem)
   1620		__free_pages(tce_mem, get_order(tce32_segsz * segs));
   1621	if (tbl) {
   1622		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
   1623		iommu_tce_table_put(tbl);
   1624	}
   1625}
   1626
   1627static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
   1628		int num, struct iommu_table *tbl)
   1629{
   1630	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
   1631			table_group);
   1632	struct pnv_phb *phb = pe->phb;
   1633	int64_t rc;
   1634	const unsigned long size = tbl->it_indirect_levels ?
   1635			tbl->it_level_size : tbl->it_size;
   1636	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
   1637	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
   1638
   1639	pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n",
   1640		num, start_addr, start_addr + win_size - 1,
   1641		IOMMU_PAGE_SIZE(tbl));
   1642
   1643	/*
   1644	 * Map TCE table through TVT. The TVE index is the PE number
   1645	 * shifted by 1 bit for 32-bits DMA space.
   1646	 */
   1647	rc = opal_pci_map_pe_dma_window(phb->opal_id,
   1648			pe->pe_number,
   1649			(pe->pe_number << 1) + num,
   1650			tbl->it_indirect_levels + 1,
   1651			__pa(tbl->it_base),
   1652			size << 3,
   1653			IOMMU_PAGE_SIZE(tbl));
   1654	if (rc) {
   1655		pe_err(pe, "Failed to configure TCE table, err %lld\n", rc);
   1656		return rc;
   1657	}
   1658
   1659	pnv_pci_link_table_and_group(phb->hose->node, num,
   1660			tbl, &pe->table_group);
   1661	pnv_pci_ioda2_tce_invalidate_pe(pe);
   1662
   1663	return 0;
   1664}
   1665
   1666static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
   1667{
   1668	uint16_t window_id = (pe->pe_number << 1 ) + 1;
   1669	int64_t rc;
   1670
   1671	pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
   1672	if (enable) {
   1673		phys_addr_t top = memblock_end_of_DRAM();
   1674
   1675		top = roundup_pow_of_two(top);
   1676		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
   1677						     pe->pe_number,
   1678						     window_id,
   1679						     pe->tce_bypass_base,
   1680						     top);
   1681	} else {
   1682		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
   1683						     pe->pe_number,
   1684						     window_id,
   1685						     pe->tce_bypass_base,
   1686						     0);
   1687	}
   1688	if (rc)
   1689		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
   1690	else
   1691		pe->tce_bypass_enabled = enable;
   1692}
   1693
   1694static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
   1695		int num, __u32 page_shift, __u64 window_size, __u32 levels,
   1696		bool alloc_userspace_copy, struct iommu_table **ptbl)
   1697{
   1698	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
   1699			table_group);
   1700	int nid = pe->phb->hose->node;
   1701	__u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
   1702	long ret;
   1703	struct iommu_table *tbl;
   1704
   1705	tbl = pnv_pci_table_alloc(nid);
   1706	if (!tbl)
   1707		return -ENOMEM;
   1708
   1709	tbl->it_ops = &pnv_ioda2_iommu_ops;
   1710
   1711	ret = pnv_pci_ioda2_table_alloc_pages(nid,
   1712			bus_offset, page_shift, window_size,
   1713			levels, alloc_userspace_copy, tbl);
   1714	if (ret) {
   1715		iommu_tce_table_put(tbl);
   1716		return ret;
   1717	}
   1718
   1719	*ptbl = tbl;
   1720
   1721	return 0;
   1722}
   1723
   1724static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
   1725{
   1726	struct iommu_table *tbl = NULL;
   1727	long rc;
   1728	unsigned long res_start, res_end;
   1729
   1730	/*
   1731	 * crashkernel= specifies the kdump kernel's maximum memory at
   1732	 * some offset and there is no guaranteed the result is a power
   1733	 * of 2, which will cause errors later.
   1734	 */
   1735	const u64 max_memory = __rounddown_pow_of_two(memory_hotplug_max());
   1736
   1737	/*
   1738	 * In memory constrained environments, e.g. kdump kernel, the
   1739	 * DMA window can be larger than available memory, which will
   1740	 * cause errors later.
   1741	 */
   1742	const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
   1743
   1744	/*
   1745	 * We create the default window as big as we can. The constraint is
   1746	 * the max order of allocation possible. The TCE table is likely to
   1747	 * end up being multilevel and with on-demand allocation in place,
   1748	 * the initial use is not going to be huge as the default window aims
   1749	 * to support crippled devices (i.e. not fully 64bit DMAble) only.
   1750	 */
   1751	/* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
   1752	const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
   1753	/* Each TCE level cannot exceed maxblock so go multilevel if needed */
   1754	unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT);
   1755	unsigned long tcelevel_order = ilog2(maxblock >> 3);
   1756	unsigned int levels = tces_order / tcelevel_order;
   1757
   1758	if (tces_order % tcelevel_order)
   1759		levels += 1;
   1760	/*
   1761	 * We try to stick to default levels (which is >1 at the moment) in
   1762	 * order to save memory by relying on on-demain TCE level allocation.
   1763	 */
   1764	levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
   1765
   1766	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
   1767			window_size, levels, false, &tbl);
   1768	if (rc) {
   1769		pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
   1770				rc);
   1771		return rc;
   1772	}
   1773
   1774	/* We use top part of 32bit space for MMIO so exclude it from DMA */
   1775	res_start = 0;
   1776	res_end = 0;
   1777	if (window_size > pe->phb->ioda.m32_pci_base) {
   1778		res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
   1779		res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
   1780	}
   1781
   1782	if (iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end))
   1783		rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
   1784	else
   1785		rc = -ENOMEM;
   1786	if (rc) {
   1787		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", rc);
   1788		iommu_tce_table_put(tbl);
   1789		tbl = NULL; /* This clears iommu_table_base below */
   1790	}
   1791	if (!pnv_iommu_bypass_disabled)
   1792		pnv_pci_ioda2_set_bypass(pe, true);
   1793
   1794	/*
   1795	 * Set table base for the case of IOMMU DMA use. Usually this is done
   1796	 * from dma_dev_setup() which is not called when a device is returned
   1797	 * from VFIO so do it here.
   1798	 */
   1799	if (pe->pdev)
   1800		set_iommu_table_base(&pe->pdev->dev, tbl);
   1801
   1802	return 0;
   1803}
   1804
   1805static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
   1806		int num)
   1807{
   1808	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
   1809			table_group);
   1810	struct pnv_phb *phb = pe->phb;
   1811	long ret;
   1812
   1813	pe_info(pe, "Removing DMA window #%d\n", num);
   1814
   1815	ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
   1816			(pe->pe_number << 1) + num,
   1817			0/* levels */, 0/* table address */,
   1818			0/* table size */, 0/* page size */);
   1819	if (ret)
   1820		pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
   1821	else
   1822		pnv_pci_ioda2_tce_invalidate_pe(pe);
   1823
   1824	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
   1825
   1826	return ret;
   1827}
   1828
   1829#ifdef CONFIG_IOMMU_API
   1830unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
   1831		__u64 window_size, __u32 levels)
   1832{
   1833	unsigned long bytes = 0;
   1834	const unsigned window_shift = ilog2(window_size);
   1835	unsigned entries_shift = window_shift - page_shift;
   1836	unsigned table_shift = entries_shift + 3;
   1837	unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
   1838	unsigned long direct_table_size;
   1839
   1840	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
   1841			!is_power_of_2(window_size))
   1842		return 0;
   1843
   1844	/* Calculate a direct table size from window_size and levels */
   1845	entries_shift = (entries_shift + levels - 1) / levels;
   1846	table_shift = entries_shift + 3;
   1847	table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
   1848	direct_table_size =  1UL << table_shift;
   1849
   1850	for ( ; levels; --levels) {
   1851		bytes += ALIGN(tce_table_size, direct_table_size);
   1852
   1853		tce_table_size /= direct_table_size;
   1854		tce_table_size <<= 3;
   1855		tce_table_size = max_t(unsigned long,
   1856				tce_table_size, direct_table_size);
   1857	}
   1858
   1859	return bytes + bytes; /* one for HW table, one for userspace copy */
   1860}
   1861
   1862static long pnv_pci_ioda2_create_table_userspace(
   1863		struct iommu_table_group *table_group,
   1864		int num, __u32 page_shift, __u64 window_size, __u32 levels,
   1865		struct iommu_table **ptbl)
   1866{
   1867	long ret = pnv_pci_ioda2_create_table(table_group,
   1868			num, page_shift, window_size, levels, true, ptbl);
   1869
   1870	if (!ret)
   1871		(*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
   1872				page_shift, window_size, levels);
   1873	return ret;
   1874}
   1875
   1876static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
   1877{
   1878	struct pci_dev *dev;
   1879
   1880	list_for_each_entry(dev, &bus->devices, bus_list) {
   1881		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
   1882		dev->dev.archdata.dma_offset = pe->tce_bypass_base;
   1883
   1884		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
   1885			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
   1886	}
   1887}
   1888
   1889static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
   1890{
   1891	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
   1892						table_group);
   1893	/* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
   1894	struct iommu_table *tbl = pe->table_group.tables[0];
   1895
   1896	pnv_pci_ioda2_set_bypass(pe, false);
   1897	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
   1898	if (pe->pbus)
   1899		pnv_ioda_setup_bus_dma(pe, pe->pbus);
   1900	else if (pe->pdev)
   1901		set_iommu_table_base(&pe->pdev->dev, NULL);
   1902	iommu_tce_table_put(tbl);
   1903}
   1904
   1905static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
   1906{
   1907	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
   1908						table_group);
   1909
   1910	pnv_pci_ioda2_setup_default_config(pe);
   1911	if (pe->pbus)
   1912		pnv_ioda_setup_bus_dma(pe, pe->pbus);
   1913}
   1914
   1915static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
   1916	.get_table_size = pnv_pci_ioda2_get_table_size,
   1917	.create_table = pnv_pci_ioda2_create_table_userspace,
   1918	.set_window = pnv_pci_ioda2_set_window,
   1919	.unset_window = pnv_pci_ioda2_unset_window,
   1920	.take_ownership = pnv_ioda2_take_ownership,
   1921	.release_ownership = pnv_ioda2_release_ownership,
   1922};
   1923#endif
   1924
   1925void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
   1926				struct pnv_ioda_pe *pe)
   1927{
   1928	int64_t rc;
   1929
   1930	/* TVE #1 is selected by PCI address bit 59 */
   1931	pe->tce_bypass_base = 1ull << 59;
   1932
   1933	/* The PE will reserve all possible 32-bits space */
   1934	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
   1935		phb->ioda.m32_pci_base);
   1936
   1937	/* Setup linux iommu table */
   1938	pe->table_group.tce32_start = 0;
   1939	pe->table_group.tce32_size = phb->ioda.m32_pci_base;
   1940	pe->table_group.max_dynamic_windows_supported =
   1941			IOMMU_TABLE_GROUP_MAX_TABLES;
   1942	pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
   1943	pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
   1944
   1945	rc = pnv_pci_ioda2_setup_default_config(pe);
   1946	if (rc)
   1947		return;
   1948
   1949#ifdef CONFIG_IOMMU_API
   1950	pe->table_group.ops = &pnv_pci_ioda2_ops;
   1951	iommu_register_group(&pe->table_group, phb->hose->global_number,
   1952			     pe->pe_number);
   1953#endif
   1954	pe->dma_setup_done = true;
   1955}
   1956
   1957/*
   1958 * Called from KVM in real mode to EOI passthru interrupts. The ICP
   1959 * EOI is handled directly in KVM in kvmppc_deliver_irq_passthru().
   1960 *
   1961 * The IRQ data is mapped in the PCI-MSI domain and the EOI OPAL call
   1962 * needs an HW IRQ number mapped in the XICS IRQ domain. The HW IRQ
   1963 * numbers of the in-the-middle MSI domain are vector numbers and it's
   1964 * good enough for OPAL. Use that.
   1965 */
   1966int64_t pnv_opal_pci_msi_eoi(struct irq_data *d)
   1967{
   1968	struct pci_controller *hose = irq_data_get_irq_chip_data(d->parent_data);
   1969	struct pnv_phb *phb = hose->private_data;
   1970
   1971	return opal_pci_msi_eoi(phb->opal_id, d->parent_data->hwirq);
   1972}
   1973
   1974/*
   1975 * The IRQ data is mapped in the XICS domain, with OPAL HW IRQ numbers
   1976 */
   1977static void pnv_ioda2_msi_eoi(struct irq_data *d)
   1978{
   1979	int64_t rc;
   1980	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
   1981	struct pci_controller *hose = irq_data_get_irq_chip_data(d);
   1982	struct pnv_phb *phb = hose->private_data;
   1983
   1984	rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
   1985	WARN_ON_ONCE(rc);
   1986
   1987	icp_native_eoi(d);
   1988}
   1989
   1990/* P8/CXL only */
   1991void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
   1992{
   1993	struct irq_data *idata;
   1994	struct irq_chip *ichip;
   1995
   1996	/* The MSI EOI OPAL call is only needed on PHB3 */
   1997	if (phb->model != PNV_PHB_MODEL_PHB3)
   1998		return;
   1999
   2000	if (!phb->ioda.irq_chip_init) {
   2001		/*
   2002		 * First time we setup an MSI IRQ, we need to setup the
   2003		 * corresponding IRQ chip to route correctly.
   2004		 */
   2005		idata = irq_get_irq_data(virq);
   2006		ichip = irq_data_get_irq_chip(idata);
   2007		phb->ioda.irq_chip_init = 1;
   2008		phb->ioda.irq_chip = *ichip;
   2009		phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
   2010	}
   2011	irq_set_chip(virq, &phb->ioda.irq_chip);
   2012	irq_set_chip_data(virq, phb->hose);
   2013}
   2014
   2015static struct irq_chip pnv_pci_msi_irq_chip;
   2016
   2017/*
   2018 * Returns true iff chip is something that we could call
   2019 * pnv_opal_pci_msi_eoi for.
   2020 */
   2021bool is_pnv_opal_msi(struct irq_chip *chip)
   2022{
   2023	return chip == &pnv_pci_msi_irq_chip;
   2024}
   2025EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
   2026
   2027static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
   2028				    unsigned int xive_num,
   2029				    unsigned int is_64, struct msi_msg *msg)
   2030{
   2031	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
   2032	__be32 data;
   2033	int rc;
   2034
   2035	dev_dbg(&dev->dev, "%s: setup %s-bit MSI for vector #%d\n", __func__,
   2036		is_64 ? "64" : "32", xive_num);
   2037
   2038	/* No PE assigned ? bail out ... no MSI for you ! */
   2039	if (pe == NULL)
   2040		return -ENXIO;
   2041
   2042	/* Check if we have an MVE */
   2043	if (pe->mve_number < 0)
   2044		return -ENXIO;
   2045
   2046	/* Force 32-bit MSI on some broken devices */
   2047	if (dev->no_64bit_msi)
   2048		is_64 = 0;
   2049
   2050	/* Assign XIVE to PE */
   2051	rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
   2052	if (rc) {
   2053		pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
   2054			pci_name(dev), rc, xive_num);
   2055		return -EIO;
   2056	}
   2057
   2058	if (is_64) {
   2059		__be64 addr64;
   2060
   2061		rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
   2062				     &addr64, &data);
   2063		if (rc) {
   2064			pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
   2065				pci_name(dev), rc);
   2066			return -EIO;
   2067		}
   2068		msg->address_hi = be64_to_cpu(addr64) >> 32;
   2069		msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
   2070	} else {
   2071		__be32 addr32;
   2072
   2073		rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
   2074				     &addr32, &data);
   2075		if (rc) {
   2076			pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
   2077				pci_name(dev), rc);
   2078			return -EIO;
   2079		}
   2080		msg->address_hi = 0;
   2081		msg->address_lo = be32_to_cpu(addr32);
   2082	}
   2083	msg->data = be32_to_cpu(data);
   2084
   2085	return 0;
   2086}
   2087
   2088/*
   2089 * The msi_free() op is called before irq_domain_free_irqs_top() when
   2090 * the handler data is still available. Use that to clear the XIVE
   2091 * controller.
   2092 */
   2093static void pnv_msi_ops_msi_free(struct irq_domain *domain,
   2094				 struct msi_domain_info *info,
   2095				 unsigned int irq)
   2096{
   2097	if (xive_enabled())
   2098		xive_irq_free_data(irq);
   2099}
   2100
   2101static struct msi_domain_ops pnv_pci_msi_domain_ops = {
   2102	.msi_free	= pnv_msi_ops_msi_free,
   2103};
   2104
   2105static void pnv_msi_shutdown(struct irq_data *d)
   2106{
   2107	d = d->parent_data;
   2108	if (d->chip->irq_shutdown)
   2109		d->chip->irq_shutdown(d);
   2110}
   2111
   2112static void pnv_msi_mask(struct irq_data *d)
   2113{
   2114	pci_msi_mask_irq(d);
   2115	irq_chip_mask_parent(d);
   2116}
   2117
   2118static void pnv_msi_unmask(struct irq_data *d)
   2119{
   2120	pci_msi_unmask_irq(d);
   2121	irq_chip_unmask_parent(d);
   2122}
   2123
   2124static struct irq_chip pnv_pci_msi_irq_chip = {
   2125	.name		= "PNV-PCI-MSI",
   2126	.irq_shutdown	= pnv_msi_shutdown,
   2127	.irq_mask	= pnv_msi_mask,
   2128	.irq_unmask	= pnv_msi_unmask,
   2129	.irq_eoi	= irq_chip_eoi_parent,
   2130};
   2131
   2132static struct msi_domain_info pnv_msi_domain_info = {
   2133	.flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
   2134		  MSI_FLAG_MULTI_PCI_MSI  | MSI_FLAG_PCI_MSIX),
   2135	.ops   = &pnv_pci_msi_domain_ops,
   2136	.chip  = &pnv_pci_msi_irq_chip,
   2137};
   2138
   2139static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg)
   2140{
   2141	struct msi_desc *entry = irq_data_get_msi_desc(d);
   2142	struct pci_dev *pdev = msi_desc_to_pci_dev(entry);
   2143	struct pci_controller *hose = irq_data_get_irq_chip_data(d);
   2144	struct pnv_phb *phb = hose->private_data;
   2145	int rc;
   2146
   2147	rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq,
   2148				      entry->pci.msi_attrib.is_64, msg);
   2149	if (rc)
   2150		dev_err(&pdev->dev, "Failed to setup %s-bit MSI #%ld : %d\n",
   2151			entry->pci.msi_attrib.is_64 ? "64" : "32", d->hwirq, rc);
   2152}
   2153
   2154/*
   2155 * The IRQ data is mapped in the MSI domain in which HW IRQ numbers
   2156 * correspond to vector numbers.
   2157 */
   2158static void pnv_msi_eoi(struct irq_data *d)
   2159{
   2160	struct pci_controller *hose = irq_data_get_irq_chip_data(d);
   2161	struct pnv_phb *phb = hose->private_data;
   2162
   2163	if (phb->model == PNV_PHB_MODEL_PHB3) {
   2164		/*
   2165		 * The EOI OPAL call takes an OPAL HW IRQ number but
   2166		 * since it is translated into a vector number in
   2167		 * OPAL, use that directly.
   2168		 */
   2169		WARN_ON_ONCE(opal_pci_msi_eoi(phb->opal_id, d->hwirq));
   2170	}
   2171
   2172	irq_chip_eoi_parent(d);
   2173}
   2174
   2175static struct irq_chip pnv_msi_irq_chip = {
   2176	.name			= "PNV-MSI",
   2177	.irq_shutdown		= pnv_msi_shutdown,
   2178	.irq_mask		= irq_chip_mask_parent,
   2179	.irq_unmask		= irq_chip_unmask_parent,
   2180	.irq_eoi		= pnv_msi_eoi,
   2181	.irq_set_affinity	= irq_chip_set_affinity_parent,
   2182	.irq_compose_msi_msg	= pnv_msi_compose_msg,
   2183};
   2184
   2185static int pnv_irq_parent_domain_alloc(struct irq_domain *domain,
   2186				       unsigned int virq, int hwirq)
   2187{
   2188	struct irq_fwspec parent_fwspec;
   2189	int ret;
   2190
   2191	parent_fwspec.fwnode = domain->parent->fwnode;
   2192	parent_fwspec.param_count = 2;
   2193	parent_fwspec.param[0] = hwirq;
   2194	parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
   2195
   2196	ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec);
   2197	if (ret)
   2198		return ret;
   2199
   2200	return 0;
   2201}
   2202
   2203static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
   2204				unsigned int nr_irqs, void *arg)
   2205{
   2206	struct pci_controller *hose = domain->host_data;
   2207	struct pnv_phb *phb = hose->private_data;
   2208	msi_alloc_info_t *info = arg;
   2209	struct pci_dev *pdev = msi_desc_to_pci_dev(info->desc);
   2210	int hwirq;
   2211	int i, ret;
   2212
   2213	hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, nr_irqs);
   2214	if (hwirq < 0) {
   2215		dev_warn(&pdev->dev, "failed to find a free MSI\n");
   2216		return -ENOSPC;
   2217	}
   2218
   2219	dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__,
   2220		hose->dn, virq, hwirq, nr_irqs);
   2221
   2222	for (i = 0; i < nr_irqs; i++) {
   2223		ret = pnv_irq_parent_domain_alloc(domain, virq + i,
   2224						  phb->msi_base + hwirq + i);
   2225		if (ret)
   2226			goto out;
   2227
   2228		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
   2229					      &pnv_msi_irq_chip, hose);
   2230	}
   2231
   2232	return 0;
   2233
   2234out:
   2235	irq_domain_free_irqs_parent(domain, virq, i - 1);
   2236	msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs);
   2237	return ret;
   2238}
   2239
   2240static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq,
   2241				unsigned int nr_irqs)
   2242{
   2243	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
   2244	struct pci_controller *hose = irq_data_get_irq_chip_data(d);
   2245	struct pnv_phb *phb = hose->private_data;
   2246
   2247	pr_debug("%s bridge %pOF %d/%lx #%d\n", __func__, hose->dn,
   2248		 virq, d->hwirq, nr_irqs);
   2249
   2250	msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs);
   2251	/* XIVE domain is cleared through ->msi_free() */
   2252}
   2253
   2254static const struct irq_domain_ops pnv_irq_domain_ops = {
   2255	.alloc  = pnv_irq_domain_alloc,
   2256	.free   = pnv_irq_domain_free,
   2257};
   2258
   2259static int __init pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count)
   2260{
   2261	struct pnv_phb *phb = hose->private_data;
   2262	struct irq_domain *parent = irq_get_default_host();
   2263
   2264	hose->fwnode = irq_domain_alloc_named_id_fwnode("PNV-MSI", phb->opal_id);
   2265	if (!hose->fwnode)
   2266		return -ENOMEM;
   2267
   2268	hose->dev_domain = irq_domain_create_hierarchy(parent, 0, count,
   2269						       hose->fwnode,
   2270						       &pnv_irq_domain_ops, hose);
   2271	if (!hose->dev_domain) {
   2272		pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n",
   2273		       hose->dn, hose->global_number);
   2274		irq_domain_free_fwnode(hose->fwnode);
   2275		return -ENOMEM;
   2276	}
   2277
   2278	hose->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(hose->dn),
   2279						     &pnv_msi_domain_info,
   2280						     hose->dev_domain);
   2281	if (!hose->msi_domain) {
   2282		pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n",
   2283		       hose->dn, hose->global_number);
   2284		irq_domain_free_fwnode(hose->fwnode);
   2285		irq_domain_remove(hose->dev_domain);
   2286		return -ENOMEM;
   2287	}
   2288
   2289	return 0;
   2290}
   2291
   2292static void __init pnv_pci_init_ioda_msis(struct pnv_phb *phb)
   2293{
   2294	unsigned int count;
   2295	const __be32 *prop = of_get_property(phb->hose->dn,
   2296					     "ibm,opal-msi-ranges", NULL);
   2297	if (!prop) {
   2298		/* BML Fallback */
   2299		prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
   2300	}
   2301	if (!prop)
   2302		return;
   2303
   2304	phb->msi_base = be32_to_cpup(prop);
   2305	count = be32_to_cpup(prop + 1);
   2306	if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
   2307		pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
   2308		       phb->hose->global_number);
   2309		return;
   2310	}
   2311
   2312	pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
   2313		count, phb->msi_base);
   2314
   2315	pnv_msi_allocate_domains(phb->hose, count);
   2316}
   2317
   2318static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
   2319				  struct resource *res)
   2320{
   2321	struct pnv_phb *phb = pe->phb;
   2322	struct pci_bus_region region;
   2323	int index;
   2324	int64_t rc;
   2325
   2326	if (!res || !res->flags || res->start > res->end)
   2327		return;
   2328
   2329	if (res->flags & IORESOURCE_IO) {
   2330		region.start = res->start - phb->ioda.io_pci_base;
   2331		region.end   = res->end - phb->ioda.io_pci_base;
   2332		index = region.start / phb->ioda.io_segsize;
   2333
   2334		while (index < phb->ioda.total_pe_num &&
   2335		       region.start <= region.end) {
   2336			phb->ioda.io_segmap[index] = pe->pe_number;
   2337			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
   2338				pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
   2339			if (rc != OPAL_SUCCESS) {
   2340				pr_err("%s: Error %lld mapping IO segment#%d to PE#%x\n",
   2341				       __func__, rc, index, pe->pe_number);
   2342				break;
   2343			}
   2344
   2345			region.start += phb->ioda.io_segsize;
   2346			index++;
   2347		}
   2348	} else if ((res->flags & IORESOURCE_MEM) &&
   2349		   !pnv_pci_is_m64(phb, res)) {
   2350		region.start = res->start -
   2351			       phb->hose->mem_offset[0] -
   2352			       phb->ioda.m32_pci_base;
   2353		region.end   = res->end -
   2354			       phb->hose->mem_offset[0] -
   2355			       phb->ioda.m32_pci_base;
   2356		index = region.start / phb->ioda.m32_segsize;
   2357
   2358		while (index < phb->ioda.total_pe_num &&
   2359		       region.start <= region.end) {
   2360			phb->ioda.m32_segmap[index] = pe->pe_number;
   2361			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
   2362				pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
   2363			if (rc != OPAL_SUCCESS) {
   2364				pr_err("%s: Error %lld mapping M32 segment#%d to PE#%x",
   2365				       __func__, rc, index, pe->pe_number);
   2366				break;
   2367			}
   2368
   2369			region.start += phb->ioda.m32_segsize;
   2370			index++;
   2371		}
   2372	}
   2373}
   2374
   2375/*
   2376 * This function is supposed to be called on basis of PE from top
   2377 * to bottom style. So the I/O or MMIO segment assigned to
   2378 * parent PE could be overridden by its child PEs if necessary.
   2379 */
   2380static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
   2381{
   2382	struct pci_dev *pdev;
   2383	int i;
   2384
   2385	/*
   2386	 * NOTE: We only care PCI bus based PE for now. For PCI
   2387	 * device based PE, for example SRIOV sensitive VF should
   2388	 * be figured out later.
   2389	 */
   2390	BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
   2391
   2392	list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
   2393		for (i = 0; i <= PCI_ROM_RESOURCE; i++)
   2394			pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
   2395
   2396		/*
   2397		 * If the PE contains all subordinate PCI buses, the
   2398		 * windows of the child bridges should be mapped to
   2399		 * the PE as well.
   2400		 */
   2401		if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
   2402			continue;
   2403		for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
   2404			pnv_ioda_setup_pe_res(pe,
   2405				&pdev->resource[PCI_BRIDGE_RESOURCES + i]);
   2406	}
   2407}
   2408
   2409#ifdef CONFIG_DEBUG_FS
   2410static int pnv_pci_diag_data_set(void *data, u64 val)
   2411{
   2412	struct pnv_phb *phb = data;
   2413	s64 ret;
   2414
   2415	/* Retrieve the diag data from firmware */
   2416	ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
   2417					  phb->diag_data_size);
   2418	if (ret != OPAL_SUCCESS)
   2419		return -EIO;
   2420
   2421	/* Print the diag data to the kernel log */
   2422	pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
   2423	return 0;
   2424}
   2425
   2426DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, pnv_pci_diag_data_set,
   2427			 "%llu\n");
   2428
   2429static int pnv_pci_ioda_pe_dump(void *data, u64 val)
   2430{
   2431	struct pnv_phb *phb = data;
   2432	int pe_num;
   2433
   2434	for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
   2435		struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_num];
   2436
   2437		if (!test_bit(pe_num, phb->ioda.pe_alloc))
   2438			continue;
   2439
   2440		pe_warn(pe, "rid: %04x dev count: %2d flags: %s%s%s%s%s%s\n",
   2441			pe->rid, pe->device_count,
   2442			(pe->flags & PNV_IODA_PE_DEV) ? "dev " : "",
   2443			(pe->flags & PNV_IODA_PE_BUS) ? "bus " : "",
   2444			(pe->flags & PNV_IODA_PE_BUS_ALL) ? "all " : "",
   2445			(pe->flags & PNV_IODA_PE_MASTER) ? "master " : "",
   2446			(pe->flags & PNV_IODA_PE_SLAVE) ? "slave " : "",
   2447			(pe->flags & PNV_IODA_PE_VF) ? "vf " : "");
   2448	}
   2449
   2450	return 0;
   2451}
   2452
   2453DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_ioda_pe_dump_fops, NULL,
   2454			 pnv_pci_ioda_pe_dump, "%llu\n");
   2455
   2456#endif /* CONFIG_DEBUG_FS */
   2457
   2458static void pnv_pci_ioda_create_dbgfs(void)
   2459{
   2460#ifdef CONFIG_DEBUG_FS
   2461	struct pci_controller *hose, *tmp;
   2462	struct pnv_phb *phb;
   2463	char name[16];
   2464
   2465	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
   2466		phb = hose->private_data;
   2467
   2468		sprintf(name, "PCI%04x", hose->global_number);
   2469		phb->dbgfs = debugfs_create_dir(name, arch_debugfs_dir);
   2470
   2471		debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs,
   2472					   phb, &pnv_pci_diag_data_fops);
   2473		debugfs_create_file_unsafe("dump_ioda_pe_state", 0200, phb->dbgfs,
   2474					   phb, &pnv_pci_ioda_pe_dump_fops);
   2475	}
   2476#endif /* CONFIG_DEBUG_FS */
   2477}
   2478
   2479static void pnv_pci_enable_bridge(struct pci_bus *bus)
   2480{
   2481	struct pci_dev *dev = bus->self;
   2482	struct pci_bus *child;
   2483
   2484	/* Empty bus ? bail */
   2485	if (list_empty(&bus->devices))
   2486		return;
   2487
   2488	/*
   2489	 * If there's a bridge associated with that bus enable it. This works
   2490	 * around races in the generic code if the enabling is done during
   2491	 * parallel probing. This can be removed once those races have been
   2492	 * fixed.
   2493	 */
   2494	if (dev) {
   2495		int rc = pci_enable_device(dev);
   2496		if (rc)
   2497			pci_err(dev, "Error enabling bridge (%d)\n", rc);
   2498		pci_set_master(dev);
   2499	}
   2500
   2501	/* Perform the same to child busses */
   2502	list_for_each_entry(child, &bus->children, node)
   2503		pnv_pci_enable_bridge(child);
   2504}
   2505
   2506static void pnv_pci_enable_bridges(void)
   2507{
   2508	struct pci_controller *hose;
   2509
   2510	list_for_each_entry(hose, &hose_list, list_node)
   2511		pnv_pci_enable_bridge(hose->bus);
   2512}
   2513
   2514static void pnv_pci_ioda_fixup(void)
   2515{
   2516	pnv_pci_ioda_create_dbgfs();
   2517
   2518	pnv_pci_enable_bridges();
   2519
   2520#ifdef CONFIG_EEH
   2521	pnv_eeh_post_init();
   2522#endif
   2523}
   2524
   2525/*
   2526 * Returns the alignment for I/O or memory windows for P2P
   2527 * bridges. That actually depends on how PEs are segmented.
   2528 * For now, we return I/O or M32 segment size for PE sensitive
   2529 * P2P bridges. Otherwise, the default values (4KiB for I/O,
   2530 * 1MiB for memory) will be returned.
   2531 *
   2532 * The current PCI bus might be put into one PE, which was
   2533 * create against the parent PCI bridge. For that case, we
   2534 * needn't enlarge the alignment so that we can save some
   2535 * resources.
   2536 */
   2537static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
   2538						unsigned long type)
   2539{
   2540	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
   2541	int num_pci_bridges = 0;
   2542	struct pci_dev *bridge;
   2543
   2544	bridge = bus->self;
   2545	while (bridge) {
   2546		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
   2547			num_pci_bridges++;
   2548			if (num_pci_bridges >= 2)
   2549				return 1;
   2550		}
   2551
   2552		bridge = bridge->bus->self;
   2553	}
   2554
   2555	/*
   2556	 * We fall back to M32 if M64 isn't supported. We enforce the M64
   2557	 * alignment for any 64-bit resource, PCIe doesn't care and
   2558	 * bridges only do 64-bit prefetchable anyway.
   2559	 */
   2560	if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type))
   2561		return phb->ioda.m64_segsize;
   2562	if (type & IORESOURCE_MEM)
   2563		return phb->ioda.m32_segsize;
   2564
   2565	return phb->ioda.io_segsize;
   2566}
   2567
   2568/*
   2569 * We are updating root port or the upstream port of the
   2570 * bridge behind the root port with PHB's windows in order
   2571 * to accommodate the changes on required resources during
   2572 * PCI (slot) hotplug, which is connected to either root
   2573 * port or the downstream ports of PCIe switch behind the
   2574 * root port.
   2575 */
   2576static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
   2577					   unsigned long type)
   2578{
   2579	struct pci_controller *hose = pci_bus_to_host(bus);
   2580	struct pnv_phb *phb = hose->private_data;
   2581	struct pci_dev *bridge = bus->self;
   2582	struct resource *r, *w;
   2583	bool msi_region = false;
   2584	int i;
   2585
   2586	/* Check if we need apply fixup to the bridge's windows */
   2587	if (!pci_is_root_bus(bridge->bus) &&
   2588	    !pci_is_root_bus(bridge->bus->self->bus))
   2589		return;
   2590
   2591	/* Fixup the resources */
   2592	for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
   2593		r = &bridge->resource[PCI_BRIDGE_RESOURCES + i];
   2594		if (!r->flags || !r->parent)
   2595			continue;
   2596
   2597		w = NULL;
   2598		if (r->flags & type & IORESOURCE_IO)
   2599			w = &hose->io_resource;
   2600		else if (pnv_pci_is_m64(phb, r) &&
   2601			 (type & IORESOURCE_PREFETCH) &&
   2602			 phb->ioda.m64_segsize)
   2603			w = &hose->mem_resources[1];
   2604		else if (r->flags & type & IORESOURCE_MEM) {
   2605			w = &hose->mem_resources[0];
   2606			msi_region = true;
   2607		}
   2608
   2609		r->start = w->start;
   2610		r->end = w->end;
   2611
   2612		/* The 64KB 32-bits MSI region shouldn't be included in
   2613		 * the 32-bits bridge window. Otherwise, we can see strange
   2614		 * issues. One of them is EEH error observed on Garrison.
   2615		 *
   2616		 * Exclude top 1MB region which is the minimal alignment of
   2617		 * 32-bits bridge window.
   2618		 */
   2619		if (msi_region) {
   2620			r->end += 0x10000;
   2621			r->end -= 0x100000;
   2622		}
   2623	}
   2624}
   2625
   2626static void pnv_pci_configure_bus(struct pci_bus *bus)
   2627{
   2628	struct pci_dev *bridge = bus->self;
   2629	struct pnv_ioda_pe *pe;
   2630	bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
   2631
   2632	dev_info(&bus->dev, "Configuring PE for bus\n");
   2633
   2634	/* Don't assign PE to PCI bus, which doesn't have subordinate devices */
   2635	if (WARN_ON(list_empty(&bus->devices)))
   2636		return;
   2637
   2638	/* Reserve PEs according to used M64 resources */
   2639	pnv_ioda_reserve_m64_pe(bus, NULL, all);
   2640
   2641	/*
   2642	 * Assign PE. We might run here because of partial hotplug.
   2643	 * For the case, we just pick up the existing PE and should
   2644	 * not allocate resources again.
   2645	 */
   2646	pe = pnv_ioda_setup_bus_PE(bus, all);
   2647	if (!pe)
   2648		return;
   2649
   2650	pnv_ioda_setup_pe_seg(pe);
   2651}
   2652
   2653static resource_size_t pnv_pci_default_alignment(void)
   2654{
   2655	return PAGE_SIZE;
   2656}
   2657
   2658/* Prevent enabling devices for which we couldn't properly
   2659 * assign a PE
   2660 */
   2661static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
   2662{
   2663	struct pci_dn *pdn;
   2664
   2665	pdn = pci_get_pdn(dev);
   2666	if (!pdn || pdn->pe_number == IODA_INVALID_PE) {
   2667		pci_err(dev, "pci_enable_device() blocked, no PE assigned.\n");
   2668		return false;
   2669	}
   2670
   2671	return true;
   2672}
   2673
   2674static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev)
   2675{
   2676	struct pci_dn *pdn;
   2677	struct pnv_ioda_pe *pe;
   2678
   2679	pdn = pci_get_pdn(dev);
   2680	if (!pdn)
   2681		return false;
   2682
   2683	if (pdn->pe_number == IODA_INVALID_PE) {
   2684		pe = pnv_ioda_setup_dev_PE(dev);
   2685		if (!pe)
   2686			return false;
   2687	}
   2688	return true;
   2689}
   2690
   2691static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
   2692				       int num)
   2693{
   2694	struct pnv_ioda_pe *pe = container_of(table_group,
   2695					      struct pnv_ioda_pe, table_group);
   2696	struct pnv_phb *phb = pe->phb;
   2697	unsigned int idx;
   2698	long rc;
   2699
   2700	pe_info(pe, "Removing DMA window #%d\n", num);
   2701	for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
   2702		if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
   2703			continue;
   2704
   2705		rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
   2706						idx, 0, 0ul, 0ul, 0ul);
   2707		if (rc != OPAL_SUCCESS) {
   2708			pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
   2709				rc, idx);
   2710			return rc;
   2711		}
   2712
   2713		phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
   2714	}
   2715
   2716	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
   2717	return OPAL_SUCCESS;
   2718}
   2719
   2720static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
   2721{
   2722	struct iommu_table *tbl = pe->table_group.tables[0];
   2723	int64_t rc;
   2724
   2725	if (!pe->dma_setup_done)
   2726		return;
   2727
   2728	rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
   2729	if (rc != OPAL_SUCCESS)
   2730		return;
   2731
   2732	pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size);
   2733	if (pe->table_group.group) {
   2734		iommu_group_put(pe->table_group.group);
   2735		WARN_ON(pe->table_group.group);
   2736	}
   2737
   2738	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
   2739	iommu_tce_table_put(tbl);
   2740}
   2741
   2742void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
   2743{
   2744	struct iommu_table *tbl = pe->table_group.tables[0];
   2745	int64_t rc;
   2746
   2747	if (!pe->dma_setup_done)
   2748		return;
   2749
   2750	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
   2751	if (rc)
   2752		pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
   2753
   2754	pnv_pci_ioda2_set_bypass(pe, false);
   2755	if (pe->table_group.group) {
   2756		iommu_group_put(pe->table_group.group);
   2757		WARN_ON(pe->table_group.group);
   2758	}
   2759
   2760	iommu_tce_table_put(tbl);
   2761}
   2762
   2763static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
   2764				 unsigned short win,
   2765				 unsigned int *map)
   2766{
   2767	struct pnv_phb *phb = pe->phb;
   2768	int idx;
   2769	int64_t rc;
   2770
   2771	for (idx = 0; idx < phb->ioda.total_pe_num; idx++) {
   2772		if (map[idx] != pe->pe_number)
   2773			continue;
   2774
   2775		rc = opal_pci_map_pe_mmio_window(phb->opal_id,
   2776				phb->ioda.reserved_pe_idx, win, 0, idx);
   2777
   2778		if (rc != OPAL_SUCCESS)
   2779			pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
   2780				rc, win, idx);
   2781
   2782		map[idx] = IODA_INVALID_PE;
   2783	}
   2784}
   2785
   2786static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
   2787{
   2788	struct pnv_phb *phb = pe->phb;
   2789
   2790	if (phb->type == PNV_PHB_IODA1) {
   2791		pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
   2792				     phb->ioda.io_segmap);
   2793		pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
   2794				     phb->ioda.m32_segmap);
   2795		/* M64 is pre-configured by pnv_ioda1_init_m64() */
   2796	} else if (phb->type == PNV_PHB_IODA2) {
   2797		pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
   2798				     phb->ioda.m32_segmap);
   2799	}
   2800}
   2801
   2802static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
   2803{
   2804	struct pnv_phb *phb = pe->phb;
   2805	struct pnv_ioda_pe *slave, *tmp;
   2806
   2807	pe_info(pe, "Releasing PE\n");
   2808
   2809	mutex_lock(&phb->ioda.pe_list_mutex);
   2810	list_del(&pe->list);
   2811	mutex_unlock(&phb->ioda.pe_list_mutex);
   2812
   2813	switch (phb->type) {
   2814	case PNV_PHB_IODA1:
   2815		pnv_pci_ioda1_release_pe_dma(pe);
   2816		break;
   2817	case PNV_PHB_IODA2:
   2818		pnv_pci_ioda2_release_pe_dma(pe);
   2819		break;
   2820	case PNV_PHB_NPU_OCAPI:
   2821		break;
   2822	default:
   2823		WARN_ON(1);
   2824	}
   2825
   2826	pnv_ioda_release_pe_seg(pe);
   2827	pnv_ioda_deconfigure_pe(pe->phb, pe);
   2828
   2829	/* Release slave PEs in the compound PE */
   2830	if (pe->flags & PNV_IODA_PE_MASTER) {
   2831		list_for_each_entry_safe(slave, tmp, &pe->slaves, list) {
   2832			list_del(&slave->list);
   2833			pnv_ioda_free_pe(slave);
   2834		}
   2835	}
   2836
   2837	/*
   2838	 * The PE for root bus can be removed because of hotplug in EEH
   2839	 * recovery for fenced PHB error. We need to mark the PE dead so
   2840	 * that it can be populated again in PCI hot add path. The PE
   2841	 * shouldn't be destroyed as it's the global reserved resource.
   2842	 */
   2843	if (phb->ioda.root_pe_idx == pe->pe_number)
   2844		return;
   2845
   2846	pnv_ioda_free_pe(pe);
   2847}
   2848
   2849static void pnv_pci_release_device(struct pci_dev *pdev)
   2850{
   2851	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
   2852	struct pci_dn *pdn = pci_get_pdn(pdev);
   2853	struct pnv_ioda_pe *pe;
   2854
   2855	/* The VF PE state is torn down when sriov_disable() is called */
   2856	if (pdev->is_virtfn)
   2857		return;
   2858
   2859	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
   2860		return;
   2861
   2862#ifdef CONFIG_PCI_IOV
   2863	/*
   2864	 * FIXME: Try move this to sriov_disable(). It's here since we allocate
   2865	 * the iov state at probe time since we need to fiddle with the IOV
   2866	 * resources.
   2867	 */
   2868	if (pdev->is_physfn)
   2869		kfree(pdev->dev.archdata.iov_data);
   2870#endif
   2871
   2872	/*
   2873	 * PCI hotplug can happen as part of EEH error recovery. The @pdn
   2874	 * isn't removed and added afterwards in this scenario. We should
   2875	 * set the PE number in @pdn to an invalid one. Otherwise, the PE's
   2876	 * device count is decreased on removing devices while failing to
   2877	 * be increased on adding devices. It leads to unbalanced PE's device
   2878	 * count and eventually make normal PCI hotplug path broken.
   2879	 */
   2880	pe = &phb->ioda.pe_array[pdn->pe_number];
   2881	pdn->pe_number = IODA_INVALID_PE;
   2882
   2883	WARN_ON(--pe->device_count < 0);
   2884	if (pe->device_count == 0)
   2885		pnv_ioda_release_pe(pe);
   2886}
   2887
   2888static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
   2889{
   2890	struct pnv_phb *phb = hose->private_data;
   2891
   2892	opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
   2893		       OPAL_ASSERT_RESET);
   2894}
   2895
   2896static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus)
   2897{
   2898	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
   2899	struct pnv_ioda_pe *pe;
   2900
   2901	list_for_each_entry(pe, &phb->ioda.pe_list, list) {
   2902		if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
   2903			continue;
   2904
   2905		if (!pe->pbus)
   2906			continue;
   2907
   2908		if (bus->number == ((pe->rid >> 8) & 0xFF)) {
   2909			pe->pbus = bus;
   2910			break;
   2911		}
   2912	}
   2913}
   2914
   2915static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
   2916	.dma_dev_setup		= pnv_pci_ioda_dma_dev_setup,
   2917	.dma_bus_setup		= pnv_pci_ioda_dma_bus_setup,
   2918	.iommu_bypass_supported	= pnv_pci_ioda_iommu_bypass_supported,
   2919	.enable_device_hook	= pnv_pci_enable_device_hook,
   2920	.release_device		= pnv_pci_release_device,
   2921	.window_alignment	= pnv_pci_window_alignment,
   2922	.setup_bridge		= pnv_pci_fixup_bridge_resources,
   2923	.reset_secondary_bus	= pnv_pci_reset_secondary_bus,
   2924	.shutdown		= pnv_pci_ioda_shutdown,
   2925};
   2926
   2927static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
   2928	.enable_device_hook	= pnv_ocapi_enable_device_hook,
   2929	.release_device		= pnv_pci_release_device,
   2930	.window_alignment	= pnv_pci_window_alignment,
   2931	.reset_secondary_bus	= pnv_pci_reset_secondary_bus,
   2932	.shutdown		= pnv_pci_ioda_shutdown,
   2933};
   2934
   2935static void __init pnv_pci_init_ioda_phb(struct device_node *np,
   2936					 u64 hub_id, int ioda_type)
   2937{
   2938	struct pci_controller *hose;
   2939	struct pnv_phb *phb;
   2940	unsigned long size, m64map_off, m32map_off, pemap_off;
   2941	unsigned long iomap_off = 0, dma32map_off = 0;
   2942	struct pnv_ioda_pe *root_pe;
   2943	struct resource r;
   2944	const __be64 *prop64;
   2945	const __be32 *prop32;
   2946	int len;
   2947	unsigned int segno;
   2948	u64 phb_id;
   2949	void *aux;
   2950	long rc;
   2951
   2952	if (!of_device_is_available(np))
   2953		return;
   2954
   2955	pr_info("Initializing %s PHB (%pOF)\n",	pnv_phb_names[ioda_type], np);
   2956
   2957	prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
   2958	if (!prop64) {
   2959		pr_err("  Missing \"ibm,opal-phbid\" property !\n");
   2960		return;
   2961	}
   2962	phb_id = be64_to_cpup(prop64);
   2963	pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
   2964
   2965	phb = kzalloc(sizeof(*phb), GFP_KERNEL);
   2966	if (!phb)
   2967		panic("%s: Failed to allocate %zu bytes\n", __func__,
   2968		      sizeof(*phb));
   2969
   2970	/* Allocate PCI controller */
   2971	phb->hose = hose = pcibios_alloc_controller(np);
   2972	if (!phb->hose) {
   2973		pr_err("  Can't allocate PCI controller for %pOF\n",
   2974		       np);
   2975		memblock_free(phb, sizeof(struct pnv_phb));
   2976		return;
   2977	}
   2978
   2979	spin_lock_init(&phb->lock);
   2980	prop32 = of_get_property(np, "bus-range", &len);
   2981	if (prop32 && len == 8) {
   2982		hose->first_busno = be32_to_cpu(prop32[0]);
   2983		hose->last_busno = be32_to_cpu(prop32[1]);
   2984	} else {
   2985		pr_warn("  Broken <bus-range> on %pOF\n", np);
   2986		hose->first_busno = 0;
   2987		hose->last_busno = 0xff;
   2988	}
   2989	hose->private_data = phb;
   2990	phb->hub_id = hub_id;
   2991	phb->opal_id = phb_id;
   2992	phb->type = ioda_type;
   2993	mutex_init(&phb->ioda.pe_alloc_mutex);
   2994
   2995	/* Detect specific models for error handling */
   2996	if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
   2997		phb->model = PNV_PHB_MODEL_P7IOC;
   2998	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
   2999		phb->model = PNV_PHB_MODEL_PHB3;
   3000	else
   3001		phb->model = PNV_PHB_MODEL_UNKNOWN;
   3002
   3003	/* Initialize diagnostic data buffer */
   3004	prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL);
   3005	if (prop32)
   3006		phb->diag_data_size = be32_to_cpup(prop32);
   3007	else
   3008		phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
   3009
   3010	phb->diag_data = kzalloc(phb->diag_data_size, GFP_KERNEL);
   3011	if (!phb->diag_data)
   3012		panic("%s: Failed to allocate %u bytes\n", __func__,
   3013		      phb->diag_data_size);
   3014
   3015	/* Parse 32-bit and IO ranges (if any) */
   3016	pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
   3017
   3018	/* Get registers */
   3019	if (!of_address_to_resource(np, 0, &r)) {
   3020		phb->regs_phys = r.start;
   3021		phb->regs = ioremap(r.start, resource_size(&r));
   3022		if (phb->regs == NULL)
   3023			pr_err("  Failed to map registers !\n");
   3024	}
   3025
   3026	/* Initialize more IODA stuff */
   3027	phb->ioda.total_pe_num = 1;
   3028	prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
   3029	if (prop32)
   3030		phb->ioda.total_pe_num = be32_to_cpup(prop32);
   3031	prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
   3032	if (prop32)
   3033		phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
   3034
   3035	/* Invalidate RID to PE# mapping */
   3036	for (segno = 0; segno < ARRAY_SIZE(phb->ioda.pe_rmap); segno++)
   3037		phb->ioda.pe_rmap[segno] = IODA_INVALID_PE;
   3038
   3039	/* Parse 64-bit MMIO range */
   3040	pnv_ioda_parse_m64_window(phb);
   3041
   3042	phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
   3043	/* FW Has already off top 64k of M32 space (MSI space) */
   3044	phb->ioda.m32_size += 0x10000;
   3045
   3046	phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
   3047	phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
   3048	phb->ioda.io_size = hose->pci_io_size;
   3049	phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
   3050	phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
   3051
   3052	/* Calculate how many 32-bit TCE segments we have */
   3053	phb->ioda.dma32_count = phb->ioda.m32_pci_base /
   3054				PNV_IODA1_DMA32_SEGSIZE;
   3055
   3056	/* Allocate aux data & arrays. We don't have IO ports on PHB3 */
   3057	size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
   3058			sizeof(unsigned long));
   3059	m64map_off = size;
   3060	size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
   3061	m32map_off = size;
   3062	size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
   3063	if (phb->type == PNV_PHB_IODA1) {
   3064		iomap_off = size;
   3065		size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
   3066		dma32map_off = size;
   3067		size += phb->ioda.dma32_count *
   3068			sizeof(phb->ioda.dma32_segmap[0]);
   3069	}
   3070	pemap_off = size;
   3071	size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
   3072	aux = kzalloc(size, GFP_KERNEL);
   3073	if (!aux)
   3074		panic("%s: Failed to allocate %lu bytes\n", __func__, size);
   3075
   3076	phb->ioda.pe_alloc = aux;
   3077	phb->ioda.m64_segmap = aux + m64map_off;
   3078	phb->ioda.m32_segmap = aux + m32map_off;
   3079	for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
   3080		phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
   3081		phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
   3082	}
   3083	if (phb->type == PNV_PHB_IODA1) {
   3084		phb->ioda.io_segmap = aux + iomap_off;
   3085		for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
   3086			phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
   3087
   3088		phb->ioda.dma32_segmap = aux + dma32map_off;
   3089		for (segno = 0; segno < phb->ioda.dma32_count; segno++)
   3090			phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
   3091	}
   3092	phb->ioda.pe_array = aux + pemap_off;
   3093
   3094	/*
   3095	 * Choose PE number for root bus, which shouldn't have
   3096	 * M64 resources consumed by its child devices. To pick
   3097	 * the PE number adjacent to the reserved one if possible.
   3098	 */
   3099	pnv_ioda_reserve_pe(phb, phb->ioda.reserved_pe_idx);
   3100	if (phb->ioda.reserved_pe_idx == 0) {
   3101		phb->ioda.root_pe_idx = 1;
   3102		pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
   3103	} else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) {
   3104		phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
   3105		pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
   3106	} else {
   3107		/* otherwise just allocate one */
   3108		root_pe = pnv_ioda_alloc_pe(phb, 1);
   3109		phb->ioda.root_pe_idx = root_pe->pe_number;
   3110	}
   3111
   3112	INIT_LIST_HEAD(&phb->ioda.pe_list);
   3113	mutex_init(&phb->ioda.pe_list_mutex);
   3114
   3115	/* Calculate how many 32-bit TCE segments we have */
   3116	phb->ioda.dma32_count = phb->ioda.m32_pci_base /
   3117				PNV_IODA1_DMA32_SEGSIZE;
   3118
   3119#if 0 /* We should really do that ... */
   3120	rc = opal_pci_set_phb_mem_window(opal->phb_id,
   3121					 window_type,
   3122					 window_num,
   3123					 starting_real_address,
   3124					 starting_pci_address,
   3125					 segment_size);
   3126#endif
   3127
   3128	pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
   3129		phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
   3130		phb->ioda.m32_size, phb->ioda.m32_segsize);
   3131	if (phb->ioda.m64_size)
   3132		pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
   3133			phb->ioda.m64_size, phb->ioda.m64_segsize);
   3134	if (phb->ioda.io_size)
   3135		pr_info("                  IO: 0x%x [segment=0x%x]\n",
   3136			phb->ioda.io_size, phb->ioda.io_segsize);
   3137
   3138
   3139	phb->hose->ops = &pnv_pci_ops;
   3140	phb->get_pe_state = pnv_ioda_get_pe_state;
   3141	phb->freeze_pe = pnv_ioda_freeze_pe;
   3142	phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
   3143
   3144	/* Setup MSI support */
   3145	pnv_pci_init_ioda_msis(phb);
   3146
   3147	/*
   3148	 * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
   3149	 * to let the PCI core do resource assignment. It's supposed
   3150	 * that the PCI core will do correct I/O and MMIO alignment
   3151	 * for the P2P bridge bars so that each PCI bus (excluding
   3152	 * the child P2P bridges) can form individual PE.
   3153	 */
   3154	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
   3155
   3156	switch (phb->type) {
   3157	case PNV_PHB_NPU_OCAPI:
   3158		hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
   3159		break;
   3160	default:
   3161		hose->controller_ops = pnv_pci_ioda_controller_ops;
   3162	}
   3163
   3164	ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
   3165
   3166#ifdef CONFIG_PCI_IOV
   3167	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov;
   3168	ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
   3169	ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
   3170	ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
   3171#endif
   3172
   3173	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
   3174
   3175	/* Reset IODA tables to a clean state */
   3176	rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
   3177	if (rc)
   3178		pr_warn("  OPAL Error %ld performing IODA table reset !\n", rc);
   3179
   3180	/*
   3181	 * If we're running in kdump kernel, the previous kernel never
   3182	 * shutdown PCI devices correctly. We already got IODA table
   3183	 * cleaned out. So we have to issue PHB reset to stop all PCI
   3184	 * transactions from previous kernel. The ppc_pci_reset_phbs
   3185	 * kernel parameter will force this reset too. Additionally,
   3186	 * if the IODA reset above failed then use a bigger hammer.
   3187	 * This can happen if we get a PHB fatal error in very early
   3188	 * boot.
   3189	 */
   3190	if (is_kdump_kernel() || pci_reset_phbs || rc) {
   3191		pr_info("  Issue PHB reset ...\n");
   3192		pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
   3193		pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
   3194	}
   3195
   3196	/* Remove M64 resource if we can't configure it successfully */
   3197	if (!phb->init_m64 || phb->init_m64(phb))
   3198		hose->mem_resources[1].flags = 0;
   3199
   3200	/* create pci_dn's for DT nodes under this PHB */
   3201	pci_devs_phb_init_dynamic(hose);
   3202}
   3203
   3204void __init pnv_pci_init_ioda2_phb(struct device_node *np)
   3205{
   3206	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
   3207}
   3208
   3209void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
   3210{
   3211	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
   3212}
   3213
   3214static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
   3215{
   3216	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
   3217
   3218	if (!machine_is(powernv))
   3219		return;
   3220
   3221	if (phb->type == PNV_PHB_NPU_OCAPI)
   3222		dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
   3223}
   3224DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup);
   3225
   3226void __init pnv_pci_init_ioda_hub(struct device_node *np)
   3227{
   3228	struct device_node *phbn;
   3229	const __be64 *prop64;
   3230	u64 hub_id;
   3231
   3232	pr_info("Probing IODA IO-Hub %pOF\n", np);
   3233
   3234	prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
   3235	if (!prop64) {
   3236		pr_err(" Missing \"ibm,opal-hubid\" property !\n");
   3237		return;
   3238	}
   3239	hub_id = be64_to_cpup(prop64);
   3240	pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
   3241
   3242	/* Count child PHBs */
   3243	for_each_child_of_node(np, phbn) {
   3244		/* Look for IODA1 PHBs */
   3245		if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
   3246			pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
   3247	}
   3248}