cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

vmd.c (26883B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Volume Management Device driver
      4 * Copyright (c) 2015, Intel Corporation.
      5 */
      6
      7#include <linux/device.h>
      8#include <linux/interrupt.h>
      9#include <linux/irq.h>
     10#include <linux/kernel.h>
     11#include <linux/module.h>
     12#include <linux/msi.h>
     13#include <linux/pci.h>
     14#include <linux/pci-acpi.h>
     15#include <linux/pci-ecam.h>
     16#include <linux/srcu.h>
     17#include <linux/rculist.h>
     18#include <linux/rcupdate.h>
     19
     20#include <asm/irqdomain.h>
     21
     22#define VMD_CFGBAR	0
     23#define VMD_MEMBAR1	2
     24#define VMD_MEMBAR2	4
     25
     26#define PCI_REG_VMCAP		0x40
     27#define BUS_RESTRICT_CAP(vmcap)	(vmcap & 0x1)
     28#define PCI_REG_VMCONFIG	0x44
     29#define BUS_RESTRICT_CFG(vmcfg)	((vmcfg >> 8) & 0x3)
     30#define VMCONFIG_MSI_REMAP	0x2
     31#define PCI_REG_VMLOCK		0x70
     32#define MB2_SHADOW_EN(vmlock)	(vmlock & 0x2)
     33
     34#define MB2_SHADOW_OFFSET	0x2000
     35#define MB2_SHADOW_SIZE		16
     36
     37enum vmd_features {
     38	/*
     39	 * Device may contain registers which hint the physical location of the
     40	 * membars, in order to allow proper address translation during
     41	 * resource assignment to enable guest virtualization
     42	 */
     43	VMD_FEAT_HAS_MEMBAR_SHADOW		= (1 << 0),
     44
     45	/*
     46	 * Device may provide root port configuration information which limits
     47	 * bus numbering
     48	 */
     49	VMD_FEAT_HAS_BUS_RESTRICTIONS		= (1 << 1),
     50
     51	/*
     52	 * Device contains physical location shadow registers in
     53	 * vendor-specific capability space
     54	 */
     55	VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP	= (1 << 2),
     56
     57	/*
     58	 * Device may use MSI-X vector 0 for software triggering and will not
     59	 * be used for MSI remapping
     60	 */
     61	VMD_FEAT_OFFSET_FIRST_VECTOR		= (1 << 3),
     62
     63	/*
     64	 * Device can bypass remapping MSI-X transactions into its MSI-X table,
     65	 * avoiding the requirement of a VMD MSI domain for child device
     66	 * interrupt handling.
     67	 */
     68	VMD_FEAT_CAN_BYPASS_MSI_REMAP		= (1 << 4),
     69};
     70
     71static DEFINE_IDA(vmd_instance_ida);
     72
     73/*
     74 * Lock for manipulating VMD IRQ lists.
     75 */
     76static DEFINE_RAW_SPINLOCK(list_lock);
     77
     78/**
     79 * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
     80 * @node:	list item for parent traversal.
     81 * @irq:	back pointer to parent.
     82 * @enabled:	true if driver enabled IRQ
     83 * @virq:	the virtual IRQ value provided to the requesting driver.
     84 *
     85 * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to
     86 * a VMD IRQ using this structure.
     87 */
     88struct vmd_irq {
     89	struct list_head	node;
     90	struct vmd_irq_list	*irq;
     91	bool			enabled;
     92	unsigned int		virq;
     93};
     94
     95/**
     96 * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector
     97 * @irq_list:	the list of irq's the VMD one demuxes to.
     98 * @srcu:	SRCU struct for local synchronization.
     99 * @count:	number of child IRQs assigned to this vector; used to track
    100 *		sharing.
    101 * @virq:	The underlying VMD Linux interrupt number
    102 */
    103struct vmd_irq_list {
    104	struct list_head	irq_list;
    105	struct srcu_struct	srcu;
    106	unsigned int		count;
    107	unsigned int		virq;
    108};
    109
    110struct vmd_dev {
    111	struct pci_dev		*dev;
    112
    113	spinlock_t		cfg_lock;
    114	void __iomem		*cfgbar;
    115
    116	int msix_count;
    117	struct vmd_irq_list	*irqs;
    118
    119	struct pci_sysdata	sysdata;
    120	struct resource		resources[3];
    121	struct irq_domain	*irq_domain;
    122	struct pci_bus		*bus;
    123	u8			busn_start;
    124	u8			first_vec;
    125	char			*name;
    126	int			instance;
    127};
    128
    129static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
    130{
    131	return container_of(bus->sysdata, struct vmd_dev, sysdata);
    132}
    133
    134static inline unsigned int index_from_irqs(struct vmd_dev *vmd,
    135					   struct vmd_irq_list *irqs)
    136{
    137	return irqs - vmd->irqs;
    138}
    139
    140/*
    141 * Drivers managing a device in a VMD domain allocate their own IRQs as before,
    142 * but the MSI entry for the hardware it's driving will be programmed with a
    143 * destination ID for the VMD MSI-X table.  The VMD muxes interrupts in its
    144 * domain into one of its own, and the VMD driver de-muxes these for the
    145 * handlers sharing that VMD IRQ.  The vmd irq_domain provides the operations
    146 * and irq_chip to set this up.
    147 */
    148static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
    149{
    150	struct vmd_irq *vmdirq = data->chip_data;
    151	struct vmd_irq_list *irq = vmdirq->irq;
    152	struct vmd_dev *vmd = irq_data_get_irq_handler_data(data);
    153
    154	memset(msg, 0, sizeof(*msg));
    155	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
    156	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
    157	msg->arch_addr_lo.destid_0_7 = index_from_irqs(vmd, irq);
    158}
    159
    160/*
    161 * We rely on MSI_FLAG_USE_DEF_CHIP_OPS to set the IRQ mask/unmask ops.
    162 */
    163static void vmd_irq_enable(struct irq_data *data)
    164{
    165	struct vmd_irq *vmdirq = data->chip_data;
    166	unsigned long flags;
    167
    168	raw_spin_lock_irqsave(&list_lock, flags);
    169	WARN_ON(vmdirq->enabled);
    170	list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list);
    171	vmdirq->enabled = true;
    172	raw_spin_unlock_irqrestore(&list_lock, flags);
    173
    174	data->chip->irq_unmask(data);
    175}
    176
    177static void vmd_irq_disable(struct irq_data *data)
    178{
    179	struct vmd_irq *vmdirq = data->chip_data;
    180	unsigned long flags;
    181
    182	data->chip->irq_mask(data);
    183
    184	raw_spin_lock_irqsave(&list_lock, flags);
    185	if (vmdirq->enabled) {
    186		list_del_rcu(&vmdirq->node);
    187		vmdirq->enabled = false;
    188	}
    189	raw_spin_unlock_irqrestore(&list_lock, flags);
    190}
    191
    192/*
    193 * XXX: Stubbed until we develop acceptable way to not create conflicts with
    194 * other devices sharing the same vector.
    195 */
    196static int vmd_irq_set_affinity(struct irq_data *data,
    197				const struct cpumask *dest, bool force)
    198{
    199	return -EINVAL;
    200}
    201
    202static struct irq_chip vmd_msi_controller = {
    203	.name			= "VMD-MSI",
    204	.irq_enable		= vmd_irq_enable,
    205	.irq_disable		= vmd_irq_disable,
    206	.irq_compose_msi_msg	= vmd_compose_msi_msg,
    207	.irq_set_affinity	= vmd_irq_set_affinity,
    208};
    209
    210static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
    211				     msi_alloc_info_t *arg)
    212{
    213	return 0;
    214}
    215
    216/*
    217 * XXX: We can be even smarter selecting the best IRQ once we solve the
    218 * affinity problem.
    219 */
    220static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc)
    221{
    222	unsigned long flags;
    223	int i, best;
    224
    225	if (vmd->msix_count == 1 + vmd->first_vec)
    226		return &vmd->irqs[vmd->first_vec];
    227
    228	/*
    229	 * White list for fast-interrupt handlers. All others will share the
    230	 * "slow" interrupt vector.
    231	 */
    232	switch (msi_desc_to_pci_dev(desc)->class) {
    233	case PCI_CLASS_STORAGE_EXPRESS:
    234		break;
    235	default:
    236		return &vmd->irqs[vmd->first_vec];
    237	}
    238
    239	raw_spin_lock_irqsave(&list_lock, flags);
    240	best = vmd->first_vec + 1;
    241	for (i = best; i < vmd->msix_count; i++)
    242		if (vmd->irqs[i].count < vmd->irqs[best].count)
    243			best = i;
    244	vmd->irqs[best].count++;
    245	raw_spin_unlock_irqrestore(&list_lock, flags);
    246
    247	return &vmd->irqs[best];
    248}
    249
    250static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
    251			unsigned int virq, irq_hw_number_t hwirq,
    252			msi_alloc_info_t *arg)
    253{
    254	struct msi_desc *desc = arg->desc;
    255	struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus);
    256	struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL);
    257
    258	if (!vmdirq)
    259		return -ENOMEM;
    260
    261	INIT_LIST_HEAD(&vmdirq->node);
    262	vmdirq->irq = vmd_next_irq(vmd, desc);
    263	vmdirq->virq = virq;
    264
    265	irq_domain_set_info(domain, virq, vmdirq->irq->virq, info->chip, vmdirq,
    266			    handle_untracked_irq, vmd, NULL);
    267	return 0;
    268}
    269
    270static void vmd_msi_free(struct irq_domain *domain,
    271			struct msi_domain_info *info, unsigned int virq)
    272{
    273	struct vmd_irq *vmdirq = irq_get_chip_data(virq);
    274	unsigned long flags;
    275
    276	synchronize_srcu(&vmdirq->irq->srcu);
    277
    278	/* XXX: Potential optimization to rebalance */
    279	raw_spin_lock_irqsave(&list_lock, flags);
    280	vmdirq->irq->count--;
    281	raw_spin_unlock_irqrestore(&list_lock, flags);
    282
    283	kfree(vmdirq);
    284}
    285
    286static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev,
    287			   int nvec, msi_alloc_info_t *arg)
    288{
    289	struct pci_dev *pdev = to_pci_dev(dev);
    290	struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
    291
    292	if (nvec > vmd->msix_count)
    293		return vmd->msix_count;
    294
    295	memset(arg, 0, sizeof(*arg));
    296	return 0;
    297}
    298
    299static void vmd_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
    300{
    301	arg->desc = desc;
    302}
    303
    304static struct msi_domain_ops vmd_msi_domain_ops = {
    305	.get_hwirq	= vmd_get_hwirq,
    306	.msi_init	= vmd_msi_init,
    307	.msi_free	= vmd_msi_free,
    308	.msi_prepare	= vmd_msi_prepare,
    309	.set_desc	= vmd_set_desc,
    310};
    311
    312static struct msi_domain_info vmd_msi_domain_info = {
    313	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
    314			  MSI_FLAG_PCI_MSIX,
    315	.ops		= &vmd_msi_domain_ops,
    316	.chip		= &vmd_msi_controller,
    317};
    318
    319static void vmd_set_msi_remapping(struct vmd_dev *vmd, bool enable)
    320{
    321	u16 reg;
    322
    323	pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG, &reg);
    324	reg = enable ? (reg & ~VMCONFIG_MSI_REMAP) :
    325		       (reg | VMCONFIG_MSI_REMAP);
    326	pci_write_config_word(vmd->dev, PCI_REG_VMCONFIG, reg);
    327}
    328
    329static int vmd_create_irq_domain(struct vmd_dev *vmd)
    330{
    331	struct fwnode_handle *fn;
    332
    333	fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain);
    334	if (!fn)
    335		return -ENODEV;
    336
    337	vmd->irq_domain = pci_msi_create_irq_domain(fn, &vmd_msi_domain_info, NULL);
    338	if (!vmd->irq_domain) {
    339		irq_domain_free_fwnode(fn);
    340		return -ENODEV;
    341	}
    342
    343	return 0;
    344}
    345
    346static void vmd_remove_irq_domain(struct vmd_dev *vmd)
    347{
    348	/*
    349	 * Some production BIOS won't enable remapping between soft reboots.
    350	 * Ensure remapping is restored before unloading the driver.
    351	 */
    352	if (!vmd->msix_count)
    353		vmd_set_msi_remapping(vmd, true);
    354
    355	if (vmd->irq_domain) {
    356		struct fwnode_handle *fn = vmd->irq_domain->fwnode;
    357
    358		irq_domain_remove(vmd->irq_domain);
    359		irq_domain_free_fwnode(fn);
    360	}
    361}
    362
    363static void __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus,
    364				  unsigned int devfn, int reg, int len)
    365{
    366	unsigned int busnr_ecam = bus->number - vmd->busn_start;
    367	u32 offset = PCIE_ECAM_OFFSET(busnr_ecam, devfn, reg);
    368
    369	if (offset + len >= resource_size(&vmd->dev->resource[VMD_CFGBAR]))
    370		return NULL;
    371
    372	return vmd->cfgbar + offset;
    373}
    374
    375/*
    376 * CPU may deadlock if config space is not serialized on some versions of this
    377 * hardware, so all config space access is done under a spinlock.
    378 */
    379static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
    380			int len, u32 *value)
    381{
    382	struct vmd_dev *vmd = vmd_from_bus(bus);
    383	void __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
    384	unsigned long flags;
    385	int ret = 0;
    386
    387	if (!addr)
    388		return -EFAULT;
    389
    390	spin_lock_irqsave(&vmd->cfg_lock, flags);
    391	switch (len) {
    392	case 1:
    393		*value = readb(addr);
    394		break;
    395	case 2:
    396		*value = readw(addr);
    397		break;
    398	case 4:
    399		*value = readl(addr);
    400		break;
    401	default:
    402		ret = -EINVAL;
    403		break;
    404	}
    405	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
    406	return ret;
    407}
    408
    409/*
    410 * VMD h/w converts non-posted config writes to posted memory writes. The
    411 * read-back in this function forces the completion so it returns only after
    412 * the config space was written, as expected.
    413 */
    414static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
    415			 int len, u32 value)
    416{
    417	struct vmd_dev *vmd = vmd_from_bus(bus);
    418	void __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
    419	unsigned long flags;
    420	int ret = 0;
    421
    422	if (!addr)
    423		return -EFAULT;
    424
    425	spin_lock_irqsave(&vmd->cfg_lock, flags);
    426	switch (len) {
    427	case 1:
    428		writeb(value, addr);
    429		readb(addr);
    430		break;
    431	case 2:
    432		writew(value, addr);
    433		readw(addr);
    434		break;
    435	case 4:
    436		writel(value, addr);
    437		readl(addr);
    438		break;
    439	default:
    440		ret = -EINVAL;
    441		break;
    442	}
    443	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
    444	return ret;
    445}
    446
    447static struct pci_ops vmd_ops = {
    448	.read		= vmd_pci_read,
    449	.write		= vmd_pci_write,
    450};
    451
    452#ifdef CONFIG_ACPI
    453static struct acpi_device *vmd_acpi_find_companion(struct pci_dev *pci_dev)
    454{
    455	struct pci_host_bridge *bridge;
    456	u32 busnr, addr;
    457
    458	if (pci_dev->bus->ops != &vmd_ops)
    459		return NULL;
    460
    461	bridge = pci_find_host_bridge(pci_dev->bus);
    462	busnr = pci_dev->bus->number - bridge->bus->number;
    463	/*
    464	 * The address computation below is only applicable to relative bus
    465	 * numbers below 32.
    466	 */
    467	if (busnr > 31)
    468		return NULL;
    469
    470	addr = (busnr << 24) | ((u32)pci_dev->devfn << 16) | 0x8000FFFFU;
    471
    472	dev_dbg(&pci_dev->dev, "Looking for ACPI companion (address 0x%x)\n",
    473		addr);
    474
    475	return acpi_find_child_device(ACPI_COMPANION(bridge->dev.parent), addr,
    476				      false);
    477}
    478
    479static bool hook_installed;
    480
    481static void vmd_acpi_begin(void)
    482{
    483	if (pci_acpi_set_companion_lookup_hook(vmd_acpi_find_companion))
    484		return;
    485
    486	hook_installed = true;
    487}
    488
    489static void vmd_acpi_end(void)
    490{
    491	if (!hook_installed)
    492		return;
    493
    494	pci_acpi_clear_companion_lookup_hook();
    495	hook_installed = false;
    496}
    497#else
    498static inline void vmd_acpi_begin(void) { }
    499static inline void vmd_acpi_end(void) { }
    500#endif /* CONFIG_ACPI */
    501
    502static void vmd_domain_reset(struct vmd_dev *vmd)
    503{
    504	u16 bus, max_buses = resource_size(&vmd->resources[0]);
    505	u8 dev, functions, fn, hdr_type;
    506	char __iomem *base;
    507
    508	for (bus = 0; bus < max_buses; bus++) {
    509		for (dev = 0; dev < 32; dev++) {
    510			base = vmd->cfgbar + PCIE_ECAM_OFFSET(bus,
    511						PCI_DEVFN(dev, 0), 0);
    512
    513			hdr_type = readb(base + PCI_HEADER_TYPE) &
    514					 PCI_HEADER_TYPE_MASK;
    515
    516			functions = (hdr_type & 0x80) ? 8 : 1;
    517			for (fn = 0; fn < functions; fn++) {
    518				base = vmd->cfgbar + PCIE_ECAM_OFFSET(bus,
    519						PCI_DEVFN(dev, fn), 0);
    520
    521				hdr_type = readb(base + PCI_HEADER_TYPE) &
    522						PCI_HEADER_TYPE_MASK;
    523
    524				if (hdr_type != PCI_HEADER_TYPE_BRIDGE ||
    525				    (readw(base + PCI_CLASS_DEVICE) !=
    526				     PCI_CLASS_BRIDGE_PCI))
    527					continue;
    528
    529				memset_io(base + PCI_IO_BASE, 0,
    530					  PCI_ROM_ADDRESS1 - PCI_IO_BASE);
    531			}
    532		}
    533	}
    534}
    535
    536static void vmd_attach_resources(struct vmd_dev *vmd)
    537{
    538	vmd->dev->resource[VMD_MEMBAR1].child = &vmd->resources[1];
    539	vmd->dev->resource[VMD_MEMBAR2].child = &vmd->resources[2];
    540}
    541
    542static void vmd_detach_resources(struct vmd_dev *vmd)
    543{
    544	vmd->dev->resource[VMD_MEMBAR1].child = NULL;
    545	vmd->dev->resource[VMD_MEMBAR2].child = NULL;
    546}
    547
    548/*
    549 * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
    550 * Per ACPI r6.0, sec 6.5.6,  _SEG returns an integer, of which the lower
    551 * 16 bits are the PCI Segment Group (domain) number.  Other bits are
    552 * currently reserved.
    553 */
    554static int vmd_find_free_domain(void)
    555{
    556	int domain = 0xffff;
    557	struct pci_bus *bus = NULL;
    558
    559	while ((bus = pci_find_next_bus(bus)) != NULL)
    560		domain = max_t(int, domain, pci_domain_nr(bus));
    561	return domain + 1;
    562}
    563
    564static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint,
    565				resource_size_t *offset1,
    566				resource_size_t *offset2)
    567{
    568	struct pci_dev *dev = vmd->dev;
    569	u64 phys1, phys2;
    570
    571	if (native_hint) {
    572		u32 vmlock;
    573		int ret;
    574
    575		ret = pci_read_config_dword(dev, PCI_REG_VMLOCK, &vmlock);
    576		if (ret || PCI_POSSIBLE_ERROR(vmlock))
    577			return -ENODEV;
    578
    579		if (MB2_SHADOW_EN(vmlock)) {
    580			void __iomem *membar2;
    581
    582			membar2 = pci_iomap(dev, VMD_MEMBAR2, 0);
    583			if (!membar2)
    584				return -ENOMEM;
    585			phys1 = readq(membar2 + MB2_SHADOW_OFFSET);
    586			phys2 = readq(membar2 + MB2_SHADOW_OFFSET + 8);
    587			pci_iounmap(dev, membar2);
    588		} else
    589			return 0;
    590	} else {
    591		/* Hypervisor-Emulated Vendor-Specific Capability */
    592		int pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
    593		u32 reg, regu;
    594
    595		pci_read_config_dword(dev, pos + 4, &reg);
    596
    597		/* "SHDW" */
    598		if (pos && reg == 0x53484457) {
    599			pci_read_config_dword(dev, pos + 8, &reg);
    600			pci_read_config_dword(dev, pos + 12, &regu);
    601			phys1 = (u64) regu << 32 | reg;
    602
    603			pci_read_config_dword(dev, pos + 16, &reg);
    604			pci_read_config_dword(dev, pos + 20, &regu);
    605			phys2 = (u64) regu << 32 | reg;
    606		} else
    607			return 0;
    608	}
    609
    610	*offset1 = dev->resource[VMD_MEMBAR1].start -
    611			(phys1 & PCI_BASE_ADDRESS_MEM_MASK);
    612	*offset2 = dev->resource[VMD_MEMBAR2].start -
    613			(phys2 & PCI_BASE_ADDRESS_MEM_MASK);
    614
    615	return 0;
    616}
    617
    618static int vmd_get_bus_number_start(struct vmd_dev *vmd)
    619{
    620	struct pci_dev *dev = vmd->dev;
    621	u16 reg;
    622
    623	pci_read_config_word(dev, PCI_REG_VMCAP, &reg);
    624	if (BUS_RESTRICT_CAP(reg)) {
    625		pci_read_config_word(dev, PCI_REG_VMCONFIG, &reg);
    626
    627		switch (BUS_RESTRICT_CFG(reg)) {
    628		case 0:
    629			vmd->busn_start = 0;
    630			break;
    631		case 1:
    632			vmd->busn_start = 128;
    633			break;
    634		case 2:
    635			vmd->busn_start = 224;
    636			break;
    637		default:
    638			pci_err(dev, "Unknown Bus Offset Setting (%d)\n",
    639				BUS_RESTRICT_CFG(reg));
    640			return -ENODEV;
    641		}
    642	}
    643
    644	return 0;
    645}
    646
    647static irqreturn_t vmd_irq(int irq, void *data)
    648{
    649	struct vmd_irq_list *irqs = data;
    650	struct vmd_irq *vmdirq;
    651	int idx;
    652
    653	idx = srcu_read_lock(&irqs->srcu);
    654	list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
    655		generic_handle_irq(vmdirq->virq);
    656	srcu_read_unlock(&irqs->srcu, idx);
    657
    658	return IRQ_HANDLED;
    659}
    660
    661static int vmd_alloc_irqs(struct vmd_dev *vmd)
    662{
    663	struct pci_dev *dev = vmd->dev;
    664	int i, err;
    665
    666	vmd->msix_count = pci_msix_vec_count(dev);
    667	if (vmd->msix_count < 0)
    668		return -ENODEV;
    669
    670	vmd->msix_count = pci_alloc_irq_vectors(dev, vmd->first_vec + 1,
    671						vmd->msix_count, PCI_IRQ_MSIX);
    672	if (vmd->msix_count < 0)
    673		return vmd->msix_count;
    674
    675	vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs),
    676				 GFP_KERNEL);
    677	if (!vmd->irqs)
    678		return -ENOMEM;
    679
    680	for (i = 0; i < vmd->msix_count; i++) {
    681		err = init_srcu_struct(&vmd->irqs[i].srcu);
    682		if (err)
    683			return err;
    684
    685		INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
    686		vmd->irqs[i].virq = pci_irq_vector(dev, i);
    687		err = devm_request_irq(&dev->dev, vmd->irqs[i].virq,
    688				       vmd_irq, IRQF_NO_THREAD,
    689				       vmd->name, &vmd->irqs[i]);
    690		if (err)
    691			return err;
    692	}
    693
    694	return 0;
    695}
    696
    697/*
    698 * Since VMD is an aperture to regular PCIe root ports, only allow it to
    699 * control features that the OS is allowed to control on the physical PCI bus.
    700 */
    701static void vmd_copy_host_bridge_flags(struct pci_host_bridge *root_bridge,
    702				       struct pci_host_bridge *vmd_bridge)
    703{
    704	vmd_bridge->native_pcie_hotplug = root_bridge->native_pcie_hotplug;
    705	vmd_bridge->native_shpc_hotplug = root_bridge->native_shpc_hotplug;
    706	vmd_bridge->native_aer = root_bridge->native_aer;
    707	vmd_bridge->native_pme = root_bridge->native_pme;
    708	vmd_bridge->native_ltr = root_bridge->native_ltr;
    709	vmd_bridge->native_dpc = root_bridge->native_dpc;
    710}
    711
    712static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
    713{
    714	struct pci_sysdata *sd = &vmd->sysdata;
    715	struct resource *res;
    716	u32 upper_bits;
    717	unsigned long flags;
    718	LIST_HEAD(resources);
    719	resource_size_t offset[2] = {0};
    720	resource_size_t membar2_offset = 0x2000;
    721	struct pci_bus *child;
    722	int ret;
    723
    724	/*
    725	 * Shadow registers may exist in certain VMD device ids which allow
    726	 * guests to correctly assign host physical addresses to the root ports
    727	 * and child devices. These registers will either return the host value
    728	 * or 0, depending on an enable bit in the VMD device.
    729	 */
    730	if (features & VMD_FEAT_HAS_MEMBAR_SHADOW) {
    731		membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE;
    732		ret = vmd_get_phys_offsets(vmd, true, &offset[0], &offset[1]);
    733		if (ret)
    734			return ret;
    735	} else if (features & VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP) {
    736		ret = vmd_get_phys_offsets(vmd, false, &offset[0], &offset[1]);
    737		if (ret)
    738			return ret;
    739	}
    740
    741	/*
    742	 * Certain VMD devices may have a root port configuration option which
    743	 * limits the bus range to between 0-127, 128-255, or 224-255
    744	 */
    745	if (features & VMD_FEAT_HAS_BUS_RESTRICTIONS) {
    746		ret = vmd_get_bus_number_start(vmd);
    747		if (ret)
    748			return ret;
    749	}
    750
    751	res = &vmd->dev->resource[VMD_CFGBAR];
    752	vmd->resources[0] = (struct resource) {
    753		.name  = "VMD CFGBAR",
    754		.start = vmd->busn_start,
    755		.end   = vmd->busn_start + (resource_size(res) >> 20) - 1,
    756		.flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
    757	};
    758
    759	/*
    760	 * If the window is below 4GB, clear IORESOURCE_MEM_64 so we can
    761	 * put 32-bit resources in the window.
    762	 *
    763	 * There's no hardware reason why a 64-bit window *couldn't*
    764	 * contain a 32-bit resource, but pbus_size_mem() computes the
    765	 * bridge window size assuming a 64-bit window will contain no
    766	 * 32-bit resources.  __pci_assign_resource() enforces that
    767	 * artificial restriction to make sure everything will fit.
    768	 *
    769	 * The only way we could use a 64-bit non-prefetchable MEMBAR is
    770	 * if its address is <4GB so that we can convert it to a 32-bit
    771	 * resource.  To be visible to the host OS, all VMD endpoints must
    772	 * be initially configured by platform BIOS, which includes setting
    773	 * up these resources.  We can assume the device is configured
    774	 * according to the platform needs.
    775	 */
    776	res = &vmd->dev->resource[VMD_MEMBAR1];
    777	upper_bits = upper_32_bits(res->end);
    778	flags = res->flags & ~IORESOURCE_SIZEALIGN;
    779	if (!upper_bits)
    780		flags &= ~IORESOURCE_MEM_64;
    781	vmd->resources[1] = (struct resource) {
    782		.name  = "VMD MEMBAR1",
    783		.start = res->start,
    784		.end   = res->end,
    785		.flags = flags,
    786		.parent = res,
    787	};
    788
    789	res = &vmd->dev->resource[VMD_MEMBAR2];
    790	upper_bits = upper_32_bits(res->end);
    791	flags = res->flags & ~IORESOURCE_SIZEALIGN;
    792	if (!upper_bits)
    793		flags &= ~IORESOURCE_MEM_64;
    794	vmd->resources[2] = (struct resource) {
    795		.name  = "VMD MEMBAR2",
    796		.start = res->start + membar2_offset,
    797		.end   = res->end,
    798		.flags = flags,
    799		.parent = res,
    800	};
    801
    802	sd->vmd_dev = vmd->dev;
    803	sd->domain = vmd_find_free_domain();
    804	if (sd->domain < 0)
    805		return sd->domain;
    806
    807	sd->node = pcibus_to_node(vmd->dev->bus);
    808
    809	/*
    810	 * Currently MSI remapping must be enabled in guest passthrough mode
    811	 * due to some missing interrupt remapping plumbing. This is probably
    812	 * acceptable because the guest is usually CPU-limited and MSI
    813	 * remapping doesn't become a performance bottleneck.
    814	 */
    815	if (!(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) ||
    816	    offset[0] || offset[1]) {
    817		ret = vmd_alloc_irqs(vmd);
    818		if (ret)
    819			return ret;
    820
    821		vmd_set_msi_remapping(vmd, true);
    822
    823		ret = vmd_create_irq_domain(vmd);
    824		if (ret)
    825			return ret;
    826
    827		/*
    828		 * Override the IRQ domain bus token so the domain can be
    829		 * distinguished from a regular PCI/MSI domain.
    830		 */
    831		irq_domain_update_bus_token(vmd->irq_domain, DOMAIN_BUS_VMD_MSI);
    832	} else {
    833		vmd_set_msi_remapping(vmd, false);
    834	}
    835
    836	pci_add_resource(&resources, &vmd->resources[0]);
    837	pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]);
    838	pci_add_resource_offset(&resources, &vmd->resources[2], offset[1]);
    839
    840	vmd->bus = pci_create_root_bus(&vmd->dev->dev, vmd->busn_start,
    841				       &vmd_ops, sd, &resources);
    842	if (!vmd->bus) {
    843		pci_free_resource_list(&resources);
    844		vmd_remove_irq_domain(vmd);
    845		return -ENODEV;
    846	}
    847
    848	vmd_copy_host_bridge_flags(pci_find_host_bridge(vmd->dev->bus),
    849				   to_pci_host_bridge(vmd->bus->bridge));
    850
    851	vmd_attach_resources(vmd);
    852	if (vmd->irq_domain)
    853		dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
    854	else
    855		dev_set_msi_domain(&vmd->bus->dev,
    856				   dev_get_msi_domain(&vmd->dev->dev));
    857
    858	vmd_acpi_begin();
    859
    860	pci_scan_child_bus(vmd->bus);
    861	vmd_domain_reset(vmd);
    862	list_for_each_entry(child, &vmd->bus->children, node)
    863		pci_reset_bus(child->self);
    864	pci_assign_unassigned_bus_resources(vmd->bus);
    865
    866	/*
    867	 * VMD root buses are virtual and don't return true on pci_is_pcie()
    868	 * and will fail pcie_bus_configure_settings() early. It can instead be
    869	 * run on each of the real root ports.
    870	 */
    871	list_for_each_entry(child, &vmd->bus->children, node)
    872		pcie_bus_configure_settings(child);
    873
    874	pci_bus_add_devices(vmd->bus);
    875
    876	vmd_acpi_end();
    877
    878	WARN(sysfs_create_link(&vmd->dev->dev.kobj, &vmd->bus->dev.kobj,
    879			       "domain"), "Can't create symlink to domain\n");
    880	return 0;
    881}
    882
    883static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
    884{
    885	unsigned long features = (unsigned long) id->driver_data;
    886	struct vmd_dev *vmd;
    887	int err;
    888
    889	if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20))
    890		return -ENOMEM;
    891
    892	vmd = devm_kzalloc(&dev->dev, sizeof(*vmd), GFP_KERNEL);
    893	if (!vmd)
    894		return -ENOMEM;
    895
    896	vmd->dev = dev;
    897	vmd->instance = ida_simple_get(&vmd_instance_ida, 0, 0, GFP_KERNEL);
    898	if (vmd->instance < 0)
    899		return vmd->instance;
    900
    901	vmd->name = kasprintf(GFP_KERNEL, "vmd%d", vmd->instance);
    902	if (!vmd->name) {
    903		err = -ENOMEM;
    904		goto out_release_instance;
    905	}
    906
    907	err = pcim_enable_device(dev);
    908	if (err < 0)
    909		goto out_release_instance;
    910
    911	vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0);
    912	if (!vmd->cfgbar) {
    913		err = -ENOMEM;
    914		goto out_release_instance;
    915	}
    916
    917	pci_set_master(dev);
    918	if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) &&
    919	    dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32))) {
    920		err = -ENODEV;
    921		goto out_release_instance;
    922	}
    923
    924	if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
    925		vmd->first_vec = 1;
    926
    927	spin_lock_init(&vmd->cfg_lock);
    928	pci_set_drvdata(dev, vmd);
    929	err = vmd_enable_domain(vmd, features);
    930	if (err)
    931		goto out_release_instance;
    932
    933	dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n",
    934		 vmd->sysdata.domain);
    935	return 0;
    936
    937 out_release_instance:
    938	ida_simple_remove(&vmd_instance_ida, vmd->instance);
    939	kfree(vmd->name);
    940	return err;
    941}
    942
    943static void vmd_cleanup_srcu(struct vmd_dev *vmd)
    944{
    945	int i;
    946
    947	for (i = 0; i < vmd->msix_count; i++)
    948		cleanup_srcu_struct(&vmd->irqs[i].srcu);
    949}
    950
    951static void vmd_remove(struct pci_dev *dev)
    952{
    953	struct vmd_dev *vmd = pci_get_drvdata(dev);
    954
    955	sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
    956	pci_stop_root_bus(vmd->bus);
    957	pci_remove_root_bus(vmd->bus);
    958	vmd_cleanup_srcu(vmd);
    959	vmd_detach_resources(vmd);
    960	vmd_remove_irq_domain(vmd);
    961	ida_simple_remove(&vmd_instance_ida, vmd->instance);
    962	kfree(vmd->name);
    963}
    964
    965#ifdef CONFIG_PM_SLEEP
    966static int vmd_suspend(struct device *dev)
    967{
    968	struct pci_dev *pdev = to_pci_dev(dev);
    969	struct vmd_dev *vmd = pci_get_drvdata(pdev);
    970	int i;
    971
    972	for (i = 0; i < vmd->msix_count; i++)
    973		devm_free_irq(dev, vmd->irqs[i].virq, &vmd->irqs[i]);
    974
    975	return 0;
    976}
    977
    978static int vmd_resume(struct device *dev)
    979{
    980	struct pci_dev *pdev = to_pci_dev(dev);
    981	struct vmd_dev *vmd = pci_get_drvdata(pdev);
    982	int err, i;
    983
    984	for (i = 0; i < vmd->msix_count; i++) {
    985		err = devm_request_irq(dev, vmd->irqs[i].virq,
    986				       vmd_irq, IRQF_NO_THREAD,
    987				       vmd->name, &vmd->irqs[i]);
    988		if (err)
    989			return err;
    990	}
    991
    992	return 0;
    993}
    994#endif
    995static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume);
    996
    997static const struct pci_device_id vmd_ids[] = {
    998	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_201D),
    999		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP,},
   1000	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_28C0),
   1001		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW |
   1002				VMD_FEAT_HAS_BUS_RESTRICTIONS |
   1003				VMD_FEAT_CAN_BYPASS_MSI_REMAP,},
   1004	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x467f),
   1005		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
   1006				VMD_FEAT_HAS_BUS_RESTRICTIONS |
   1007				VMD_FEAT_OFFSET_FIRST_VECTOR,},
   1008	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x4c3d),
   1009		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
   1010				VMD_FEAT_HAS_BUS_RESTRICTIONS |
   1011				VMD_FEAT_OFFSET_FIRST_VECTOR,},
   1012	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa77f),
   1013		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
   1014				VMD_FEAT_HAS_BUS_RESTRICTIONS |
   1015				VMD_FEAT_OFFSET_FIRST_VECTOR,},
   1016	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),
   1017		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
   1018				VMD_FEAT_HAS_BUS_RESTRICTIONS |
   1019				VMD_FEAT_OFFSET_FIRST_VECTOR,},
   1020	{0,}
   1021};
   1022MODULE_DEVICE_TABLE(pci, vmd_ids);
   1023
   1024static struct pci_driver vmd_drv = {
   1025	.name		= "vmd",
   1026	.id_table	= vmd_ids,
   1027	.probe		= vmd_probe,
   1028	.remove		= vmd_remove,
   1029	.driver		= {
   1030		.pm	= &vmd_dev_pm_ops,
   1031	},
   1032};
   1033module_pci_driver(vmd_drv);
   1034
   1035MODULE_AUTHOR("Intel Corporation");
   1036MODULE_LICENSE("GPL v2");
   1037MODULE_VERSION("0.6");