cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

iommu.c (91707B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
      4 * Author: Joerg Roedel <jroedel@suse.de>
      5 *         Leo Duran <leo.duran@amd.com>
      6 */
      7
      8#define pr_fmt(fmt)     "AMD-Vi: " fmt
      9#define dev_fmt(fmt)    pr_fmt(fmt)
     10
     11#include <linux/ratelimit.h>
     12#include <linux/pci.h>
     13#include <linux/acpi.h>
     14#include <linux/amba/bus.h>
     15#include <linux/platform_device.h>
     16#include <linux/pci-ats.h>
     17#include <linux/bitmap.h>
     18#include <linux/slab.h>
     19#include <linux/debugfs.h>
     20#include <linux/scatterlist.h>
     21#include <linux/dma-map-ops.h>
     22#include <linux/dma-direct.h>
     23#include <linux/dma-iommu.h>
     24#include <linux/iommu-helper.h>
     25#include <linux/delay.h>
     26#include <linux/amd-iommu.h>
     27#include <linux/notifier.h>
     28#include <linux/export.h>
     29#include <linux/irq.h>
     30#include <linux/msi.h>
     31#include <linux/irqdomain.h>
     32#include <linux/percpu.h>
     33#include <linux/io-pgtable.h>
     34#include <linux/cc_platform.h>
     35#include <asm/irq_remapping.h>
     36#include <asm/io_apic.h>
     37#include <asm/apic.h>
     38#include <asm/hw_irq.h>
     39#include <asm/proto.h>
     40#include <asm/iommu.h>
     41#include <asm/gart.h>
     42#include <asm/dma.h>
     43
     44#include "amd_iommu.h"
     45#include "../irq_remapping.h"
     46
     47#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
     48
     49#define LOOP_TIMEOUT	100000
     50
     51/* IO virtual address start page frame number */
     52#define IOVA_START_PFN		(1)
     53#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
     54
     55/* Reserved IOVA ranges */
     56#define MSI_RANGE_START		(0xfee00000)
     57#define MSI_RANGE_END		(0xfeefffff)
     58#define HT_RANGE_START		(0xfd00000000ULL)
     59#define HT_RANGE_END		(0xffffffffffULL)
     60
     61#define DEFAULT_PGTABLE_LEVEL	PAGE_MODE_3_LEVEL
     62
     63static DEFINE_SPINLOCK(pd_bitmap_lock);
     64
     65LIST_HEAD(ioapic_map);
     66LIST_HEAD(hpet_map);
     67LIST_HEAD(acpihid_map);
     68
     69/*
     70 * Domain for untranslated devices - only allocated
     71 * if iommu=pt passed on kernel cmd line.
     72 */
     73const struct iommu_ops amd_iommu_ops;
     74
     75static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
     76int amd_iommu_max_glx_val = -1;
     77
     78/*
     79 * general struct to manage commands send to an IOMMU
     80 */
     81struct iommu_cmd {
     82	u32 data[4];
     83};
     84
     85struct kmem_cache *amd_iommu_irq_cache;
     86
     87static void detach_device(struct device *dev);
     88
     89/****************************************************************************
     90 *
     91 * Helper functions
     92 *
     93 ****************************************************************************/
     94
     95static inline int get_acpihid_device_id(struct device *dev,
     96					struct acpihid_map_entry **entry)
     97{
     98	struct acpi_device *adev = ACPI_COMPANION(dev);
     99	struct acpihid_map_entry *p;
    100
    101	if (!adev)
    102		return -ENODEV;
    103
    104	list_for_each_entry(p, &acpihid_map, list) {
    105		if (acpi_dev_hid_uid_match(adev, p->hid,
    106					   p->uid[0] ? p->uid : NULL)) {
    107			if (entry)
    108				*entry = p;
    109			return p->devid;
    110		}
    111	}
    112	return -EINVAL;
    113}
    114
    115static inline int get_device_sbdf_id(struct device *dev)
    116{
    117	int sbdf;
    118
    119	if (dev_is_pci(dev))
    120		sbdf = get_pci_sbdf_id(to_pci_dev(dev));
    121	else
    122		sbdf = get_acpihid_device_id(dev, NULL);
    123
    124	return sbdf;
    125}
    126
    127struct dev_table_entry *get_dev_table(struct amd_iommu *iommu)
    128{
    129	struct dev_table_entry *dev_table;
    130	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
    131
    132	BUG_ON(pci_seg == NULL);
    133	dev_table = pci_seg->dev_table;
    134	BUG_ON(dev_table == NULL);
    135
    136	return dev_table;
    137}
    138
    139static inline u16 get_device_segment(struct device *dev)
    140{
    141	u16 seg;
    142
    143	if (dev_is_pci(dev)) {
    144		struct pci_dev *pdev = to_pci_dev(dev);
    145
    146		seg = pci_domain_nr(pdev->bus);
    147	} else {
    148		u32 devid = get_acpihid_device_id(dev, NULL);
    149
    150		seg = PCI_SBDF_TO_SEGID(devid);
    151	}
    152
    153	return seg;
    154}
    155
    156/* Writes the specific IOMMU for a device into the PCI segment rlookup table */
    157void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid)
    158{
    159	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
    160
    161	pci_seg->rlookup_table[devid] = iommu;
    162}
    163
    164static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid)
    165{
    166	struct amd_iommu_pci_seg *pci_seg;
    167
    168	for_each_pci_segment(pci_seg) {
    169		if (pci_seg->id == seg)
    170			return pci_seg->rlookup_table[devid];
    171	}
    172	return NULL;
    173}
    174
    175static struct amd_iommu *rlookup_amd_iommu(struct device *dev)
    176{
    177	u16 seg = get_device_segment(dev);
    178	int devid = get_device_sbdf_id(dev);
    179
    180	if (devid < 0)
    181		return NULL;
    182	return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid));
    183}
    184
    185static struct protection_domain *to_pdomain(struct iommu_domain *dom)
    186{
    187	return container_of(dom, struct protection_domain, domain);
    188}
    189
    190static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
    191{
    192	struct iommu_dev_data *dev_data;
    193	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
    194
    195	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
    196	if (!dev_data)
    197		return NULL;
    198
    199	spin_lock_init(&dev_data->lock);
    200	dev_data->devid = devid;
    201	ratelimit_default_init(&dev_data->rs);
    202
    203	llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list);
    204	return dev_data;
    205}
    206
    207static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
    208{
    209	struct iommu_dev_data *dev_data;
    210	struct llist_node *node;
    211	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
    212
    213	if (llist_empty(&pci_seg->dev_data_list))
    214		return NULL;
    215
    216	node = pci_seg->dev_data_list.first;
    217	llist_for_each_entry(dev_data, node, dev_data_list) {
    218		if (dev_data->devid == devid)
    219			return dev_data;
    220	}
    221
    222	return NULL;
    223}
    224
    225static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
    226{
    227	struct amd_iommu *iommu;
    228	struct dev_table_entry *dev_table;
    229	u16 devid = pci_dev_id(pdev);
    230
    231	if (devid == alias)
    232		return 0;
    233
    234	iommu = rlookup_amd_iommu(&pdev->dev);
    235	if (!iommu)
    236		return 0;
    237
    238	amd_iommu_set_rlookup_table(iommu, alias);
    239	dev_table = get_dev_table(iommu);
    240	memcpy(dev_table[alias].data,
    241	       dev_table[devid].data,
    242	       sizeof(dev_table[alias].data));
    243
    244	return 0;
    245}
    246
    247static void clone_aliases(struct amd_iommu *iommu, struct device *dev)
    248{
    249	struct pci_dev *pdev;
    250
    251	if (!dev_is_pci(dev))
    252		return;
    253	pdev = to_pci_dev(dev);
    254
    255	/*
    256	 * The IVRS alias stored in the alias table may not be
    257	 * part of the PCI DMA aliases if it's bus differs
    258	 * from the original device.
    259	 */
    260	clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL);
    261
    262	pci_for_each_dma_alias(pdev, clone_alias, NULL);
    263}
    264
    265static void setup_aliases(struct amd_iommu *iommu, struct device *dev)
    266{
    267	struct pci_dev *pdev = to_pci_dev(dev);
    268	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
    269	u16 ivrs_alias;
    270
    271	/* For ACPI HID devices, there are no aliases */
    272	if (!dev_is_pci(dev))
    273		return;
    274
    275	/*
    276	 * Add the IVRS alias to the pci aliases if it is on the same
    277	 * bus. The IVRS table may know about a quirk that we don't.
    278	 */
    279	ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)];
    280	if (ivrs_alias != pci_dev_id(pdev) &&
    281	    PCI_BUS_NUM(ivrs_alias) == pdev->bus->number)
    282		pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1);
    283
    284	clone_aliases(iommu, dev);
    285}
    286
    287static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid)
    288{
    289	struct iommu_dev_data *dev_data;
    290
    291	dev_data = search_dev_data(iommu, devid);
    292
    293	if (dev_data == NULL) {
    294		dev_data = alloc_dev_data(iommu, devid);
    295		if (!dev_data)
    296			return NULL;
    297
    298		if (translation_pre_enabled(iommu))
    299			dev_data->defer_attach = true;
    300	}
    301
    302	return dev_data;
    303}
    304
    305/*
    306* Find or create an IOMMU group for a acpihid device.
    307*/
    308static struct iommu_group *acpihid_device_group(struct device *dev)
    309{
    310	struct acpihid_map_entry *p, *entry = NULL;
    311	int devid;
    312
    313	devid = get_acpihid_device_id(dev, &entry);
    314	if (devid < 0)
    315		return ERR_PTR(devid);
    316
    317	list_for_each_entry(p, &acpihid_map, list) {
    318		if ((devid == p->devid) && p->group)
    319			entry->group = p->group;
    320	}
    321
    322	if (!entry->group)
    323		entry->group = generic_device_group(dev);
    324	else
    325		iommu_group_ref_get(entry->group);
    326
    327	return entry->group;
    328}
    329
    330static bool pci_iommuv2_capable(struct pci_dev *pdev)
    331{
    332	static const int caps[] = {
    333		PCI_EXT_CAP_ID_PRI,
    334		PCI_EXT_CAP_ID_PASID,
    335	};
    336	int i, pos;
    337
    338	if (!pci_ats_supported(pdev))
    339		return false;
    340
    341	for (i = 0; i < 2; ++i) {
    342		pos = pci_find_ext_capability(pdev, caps[i]);
    343		if (pos == 0)
    344			return false;
    345	}
    346
    347	return true;
    348}
    349
    350/*
    351 * This function checks if the driver got a valid device from the caller to
    352 * avoid dereferencing invalid pointers.
    353 */
    354static bool check_device(struct device *dev)
    355{
    356	struct amd_iommu_pci_seg *pci_seg;
    357	struct amd_iommu *iommu;
    358	int devid, sbdf;
    359
    360	if (!dev)
    361		return false;
    362
    363	sbdf = get_device_sbdf_id(dev);
    364	if (sbdf < 0)
    365		return false;
    366	devid = PCI_SBDF_TO_DEVID(sbdf);
    367
    368	iommu = rlookup_amd_iommu(dev);
    369	if (!iommu)
    370		return false;
    371
    372	/* Out of our scope? */
    373	pci_seg = iommu->pci_seg;
    374	if (devid > pci_seg->last_bdf)
    375		return false;
    376
    377	return true;
    378}
    379
    380static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
    381{
    382	struct iommu_dev_data *dev_data;
    383	int devid, sbdf;
    384
    385	if (dev_iommu_priv_get(dev))
    386		return 0;
    387
    388	sbdf = get_device_sbdf_id(dev);
    389	if (sbdf < 0)
    390		return sbdf;
    391
    392	devid = PCI_SBDF_TO_DEVID(sbdf);
    393	dev_data = find_dev_data(iommu, devid);
    394	if (!dev_data)
    395		return -ENOMEM;
    396
    397	dev_data->dev = dev;
    398	setup_aliases(iommu, dev);
    399
    400	/*
    401	 * By default we use passthrough mode for IOMMUv2 capable device.
    402	 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
    403	 * invalid address), we ignore the capability for the device so
    404	 * it'll be forced to go into translation mode.
    405	 */
    406	if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
    407	    dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) {
    408		dev_data->iommu_v2 = iommu->is_iommu_v2;
    409	}
    410
    411	dev_iommu_priv_set(dev, dev_data);
    412
    413	return 0;
    414}
    415
    416static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
    417{
    418	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
    419	struct dev_table_entry *dev_table = get_dev_table(iommu);
    420	int devid, sbdf;
    421
    422	sbdf = get_device_sbdf_id(dev);
    423	if (sbdf < 0)
    424		return;
    425
    426	devid = PCI_SBDF_TO_DEVID(sbdf);
    427	pci_seg->rlookup_table[devid] = NULL;
    428	memset(&dev_table[devid], 0, sizeof(struct dev_table_entry));
    429
    430	setup_aliases(iommu, dev);
    431}
    432
    433static void amd_iommu_uninit_device(struct device *dev)
    434{
    435	struct iommu_dev_data *dev_data;
    436
    437	dev_data = dev_iommu_priv_get(dev);
    438	if (!dev_data)
    439		return;
    440
    441	if (dev_data->domain)
    442		detach_device(dev);
    443
    444	dev_iommu_priv_set(dev, NULL);
    445
    446	/*
    447	 * We keep dev_data around for unplugged devices and reuse it when the
    448	 * device is re-plugged - not doing so would introduce a ton of races.
    449	 */
    450}
    451
    452/****************************************************************************
    453 *
    454 * Interrupt handling functions
    455 *
    456 ****************************************************************************/
    457
    458static void dump_dte_entry(struct amd_iommu *iommu, u16 devid)
    459{
    460	int i;
    461	struct dev_table_entry *dev_table = get_dev_table(iommu);
    462
    463	for (i = 0; i < 4; ++i)
    464		pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]);
    465}
    466
    467static void dump_command(unsigned long phys_addr)
    468{
    469	struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
    470	int i;
    471
    472	for (i = 0; i < 4; ++i)
    473		pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
    474}
    475
    476static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event)
    477{
    478	struct iommu_dev_data *dev_data = NULL;
    479	int devid, vmg_tag, flags;
    480	struct pci_dev *pdev;
    481	u64 spa;
    482
    483	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
    484	vmg_tag = (event[1]) & 0xFFFF;
    485	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
    486	spa     = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8);
    487
    488	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
    489					   devid & 0xff);
    490	if (pdev)
    491		dev_data = dev_iommu_priv_get(&pdev->dev);
    492
    493	if (dev_data) {
    494		if (__ratelimit(&dev_data->rs)) {
    495			pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
    496				vmg_tag, spa, flags);
    497		}
    498	} else {
    499		pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
    500			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
    501			vmg_tag, spa, flags);
    502	}
    503
    504	if (pdev)
    505		pci_dev_put(pdev);
    506}
    507
    508static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event)
    509{
    510	struct iommu_dev_data *dev_data = NULL;
    511	int devid, flags_rmp, vmg_tag, flags;
    512	struct pci_dev *pdev;
    513	u64 gpa;
    514
    515	devid     = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
    516	flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF;
    517	vmg_tag   = (event[1]) & 0xFFFF;
    518	flags     = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
    519	gpa       = ((u64)event[3] << 32) | event[2];
    520
    521	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
    522					   devid & 0xff);
    523	if (pdev)
    524		dev_data = dev_iommu_priv_get(&pdev->dev);
    525
    526	if (dev_data) {
    527		if (__ratelimit(&dev_data->rs)) {
    528			pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
    529				vmg_tag, gpa, flags_rmp, flags);
    530		}
    531	} else {
    532		pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
    533			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
    534			vmg_tag, gpa, flags_rmp, flags);
    535	}
    536
    537	if (pdev)
    538		pci_dev_put(pdev);
    539}
    540
    541#define IS_IOMMU_MEM_TRANSACTION(flags)		\
    542	(((flags) & EVENT_FLAG_I) == 0)
    543
    544#define IS_WRITE_REQUEST(flags)			\
    545	((flags) & EVENT_FLAG_RW)
    546
    547static void amd_iommu_report_page_fault(struct amd_iommu *iommu,
    548					u16 devid, u16 domain_id,
    549					u64 address, int flags)
    550{
    551	struct iommu_dev_data *dev_data = NULL;
    552	struct pci_dev *pdev;
    553
    554	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
    555					   devid & 0xff);
    556	if (pdev)
    557		dev_data = dev_iommu_priv_get(&pdev->dev);
    558
    559	if (dev_data) {
    560		/*
    561		 * If this is a DMA fault (for which the I(nterrupt)
    562		 * bit will be unset), allow report_iommu_fault() to
    563		 * prevent logging it.
    564		 */
    565		if (IS_IOMMU_MEM_TRANSACTION(flags)) {
    566			if (!report_iommu_fault(&dev_data->domain->domain,
    567						&pdev->dev, address,
    568						IS_WRITE_REQUEST(flags) ?
    569							IOMMU_FAULT_WRITE :
    570							IOMMU_FAULT_READ))
    571				goto out;
    572		}
    573
    574		if (__ratelimit(&dev_data->rs)) {
    575			pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
    576				domain_id, address, flags);
    577		}
    578	} else {
    579		pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
    580			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
    581			domain_id, address, flags);
    582	}
    583
    584out:
    585	if (pdev)
    586		pci_dev_put(pdev);
    587}
    588
    589static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
    590{
    591	struct device *dev = iommu->iommu.dev;
    592	int type, devid, flags, tag;
    593	volatile u32 *event = __evt;
    594	int count = 0;
    595	u64 address;
    596	u32 pasid;
    597
    598retry:
    599	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
    600	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
    601	pasid   = (event[0] & EVENT_DOMID_MASK_HI) |
    602		  (event[1] & EVENT_DOMID_MASK_LO);
    603	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
    604	address = (u64)(((u64)event[3]) << 32) | event[2];
    605
    606	if (type == 0) {
    607		/* Did we hit the erratum? */
    608		if (++count == LOOP_TIMEOUT) {
    609			pr_err("No event written to event log\n");
    610			return;
    611		}
    612		udelay(1);
    613		goto retry;
    614	}
    615
    616	if (type == EVENT_TYPE_IO_FAULT) {
    617		amd_iommu_report_page_fault(iommu, devid, pasid, address, flags);
    618		return;
    619	}
    620
    621	switch (type) {
    622	case EVENT_TYPE_ILL_DEV:
    623		dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
    624			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
    625			pasid, address, flags);
    626		dump_dte_entry(iommu, devid);
    627		break;
    628	case EVENT_TYPE_DEV_TAB_ERR:
    629		dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x "
    630			"address=0x%llx flags=0x%04x]\n",
    631			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
    632			address, flags);
    633		break;
    634	case EVENT_TYPE_PAGE_TAB_ERR:
    635		dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n",
    636			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
    637			pasid, address, flags);
    638		break;
    639	case EVENT_TYPE_ILL_CMD:
    640		dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
    641		dump_command(address);
    642		break;
    643	case EVENT_TYPE_CMD_HARD_ERR:
    644		dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
    645			address, flags);
    646		break;
    647	case EVENT_TYPE_IOTLB_INV_TO:
    648		dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n",
    649			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
    650			address);
    651		break;
    652	case EVENT_TYPE_INV_DEV_REQ:
    653		dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
    654			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
    655			pasid, address, flags);
    656		break;
    657	case EVENT_TYPE_RMP_FAULT:
    658		amd_iommu_report_rmp_fault(iommu, event);
    659		break;
    660	case EVENT_TYPE_RMP_HW_ERR:
    661		amd_iommu_report_rmp_hw_error(iommu, event);
    662		break;
    663	case EVENT_TYPE_INV_PPR_REQ:
    664		pasid = PPR_PASID(*((u64 *)__evt));
    665		tag = event[1] & 0x03FF;
    666		dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
    667			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
    668			pasid, address, flags, tag);
    669		break;
    670	default:
    671		dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
    672			event[0], event[1], event[2], event[3]);
    673	}
    674
    675	memset(__evt, 0, 4 * sizeof(u32));
    676}
    677
    678static void iommu_poll_events(struct amd_iommu *iommu)
    679{
    680	u32 head, tail;
    681
    682	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
    683	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
    684
    685	while (head != tail) {
    686		iommu_print_event(iommu, iommu->evt_buf + head);
    687		head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
    688	}
    689
    690	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
    691}
    692
    693static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
    694{
    695	struct amd_iommu_fault fault;
    696
    697	if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
    698		pr_err_ratelimited("Unknown PPR request received\n");
    699		return;
    700	}
    701
    702	fault.address   = raw[1];
    703	fault.pasid     = PPR_PASID(raw[0]);
    704	fault.sbdf      = PCI_SEG_DEVID_TO_SBDF(iommu->pci_seg->id, PPR_DEVID(raw[0]));
    705	fault.tag       = PPR_TAG(raw[0]);
    706	fault.flags     = PPR_FLAGS(raw[0]);
    707
    708	atomic_notifier_call_chain(&ppr_notifier, 0, &fault);
    709}
    710
    711static void iommu_poll_ppr_log(struct amd_iommu *iommu)
    712{
    713	u32 head, tail;
    714
    715	if (iommu->ppr_log == NULL)
    716		return;
    717
    718	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
    719	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
    720
    721	while (head != tail) {
    722		volatile u64 *raw;
    723		u64 entry[2];
    724		int i;
    725
    726		raw = (u64 *)(iommu->ppr_log + head);
    727
    728		/*
    729		 * Hardware bug: Interrupt may arrive before the entry is
    730		 * written to memory. If this happens we need to wait for the
    731		 * entry to arrive.
    732		 */
    733		for (i = 0; i < LOOP_TIMEOUT; ++i) {
    734			if (PPR_REQ_TYPE(raw[0]) != 0)
    735				break;
    736			udelay(1);
    737		}
    738
    739		/* Avoid memcpy function-call overhead */
    740		entry[0] = raw[0];
    741		entry[1] = raw[1];
    742
    743		/*
    744		 * To detect the hardware bug we need to clear the entry
    745		 * back to zero.
    746		 */
    747		raw[0] = raw[1] = 0UL;
    748
    749		/* Update head pointer of hardware ring-buffer */
    750		head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
    751		writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
    752
    753		/* Handle PPR entry */
    754		iommu_handle_ppr_entry(iommu, entry);
    755
    756		/* Refresh ring-buffer information */
    757		head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
    758		tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
    759	}
    760}
    761
    762#ifdef CONFIG_IRQ_REMAP
    763static int (*iommu_ga_log_notifier)(u32);
    764
    765int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
    766{
    767	iommu_ga_log_notifier = notifier;
    768
    769	return 0;
    770}
    771EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
    772
    773static void iommu_poll_ga_log(struct amd_iommu *iommu)
    774{
    775	u32 head, tail, cnt = 0;
    776
    777	if (iommu->ga_log == NULL)
    778		return;
    779
    780	head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
    781	tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
    782
    783	while (head != tail) {
    784		volatile u64 *raw;
    785		u64 log_entry;
    786
    787		raw = (u64 *)(iommu->ga_log + head);
    788		cnt++;
    789
    790		/* Avoid memcpy function-call overhead */
    791		log_entry = *raw;
    792
    793		/* Update head pointer of hardware ring-buffer */
    794		head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
    795		writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
    796
    797		/* Handle GA entry */
    798		switch (GA_REQ_TYPE(log_entry)) {
    799		case GA_GUEST_NR:
    800			if (!iommu_ga_log_notifier)
    801				break;
    802
    803			pr_debug("%s: devid=%#x, ga_tag=%#x\n",
    804				 __func__, GA_DEVID(log_entry),
    805				 GA_TAG(log_entry));
    806
    807			if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
    808				pr_err("GA log notifier failed.\n");
    809			break;
    810		default:
    811			break;
    812		}
    813	}
    814}
    815
    816static void
    817amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
    818{
    819	if (!irq_remapping_enabled || !dev_is_pci(dev) ||
    820	    pci_dev_has_special_msi_domain(to_pci_dev(dev)))
    821		return;
    822
    823	dev_set_msi_domain(dev, iommu->msi_domain);
    824}
    825
    826#else /* CONFIG_IRQ_REMAP */
    827static inline void
    828amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
    829#endif /* !CONFIG_IRQ_REMAP */
    830
    831#define AMD_IOMMU_INT_MASK	\
    832	(MMIO_STATUS_EVT_OVERFLOW_INT_MASK | \
    833	 MMIO_STATUS_EVT_INT_MASK | \
    834	 MMIO_STATUS_PPR_INT_MASK | \
    835	 MMIO_STATUS_GALOG_INT_MASK)
    836
    837irqreturn_t amd_iommu_int_thread(int irq, void *data)
    838{
    839	struct amd_iommu *iommu = (struct amd_iommu *) data;
    840	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
    841
    842	while (status & AMD_IOMMU_INT_MASK) {
    843		/* Enable interrupt sources again */
    844		writel(AMD_IOMMU_INT_MASK,
    845			iommu->mmio_base + MMIO_STATUS_OFFSET);
    846
    847		if (status & MMIO_STATUS_EVT_INT_MASK) {
    848			pr_devel("Processing IOMMU Event Log\n");
    849			iommu_poll_events(iommu);
    850		}
    851
    852		if (status & MMIO_STATUS_PPR_INT_MASK) {
    853			pr_devel("Processing IOMMU PPR Log\n");
    854			iommu_poll_ppr_log(iommu);
    855		}
    856
    857#ifdef CONFIG_IRQ_REMAP
    858		if (status & MMIO_STATUS_GALOG_INT_MASK) {
    859			pr_devel("Processing IOMMU GA Log\n");
    860			iommu_poll_ga_log(iommu);
    861		}
    862#endif
    863
    864		if (status & MMIO_STATUS_EVT_OVERFLOW_INT_MASK) {
    865			pr_info_ratelimited("IOMMU event log overflow\n");
    866			amd_iommu_restart_event_logging(iommu);
    867		}
    868
    869		/*
    870		 * Hardware bug: ERBT1312
    871		 * When re-enabling interrupt (by writing 1
    872		 * to clear the bit), the hardware might also try to set
    873		 * the interrupt bit in the event status register.
    874		 * In this scenario, the bit will be set, and disable
    875		 * subsequent interrupts.
    876		 *
    877		 * Workaround: The IOMMU driver should read back the
    878		 * status register and check if the interrupt bits are cleared.
    879		 * If not, driver will need to go through the interrupt handler
    880		 * again and re-clear the bits
    881		 */
    882		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
    883	}
    884	return IRQ_HANDLED;
    885}
    886
    887irqreturn_t amd_iommu_int_handler(int irq, void *data)
    888{
    889	return IRQ_WAKE_THREAD;
    890}
    891
    892/****************************************************************************
    893 *
    894 * IOMMU command queuing functions
    895 *
    896 ****************************************************************************/
    897
    898static int wait_on_sem(struct amd_iommu *iommu, u64 data)
    899{
    900	int i = 0;
    901
    902	while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
    903		udelay(1);
    904		i += 1;
    905	}
    906
    907	if (i == LOOP_TIMEOUT) {
    908		pr_alert("Completion-Wait loop timed out\n");
    909		return -EIO;
    910	}
    911
    912	return 0;
    913}
    914
    915static void copy_cmd_to_buffer(struct amd_iommu *iommu,
    916			       struct iommu_cmd *cmd)
    917{
    918	u8 *target;
    919	u32 tail;
    920
    921	/* Copy command to buffer */
    922	tail = iommu->cmd_buf_tail;
    923	target = iommu->cmd_buf + tail;
    924	memcpy(target, cmd, sizeof(*cmd));
    925
    926	tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
    927	iommu->cmd_buf_tail = tail;
    928
    929	/* Tell the IOMMU about it */
    930	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
    931}
    932
    933static void build_completion_wait(struct iommu_cmd *cmd,
    934				  struct amd_iommu *iommu,
    935				  u64 data)
    936{
    937	u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
    938
    939	memset(cmd, 0, sizeof(*cmd));
    940	cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
    941	cmd->data[1] = upper_32_bits(paddr);
    942	cmd->data[2] = data;
    943	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
    944}
    945
    946static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
    947{
    948	memset(cmd, 0, sizeof(*cmd));
    949	cmd->data[0] = devid;
    950	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
    951}
    952
    953/*
    954 * Builds an invalidation address which is suitable for one page or multiple
    955 * pages. Sets the size bit (S) as needed is more than one page is flushed.
    956 */
    957static inline u64 build_inv_address(u64 address, size_t size)
    958{
    959	u64 pages, end, msb_diff;
    960
    961	pages = iommu_num_pages(address, size, PAGE_SIZE);
    962
    963	if (pages == 1)
    964		return address & PAGE_MASK;
    965
    966	end = address + size - 1;
    967
    968	/*
    969	 * msb_diff would hold the index of the most significant bit that
    970	 * flipped between the start and end.
    971	 */
    972	msb_diff = fls64(end ^ address) - 1;
    973
    974	/*
    975	 * Bits 63:52 are sign extended. If for some reason bit 51 is different
    976	 * between the start and the end, invalidate everything.
    977	 */
    978	if (unlikely(msb_diff > 51)) {
    979		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
    980	} else {
    981		/*
    982		 * The msb-bit must be clear on the address. Just set all the
    983		 * lower bits.
    984		 */
    985		address |= (1ull << msb_diff) - 1;
    986	}
    987
    988	/* Clear bits 11:0 */
    989	address &= PAGE_MASK;
    990
    991	/* Set the size bit - we flush more than one 4kb page */
    992	return address | CMD_INV_IOMMU_PAGES_SIZE_MASK;
    993}
    994
    995static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
    996				  size_t size, u16 domid, int pde)
    997{
    998	u64 inv_address = build_inv_address(address, size);
    999
   1000	memset(cmd, 0, sizeof(*cmd));
   1001	cmd->data[1] |= domid;
   1002	cmd->data[2]  = lower_32_bits(inv_address);
   1003	cmd->data[3]  = upper_32_bits(inv_address);
   1004	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
   1005	if (pde) /* PDE bit - we want to flush everything, not only the PTEs */
   1006		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
   1007}
   1008
   1009static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
   1010				  u64 address, size_t size)
   1011{
   1012	u64 inv_address = build_inv_address(address, size);
   1013
   1014	memset(cmd, 0, sizeof(*cmd));
   1015	cmd->data[0]  = devid;
   1016	cmd->data[0] |= (qdep & 0xff) << 24;
   1017	cmd->data[1]  = devid;
   1018	cmd->data[2]  = lower_32_bits(inv_address);
   1019	cmd->data[3]  = upper_32_bits(inv_address);
   1020	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
   1021}
   1022
   1023static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid,
   1024				  u64 address, bool size)
   1025{
   1026	memset(cmd, 0, sizeof(*cmd));
   1027
   1028	address &= ~(0xfffULL);
   1029
   1030	cmd->data[0]  = pasid;
   1031	cmd->data[1]  = domid;
   1032	cmd->data[2]  = lower_32_bits(address);
   1033	cmd->data[3]  = upper_32_bits(address);
   1034	cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
   1035	cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
   1036	if (size)
   1037		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
   1038	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
   1039}
   1040
   1041static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid,
   1042				  int qdep, u64 address, bool size)
   1043{
   1044	memset(cmd, 0, sizeof(*cmd));
   1045
   1046	address &= ~(0xfffULL);
   1047
   1048	cmd->data[0]  = devid;
   1049	cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
   1050	cmd->data[0] |= (qdep  & 0xff) << 24;
   1051	cmd->data[1]  = devid;
   1052	cmd->data[1] |= (pasid & 0xff) << 16;
   1053	cmd->data[2]  = lower_32_bits(address);
   1054	cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
   1055	cmd->data[3]  = upper_32_bits(address);
   1056	if (size)
   1057		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
   1058	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
   1059}
   1060
   1061static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
   1062			       int status, int tag, bool gn)
   1063{
   1064	memset(cmd, 0, sizeof(*cmd));
   1065
   1066	cmd->data[0]  = devid;
   1067	if (gn) {
   1068		cmd->data[1]  = pasid;
   1069		cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
   1070	}
   1071	cmd->data[3]  = tag & 0x1ff;
   1072	cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
   1073
   1074	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
   1075}
   1076
   1077static void build_inv_all(struct iommu_cmd *cmd)
   1078{
   1079	memset(cmd, 0, sizeof(*cmd));
   1080	CMD_SET_TYPE(cmd, CMD_INV_ALL);
   1081}
   1082
   1083static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
   1084{
   1085	memset(cmd, 0, sizeof(*cmd));
   1086	cmd->data[0] = devid;
   1087	CMD_SET_TYPE(cmd, CMD_INV_IRT);
   1088}
   1089
   1090/*
   1091 * Writes the command to the IOMMUs command buffer and informs the
   1092 * hardware about the new command.
   1093 */
   1094static int __iommu_queue_command_sync(struct amd_iommu *iommu,
   1095				      struct iommu_cmd *cmd,
   1096				      bool sync)
   1097{
   1098	unsigned int count = 0;
   1099	u32 left, next_tail;
   1100
   1101	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
   1102again:
   1103	left      = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
   1104
   1105	if (left <= 0x20) {
   1106		/* Skip udelay() the first time around */
   1107		if (count++) {
   1108			if (count == LOOP_TIMEOUT) {
   1109				pr_err("Command buffer timeout\n");
   1110				return -EIO;
   1111			}
   1112
   1113			udelay(1);
   1114		}
   1115
   1116		/* Update head and recheck remaining space */
   1117		iommu->cmd_buf_head = readl(iommu->mmio_base +
   1118					    MMIO_CMD_HEAD_OFFSET);
   1119
   1120		goto again;
   1121	}
   1122
   1123	copy_cmd_to_buffer(iommu, cmd);
   1124
   1125	/* Do we need to make sure all commands are processed? */
   1126	iommu->need_sync = sync;
   1127
   1128	return 0;
   1129}
   1130
   1131static int iommu_queue_command_sync(struct amd_iommu *iommu,
   1132				    struct iommu_cmd *cmd,
   1133				    bool sync)
   1134{
   1135	unsigned long flags;
   1136	int ret;
   1137
   1138	raw_spin_lock_irqsave(&iommu->lock, flags);
   1139	ret = __iommu_queue_command_sync(iommu, cmd, sync);
   1140	raw_spin_unlock_irqrestore(&iommu->lock, flags);
   1141
   1142	return ret;
   1143}
   1144
   1145static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
   1146{
   1147	return iommu_queue_command_sync(iommu, cmd, true);
   1148}
   1149
   1150/*
   1151 * This function queues a completion wait command into the command
   1152 * buffer of an IOMMU
   1153 */
   1154static int iommu_completion_wait(struct amd_iommu *iommu)
   1155{
   1156	struct iommu_cmd cmd;
   1157	unsigned long flags;
   1158	int ret;
   1159	u64 data;
   1160
   1161	if (!iommu->need_sync)
   1162		return 0;
   1163
   1164	raw_spin_lock_irqsave(&iommu->lock, flags);
   1165
   1166	data = ++iommu->cmd_sem_val;
   1167	build_completion_wait(&cmd, iommu, data);
   1168
   1169	ret = __iommu_queue_command_sync(iommu, &cmd, false);
   1170	if (ret)
   1171		goto out_unlock;
   1172
   1173	ret = wait_on_sem(iommu, data);
   1174
   1175out_unlock:
   1176	raw_spin_unlock_irqrestore(&iommu->lock, flags);
   1177
   1178	return ret;
   1179}
   1180
   1181static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
   1182{
   1183	struct iommu_cmd cmd;
   1184
   1185	build_inv_dte(&cmd, devid);
   1186
   1187	return iommu_queue_command(iommu, &cmd);
   1188}
   1189
   1190static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
   1191{
   1192	u32 devid;
   1193	u16 last_bdf = iommu->pci_seg->last_bdf;
   1194
   1195	for (devid = 0; devid <= last_bdf; ++devid)
   1196		iommu_flush_dte(iommu, devid);
   1197
   1198	iommu_completion_wait(iommu);
   1199}
   1200
   1201/*
   1202 * This function uses heavy locking and may disable irqs for some time. But
   1203 * this is no issue because it is only called during resume.
   1204 */
   1205static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
   1206{
   1207	u32 dom_id;
   1208	u16 last_bdf = iommu->pci_seg->last_bdf;
   1209
   1210	for (dom_id = 0; dom_id <= last_bdf; ++dom_id) {
   1211		struct iommu_cmd cmd;
   1212		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
   1213				      dom_id, 1);
   1214		iommu_queue_command(iommu, &cmd);
   1215	}
   1216
   1217	iommu_completion_wait(iommu);
   1218}
   1219
   1220static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
   1221{
   1222	struct iommu_cmd cmd;
   1223
   1224	build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
   1225			      dom_id, 1);
   1226	iommu_queue_command(iommu, &cmd);
   1227
   1228	iommu_completion_wait(iommu);
   1229}
   1230
   1231static void amd_iommu_flush_all(struct amd_iommu *iommu)
   1232{
   1233	struct iommu_cmd cmd;
   1234
   1235	build_inv_all(&cmd);
   1236
   1237	iommu_queue_command(iommu, &cmd);
   1238	iommu_completion_wait(iommu);
   1239}
   1240
   1241static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
   1242{
   1243	struct iommu_cmd cmd;
   1244
   1245	build_inv_irt(&cmd, devid);
   1246
   1247	iommu_queue_command(iommu, &cmd);
   1248}
   1249
   1250static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
   1251{
   1252	u32 devid;
   1253	u16 last_bdf = iommu->pci_seg->last_bdf;
   1254
   1255	for (devid = 0; devid <= last_bdf; devid++)
   1256		iommu_flush_irt(iommu, devid);
   1257
   1258	iommu_completion_wait(iommu);
   1259}
   1260
   1261void iommu_flush_all_caches(struct amd_iommu *iommu)
   1262{
   1263	if (iommu_feature(iommu, FEATURE_IA)) {
   1264		amd_iommu_flush_all(iommu);
   1265	} else {
   1266		amd_iommu_flush_dte_all(iommu);
   1267		amd_iommu_flush_irt_all(iommu);
   1268		amd_iommu_flush_tlb_all(iommu);
   1269	}
   1270}
   1271
   1272/*
   1273 * Command send function for flushing on-device TLB
   1274 */
   1275static int device_flush_iotlb(struct iommu_dev_data *dev_data,
   1276			      u64 address, size_t size)
   1277{
   1278	struct amd_iommu *iommu;
   1279	struct iommu_cmd cmd;
   1280	int qdep;
   1281
   1282	qdep     = dev_data->ats.qdep;
   1283	iommu    = rlookup_amd_iommu(dev_data->dev);
   1284	if (!iommu)
   1285		return -EINVAL;
   1286
   1287	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
   1288
   1289	return iommu_queue_command(iommu, &cmd);
   1290}
   1291
   1292static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data)
   1293{
   1294	struct amd_iommu *iommu = data;
   1295
   1296	return iommu_flush_dte(iommu, alias);
   1297}
   1298
   1299/*
   1300 * Command send function for invalidating a device table entry
   1301 */
   1302static int device_flush_dte(struct iommu_dev_data *dev_data)
   1303{
   1304	struct amd_iommu *iommu;
   1305	struct pci_dev *pdev = NULL;
   1306	struct amd_iommu_pci_seg *pci_seg;
   1307	u16 alias;
   1308	int ret;
   1309
   1310	iommu = rlookup_amd_iommu(dev_data->dev);
   1311	if (!iommu)
   1312		return -EINVAL;
   1313
   1314	if (dev_is_pci(dev_data->dev))
   1315		pdev = to_pci_dev(dev_data->dev);
   1316
   1317	if (pdev)
   1318		ret = pci_for_each_dma_alias(pdev,
   1319					     device_flush_dte_alias, iommu);
   1320	else
   1321		ret = iommu_flush_dte(iommu, dev_data->devid);
   1322	if (ret)
   1323		return ret;
   1324
   1325	pci_seg = iommu->pci_seg;
   1326	alias = pci_seg->alias_table[dev_data->devid];
   1327	if (alias != dev_data->devid) {
   1328		ret = iommu_flush_dte(iommu, alias);
   1329		if (ret)
   1330			return ret;
   1331	}
   1332
   1333	if (dev_data->ats.enabled)
   1334		ret = device_flush_iotlb(dev_data, 0, ~0UL);
   1335
   1336	return ret;
   1337}
   1338
   1339/*
   1340 * TLB invalidation function which is called from the mapping functions.
   1341 * It invalidates a single PTE if the range to flush is within a single
   1342 * page. Otherwise it flushes the whole TLB of the IOMMU.
   1343 */
   1344static void __domain_flush_pages(struct protection_domain *domain,
   1345				 u64 address, size_t size, int pde)
   1346{
   1347	struct iommu_dev_data *dev_data;
   1348	struct iommu_cmd cmd;
   1349	int ret = 0, i;
   1350
   1351	build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
   1352
   1353	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
   1354		if (!domain->dev_iommu[i])
   1355			continue;
   1356
   1357		/*
   1358		 * Devices of this domain are behind this IOMMU
   1359		 * We need a TLB flush
   1360		 */
   1361		ret |= iommu_queue_command(amd_iommus[i], &cmd);
   1362	}
   1363
   1364	list_for_each_entry(dev_data, &domain->dev_list, list) {
   1365
   1366		if (!dev_data->ats.enabled)
   1367			continue;
   1368
   1369		ret |= device_flush_iotlb(dev_data, address, size);
   1370	}
   1371
   1372	WARN_ON(ret);
   1373}
   1374
   1375static void domain_flush_pages(struct protection_domain *domain,
   1376			       u64 address, size_t size, int pde)
   1377{
   1378	if (likely(!amd_iommu_np_cache)) {
   1379		__domain_flush_pages(domain, address, size, pde);
   1380		return;
   1381	}
   1382
   1383	/*
   1384	 * When NpCache is on, we infer that we run in a VM and use a vIOMMU.
   1385	 * In such setups it is best to avoid flushes of ranges which are not
   1386	 * naturally aligned, since it would lead to flushes of unmodified
   1387	 * PTEs. Such flushes would require the hypervisor to do more work than
   1388	 * necessary. Therefore, perform repeated flushes of aligned ranges
   1389	 * until you cover the range. Each iteration flushes the smaller
   1390	 * between the natural alignment of the address that we flush and the
   1391	 * greatest naturally aligned region that fits in the range.
   1392	 */
   1393	while (size != 0) {
   1394		int addr_alignment = __ffs(address);
   1395		int size_alignment = __fls(size);
   1396		int min_alignment;
   1397		size_t flush_size;
   1398
   1399		/*
   1400		 * size is always non-zero, but address might be zero, causing
   1401		 * addr_alignment to be negative. As the casting of the
   1402		 * argument in __ffs(address) to long might trim the high bits
   1403		 * of the address on x86-32, cast to long when doing the check.
   1404		 */
   1405		if (likely((unsigned long)address != 0))
   1406			min_alignment = min(addr_alignment, size_alignment);
   1407		else
   1408			min_alignment = size_alignment;
   1409
   1410		flush_size = 1ul << min_alignment;
   1411
   1412		__domain_flush_pages(domain, address, flush_size, pde);
   1413		address += flush_size;
   1414		size -= flush_size;
   1415	}
   1416}
   1417
   1418/* Flush the whole IO/TLB for a given protection domain - including PDE */
   1419void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain)
   1420{
   1421	domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
   1422}
   1423
   1424void amd_iommu_domain_flush_complete(struct protection_domain *domain)
   1425{
   1426	int i;
   1427
   1428	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
   1429		if (domain && !domain->dev_iommu[i])
   1430			continue;
   1431
   1432		/*
   1433		 * Devices of this domain are behind this IOMMU
   1434		 * We need to wait for completion of all commands.
   1435		 */
   1436		iommu_completion_wait(amd_iommus[i]);
   1437	}
   1438}
   1439
   1440/* Flush the not present cache if it exists */
   1441static void domain_flush_np_cache(struct protection_domain *domain,
   1442		dma_addr_t iova, size_t size)
   1443{
   1444	if (unlikely(amd_iommu_np_cache)) {
   1445		unsigned long flags;
   1446
   1447		spin_lock_irqsave(&domain->lock, flags);
   1448		domain_flush_pages(domain, iova, size, 1);
   1449		amd_iommu_domain_flush_complete(domain);
   1450		spin_unlock_irqrestore(&domain->lock, flags);
   1451	}
   1452}
   1453
   1454
   1455/*
   1456 * This function flushes the DTEs for all devices in domain
   1457 */
   1458static void domain_flush_devices(struct protection_domain *domain)
   1459{
   1460	struct iommu_dev_data *dev_data;
   1461
   1462	list_for_each_entry(dev_data, &domain->dev_list, list)
   1463		device_flush_dte(dev_data);
   1464}
   1465
   1466/****************************************************************************
   1467 *
   1468 * The next functions belong to the domain allocation. A domain is
   1469 * allocated for every IOMMU as the default domain. If device isolation
   1470 * is enabled, every device get its own domain. The most important thing
   1471 * about domains is the page table mapping the DMA address space they
   1472 * contain.
   1473 *
   1474 ****************************************************************************/
   1475
   1476static u16 domain_id_alloc(void)
   1477{
   1478	int id;
   1479
   1480	spin_lock(&pd_bitmap_lock);
   1481	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
   1482	BUG_ON(id == 0);
   1483	if (id > 0 && id < MAX_DOMAIN_ID)
   1484		__set_bit(id, amd_iommu_pd_alloc_bitmap);
   1485	else
   1486		id = 0;
   1487	spin_unlock(&pd_bitmap_lock);
   1488
   1489	return id;
   1490}
   1491
   1492static void domain_id_free(int id)
   1493{
   1494	spin_lock(&pd_bitmap_lock);
   1495	if (id > 0 && id < MAX_DOMAIN_ID)
   1496		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
   1497	spin_unlock(&pd_bitmap_lock);
   1498}
   1499
   1500static void free_gcr3_tbl_level1(u64 *tbl)
   1501{
   1502	u64 *ptr;
   1503	int i;
   1504
   1505	for (i = 0; i < 512; ++i) {
   1506		if (!(tbl[i] & GCR3_VALID))
   1507			continue;
   1508
   1509		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
   1510
   1511		free_page((unsigned long)ptr);
   1512	}
   1513}
   1514
   1515static void free_gcr3_tbl_level2(u64 *tbl)
   1516{
   1517	u64 *ptr;
   1518	int i;
   1519
   1520	for (i = 0; i < 512; ++i) {
   1521		if (!(tbl[i] & GCR3_VALID))
   1522			continue;
   1523
   1524		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
   1525
   1526		free_gcr3_tbl_level1(ptr);
   1527	}
   1528}
   1529
   1530static void free_gcr3_table(struct protection_domain *domain)
   1531{
   1532	if (domain->glx == 2)
   1533		free_gcr3_tbl_level2(domain->gcr3_tbl);
   1534	else if (domain->glx == 1)
   1535		free_gcr3_tbl_level1(domain->gcr3_tbl);
   1536	else
   1537		BUG_ON(domain->glx != 0);
   1538
   1539	free_page((unsigned long)domain->gcr3_tbl);
   1540}
   1541
   1542static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
   1543			  struct protection_domain *domain, bool ats, bool ppr)
   1544{
   1545	u64 pte_root = 0;
   1546	u64 flags = 0;
   1547	u32 old_domid;
   1548	struct dev_table_entry *dev_table = get_dev_table(iommu);
   1549
   1550	if (domain->iop.mode != PAGE_MODE_NONE)
   1551		pte_root = iommu_virt_to_phys(domain->iop.root);
   1552
   1553	pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
   1554		    << DEV_ENTRY_MODE_SHIFT;
   1555
   1556	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
   1557
   1558	/*
   1559	 * When SNP is enabled, Only set TV bit when IOMMU
   1560	 * page translation is in use.
   1561	 */
   1562	if (!amd_iommu_snp_en || (domain->id != 0))
   1563		pte_root |= DTE_FLAG_TV;
   1564
   1565	flags = dev_table[devid].data[1];
   1566
   1567	if (ats)
   1568		flags |= DTE_FLAG_IOTLB;
   1569
   1570	if (ppr) {
   1571		if (iommu_feature(iommu, FEATURE_EPHSUP))
   1572			pte_root |= 1ULL << DEV_ENTRY_PPR;
   1573	}
   1574
   1575	if (domain->flags & PD_IOMMUV2_MASK) {
   1576		u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
   1577		u64 glx  = domain->glx;
   1578		u64 tmp;
   1579
   1580		pte_root |= DTE_FLAG_GV;
   1581		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
   1582
   1583		/* First mask out possible old values for GCR3 table */
   1584		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
   1585		flags    &= ~tmp;
   1586
   1587		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
   1588		flags    &= ~tmp;
   1589
   1590		/* Encode GCR3 table into DTE */
   1591		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
   1592		pte_root |= tmp;
   1593
   1594		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
   1595		flags    |= tmp;
   1596
   1597		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
   1598		flags    |= tmp;
   1599	}
   1600
   1601	flags &= ~DEV_DOMID_MASK;
   1602	flags |= domain->id;
   1603
   1604	old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
   1605	dev_table[devid].data[1]  = flags;
   1606	dev_table[devid].data[0]  = pte_root;
   1607
   1608	/*
   1609	 * A kdump kernel might be replacing a domain ID that was copied from
   1610	 * the previous kernel--if so, it needs to flush the translation cache
   1611	 * entries for the old domain ID that is being overwritten
   1612	 */
   1613	if (old_domid) {
   1614		amd_iommu_flush_tlb_domid(iommu, old_domid);
   1615	}
   1616}
   1617
   1618static void clear_dte_entry(struct amd_iommu *iommu, u16 devid)
   1619{
   1620	struct dev_table_entry *dev_table = get_dev_table(iommu);
   1621
   1622	/* remove entry from the device table seen by the hardware */
   1623	dev_table[devid].data[0]  = DTE_FLAG_V;
   1624
   1625	if (!amd_iommu_snp_en)
   1626		dev_table[devid].data[0] |= DTE_FLAG_TV;
   1627
   1628	dev_table[devid].data[1] &= DTE_FLAG_MASK;
   1629
   1630	amd_iommu_apply_erratum_63(iommu, devid);
   1631}
   1632
   1633static void do_attach(struct iommu_dev_data *dev_data,
   1634		      struct protection_domain *domain)
   1635{
   1636	struct amd_iommu *iommu;
   1637	bool ats;
   1638
   1639	iommu = rlookup_amd_iommu(dev_data->dev);
   1640	if (!iommu)
   1641		return;
   1642	ats   = dev_data->ats.enabled;
   1643
   1644	/* Update data structures */
   1645	dev_data->domain = domain;
   1646	list_add(&dev_data->list, &domain->dev_list);
   1647
   1648	/* Do reference counting */
   1649	domain->dev_iommu[iommu->index] += 1;
   1650	domain->dev_cnt                 += 1;
   1651
   1652	/* Update device table */
   1653	set_dte_entry(iommu, dev_data->devid, domain,
   1654		      ats, dev_data->iommu_v2);
   1655	clone_aliases(iommu, dev_data->dev);
   1656
   1657	device_flush_dte(dev_data);
   1658}
   1659
   1660static void do_detach(struct iommu_dev_data *dev_data)
   1661{
   1662	struct protection_domain *domain = dev_data->domain;
   1663	struct amd_iommu *iommu;
   1664
   1665	iommu = rlookup_amd_iommu(dev_data->dev);
   1666	if (!iommu)
   1667		return;
   1668
   1669	/* Update data structures */
   1670	dev_data->domain = NULL;
   1671	list_del(&dev_data->list);
   1672	clear_dte_entry(iommu, dev_data->devid);
   1673	clone_aliases(iommu, dev_data->dev);
   1674
   1675	/* Flush the DTE entry */
   1676	device_flush_dte(dev_data);
   1677
   1678	/* Flush IOTLB */
   1679	amd_iommu_domain_flush_tlb_pde(domain);
   1680
   1681	/* Wait for the flushes to finish */
   1682	amd_iommu_domain_flush_complete(domain);
   1683
   1684	/* decrease reference counters - needs to happen after the flushes */
   1685	domain->dev_iommu[iommu->index] -= 1;
   1686	domain->dev_cnt                 -= 1;
   1687}
   1688
   1689static void pdev_iommuv2_disable(struct pci_dev *pdev)
   1690{
   1691	pci_disable_ats(pdev);
   1692	pci_disable_pri(pdev);
   1693	pci_disable_pasid(pdev);
   1694}
   1695
   1696static int pdev_iommuv2_enable(struct pci_dev *pdev)
   1697{
   1698	int ret;
   1699
   1700	/* Only allow access to user-accessible pages */
   1701	ret = pci_enable_pasid(pdev, 0);
   1702	if (ret)
   1703		goto out_err;
   1704
   1705	/* First reset the PRI state of the device */
   1706	ret = pci_reset_pri(pdev);
   1707	if (ret)
   1708		goto out_err;
   1709
   1710	/* Enable PRI */
   1711	/* FIXME: Hardcode number of outstanding requests for now */
   1712	ret = pci_enable_pri(pdev, 32);
   1713	if (ret)
   1714		goto out_err;
   1715
   1716	ret = pci_enable_ats(pdev, PAGE_SHIFT);
   1717	if (ret)
   1718		goto out_err;
   1719
   1720	return 0;
   1721
   1722out_err:
   1723	pci_disable_pri(pdev);
   1724	pci_disable_pasid(pdev);
   1725
   1726	return ret;
   1727}
   1728
   1729/*
   1730 * If a device is not yet associated with a domain, this function makes the
   1731 * device visible in the domain
   1732 */
   1733static int attach_device(struct device *dev,
   1734			 struct protection_domain *domain)
   1735{
   1736	struct iommu_dev_data *dev_data;
   1737	struct pci_dev *pdev;
   1738	unsigned long flags;
   1739	int ret;
   1740
   1741	spin_lock_irqsave(&domain->lock, flags);
   1742
   1743	dev_data = dev_iommu_priv_get(dev);
   1744
   1745	spin_lock(&dev_data->lock);
   1746
   1747	ret = -EBUSY;
   1748	if (dev_data->domain != NULL)
   1749		goto out;
   1750
   1751	if (!dev_is_pci(dev))
   1752		goto skip_ats_check;
   1753
   1754	pdev = to_pci_dev(dev);
   1755	if (domain->flags & PD_IOMMUV2_MASK) {
   1756		struct iommu_domain *def_domain = iommu_get_dma_domain(dev);
   1757
   1758		ret = -EINVAL;
   1759		if (def_domain->type != IOMMU_DOMAIN_IDENTITY)
   1760			goto out;
   1761
   1762		if (dev_data->iommu_v2) {
   1763			if (pdev_iommuv2_enable(pdev) != 0)
   1764				goto out;
   1765
   1766			dev_data->ats.enabled = true;
   1767			dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
   1768			dev_data->pri_tlp     = pci_prg_resp_pasid_required(pdev);
   1769		}
   1770	} else if (amd_iommu_iotlb_sup &&
   1771		   pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
   1772		dev_data->ats.enabled = true;
   1773		dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
   1774	}
   1775
   1776skip_ats_check:
   1777	ret = 0;
   1778
   1779	do_attach(dev_data, domain);
   1780
   1781	/*
   1782	 * We might boot into a crash-kernel here. The crashed kernel
   1783	 * left the caches in the IOMMU dirty. So we have to flush
   1784	 * here to evict all dirty stuff.
   1785	 */
   1786	amd_iommu_domain_flush_tlb_pde(domain);
   1787
   1788	amd_iommu_domain_flush_complete(domain);
   1789
   1790out:
   1791	spin_unlock(&dev_data->lock);
   1792
   1793	spin_unlock_irqrestore(&domain->lock, flags);
   1794
   1795	return ret;
   1796}
   1797
   1798/*
   1799 * Removes a device from a protection domain (with devtable_lock held)
   1800 */
   1801static void detach_device(struct device *dev)
   1802{
   1803	struct protection_domain *domain;
   1804	struct iommu_dev_data *dev_data;
   1805	unsigned long flags;
   1806
   1807	dev_data = dev_iommu_priv_get(dev);
   1808	domain   = dev_data->domain;
   1809
   1810	spin_lock_irqsave(&domain->lock, flags);
   1811
   1812	spin_lock(&dev_data->lock);
   1813
   1814	/*
   1815	 * First check if the device is still attached. It might already
   1816	 * be detached from its domain because the generic
   1817	 * iommu_detach_group code detached it and we try again here in
   1818	 * our alias handling.
   1819	 */
   1820	if (WARN_ON(!dev_data->domain))
   1821		goto out;
   1822
   1823	do_detach(dev_data);
   1824
   1825	if (!dev_is_pci(dev))
   1826		goto out;
   1827
   1828	if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
   1829		pdev_iommuv2_disable(to_pci_dev(dev));
   1830	else if (dev_data->ats.enabled)
   1831		pci_disable_ats(to_pci_dev(dev));
   1832
   1833	dev_data->ats.enabled = false;
   1834
   1835out:
   1836	spin_unlock(&dev_data->lock);
   1837
   1838	spin_unlock_irqrestore(&domain->lock, flags);
   1839}
   1840
   1841static struct iommu_device *amd_iommu_probe_device(struct device *dev)
   1842{
   1843	struct iommu_device *iommu_dev;
   1844	struct amd_iommu *iommu;
   1845	int ret;
   1846
   1847	if (!check_device(dev))
   1848		return ERR_PTR(-ENODEV);
   1849
   1850	iommu = rlookup_amd_iommu(dev);
   1851	if (!iommu)
   1852		return ERR_PTR(-ENODEV);
   1853
   1854	if (dev_iommu_priv_get(dev))
   1855		return &iommu->iommu;
   1856
   1857	ret = iommu_init_device(iommu, dev);
   1858	if (ret) {
   1859		if (ret != -ENOTSUPP)
   1860			dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
   1861		iommu_dev = ERR_PTR(ret);
   1862		iommu_ignore_device(iommu, dev);
   1863	} else {
   1864		amd_iommu_set_pci_msi_domain(dev, iommu);
   1865		iommu_dev = &iommu->iommu;
   1866	}
   1867
   1868	iommu_completion_wait(iommu);
   1869
   1870	return iommu_dev;
   1871}
   1872
   1873static void amd_iommu_probe_finalize(struct device *dev)
   1874{
   1875	/* Domains are initialized for this device - have a look what we ended up with */
   1876	set_dma_ops(dev, NULL);
   1877	iommu_setup_dma_ops(dev, 0, U64_MAX);
   1878}
   1879
   1880static void amd_iommu_release_device(struct device *dev)
   1881{
   1882	struct amd_iommu *iommu;
   1883
   1884	if (!check_device(dev))
   1885		return;
   1886
   1887	iommu = rlookup_amd_iommu(dev);
   1888	if (!iommu)
   1889		return;
   1890
   1891	amd_iommu_uninit_device(dev);
   1892	iommu_completion_wait(iommu);
   1893}
   1894
   1895static struct iommu_group *amd_iommu_device_group(struct device *dev)
   1896{
   1897	if (dev_is_pci(dev))
   1898		return pci_device_group(dev);
   1899
   1900	return acpihid_device_group(dev);
   1901}
   1902
   1903/*****************************************************************************
   1904 *
   1905 * The next functions belong to the dma_ops mapping/unmapping code.
   1906 *
   1907 *****************************************************************************/
   1908
   1909static void update_device_table(struct protection_domain *domain)
   1910{
   1911	struct iommu_dev_data *dev_data;
   1912
   1913	list_for_each_entry(dev_data, &domain->dev_list, list) {
   1914		struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
   1915
   1916		if (!iommu)
   1917			continue;
   1918		set_dte_entry(iommu, dev_data->devid, domain,
   1919			      dev_data->ats.enabled, dev_data->iommu_v2);
   1920		clone_aliases(iommu, dev_data->dev);
   1921	}
   1922}
   1923
   1924void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
   1925{
   1926	update_device_table(domain);
   1927	domain_flush_devices(domain);
   1928}
   1929
   1930void amd_iommu_domain_update(struct protection_domain *domain)
   1931{
   1932	/* Update device table */
   1933	amd_iommu_update_and_flush_device_table(domain);
   1934
   1935	/* Flush domain TLB(s) and wait for completion */
   1936	amd_iommu_domain_flush_tlb_pde(domain);
   1937	amd_iommu_domain_flush_complete(domain);
   1938}
   1939
   1940int __init amd_iommu_init_api(void)
   1941{
   1942	int err;
   1943
   1944	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
   1945	if (err)
   1946		return err;
   1947#ifdef CONFIG_ARM_AMBA
   1948	err = bus_set_iommu(&amba_bustype, &amd_iommu_ops);
   1949	if (err)
   1950		return err;
   1951#endif
   1952	err = bus_set_iommu(&platform_bus_type, &amd_iommu_ops);
   1953	if (err)
   1954		return err;
   1955
   1956	return 0;
   1957}
   1958
   1959/*****************************************************************************
   1960 *
   1961 * The following functions belong to the exported interface of AMD IOMMU
   1962 *
   1963 * This interface allows access to lower level functions of the IOMMU
   1964 * like protection domain handling and assignement of devices to domains
   1965 * which is not possible with the dma_ops interface.
   1966 *
   1967 *****************************************************************************/
   1968
   1969static void cleanup_domain(struct protection_domain *domain)
   1970{
   1971	struct iommu_dev_data *entry;
   1972	unsigned long flags;
   1973
   1974	spin_lock_irqsave(&domain->lock, flags);
   1975
   1976	while (!list_empty(&domain->dev_list)) {
   1977		entry = list_first_entry(&domain->dev_list,
   1978					 struct iommu_dev_data, list);
   1979		BUG_ON(!entry->domain);
   1980		do_detach(entry);
   1981	}
   1982
   1983	spin_unlock_irqrestore(&domain->lock, flags);
   1984}
   1985
   1986static void protection_domain_free(struct protection_domain *domain)
   1987{
   1988	if (!domain)
   1989		return;
   1990
   1991	if (domain->id)
   1992		domain_id_free(domain->id);
   1993
   1994	if (domain->iop.pgtbl_cfg.tlb)
   1995		free_io_pgtable_ops(&domain->iop.iop.ops);
   1996
   1997	kfree(domain);
   1998}
   1999
   2000static int protection_domain_init_v1(struct protection_domain *domain, int mode)
   2001{
   2002	u64 *pt_root = NULL;
   2003
   2004	BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL);
   2005
   2006	spin_lock_init(&domain->lock);
   2007	domain->id = domain_id_alloc();
   2008	if (!domain->id)
   2009		return -ENOMEM;
   2010	INIT_LIST_HEAD(&domain->dev_list);
   2011
   2012	if (mode != PAGE_MODE_NONE) {
   2013		pt_root = (void *)get_zeroed_page(GFP_KERNEL);
   2014		if (!pt_root)
   2015			return -ENOMEM;
   2016	}
   2017
   2018	amd_iommu_domain_set_pgtable(domain, pt_root, mode);
   2019
   2020	return 0;
   2021}
   2022
   2023static struct protection_domain *protection_domain_alloc(unsigned int type)
   2024{
   2025	struct io_pgtable_ops *pgtbl_ops;
   2026	struct protection_domain *domain;
   2027	int pgtable = amd_iommu_pgtable;
   2028	int mode = DEFAULT_PGTABLE_LEVEL;
   2029	int ret;
   2030
   2031	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
   2032	if (!domain)
   2033		return NULL;
   2034
   2035	/*
   2036	 * Force IOMMU v1 page table when iommu=pt and
   2037	 * when allocating domain for pass-through devices.
   2038	 */
   2039	if (type == IOMMU_DOMAIN_IDENTITY) {
   2040		pgtable = AMD_IOMMU_V1;
   2041		mode = PAGE_MODE_NONE;
   2042	} else if (type == IOMMU_DOMAIN_UNMANAGED) {
   2043		pgtable = AMD_IOMMU_V1;
   2044	}
   2045
   2046	switch (pgtable) {
   2047	case AMD_IOMMU_V1:
   2048		ret = protection_domain_init_v1(domain, mode);
   2049		break;
   2050	default:
   2051		ret = -EINVAL;
   2052	}
   2053
   2054	if (ret)
   2055		goto out_err;
   2056
   2057	pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain);
   2058	if (!pgtbl_ops)
   2059		goto out_err;
   2060
   2061	return domain;
   2062out_err:
   2063	kfree(domain);
   2064	return NULL;
   2065}
   2066
   2067static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
   2068{
   2069	struct protection_domain *domain;
   2070
   2071	/*
   2072	 * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
   2073	 * default to use IOMMU_DOMAIN_DMA[_FQ].
   2074	 */
   2075	if (WARN_ONCE(amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY),
   2076		      "Cannot allocate identity domain due to SNP\n"))
   2077		return NULL;
   2078
   2079	domain = protection_domain_alloc(type);
   2080	if (!domain)
   2081		return NULL;
   2082
   2083	domain->domain.geometry.aperture_start = 0;
   2084	domain->domain.geometry.aperture_end   = ~0ULL;
   2085	domain->domain.geometry.force_aperture = true;
   2086
   2087	return &domain->domain;
   2088}
   2089
   2090static void amd_iommu_domain_free(struct iommu_domain *dom)
   2091{
   2092	struct protection_domain *domain;
   2093
   2094	domain = to_pdomain(dom);
   2095
   2096	if (domain->dev_cnt > 0)
   2097		cleanup_domain(domain);
   2098
   2099	BUG_ON(domain->dev_cnt != 0);
   2100
   2101	if (!dom)
   2102		return;
   2103
   2104	if (domain->flags & PD_IOMMUV2_MASK)
   2105		free_gcr3_table(domain);
   2106
   2107	protection_domain_free(domain);
   2108}
   2109
   2110static void amd_iommu_detach_device(struct iommu_domain *dom,
   2111				    struct device *dev)
   2112{
   2113	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
   2114	struct amd_iommu *iommu;
   2115
   2116	if (!check_device(dev))
   2117		return;
   2118
   2119	if (dev_data->domain != NULL)
   2120		detach_device(dev);
   2121
   2122	iommu = rlookup_amd_iommu(dev);
   2123	if (!iommu)
   2124		return;
   2125
   2126#ifdef CONFIG_IRQ_REMAP
   2127	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
   2128	    (dom->type == IOMMU_DOMAIN_UNMANAGED))
   2129		dev_data->use_vapic = 0;
   2130#endif
   2131
   2132	iommu_completion_wait(iommu);
   2133}
   2134
   2135static int amd_iommu_attach_device(struct iommu_domain *dom,
   2136				   struct device *dev)
   2137{
   2138	struct protection_domain *domain = to_pdomain(dom);
   2139	struct iommu_dev_data *dev_data;
   2140	struct amd_iommu *iommu;
   2141	int ret;
   2142
   2143	if (!check_device(dev))
   2144		return -EINVAL;
   2145
   2146	dev_data = dev_iommu_priv_get(dev);
   2147	dev_data->defer_attach = false;
   2148
   2149	iommu = rlookup_amd_iommu(dev);
   2150	if (!iommu)
   2151		return -EINVAL;
   2152
   2153	if (dev_data->domain)
   2154		detach_device(dev);
   2155
   2156	ret = attach_device(dev, domain);
   2157
   2158#ifdef CONFIG_IRQ_REMAP
   2159	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
   2160		if (dom->type == IOMMU_DOMAIN_UNMANAGED)
   2161			dev_data->use_vapic = 1;
   2162		else
   2163			dev_data->use_vapic = 0;
   2164	}
   2165#endif
   2166
   2167	iommu_completion_wait(iommu);
   2168
   2169	return ret;
   2170}
   2171
   2172static void amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
   2173				     unsigned long iova, size_t size)
   2174{
   2175	struct protection_domain *domain = to_pdomain(dom);
   2176	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
   2177
   2178	if (ops->map)
   2179		domain_flush_np_cache(domain, iova, size);
   2180}
   2181
   2182static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
   2183			 phys_addr_t paddr, size_t page_size, int iommu_prot,
   2184			 gfp_t gfp)
   2185{
   2186	struct protection_domain *domain = to_pdomain(dom);
   2187	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
   2188	int prot = 0;
   2189	int ret = -EINVAL;
   2190
   2191	if ((amd_iommu_pgtable == AMD_IOMMU_V1) &&
   2192	    (domain->iop.mode == PAGE_MODE_NONE))
   2193		return -EINVAL;
   2194
   2195	if (iommu_prot & IOMMU_READ)
   2196		prot |= IOMMU_PROT_IR;
   2197	if (iommu_prot & IOMMU_WRITE)
   2198		prot |= IOMMU_PROT_IW;
   2199
   2200	if (ops->map)
   2201		ret = ops->map(ops, iova, paddr, page_size, prot, gfp);
   2202
   2203	return ret;
   2204}
   2205
   2206static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
   2207					    struct iommu_iotlb_gather *gather,
   2208					    unsigned long iova, size_t size)
   2209{
   2210	/*
   2211	 * AMD's IOMMU can flush as many pages as necessary in a single flush.
   2212	 * Unless we run in a virtual machine, which can be inferred according
   2213	 * to whether "non-present cache" is on, it is probably best to prefer
   2214	 * (potentially) too extensive TLB flushing (i.e., more misses) over
   2215	 * mutliple TLB flushes (i.e., more flushes). For virtual machines the
   2216	 * hypervisor needs to synchronize the host IOMMU PTEs with those of
   2217	 * the guest, and the trade-off is different: unnecessary TLB flushes
   2218	 * should be avoided.
   2219	 */
   2220	if (amd_iommu_np_cache &&
   2221	    iommu_iotlb_gather_is_disjoint(gather, iova, size))
   2222		iommu_iotlb_sync(domain, gather);
   2223
   2224	iommu_iotlb_gather_add_range(gather, iova, size);
   2225}
   2226
   2227static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
   2228			      size_t page_size,
   2229			      struct iommu_iotlb_gather *gather)
   2230{
   2231	struct protection_domain *domain = to_pdomain(dom);
   2232	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
   2233	size_t r;
   2234
   2235	if ((amd_iommu_pgtable == AMD_IOMMU_V1) &&
   2236	    (domain->iop.mode == PAGE_MODE_NONE))
   2237		return 0;
   2238
   2239	r = (ops->unmap) ? ops->unmap(ops, iova, page_size, gather) : 0;
   2240
   2241	amd_iommu_iotlb_gather_add_page(dom, gather, iova, page_size);
   2242
   2243	return r;
   2244}
   2245
   2246static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
   2247					  dma_addr_t iova)
   2248{
   2249	struct protection_domain *domain = to_pdomain(dom);
   2250	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
   2251
   2252	return ops->iova_to_phys(ops, iova);
   2253}
   2254
   2255static bool amd_iommu_capable(enum iommu_cap cap)
   2256{
   2257	switch (cap) {
   2258	case IOMMU_CAP_CACHE_COHERENCY:
   2259		return true;
   2260	case IOMMU_CAP_INTR_REMAP:
   2261		return (irq_remapping_enabled == 1);
   2262	case IOMMU_CAP_NOEXEC:
   2263		return false;
   2264	case IOMMU_CAP_PRE_BOOT_PROTECTION:
   2265		return amdr_ivrs_remap_support;
   2266	default:
   2267		break;
   2268	}
   2269
   2270	return false;
   2271}
   2272
   2273static void amd_iommu_get_resv_regions(struct device *dev,
   2274				       struct list_head *head)
   2275{
   2276	struct iommu_resv_region *region;
   2277	struct unity_map_entry *entry;
   2278	struct amd_iommu *iommu;
   2279	struct amd_iommu_pci_seg *pci_seg;
   2280	int devid, sbdf;
   2281
   2282	sbdf = get_device_sbdf_id(dev);
   2283	if (sbdf < 0)
   2284		return;
   2285
   2286	devid = PCI_SBDF_TO_DEVID(sbdf);
   2287	iommu = rlookup_amd_iommu(dev);
   2288	if (!iommu)
   2289		return;
   2290	pci_seg = iommu->pci_seg;
   2291
   2292	list_for_each_entry(entry, &pci_seg->unity_map, list) {
   2293		int type, prot = 0;
   2294		size_t length;
   2295
   2296		if (devid < entry->devid_start || devid > entry->devid_end)
   2297			continue;
   2298
   2299		type   = IOMMU_RESV_DIRECT;
   2300		length = entry->address_end - entry->address_start;
   2301		if (entry->prot & IOMMU_PROT_IR)
   2302			prot |= IOMMU_READ;
   2303		if (entry->prot & IOMMU_PROT_IW)
   2304			prot |= IOMMU_WRITE;
   2305		if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
   2306			/* Exclusion range */
   2307			type = IOMMU_RESV_RESERVED;
   2308
   2309		region = iommu_alloc_resv_region(entry->address_start,
   2310						 length, prot, type);
   2311		if (!region) {
   2312			dev_err(dev, "Out of memory allocating dm-regions\n");
   2313			return;
   2314		}
   2315		list_add_tail(&region->list, head);
   2316	}
   2317
   2318	region = iommu_alloc_resv_region(MSI_RANGE_START,
   2319					 MSI_RANGE_END - MSI_RANGE_START + 1,
   2320					 0, IOMMU_RESV_MSI);
   2321	if (!region)
   2322		return;
   2323	list_add_tail(&region->list, head);
   2324
   2325	region = iommu_alloc_resv_region(HT_RANGE_START,
   2326					 HT_RANGE_END - HT_RANGE_START + 1,
   2327					 0, IOMMU_RESV_RESERVED);
   2328	if (!region)
   2329		return;
   2330	list_add_tail(&region->list, head);
   2331}
   2332
   2333bool amd_iommu_is_attach_deferred(struct device *dev)
   2334{
   2335	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
   2336
   2337	return dev_data->defer_attach;
   2338}
   2339EXPORT_SYMBOL_GPL(amd_iommu_is_attach_deferred);
   2340
   2341static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
   2342{
   2343	struct protection_domain *dom = to_pdomain(domain);
   2344	unsigned long flags;
   2345
   2346	spin_lock_irqsave(&dom->lock, flags);
   2347	amd_iommu_domain_flush_tlb_pde(dom);
   2348	amd_iommu_domain_flush_complete(dom);
   2349	spin_unlock_irqrestore(&dom->lock, flags);
   2350}
   2351
   2352static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
   2353				 struct iommu_iotlb_gather *gather)
   2354{
   2355	struct protection_domain *dom = to_pdomain(domain);
   2356	unsigned long flags;
   2357
   2358	spin_lock_irqsave(&dom->lock, flags);
   2359	domain_flush_pages(dom, gather->start, gather->end - gather->start, 1);
   2360	amd_iommu_domain_flush_complete(dom);
   2361	spin_unlock_irqrestore(&dom->lock, flags);
   2362}
   2363
   2364static int amd_iommu_def_domain_type(struct device *dev)
   2365{
   2366	struct iommu_dev_data *dev_data;
   2367
   2368	dev_data = dev_iommu_priv_get(dev);
   2369	if (!dev_data)
   2370		return 0;
   2371
   2372	/*
   2373	 * Do not identity map IOMMUv2 capable devices when memory encryption is
   2374	 * active, because some of those devices (AMD GPUs) don't have the
   2375	 * encryption bit in their DMA-mask and require remapping.
   2376	 */
   2377	if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT) && dev_data->iommu_v2)
   2378		return IOMMU_DOMAIN_IDENTITY;
   2379
   2380	return 0;
   2381}
   2382
   2383static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
   2384{
   2385	/* IOMMU_PTE_FC is always set */
   2386	return true;
   2387}
   2388
   2389const struct iommu_ops amd_iommu_ops = {
   2390	.capable = amd_iommu_capable,
   2391	.domain_alloc = amd_iommu_domain_alloc,
   2392	.probe_device = amd_iommu_probe_device,
   2393	.release_device = amd_iommu_release_device,
   2394	.probe_finalize = amd_iommu_probe_finalize,
   2395	.device_group = amd_iommu_device_group,
   2396	.get_resv_regions = amd_iommu_get_resv_regions,
   2397	.put_resv_regions = generic_iommu_put_resv_regions,
   2398	.is_attach_deferred = amd_iommu_is_attach_deferred,
   2399	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
   2400	.def_domain_type = amd_iommu_def_domain_type,
   2401	.default_domain_ops = &(const struct iommu_domain_ops) {
   2402		.attach_dev	= amd_iommu_attach_device,
   2403		.detach_dev	= amd_iommu_detach_device,
   2404		.map		= amd_iommu_map,
   2405		.unmap		= amd_iommu_unmap,
   2406		.iotlb_sync_map	= amd_iommu_iotlb_sync_map,
   2407		.iova_to_phys	= amd_iommu_iova_to_phys,
   2408		.flush_iotlb_all = amd_iommu_flush_iotlb_all,
   2409		.iotlb_sync	= amd_iommu_iotlb_sync,
   2410		.free		= amd_iommu_domain_free,
   2411		.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
   2412	}
   2413};
   2414
   2415/*****************************************************************************
   2416 *
   2417 * The next functions do a basic initialization of IOMMU for pass through
   2418 * mode
   2419 *
   2420 * In passthrough mode the IOMMU is initialized and enabled but not used for
   2421 * DMA-API translation.
   2422 *
   2423 *****************************************************************************/
   2424
   2425/* IOMMUv2 specific functions */
   2426int amd_iommu_register_ppr_notifier(struct notifier_block *nb)
   2427{
   2428	return atomic_notifier_chain_register(&ppr_notifier, nb);
   2429}
   2430EXPORT_SYMBOL(amd_iommu_register_ppr_notifier);
   2431
   2432int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
   2433{
   2434	return atomic_notifier_chain_unregister(&ppr_notifier, nb);
   2435}
   2436EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
   2437
   2438void amd_iommu_domain_direct_map(struct iommu_domain *dom)
   2439{
   2440	struct protection_domain *domain = to_pdomain(dom);
   2441	unsigned long flags;
   2442
   2443	spin_lock_irqsave(&domain->lock, flags);
   2444
   2445	if (domain->iop.pgtbl_cfg.tlb)
   2446		free_io_pgtable_ops(&domain->iop.iop.ops);
   2447
   2448	spin_unlock_irqrestore(&domain->lock, flags);
   2449}
   2450EXPORT_SYMBOL(amd_iommu_domain_direct_map);
   2451
   2452int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
   2453{
   2454	struct protection_domain *domain = to_pdomain(dom);
   2455	unsigned long flags;
   2456	int levels, ret;
   2457
   2458	/* Number of GCR3 table levels required */
   2459	for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
   2460		levels += 1;
   2461
   2462	if (levels > amd_iommu_max_glx_val)
   2463		return -EINVAL;
   2464
   2465	spin_lock_irqsave(&domain->lock, flags);
   2466
   2467	/*
   2468	 * Save us all sanity checks whether devices already in the
   2469	 * domain support IOMMUv2. Just force that the domain has no
   2470	 * devices attached when it is switched into IOMMUv2 mode.
   2471	 */
   2472	ret = -EBUSY;
   2473	if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK)
   2474		goto out;
   2475
   2476	ret = -ENOMEM;
   2477	domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC);
   2478	if (domain->gcr3_tbl == NULL)
   2479		goto out;
   2480
   2481	domain->glx      = levels;
   2482	domain->flags   |= PD_IOMMUV2_MASK;
   2483
   2484	amd_iommu_domain_update(domain);
   2485
   2486	ret = 0;
   2487
   2488out:
   2489	spin_unlock_irqrestore(&domain->lock, flags);
   2490
   2491	return ret;
   2492}
   2493EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
   2494
   2495static int __flush_pasid(struct protection_domain *domain, u32 pasid,
   2496			 u64 address, bool size)
   2497{
   2498	struct iommu_dev_data *dev_data;
   2499	struct iommu_cmd cmd;
   2500	int i, ret;
   2501
   2502	if (!(domain->flags & PD_IOMMUV2_MASK))
   2503		return -EINVAL;
   2504
   2505	build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size);
   2506
   2507	/*
   2508	 * IOMMU TLB needs to be flushed before Device TLB to
   2509	 * prevent device TLB refill from IOMMU TLB
   2510	 */
   2511	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
   2512		if (domain->dev_iommu[i] == 0)
   2513			continue;
   2514
   2515		ret = iommu_queue_command(amd_iommus[i], &cmd);
   2516		if (ret != 0)
   2517			goto out;
   2518	}
   2519
   2520	/* Wait until IOMMU TLB flushes are complete */
   2521	amd_iommu_domain_flush_complete(domain);
   2522
   2523	/* Now flush device TLBs */
   2524	list_for_each_entry(dev_data, &domain->dev_list, list) {
   2525		struct amd_iommu *iommu;
   2526		int qdep;
   2527
   2528		/*
   2529		   There might be non-IOMMUv2 capable devices in an IOMMUv2
   2530		 * domain.
   2531		 */
   2532		if (!dev_data->ats.enabled)
   2533			continue;
   2534
   2535		qdep  = dev_data->ats.qdep;
   2536		iommu = rlookup_amd_iommu(dev_data->dev);
   2537		if (!iommu)
   2538			continue;
   2539		build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid,
   2540				      qdep, address, size);
   2541
   2542		ret = iommu_queue_command(iommu, &cmd);
   2543		if (ret != 0)
   2544			goto out;
   2545	}
   2546
   2547	/* Wait until all device TLBs are flushed */
   2548	amd_iommu_domain_flush_complete(domain);
   2549
   2550	ret = 0;
   2551
   2552out:
   2553
   2554	return ret;
   2555}
   2556
   2557static int __amd_iommu_flush_page(struct protection_domain *domain, u32 pasid,
   2558				  u64 address)
   2559{
   2560	return __flush_pasid(domain, pasid, address, false);
   2561}
   2562
   2563int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid,
   2564			 u64 address)
   2565{
   2566	struct protection_domain *domain = to_pdomain(dom);
   2567	unsigned long flags;
   2568	int ret;
   2569
   2570	spin_lock_irqsave(&domain->lock, flags);
   2571	ret = __amd_iommu_flush_page(domain, pasid, address);
   2572	spin_unlock_irqrestore(&domain->lock, flags);
   2573
   2574	return ret;
   2575}
   2576EXPORT_SYMBOL(amd_iommu_flush_page);
   2577
   2578static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid)
   2579{
   2580	return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
   2581			     true);
   2582}
   2583
   2584int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid)
   2585{
   2586	struct protection_domain *domain = to_pdomain(dom);
   2587	unsigned long flags;
   2588	int ret;
   2589
   2590	spin_lock_irqsave(&domain->lock, flags);
   2591	ret = __amd_iommu_flush_tlb(domain, pasid);
   2592	spin_unlock_irqrestore(&domain->lock, flags);
   2593
   2594	return ret;
   2595}
   2596EXPORT_SYMBOL(amd_iommu_flush_tlb);
   2597
   2598static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc)
   2599{
   2600	int index;
   2601	u64 *pte;
   2602
   2603	while (true) {
   2604
   2605		index = (pasid >> (9 * level)) & 0x1ff;
   2606		pte   = &root[index];
   2607
   2608		if (level == 0)
   2609			break;
   2610
   2611		if (!(*pte & GCR3_VALID)) {
   2612			if (!alloc)
   2613				return NULL;
   2614
   2615			root = (void *)get_zeroed_page(GFP_ATOMIC);
   2616			if (root == NULL)
   2617				return NULL;
   2618
   2619			*pte = iommu_virt_to_phys(root) | GCR3_VALID;
   2620		}
   2621
   2622		root = iommu_phys_to_virt(*pte & PAGE_MASK);
   2623
   2624		level -= 1;
   2625	}
   2626
   2627	return pte;
   2628}
   2629
   2630static int __set_gcr3(struct protection_domain *domain, u32 pasid,
   2631		      unsigned long cr3)
   2632{
   2633	u64 *pte;
   2634
   2635	if (domain->iop.mode != PAGE_MODE_NONE)
   2636		return -EINVAL;
   2637
   2638	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
   2639	if (pte == NULL)
   2640		return -ENOMEM;
   2641
   2642	*pte = (cr3 & PAGE_MASK) | GCR3_VALID;
   2643
   2644	return __amd_iommu_flush_tlb(domain, pasid);
   2645}
   2646
   2647static int __clear_gcr3(struct protection_domain *domain, u32 pasid)
   2648{
   2649	u64 *pte;
   2650
   2651	if (domain->iop.mode != PAGE_MODE_NONE)
   2652		return -EINVAL;
   2653
   2654	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
   2655	if (pte == NULL)
   2656		return 0;
   2657
   2658	*pte = 0;
   2659
   2660	return __amd_iommu_flush_tlb(domain, pasid);
   2661}
   2662
   2663int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid,
   2664			      unsigned long cr3)
   2665{
   2666	struct protection_domain *domain = to_pdomain(dom);
   2667	unsigned long flags;
   2668	int ret;
   2669
   2670	spin_lock_irqsave(&domain->lock, flags);
   2671	ret = __set_gcr3(domain, pasid, cr3);
   2672	spin_unlock_irqrestore(&domain->lock, flags);
   2673
   2674	return ret;
   2675}
   2676EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);
   2677
   2678int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid)
   2679{
   2680	struct protection_domain *domain = to_pdomain(dom);
   2681	unsigned long flags;
   2682	int ret;
   2683
   2684	spin_lock_irqsave(&domain->lock, flags);
   2685	ret = __clear_gcr3(domain, pasid);
   2686	spin_unlock_irqrestore(&domain->lock, flags);
   2687
   2688	return ret;
   2689}
   2690EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
   2691
   2692int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
   2693			   int status, int tag)
   2694{
   2695	struct iommu_dev_data *dev_data;
   2696	struct amd_iommu *iommu;
   2697	struct iommu_cmd cmd;
   2698
   2699	dev_data = dev_iommu_priv_get(&pdev->dev);
   2700	iommu    = rlookup_amd_iommu(&pdev->dev);
   2701	if (!iommu)
   2702		return -ENODEV;
   2703
   2704	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
   2705			   tag, dev_data->pri_tlp);
   2706
   2707	return iommu_queue_command(iommu, &cmd);
   2708}
   2709EXPORT_SYMBOL(amd_iommu_complete_ppr);
   2710
   2711int amd_iommu_device_info(struct pci_dev *pdev,
   2712                          struct amd_iommu_device_info *info)
   2713{
   2714	int max_pasids;
   2715	int pos;
   2716
   2717	if (pdev == NULL || info == NULL)
   2718		return -EINVAL;
   2719
   2720	if (!amd_iommu_v2_supported())
   2721		return -EINVAL;
   2722
   2723	memset(info, 0, sizeof(*info));
   2724
   2725	if (pci_ats_supported(pdev))
   2726		info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
   2727
   2728	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
   2729	if (pos)
   2730		info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
   2731
   2732	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
   2733	if (pos) {
   2734		int features;
   2735
   2736		max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1));
   2737		max_pasids = min(max_pasids, (1 << 20));
   2738
   2739		info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
   2740		info->max_pasids = min(pci_max_pasids(pdev), max_pasids);
   2741
   2742		features = pci_pasid_features(pdev);
   2743		if (features & PCI_PASID_CAP_EXEC)
   2744			info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
   2745		if (features & PCI_PASID_CAP_PRIV)
   2746			info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
   2747	}
   2748
   2749	return 0;
   2750}
   2751EXPORT_SYMBOL(amd_iommu_device_info);
   2752
   2753#ifdef CONFIG_IRQ_REMAP
   2754
   2755/*****************************************************************************
   2756 *
   2757 * Interrupt Remapping Implementation
   2758 *
   2759 *****************************************************************************/
   2760
   2761static struct irq_chip amd_ir_chip;
   2762static DEFINE_SPINLOCK(iommu_table_lock);
   2763
   2764static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid,
   2765			      struct irq_remap_table *table)
   2766{
   2767	u64 dte;
   2768	struct dev_table_entry *dev_table = get_dev_table(iommu);
   2769
   2770	dte	= dev_table[devid].data[2];
   2771	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
   2772	dte	|= iommu_virt_to_phys(table->table);
   2773	dte	|= DTE_IRQ_REMAP_INTCTL;
   2774	dte	|= DTE_INTTABLEN;
   2775	dte	|= DTE_IRQ_REMAP_ENABLE;
   2776
   2777	dev_table[devid].data[2] = dte;
   2778}
   2779
   2780static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
   2781{
   2782	struct irq_remap_table *table;
   2783	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
   2784
   2785	if (WARN_ONCE(!pci_seg->rlookup_table[devid],
   2786		      "%s: no iommu for devid %x:%x\n",
   2787		      __func__, pci_seg->id, devid))
   2788		return NULL;
   2789
   2790	table = pci_seg->irq_lookup_table[devid];
   2791	if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n",
   2792		      __func__, pci_seg->id, devid))
   2793		return NULL;
   2794
   2795	return table;
   2796}
   2797
   2798static struct irq_remap_table *__alloc_irq_table(void)
   2799{
   2800	struct irq_remap_table *table;
   2801
   2802	table = kzalloc(sizeof(*table), GFP_KERNEL);
   2803	if (!table)
   2804		return NULL;
   2805
   2806	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
   2807	if (!table->table) {
   2808		kfree(table);
   2809		return NULL;
   2810	}
   2811	raw_spin_lock_init(&table->lock);
   2812
   2813	if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
   2814		memset(table->table, 0,
   2815		       MAX_IRQS_PER_TABLE * sizeof(u32));
   2816	else
   2817		memset(table->table, 0,
   2818		       (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
   2819	return table;
   2820}
   2821
   2822static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
   2823				  struct irq_remap_table *table)
   2824{
   2825	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
   2826
   2827	pci_seg->irq_lookup_table[devid] = table;
   2828	set_dte_irq_entry(iommu, devid, table);
   2829	iommu_flush_dte(iommu, devid);
   2830}
   2831
   2832static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
   2833				       void *data)
   2834{
   2835	struct irq_remap_table *table = data;
   2836	struct amd_iommu_pci_seg *pci_seg;
   2837	struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev);
   2838
   2839	if (!iommu)
   2840		return -EINVAL;
   2841
   2842	pci_seg = iommu->pci_seg;
   2843	pci_seg->irq_lookup_table[alias] = table;
   2844	set_dte_irq_entry(iommu, alias, table);
   2845	iommu_flush_dte(pci_seg->rlookup_table[alias], alias);
   2846
   2847	return 0;
   2848}
   2849
   2850static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
   2851					       u16 devid, struct pci_dev *pdev)
   2852{
   2853	struct irq_remap_table *table = NULL;
   2854	struct irq_remap_table *new_table = NULL;
   2855	struct amd_iommu_pci_seg *pci_seg;
   2856	unsigned long flags;
   2857	u16 alias;
   2858
   2859	spin_lock_irqsave(&iommu_table_lock, flags);
   2860
   2861	pci_seg = iommu->pci_seg;
   2862	table = pci_seg->irq_lookup_table[devid];
   2863	if (table)
   2864		goto out_unlock;
   2865
   2866	alias = pci_seg->alias_table[devid];
   2867	table = pci_seg->irq_lookup_table[alias];
   2868	if (table) {
   2869		set_remap_table_entry(iommu, devid, table);
   2870		goto out_wait;
   2871	}
   2872	spin_unlock_irqrestore(&iommu_table_lock, flags);
   2873
   2874	/* Nothing there yet, allocate new irq remapping table */
   2875	new_table = __alloc_irq_table();
   2876	if (!new_table)
   2877		return NULL;
   2878
   2879	spin_lock_irqsave(&iommu_table_lock, flags);
   2880
   2881	table = pci_seg->irq_lookup_table[devid];
   2882	if (table)
   2883		goto out_unlock;
   2884
   2885	table = pci_seg->irq_lookup_table[alias];
   2886	if (table) {
   2887		set_remap_table_entry(iommu, devid, table);
   2888		goto out_wait;
   2889	}
   2890
   2891	table = new_table;
   2892	new_table = NULL;
   2893
   2894	if (pdev)
   2895		pci_for_each_dma_alias(pdev, set_remap_table_entry_alias,
   2896				       table);
   2897	else
   2898		set_remap_table_entry(iommu, devid, table);
   2899
   2900	if (devid != alias)
   2901		set_remap_table_entry(iommu, alias, table);
   2902
   2903out_wait:
   2904	iommu_completion_wait(iommu);
   2905
   2906out_unlock:
   2907	spin_unlock_irqrestore(&iommu_table_lock, flags);
   2908
   2909	if (new_table) {
   2910		kmem_cache_free(amd_iommu_irq_cache, new_table->table);
   2911		kfree(new_table);
   2912	}
   2913	return table;
   2914}
   2915
   2916static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
   2917			   bool align, struct pci_dev *pdev)
   2918{
   2919	struct irq_remap_table *table;
   2920	int index, c, alignment = 1;
   2921	unsigned long flags;
   2922
   2923	table = alloc_irq_table(iommu, devid, pdev);
   2924	if (!table)
   2925		return -ENODEV;
   2926
   2927	if (align)
   2928		alignment = roundup_pow_of_two(count);
   2929
   2930	raw_spin_lock_irqsave(&table->lock, flags);
   2931
   2932	/* Scan table for free entries */
   2933	for (index = ALIGN(table->min_index, alignment), c = 0;
   2934	     index < MAX_IRQS_PER_TABLE;) {
   2935		if (!iommu->irte_ops->is_allocated(table, index)) {
   2936			c += 1;
   2937		} else {
   2938			c     = 0;
   2939			index = ALIGN(index + 1, alignment);
   2940			continue;
   2941		}
   2942
   2943		if (c == count)	{
   2944			for (; c != 0; --c)
   2945				iommu->irte_ops->set_allocated(table, index - c + 1);
   2946
   2947			index -= count - 1;
   2948			goto out;
   2949		}
   2950
   2951		index++;
   2952	}
   2953
   2954	index = -ENOSPC;
   2955
   2956out:
   2957	raw_spin_unlock_irqrestore(&table->lock, flags);
   2958
   2959	return index;
   2960}
   2961
   2962static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
   2963			  struct irte_ga *irte, struct amd_ir_data *data)
   2964{
   2965	bool ret;
   2966	struct irq_remap_table *table;
   2967	unsigned long flags;
   2968	struct irte_ga *entry;
   2969
   2970	table = get_irq_table(iommu, devid);
   2971	if (!table)
   2972		return -ENOMEM;
   2973
   2974	raw_spin_lock_irqsave(&table->lock, flags);
   2975
   2976	entry = (struct irte_ga *)table->table;
   2977	entry = &entry[index];
   2978
   2979	ret = cmpxchg_double(&entry->lo.val, &entry->hi.val,
   2980			     entry->lo.val, entry->hi.val,
   2981			     irte->lo.val, irte->hi.val);
   2982	/*
   2983	 * We use cmpxchg16 to atomically update the 128-bit IRTE,
   2984	 * and it cannot be updated by the hardware or other processors
   2985	 * behind us, so the return value of cmpxchg16 should be the
   2986	 * same as the old value.
   2987	 */
   2988	WARN_ON(!ret);
   2989
   2990	if (data)
   2991		data->ref = entry;
   2992
   2993	raw_spin_unlock_irqrestore(&table->lock, flags);
   2994
   2995	iommu_flush_irt(iommu, devid);
   2996	iommu_completion_wait(iommu);
   2997
   2998	return 0;
   2999}
   3000
   3001static int modify_irte(struct amd_iommu *iommu,
   3002		       u16 devid, int index, union irte *irte)
   3003{
   3004	struct irq_remap_table *table;
   3005	unsigned long flags;
   3006
   3007	table = get_irq_table(iommu, devid);
   3008	if (!table)
   3009		return -ENOMEM;
   3010
   3011	raw_spin_lock_irqsave(&table->lock, flags);
   3012	table->table[index] = irte->val;
   3013	raw_spin_unlock_irqrestore(&table->lock, flags);
   3014
   3015	iommu_flush_irt(iommu, devid);
   3016	iommu_completion_wait(iommu);
   3017
   3018	return 0;
   3019}
   3020
   3021static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
   3022{
   3023	struct irq_remap_table *table;
   3024	unsigned long flags;
   3025
   3026	table = get_irq_table(iommu, devid);
   3027	if (!table)
   3028		return;
   3029
   3030	raw_spin_lock_irqsave(&table->lock, flags);
   3031	iommu->irte_ops->clear_allocated(table, index);
   3032	raw_spin_unlock_irqrestore(&table->lock, flags);
   3033
   3034	iommu_flush_irt(iommu, devid);
   3035	iommu_completion_wait(iommu);
   3036}
   3037
   3038static void irte_prepare(void *entry,
   3039			 u32 delivery_mode, bool dest_mode,
   3040			 u8 vector, u32 dest_apicid, int devid)
   3041{
   3042	union irte *irte = (union irte *) entry;
   3043
   3044	irte->val                = 0;
   3045	irte->fields.vector      = vector;
   3046	irte->fields.int_type    = delivery_mode;
   3047	irte->fields.destination = dest_apicid;
   3048	irte->fields.dm          = dest_mode;
   3049	irte->fields.valid       = 1;
   3050}
   3051
   3052static void irte_ga_prepare(void *entry,
   3053			    u32 delivery_mode, bool dest_mode,
   3054			    u8 vector, u32 dest_apicid, int devid)
   3055{
   3056	struct irte_ga *irte = (struct irte_ga *) entry;
   3057
   3058	irte->lo.val                      = 0;
   3059	irte->hi.val                      = 0;
   3060	irte->lo.fields_remap.int_type    = delivery_mode;
   3061	irte->lo.fields_remap.dm          = dest_mode;
   3062	irte->hi.fields.vector            = vector;
   3063	irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
   3064	irte->hi.fields.destination       = APICID_TO_IRTE_DEST_HI(dest_apicid);
   3065	irte->lo.fields_remap.valid       = 1;
   3066}
   3067
   3068static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
   3069{
   3070	union irte *irte = (union irte *) entry;
   3071
   3072	irte->fields.valid = 1;
   3073	modify_irte(iommu, devid, index, irte);
   3074}
   3075
   3076static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
   3077{
   3078	struct irte_ga *irte = (struct irte_ga *) entry;
   3079
   3080	irte->lo.fields_remap.valid = 1;
   3081	modify_irte_ga(iommu, devid, index, irte, NULL);
   3082}
   3083
   3084static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
   3085{
   3086	union irte *irte = (union irte *) entry;
   3087
   3088	irte->fields.valid = 0;
   3089	modify_irte(iommu, devid, index, irte);
   3090}
   3091
   3092static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
   3093{
   3094	struct irte_ga *irte = (struct irte_ga *) entry;
   3095
   3096	irte->lo.fields_remap.valid = 0;
   3097	modify_irte_ga(iommu, devid, index, irte, NULL);
   3098}
   3099
   3100static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
   3101			      u8 vector, u32 dest_apicid)
   3102{
   3103	union irte *irte = (union irte *) entry;
   3104
   3105	irte->fields.vector = vector;
   3106	irte->fields.destination = dest_apicid;
   3107	modify_irte(iommu, devid, index, irte);
   3108}
   3109
   3110static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
   3111				 u8 vector, u32 dest_apicid)
   3112{
   3113	struct irte_ga *irte = (struct irte_ga *) entry;
   3114
   3115	if (!irte->lo.fields_remap.guest_mode) {
   3116		irte->hi.fields.vector = vector;
   3117		irte->lo.fields_remap.destination =
   3118					APICID_TO_IRTE_DEST_LO(dest_apicid);
   3119		irte->hi.fields.destination =
   3120					APICID_TO_IRTE_DEST_HI(dest_apicid);
   3121		modify_irte_ga(iommu, devid, index, irte, NULL);
   3122	}
   3123}
   3124
   3125#define IRTE_ALLOCATED (~1U)
   3126static void irte_set_allocated(struct irq_remap_table *table, int index)
   3127{
   3128	table->table[index] = IRTE_ALLOCATED;
   3129}
   3130
   3131static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
   3132{
   3133	struct irte_ga *ptr = (struct irte_ga *)table->table;
   3134	struct irte_ga *irte = &ptr[index];
   3135
   3136	memset(&irte->lo.val, 0, sizeof(u64));
   3137	memset(&irte->hi.val, 0, sizeof(u64));
   3138	irte->hi.fields.vector = 0xff;
   3139}
   3140
   3141static bool irte_is_allocated(struct irq_remap_table *table, int index)
   3142{
   3143	union irte *ptr = (union irte *)table->table;
   3144	union irte *irte = &ptr[index];
   3145
   3146	return irte->val != 0;
   3147}
   3148
   3149static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
   3150{
   3151	struct irte_ga *ptr = (struct irte_ga *)table->table;
   3152	struct irte_ga *irte = &ptr[index];
   3153
   3154	return irte->hi.fields.vector != 0;
   3155}
   3156
   3157static void irte_clear_allocated(struct irq_remap_table *table, int index)
   3158{
   3159	table->table[index] = 0;
   3160}
   3161
   3162static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
   3163{
   3164	struct irte_ga *ptr = (struct irte_ga *)table->table;
   3165	struct irte_ga *irte = &ptr[index];
   3166
   3167	memset(&irte->lo.val, 0, sizeof(u64));
   3168	memset(&irte->hi.val, 0, sizeof(u64));
   3169}
   3170
   3171static int get_devid(struct irq_alloc_info *info)
   3172{
   3173	switch (info->type) {
   3174	case X86_IRQ_ALLOC_TYPE_IOAPIC:
   3175		return get_ioapic_devid(info->devid);
   3176	case X86_IRQ_ALLOC_TYPE_HPET:
   3177		return get_hpet_devid(info->devid);
   3178	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
   3179	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
   3180		return get_device_sbdf_id(msi_desc_to_dev(info->desc));
   3181	default:
   3182		WARN_ON_ONCE(1);
   3183		return -1;
   3184	}
   3185}
   3186
   3187struct irq_remap_ops amd_iommu_irq_ops = {
   3188	.prepare		= amd_iommu_prepare,
   3189	.enable			= amd_iommu_enable,
   3190	.disable		= amd_iommu_disable,
   3191	.reenable		= amd_iommu_reenable,
   3192	.enable_faulting	= amd_iommu_enable_faulting,
   3193};
   3194
   3195static void fill_msi_msg(struct msi_msg *msg, u32 index)
   3196{
   3197	msg->data = index;
   3198	msg->address_lo = 0;
   3199	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
   3200	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
   3201}
   3202
   3203static void irq_remapping_prepare_irte(struct amd_ir_data *data,
   3204				       struct irq_cfg *irq_cfg,
   3205				       struct irq_alloc_info *info,
   3206				       int devid, int index, int sub_handle)
   3207{
   3208	struct irq_2_irte *irte_info = &data->irq_2_irte;
   3209	struct amd_iommu *iommu = data->iommu;
   3210
   3211	if (!iommu)
   3212		return;
   3213
   3214	data->irq_2_irte.devid = devid;
   3215	data->irq_2_irte.index = index + sub_handle;
   3216	iommu->irte_ops->prepare(data->entry, apic->delivery_mode,
   3217				 apic->dest_mode_logical, irq_cfg->vector,
   3218				 irq_cfg->dest_apicid, devid);
   3219
   3220	switch (info->type) {
   3221	case X86_IRQ_ALLOC_TYPE_IOAPIC:
   3222	case X86_IRQ_ALLOC_TYPE_HPET:
   3223	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
   3224	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
   3225		fill_msi_msg(&data->msi_entry, irte_info->index);
   3226		break;
   3227
   3228	default:
   3229		BUG_ON(1);
   3230		break;
   3231	}
   3232}
   3233
   3234struct amd_irte_ops irte_32_ops = {
   3235	.prepare = irte_prepare,
   3236	.activate = irte_activate,
   3237	.deactivate = irte_deactivate,
   3238	.set_affinity = irte_set_affinity,
   3239	.set_allocated = irte_set_allocated,
   3240	.is_allocated = irte_is_allocated,
   3241	.clear_allocated = irte_clear_allocated,
   3242};
   3243
   3244struct amd_irte_ops irte_128_ops = {
   3245	.prepare = irte_ga_prepare,
   3246	.activate = irte_ga_activate,
   3247	.deactivate = irte_ga_deactivate,
   3248	.set_affinity = irte_ga_set_affinity,
   3249	.set_allocated = irte_ga_set_allocated,
   3250	.is_allocated = irte_ga_is_allocated,
   3251	.clear_allocated = irte_ga_clear_allocated,
   3252};
   3253
   3254static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
   3255			       unsigned int nr_irqs, void *arg)
   3256{
   3257	struct irq_alloc_info *info = arg;
   3258	struct irq_data *irq_data;
   3259	struct amd_ir_data *data = NULL;
   3260	struct amd_iommu *iommu;
   3261	struct irq_cfg *cfg;
   3262	int i, ret, devid, seg, sbdf;
   3263	int index;
   3264
   3265	if (!info)
   3266		return -EINVAL;
   3267	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI &&
   3268	    info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX)
   3269		return -EINVAL;
   3270
   3271	/*
   3272	 * With IRQ remapping enabled, don't need contiguous CPU vectors
   3273	 * to support multiple MSI interrupts.
   3274	 */
   3275	if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
   3276		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
   3277
   3278	sbdf = get_devid(info);
   3279	if (sbdf < 0)
   3280		return -EINVAL;
   3281
   3282	seg = PCI_SBDF_TO_SEGID(sbdf);
   3283	devid = PCI_SBDF_TO_DEVID(sbdf);
   3284	iommu = __rlookup_amd_iommu(seg, devid);
   3285	if (!iommu)
   3286		return -EINVAL;
   3287
   3288	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
   3289	if (ret < 0)
   3290		return ret;
   3291
   3292	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
   3293		struct irq_remap_table *table;
   3294
   3295		table = alloc_irq_table(iommu, devid, NULL);
   3296		if (table) {
   3297			if (!table->min_index) {
   3298				/*
   3299				 * Keep the first 32 indexes free for IOAPIC
   3300				 * interrupts.
   3301				 */
   3302				table->min_index = 32;
   3303				for (i = 0; i < 32; ++i)
   3304					iommu->irte_ops->set_allocated(table, i);
   3305			}
   3306			WARN_ON(table->min_index != 32);
   3307			index = info->ioapic.pin;
   3308		} else {
   3309			index = -ENOMEM;
   3310		}
   3311	} else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
   3312		   info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
   3313		bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
   3314
   3315		index = alloc_irq_index(iommu, devid, nr_irqs, align,
   3316					msi_desc_to_pci_dev(info->desc));
   3317	} else {
   3318		index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL);
   3319	}
   3320
   3321	if (index < 0) {
   3322		pr_warn("Failed to allocate IRTE\n");
   3323		ret = index;
   3324		goto out_free_parent;
   3325	}
   3326
   3327	for (i = 0; i < nr_irqs; i++) {
   3328		irq_data = irq_domain_get_irq_data(domain, virq + i);
   3329		cfg = irq_data ? irqd_cfg(irq_data) : NULL;
   3330		if (!cfg) {
   3331			ret = -EINVAL;
   3332			goto out_free_data;
   3333		}
   3334
   3335		ret = -ENOMEM;
   3336		data = kzalloc(sizeof(*data), GFP_KERNEL);
   3337		if (!data)
   3338			goto out_free_data;
   3339
   3340		if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
   3341			data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
   3342		else
   3343			data->entry = kzalloc(sizeof(struct irte_ga),
   3344						     GFP_KERNEL);
   3345		if (!data->entry) {
   3346			kfree(data);
   3347			goto out_free_data;
   3348		}
   3349
   3350		data->iommu = iommu;
   3351		irq_data->hwirq = (devid << 16) + i;
   3352		irq_data->chip_data = data;
   3353		irq_data->chip = &amd_ir_chip;
   3354		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
   3355		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
   3356	}
   3357
   3358	return 0;
   3359
   3360out_free_data:
   3361	for (i--; i >= 0; i--) {
   3362		irq_data = irq_domain_get_irq_data(domain, virq + i);
   3363		if (irq_data)
   3364			kfree(irq_data->chip_data);
   3365	}
   3366	for (i = 0; i < nr_irqs; i++)
   3367		free_irte(iommu, devid, index + i);
   3368out_free_parent:
   3369	irq_domain_free_irqs_common(domain, virq, nr_irqs);
   3370	return ret;
   3371}
   3372
   3373static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
   3374			       unsigned int nr_irqs)
   3375{
   3376	struct irq_2_irte *irte_info;
   3377	struct irq_data *irq_data;
   3378	struct amd_ir_data *data;
   3379	int i;
   3380
   3381	for (i = 0; i < nr_irqs; i++) {
   3382		irq_data = irq_domain_get_irq_data(domain, virq  + i);
   3383		if (irq_data && irq_data->chip_data) {
   3384			data = irq_data->chip_data;
   3385			irte_info = &data->irq_2_irte;
   3386			free_irte(data->iommu, irte_info->devid, irte_info->index);
   3387			kfree(data->entry);
   3388			kfree(data);
   3389		}
   3390	}
   3391	irq_domain_free_irqs_common(domain, virq, nr_irqs);
   3392}
   3393
   3394static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
   3395			       struct amd_ir_data *ir_data,
   3396			       struct irq_2_irte *irte_info,
   3397			       struct irq_cfg *cfg);
   3398
   3399static int irq_remapping_activate(struct irq_domain *domain,
   3400				  struct irq_data *irq_data, bool reserve)
   3401{
   3402	struct amd_ir_data *data = irq_data->chip_data;
   3403	struct irq_2_irte *irte_info = &data->irq_2_irte;
   3404	struct amd_iommu *iommu = data->iommu;
   3405	struct irq_cfg *cfg = irqd_cfg(irq_data);
   3406
   3407	if (!iommu)
   3408		return 0;
   3409
   3410	iommu->irte_ops->activate(iommu, data->entry, irte_info->devid,
   3411				  irte_info->index);
   3412	amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
   3413	return 0;
   3414}
   3415
   3416static void irq_remapping_deactivate(struct irq_domain *domain,
   3417				     struct irq_data *irq_data)
   3418{
   3419	struct amd_ir_data *data = irq_data->chip_data;
   3420	struct irq_2_irte *irte_info = &data->irq_2_irte;
   3421	struct amd_iommu *iommu = data->iommu;
   3422
   3423	if (iommu)
   3424		iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid,
   3425					    irte_info->index);
   3426}
   3427
   3428static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
   3429				enum irq_domain_bus_token bus_token)
   3430{
   3431	struct amd_iommu *iommu;
   3432	int devid = -1;
   3433
   3434	if (!amd_iommu_irq_remap)
   3435		return 0;
   3436
   3437	if (x86_fwspec_is_ioapic(fwspec))
   3438		devid = get_ioapic_devid(fwspec->param[0]);
   3439	else if (x86_fwspec_is_hpet(fwspec))
   3440		devid = get_hpet_devid(fwspec->param[0]);
   3441
   3442	if (devid < 0)
   3443		return 0;
   3444	iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff));
   3445
   3446	return iommu && iommu->ir_domain == d;
   3447}
   3448
   3449static const struct irq_domain_ops amd_ir_domain_ops = {
   3450	.select = irq_remapping_select,
   3451	.alloc = irq_remapping_alloc,
   3452	.free = irq_remapping_free,
   3453	.activate = irq_remapping_activate,
   3454	.deactivate = irq_remapping_deactivate,
   3455};
   3456
   3457int amd_iommu_activate_guest_mode(void *data)
   3458{
   3459	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
   3460	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
   3461	u64 valid;
   3462
   3463	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
   3464	    !entry || entry->lo.fields_vapic.guest_mode)
   3465		return 0;
   3466
   3467	valid = entry->lo.fields_vapic.valid;
   3468
   3469	entry->lo.val = 0;
   3470	entry->hi.val = 0;
   3471
   3472	entry->lo.fields_vapic.valid       = valid;
   3473	entry->lo.fields_vapic.guest_mode  = 1;
   3474	entry->lo.fields_vapic.ga_log_intr = 1;
   3475	entry->hi.fields.ga_root_ptr       = ir_data->ga_root_ptr;
   3476	entry->hi.fields.vector            = ir_data->ga_vector;
   3477	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
   3478
   3479	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
   3480			      ir_data->irq_2_irte.index, entry, ir_data);
   3481}
   3482EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
   3483
   3484int amd_iommu_deactivate_guest_mode(void *data)
   3485{
   3486	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
   3487	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
   3488	struct irq_cfg *cfg = ir_data->cfg;
   3489	u64 valid;
   3490
   3491	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
   3492	    !entry || !entry->lo.fields_vapic.guest_mode)
   3493		return 0;
   3494
   3495	valid = entry->lo.fields_remap.valid;
   3496
   3497	entry->lo.val = 0;
   3498	entry->hi.val = 0;
   3499
   3500	entry->lo.fields_remap.valid       = valid;
   3501	entry->lo.fields_remap.dm          = apic->dest_mode_logical;
   3502	entry->lo.fields_remap.int_type    = apic->delivery_mode;
   3503	entry->hi.fields.vector            = cfg->vector;
   3504	entry->lo.fields_remap.destination =
   3505				APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
   3506	entry->hi.fields.destination =
   3507				APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
   3508
   3509	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
   3510			      ir_data->irq_2_irte.index, entry, ir_data);
   3511}
   3512EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
   3513
   3514static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
   3515{
   3516	int ret;
   3517	struct amd_iommu_pi_data *pi_data = vcpu_info;
   3518	struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
   3519	struct amd_ir_data *ir_data = data->chip_data;
   3520	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
   3521	struct iommu_dev_data *dev_data;
   3522
   3523	if (ir_data->iommu == NULL)
   3524		return -EINVAL;
   3525
   3526	dev_data = search_dev_data(ir_data->iommu, irte_info->devid);
   3527
   3528	/* Note:
   3529	 * This device has never been set up for guest mode.
   3530	 * we should not modify the IRTE
   3531	 */
   3532	if (!dev_data || !dev_data->use_vapic)
   3533		return 0;
   3534
   3535	ir_data->cfg = irqd_cfg(data);
   3536	pi_data->ir_data = ir_data;
   3537
   3538	/* Note:
   3539	 * SVM tries to set up for VAPIC mode, but we are in
   3540	 * legacy mode. So, we force legacy mode instead.
   3541	 */
   3542	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
   3543		pr_debug("%s: Fall back to using intr legacy remap\n",
   3544			 __func__);
   3545		pi_data->is_guest_mode = false;
   3546	}
   3547
   3548	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
   3549	if (pi_data->is_guest_mode) {
   3550		ir_data->ga_root_ptr = (pi_data->base >> 12);
   3551		ir_data->ga_vector = vcpu_pi_info->vector;
   3552		ir_data->ga_tag = pi_data->ga_tag;
   3553		ret = amd_iommu_activate_guest_mode(ir_data);
   3554		if (!ret)
   3555			ir_data->cached_ga_tag = pi_data->ga_tag;
   3556	} else {
   3557		ret = amd_iommu_deactivate_guest_mode(ir_data);
   3558
   3559		/*
   3560		 * This communicates the ga_tag back to the caller
   3561		 * so that it can do all the necessary clean up.
   3562		 */
   3563		if (!ret)
   3564			ir_data->cached_ga_tag = 0;
   3565	}
   3566
   3567	return ret;
   3568}
   3569
   3570
   3571static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
   3572			       struct amd_ir_data *ir_data,
   3573			       struct irq_2_irte *irte_info,
   3574			       struct irq_cfg *cfg)
   3575{
   3576
   3577	/*
   3578	 * Atomically updates the IRTE with the new destination, vector
   3579	 * and flushes the interrupt entry cache.
   3580	 */
   3581	iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid,
   3582				      irte_info->index, cfg->vector,
   3583				      cfg->dest_apicid);
   3584}
   3585
   3586static int amd_ir_set_affinity(struct irq_data *data,
   3587			       const struct cpumask *mask, bool force)
   3588{
   3589	struct amd_ir_data *ir_data = data->chip_data;
   3590	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
   3591	struct irq_cfg *cfg = irqd_cfg(data);
   3592	struct irq_data *parent = data->parent_data;
   3593	struct amd_iommu *iommu = ir_data->iommu;
   3594	int ret;
   3595
   3596	if (!iommu)
   3597		return -ENODEV;
   3598
   3599	ret = parent->chip->irq_set_affinity(parent, mask, force);
   3600	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
   3601		return ret;
   3602
   3603	amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
   3604	/*
   3605	 * After this point, all the interrupts will start arriving
   3606	 * at the new destination. So, time to cleanup the previous
   3607	 * vector allocation.
   3608	 */
   3609	send_cleanup_vector(cfg);
   3610
   3611	return IRQ_SET_MASK_OK_DONE;
   3612}
   3613
   3614static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
   3615{
   3616	struct amd_ir_data *ir_data = irq_data->chip_data;
   3617
   3618	*msg = ir_data->msi_entry;
   3619}
   3620
   3621static struct irq_chip amd_ir_chip = {
   3622	.name			= "AMD-IR",
   3623	.irq_ack		= apic_ack_irq,
   3624	.irq_set_affinity	= amd_ir_set_affinity,
   3625	.irq_set_vcpu_affinity	= amd_ir_set_vcpu_affinity,
   3626	.irq_compose_msi_msg	= ir_compose_msi_msg,
   3627};
   3628
   3629int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
   3630{
   3631	struct fwnode_handle *fn;
   3632
   3633	fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
   3634	if (!fn)
   3635		return -ENOMEM;
   3636	iommu->ir_domain = irq_domain_create_tree(fn, &amd_ir_domain_ops, iommu);
   3637	if (!iommu->ir_domain) {
   3638		irq_domain_free_fwnode(fn);
   3639		return -ENOMEM;
   3640	}
   3641
   3642	iommu->ir_domain->parent = arch_get_ir_parent_domain();
   3643	iommu->msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain,
   3644							     "AMD-IR-MSI",
   3645							     iommu->index);
   3646	return 0;
   3647}
   3648
   3649int amd_iommu_update_ga(int cpu, bool is_run, void *data)
   3650{
   3651	unsigned long flags;
   3652	struct amd_iommu *iommu;
   3653	struct irq_remap_table *table;
   3654	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
   3655	int devid = ir_data->irq_2_irte.devid;
   3656	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
   3657	struct irte_ga *ref = (struct irte_ga *) ir_data->ref;
   3658
   3659	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
   3660	    !ref || !entry || !entry->lo.fields_vapic.guest_mode)
   3661		return 0;
   3662
   3663	iommu = ir_data->iommu;
   3664	if (!iommu)
   3665		return -ENODEV;
   3666
   3667	table = get_irq_table(iommu, devid);
   3668	if (!table)
   3669		return -ENODEV;
   3670
   3671	raw_spin_lock_irqsave(&table->lock, flags);
   3672
   3673	if (ref->lo.fields_vapic.guest_mode) {
   3674		if (cpu >= 0) {
   3675			ref->lo.fields_vapic.destination =
   3676						APICID_TO_IRTE_DEST_LO(cpu);
   3677			ref->hi.fields.destination =
   3678						APICID_TO_IRTE_DEST_HI(cpu);
   3679		}
   3680		ref->lo.fields_vapic.is_run = is_run;
   3681		barrier();
   3682	}
   3683
   3684	raw_spin_unlock_irqrestore(&table->lock, flags);
   3685
   3686	iommu_flush_irt(iommu, devid);
   3687	iommu_completion_wait(iommu);
   3688	return 0;
   3689}
   3690EXPORT_SYMBOL(amd_iommu_update_ga);
   3691#endif