cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

iommu.c (132570B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright © 2006-2014 Intel Corporation.
      4 *
      5 * Authors: David Woodhouse <dwmw2@infradead.org>,
      6 *          Ashok Raj <ashok.raj@intel.com>,
      7 *          Shaohua Li <shaohua.li@intel.com>,
      8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
      9 *          Fenghua Yu <fenghua.yu@intel.com>
     10 *          Joerg Roedel <jroedel@suse.de>
     11 */
     12
     13#define pr_fmt(fmt)     "DMAR: " fmt
     14#define dev_fmt(fmt)    pr_fmt(fmt)
     15
     16#include <linux/crash_dump.h>
     17#include <linux/dma-direct.h>
     18#include <linux/dma-iommu.h>
     19#include <linux/dmi.h>
     20#include <linux/intel-iommu.h>
     21#include <linux/intel-svm.h>
     22#include <linux/memory.h>
     23#include <linux/pci.h>
     24#include <linux/pci-ats.h>
     25#include <linux/spinlock.h>
     26#include <linux/syscore_ops.h>
     27#include <linux/tboot.h>
     28
     29#include "../irq_remapping.h"
     30#include "../iommu-sva-lib.h"
     31#include "pasid.h"
     32#include "cap_audit.h"
     33
     34#define ROOT_SIZE		VTD_PAGE_SIZE
     35#define CONTEXT_SIZE		VTD_PAGE_SIZE
     36
     37#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
     38#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
     39#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
     40#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
     41
     42#define IOAPIC_RANGE_START	(0xfee00000)
     43#define IOAPIC_RANGE_END	(0xfeefffff)
     44#define IOVA_START_ADDR		(0x1000)
     45
     46#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
     47
     48#define MAX_AGAW_WIDTH 64
     49#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
     50
     51#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
     52#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
     53
     54/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
     55   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
     56#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
     57				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
     58#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
     59
     60/* IO virtual address start page frame number */
     61#define IOVA_START_PFN		(1)
     62
     63#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
     64
     65/* page table handling */
     66#define LEVEL_STRIDE		(9)
     67#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
     68
     69static inline int agaw_to_level(int agaw)
     70{
     71	return agaw + 2;
     72}
     73
     74static inline int agaw_to_width(int agaw)
     75{
     76	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
     77}
     78
     79static inline int width_to_agaw(int width)
     80{
     81	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
     82}
     83
     84static inline unsigned int level_to_offset_bits(int level)
     85{
     86	return (level - 1) * LEVEL_STRIDE;
     87}
     88
     89static inline int pfn_level_offset(u64 pfn, int level)
     90{
     91	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
     92}
     93
     94static inline u64 level_mask(int level)
     95{
     96	return -1ULL << level_to_offset_bits(level);
     97}
     98
     99static inline u64 level_size(int level)
    100{
    101	return 1ULL << level_to_offset_bits(level);
    102}
    103
    104static inline u64 align_to_level(u64 pfn, int level)
    105{
    106	return (pfn + level_size(level) - 1) & level_mask(level);
    107}
    108
    109static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
    110{
    111	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
    112}
    113
    114/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
    115   are never going to work. */
    116static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
    117{
    118	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
    119}
    120static inline unsigned long page_to_dma_pfn(struct page *pg)
    121{
    122	return mm_to_dma_pfn(page_to_pfn(pg));
    123}
    124static inline unsigned long virt_to_dma_pfn(void *p)
    125{
    126	return page_to_dma_pfn(virt_to_page(p));
    127}
    128
    129/* global iommu list, set NULL for ignored DMAR units */
    130static struct intel_iommu **g_iommus;
    131
    132static void __init check_tylersburg_isoch(void);
    133static int rwbf_quirk;
    134static inline struct device_domain_info *
    135dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
    136
    137/*
    138 * set to 1 to panic kernel if can't successfully enable VT-d
    139 * (used when kernel is launched w/ TXT)
    140 */
    141static int force_on = 0;
    142static int intel_iommu_tboot_noforce;
    143static int no_platform_optin;
    144
    145#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
    146
    147/*
    148 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
    149 * if marked present.
    150 */
    151static phys_addr_t root_entry_lctp(struct root_entry *re)
    152{
    153	if (!(re->lo & 1))
    154		return 0;
    155
    156	return re->lo & VTD_PAGE_MASK;
    157}
    158
    159/*
    160 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
    161 * if marked present.
    162 */
    163static phys_addr_t root_entry_uctp(struct root_entry *re)
    164{
    165	if (!(re->hi & 1))
    166		return 0;
    167
    168	return re->hi & VTD_PAGE_MASK;
    169}
    170
    171static inline void context_clear_pasid_enable(struct context_entry *context)
    172{
    173	context->lo &= ~(1ULL << 11);
    174}
    175
    176static inline bool context_pasid_enabled(struct context_entry *context)
    177{
    178	return !!(context->lo & (1ULL << 11));
    179}
    180
    181static inline void context_set_copied(struct context_entry *context)
    182{
    183	context->hi |= (1ull << 3);
    184}
    185
    186static inline bool context_copied(struct context_entry *context)
    187{
    188	return !!(context->hi & (1ULL << 3));
    189}
    190
    191static inline bool __context_present(struct context_entry *context)
    192{
    193	return (context->lo & 1);
    194}
    195
    196bool context_present(struct context_entry *context)
    197{
    198	return context_pasid_enabled(context) ?
    199	     __context_present(context) :
    200	     __context_present(context) && !context_copied(context);
    201}
    202
    203static inline void context_set_present(struct context_entry *context)
    204{
    205	context->lo |= 1;
    206}
    207
    208static inline void context_set_fault_enable(struct context_entry *context)
    209{
    210	context->lo &= (((u64)-1) << 2) | 1;
    211}
    212
    213static inline void context_set_translation_type(struct context_entry *context,
    214						unsigned long value)
    215{
    216	context->lo &= (((u64)-1) << 4) | 3;
    217	context->lo |= (value & 3) << 2;
    218}
    219
    220static inline void context_set_address_root(struct context_entry *context,
    221					    unsigned long value)
    222{
    223	context->lo &= ~VTD_PAGE_MASK;
    224	context->lo |= value & VTD_PAGE_MASK;
    225}
    226
    227static inline void context_set_address_width(struct context_entry *context,
    228					     unsigned long value)
    229{
    230	context->hi |= value & 7;
    231}
    232
    233static inline void context_set_domain_id(struct context_entry *context,
    234					 unsigned long value)
    235{
    236	context->hi |= (value & ((1 << 16) - 1)) << 8;
    237}
    238
    239static inline int context_domain_id(struct context_entry *c)
    240{
    241	return((c->hi >> 8) & 0xffff);
    242}
    243
    244static inline void context_clear_entry(struct context_entry *context)
    245{
    246	context->lo = 0;
    247	context->hi = 0;
    248}
    249
    250/*
    251 * This domain is a statically identity mapping domain.
    252 *	1. This domain creats a static 1:1 mapping to all usable memory.
    253 * 	2. It maps to each iommu if successful.
    254 *	3. Each iommu mapps to this domain if successful.
    255 */
    256static struct dmar_domain *si_domain;
    257static int hw_pass_through = 1;
    258
    259#define for_each_domain_iommu(idx, domain)			\
    260	for (idx = 0; idx < g_num_of_iommus; idx++)		\
    261		if (domain->iommu_refcnt[idx])
    262
    263struct dmar_rmrr_unit {
    264	struct list_head list;		/* list of rmrr units	*/
    265	struct acpi_dmar_header *hdr;	/* ACPI header		*/
    266	u64	base_address;		/* reserved base address*/
    267	u64	end_address;		/* reserved end address */
    268	struct dmar_dev_scope *devices;	/* target devices */
    269	int	devices_cnt;		/* target device count */
    270};
    271
    272struct dmar_atsr_unit {
    273	struct list_head list;		/* list of ATSR units */
    274	struct acpi_dmar_header *hdr;	/* ACPI header */
    275	struct dmar_dev_scope *devices;	/* target devices */
    276	int devices_cnt;		/* target device count */
    277	u8 include_all:1;		/* include all ports */
    278};
    279
    280struct dmar_satc_unit {
    281	struct list_head list;		/* list of SATC units */
    282	struct acpi_dmar_header *hdr;	/* ACPI header */
    283	struct dmar_dev_scope *devices;	/* target devices */
    284	struct intel_iommu *iommu;	/* the corresponding iommu */
    285	int devices_cnt;		/* target device count */
    286	u8 atc_required:1;		/* ATS is required */
    287};
    288
    289static LIST_HEAD(dmar_atsr_units);
    290static LIST_HEAD(dmar_rmrr_units);
    291static LIST_HEAD(dmar_satc_units);
    292
    293#define for_each_rmrr_units(rmrr) \
    294	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
    295
    296/* bitmap for indexing intel_iommus */
    297static int g_num_of_iommus;
    298
    299static void domain_remove_dev_info(struct dmar_domain *domain);
    300static void dmar_remove_one_dev_info(struct device *dev);
    301static void __dmar_remove_one_dev_info(struct device_domain_info *info);
    302
    303int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
    304int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
    305
    306int intel_iommu_enabled = 0;
    307EXPORT_SYMBOL_GPL(intel_iommu_enabled);
    308
    309static int dmar_map_gfx = 1;
    310static int intel_iommu_superpage = 1;
    311static int iommu_identity_mapping;
    312static int iommu_skip_te_disable;
    313
    314#define IDENTMAP_GFX		2
    315#define IDENTMAP_AZALIA		4
    316
    317int intel_iommu_gfx_mapped;
    318EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
    319
    320DEFINE_SPINLOCK(device_domain_lock);
    321static LIST_HEAD(device_domain_list);
    322
    323const struct iommu_ops intel_iommu_ops;
    324
    325static bool translation_pre_enabled(struct intel_iommu *iommu)
    326{
    327	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
    328}
    329
    330static void clear_translation_pre_enabled(struct intel_iommu *iommu)
    331{
    332	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
    333}
    334
    335static void init_translation_status(struct intel_iommu *iommu)
    336{
    337	u32 gsts;
    338
    339	gsts = readl(iommu->reg + DMAR_GSTS_REG);
    340	if (gsts & DMA_GSTS_TES)
    341		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
    342}
    343
    344static int __init intel_iommu_setup(char *str)
    345{
    346	if (!str)
    347		return -EINVAL;
    348
    349	while (*str) {
    350		if (!strncmp(str, "on", 2)) {
    351			dmar_disabled = 0;
    352			pr_info("IOMMU enabled\n");
    353		} else if (!strncmp(str, "off", 3)) {
    354			dmar_disabled = 1;
    355			no_platform_optin = 1;
    356			pr_info("IOMMU disabled\n");
    357		} else if (!strncmp(str, "igfx_off", 8)) {
    358			dmar_map_gfx = 0;
    359			pr_info("Disable GFX device mapping\n");
    360		} else if (!strncmp(str, "forcedac", 8)) {
    361			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
    362			iommu_dma_forcedac = true;
    363		} else if (!strncmp(str, "strict", 6)) {
    364			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
    365			iommu_set_dma_strict();
    366		} else if (!strncmp(str, "sp_off", 6)) {
    367			pr_info("Disable supported super page\n");
    368			intel_iommu_superpage = 0;
    369		} else if (!strncmp(str, "sm_on", 5)) {
    370			pr_info("Enable scalable mode if hardware supports\n");
    371			intel_iommu_sm = 1;
    372		} else if (!strncmp(str, "sm_off", 6)) {
    373			pr_info("Scalable mode is disallowed\n");
    374			intel_iommu_sm = 0;
    375		} else if (!strncmp(str, "tboot_noforce", 13)) {
    376			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
    377			intel_iommu_tboot_noforce = 1;
    378		} else {
    379			pr_notice("Unknown option - '%s'\n", str);
    380		}
    381
    382		str += strcspn(str, ",");
    383		while (*str == ',')
    384			str++;
    385	}
    386
    387	return 1;
    388}
    389__setup("intel_iommu=", intel_iommu_setup);
    390
    391void *alloc_pgtable_page(int node)
    392{
    393	struct page *page;
    394	void *vaddr = NULL;
    395
    396	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
    397	if (page)
    398		vaddr = page_address(page);
    399	return vaddr;
    400}
    401
    402void free_pgtable_page(void *vaddr)
    403{
    404	free_page((unsigned long)vaddr);
    405}
    406
    407static inline int domain_type_is_si(struct dmar_domain *domain)
    408{
    409	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
    410}
    411
    412static inline bool domain_use_first_level(struct dmar_domain *domain)
    413{
    414	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
    415}
    416
    417static inline int domain_pfn_supported(struct dmar_domain *domain,
    418				       unsigned long pfn)
    419{
    420	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
    421
    422	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
    423}
    424
    425static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
    426{
    427	unsigned long sagaw;
    428	int agaw;
    429
    430	sagaw = cap_sagaw(iommu->cap);
    431	for (agaw = width_to_agaw(max_gaw);
    432	     agaw >= 0; agaw--) {
    433		if (test_bit(agaw, &sagaw))
    434			break;
    435	}
    436
    437	return agaw;
    438}
    439
    440/*
    441 * Calculate max SAGAW for each iommu.
    442 */
    443int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
    444{
    445	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
    446}
    447
    448/*
    449 * calculate agaw for each iommu.
    450 * "SAGAW" may be different across iommus, use a default agaw, and
    451 * get a supported less agaw for iommus that don't support the default agaw.
    452 */
    453int iommu_calculate_agaw(struct intel_iommu *iommu)
    454{
    455	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
    456}
    457
    458/* This functionin only returns single iommu in a domain */
    459struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
    460{
    461	int iommu_id;
    462
    463	/* si_domain and vm domain should not get here. */
    464	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
    465		return NULL;
    466
    467	for_each_domain_iommu(iommu_id, domain)
    468		break;
    469
    470	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
    471		return NULL;
    472
    473	return g_iommus[iommu_id];
    474}
    475
    476static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
    477{
    478	return sm_supported(iommu) ?
    479			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
    480}
    481
    482static void domain_update_iommu_coherency(struct dmar_domain *domain)
    483{
    484	struct dmar_drhd_unit *drhd;
    485	struct intel_iommu *iommu;
    486	bool found = false;
    487	int i;
    488
    489	domain->iommu_coherency = true;
    490
    491	for_each_domain_iommu(i, domain) {
    492		found = true;
    493		if (!iommu_paging_structure_coherency(g_iommus[i])) {
    494			domain->iommu_coherency = false;
    495			break;
    496		}
    497	}
    498	if (found)
    499		return;
    500
    501	/* No hardware attached; use lowest common denominator */
    502	rcu_read_lock();
    503	for_each_active_iommu(iommu, drhd) {
    504		if (!iommu_paging_structure_coherency(iommu)) {
    505			domain->iommu_coherency = false;
    506			break;
    507		}
    508	}
    509	rcu_read_unlock();
    510}
    511
    512static int domain_update_iommu_superpage(struct dmar_domain *domain,
    513					 struct intel_iommu *skip)
    514{
    515	struct dmar_drhd_unit *drhd;
    516	struct intel_iommu *iommu;
    517	int mask = 0x3;
    518
    519	if (!intel_iommu_superpage)
    520		return 0;
    521
    522	/* set iommu_superpage to the smallest common denominator */
    523	rcu_read_lock();
    524	for_each_active_iommu(iommu, drhd) {
    525		if (iommu != skip) {
    526			if (domain && domain_use_first_level(domain)) {
    527				if (!cap_fl1gp_support(iommu->cap))
    528					mask = 0x1;
    529			} else {
    530				mask &= cap_super_page_val(iommu->cap);
    531			}
    532
    533			if (!mask)
    534				break;
    535		}
    536	}
    537	rcu_read_unlock();
    538
    539	return fls(mask);
    540}
    541
    542static int domain_update_device_node(struct dmar_domain *domain)
    543{
    544	struct device_domain_info *info;
    545	int nid = NUMA_NO_NODE;
    546
    547	assert_spin_locked(&device_domain_lock);
    548
    549	if (list_empty(&domain->devices))
    550		return NUMA_NO_NODE;
    551
    552	list_for_each_entry(info, &domain->devices, link) {
    553		if (!info->dev)
    554			continue;
    555
    556		/*
    557		 * There could possibly be multiple device numa nodes as devices
    558		 * within the same domain may sit behind different IOMMUs. There
    559		 * isn't perfect answer in such situation, so we select first
    560		 * come first served policy.
    561		 */
    562		nid = dev_to_node(info->dev);
    563		if (nid != NUMA_NO_NODE)
    564			break;
    565	}
    566
    567	return nid;
    568}
    569
    570static void domain_update_iotlb(struct dmar_domain *domain);
    571
    572/* Return the super pagesize bitmap if supported. */
    573static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
    574{
    575	unsigned long bitmap = 0;
    576
    577	/*
    578	 * 1-level super page supports page size of 2MiB, 2-level super page
    579	 * supports page size of both 2MiB and 1GiB.
    580	 */
    581	if (domain->iommu_superpage == 1)
    582		bitmap |= SZ_2M;
    583	else if (domain->iommu_superpage == 2)
    584		bitmap |= SZ_2M | SZ_1G;
    585
    586	return bitmap;
    587}
    588
    589/* Some capabilities may be different across iommus */
    590static void domain_update_iommu_cap(struct dmar_domain *domain)
    591{
    592	domain_update_iommu_coherency(domain);
    593	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
    594
    595	/*
    596	 * If RHSA is missing, we should default to the device numa domain
    597	 * as fall back.
    598	 */
    599	if (domain->nid == NUMA_NO_NODE)
    600		domain->nid = domain_update_device_node(domain);
    601
    602	/*
    603	 * First-level translation restricts the input-address to a
    604	 * canonical address (i.e., address bits 63:N have the same
    605	 * value as address bit [N-1], where N is 48-bits with 4-level
    606	 * paging and 57-bits with 5-level paging). Hence, skip bit
    607	 * [N-1].
    608	 */
    609	if (domain_use_first_level(domain))
    610		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
    611	else
    612		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
    613
    614	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
    615	domain_update_iotlb(domain);
    616}
    617
    618struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
    619					 u8 devfn, int alloc)
    620{
    621	struct root_entry *root = &iommu->root_entry[bus];
    622	struct context_entry *context;
    623	u64 *entry;
    624
    625	entry = &root->lo;
    626	if (sm_supported(iommu)) {
    627		if (devfn >= 0x80) {
    628			devfn -= 0x80;
    629			entry = &root->hi;
    630		}
    631		devfn *= 2;
    632	}
    633	if (*entry & 1)
    634		context = phys_to_virt(*entry & VTD_PAGE_MASK);
    635	else {
    636		unsigned long phy_addr;
    637		if (!alloc)
    638			return NULL;
    639
    640		context = alloc_pgtable_page(iommu->node);
    641		if (!context)
    642			return NULL;
    643
    644		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
    645		phy_addr = virt_to_phys((void *)context);
    646		*entry = phy_addr | 1;
    647		__iommu_flush_cache(iommu, entry, sizeof(*entry));
    648	}
    649	return &context[devfn];
    650}
    651
    652/**
    653 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
    654 *				 sub-hierarchy of a candidate PCI-PCI bridge
    655 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
    656 * @bridge: the candidate PCI-PCI bridge
    657 *
    658 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
    659 */
    660static bool
    661is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
    662{
    663	struct pci_dev *pdev, *pbridge;
    664
    665	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
    666		return false;
    667
    668	pdev = to_pci_dev(dev);
    669	pbridge = to_pci_dev(bridge);
    670
    671	if (pbridge->subordinate &&
    672	    pbridge->subordinate->number <= pdev->bus->number &&
    673	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
    674		return true;
    675
    676	return false;
    677}
    678
    679static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
    680{
    681	struct dmar_drhd_unit *drhd;
    682	u32 vtbar;
    683	int rc;
    684
    685	/* We know that this device on this chipset has its own IOMMU.
    686	 * If we find it under a different IOMMU, then the BIOS is lying
    687	 * to us. Hope that the IOMMU for this device is actually
    688	 * disabled, and it needs no translation...
    689	 */
    690	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
    691	if (rc) {
    692		/* "can't" happen */
    693		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
    694		return false;
    695	}
    696	vtbar &= 0xffff0000;
    697
    698	/* we know that the this iommu should be at offset 0xa000 from vtbar */
    699	drhd = dmar_find_matched_drhd_unit(pdev);
    700	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
    701		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
    702		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
    703		return true;
    704	}
    705
    706	return false;
    707}
    708
    709static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
    710{
    711	if (!iommu || iommu->drhd->ignored)
    712		return true;
    713
    714	if (dev_is_pci(dev)) {
    715		struct pci_dev *pdev = to_pci_dev(dev);
    716
    717		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
    718		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
    719		    quirk_ioat_snb_local_iommu(pdev))
    720			return true;
    721	}
    722
    723	return false;
    724}
    725
    726struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
    727{
    728	struct dmar_drhd_unit *drhd = NULL;
    729	struct pci_dev *pdev = NULL;
    730	struct intel_iommu *iommu;
    731	struct device *tmp;
    732	u16 segment = 0;
    733	int i;
    734
    735	if (!dev)
    736		return NULL;
    737
    738	if (dev_is_pci(dev)) {
    739		struct pci_dev *pf_pdev;
    740
    741		pdev = pci_real_dma_dev(to_pci_dev(dev));
    742
    743		/* VFs aren't listed in scope tables; we need to look up
    744		 * the PF instead to find the IOMMU. */
    745		pf_pdev = pci_physfn(pdev);
    746		dev = &pf_pdev->dev;
    747		segment = pci_domain_nr(pdev->bus);
    748	} else if (has_acpi_companion(dev))
    749		dev = &ACPI_COMPANION(dev)->dev;
    750
    751	rcu_read_lock();
    752	for_each_iommu(iommu, drhd) {
    753		if (pdev && segment != drhd->segment)
    754			continue;
    755
    756		for_each_active_dev_scope(drhd->devices,
    757					  drhd->devices_cnt, i, tmp) {
    758			if (tmp == dev) {
    759				/* For a VF use its original BDF# not that of the PF
    760				 * which we used for the IOMMU lookup. Strictly speaking
    761				 * we could do this for all PCI devices; we only need to
    762				 * get the BDF# from the scope table for ACPI matches. */
    763				if (pdev && pdev->is_virtfn)
    764					goto got_pdev;
    765
    766				if (bus && devfn) {
    767					*bus = drhd->devices[i].bus;
    768					*devfn = drhd->devices[i].devfn;
    769				}
    770				goto out;
    771			}
    772
    773			if (is_downstream_to_pci_bridge(dev, tmp))
    774				goto got_pdev;
    775		}
    776
    777		if (pdev && drhd->include_all) {
    778got_pdev:
    779			if (bus && devfn) {
    780				*bus = pdev->bus->number;
    781				*devfn = pdev->devfn;
    782			}
    783			goto out;
    784		}
    785	}
    786	iommu = NULL;
    787out:
    788	if (iommu_is_dummy(iommu, dev))
    789		iommu = NULL;
    790
    791	rcu_read_unlock();
    792
    793	return iommu;
    794}
    795
    796static void domain_flush_cache(struct dmar_domain *domain,
    797			       void *addr, int size)
    798{
    799	if (!domain->iommu_coherency)
    800		clflush_cache_range(addr, size);
    801}
    802
    803static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
    804{
    805	struct context_entry *context;
    806	int ret = 0;
    807	unsigned long flags;
    808
    809	spin_lock_irqsave(&iommu->lock, flags);
    810	context = iommu_context_addr(iommu, bus, devfn, 0);
    811	if (context)
    812		ret = context_present(context);
    813	spin_unlock_irqrestore(&iommu->lock, flags);
    814	return ret;
    815}
    816
    817static void free_context_table(struct intel_iommu *iommu)
    818{
    819	int i;
    820	unsigned long flags;
    821	struct context_entry *context;
    822
    823	spin_lock_irqsave(&iommu->lock, flags);
    824	if (!iommu->root_entry) {
    825		goto out;
    826	}
    827	for (i = 0; i < ROOT_ENTRY_NR; i++) {
    828		context = iommu_context_addr(iommu, i, 0, 0);
    829		if (context)
    830			free_pgtable_page(context);
    831
    832		if (!sm_supported(iommu))
    833			continue;
    834
    835		context = iommu_context_addr(iommu, i, 0x80, 0);
    836		if (context)
    837			free_pgtable_page(context);
    838
    839	}
    840	free_pgtable_page(iommu->root_entry);
    841	iommu->root_entry = NULL;
    842out:
    843	spin_unlock_irqrestore(&iommu->lock, flags);
    844}
    845
    846#ifdef CONFIG_DMAR_DEBUG
    847static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
    848{
    849	struct device_domain_info *info;
    850	struct dma_pte *parent, *pte;
    851	struct dmar_domain *domain;
    852	int offset, level;
    853
    854	info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
    855	if (!info || !info->domain) {
    856		pr_info("device [%02x:%02x.%d] not probed\n",
    857			bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
    858		return;
    859	}
    860
    861	domain = info->domain;
    862	level = agaw_to_level(domain->agaw);
    863	parent = domain->pgd;
    864	if (!parent) {
    865		pr_info("no page table setup\n");
    866		return;
    867	}
    868
    869	while (1) {
    870		offset = pfn_level_offset(pfn, level);
    871		pte = &parent[offset];
    872		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
    873			pr_info("PTE not present at level %d\n", level);
    874			break;
    875		}
    876
    877		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
    878
    879		if (level == 1)
    880			break;
    881
    882		parent = phys_to_virt(dma_pte_addr(pte));
    883		level--;
    884	}
    885}
    886
    887void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
    888			  unsigned long long addr, u32 pasid)
    889{
    890	struct pasid_dir_entry *dir, *pde;
    891	struct pasid_entry *entries, *pte;
    892	struct context_entry *ctx_entry;
    893	struct root_entry *rt_entry;
    894	u8 devfn = source_id & 0xff;
    895	u8 bus = source_id >> 8;
    896	int i, dir_index, index;
    897
    898	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
    899
    900	/* root entry dump */
    901	rt_entry = &iommu->root_entry[bus];
    902	if (!rt_entry) {
    903		pr_info("root table entry is not present\n");
    904		return;
    905	}
    906
    907	if (sm_supported(iommu))
    908		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
    909			rt_entry->hi, rt_entry->lo);
    910	else
    911		pr_info("root entry: 0x%016llx", rt_entry->lo);
    912
    913	/* context entry dump */
    914	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
    915	if (!ctx_entry) {
    916		pr_info("context table entry is not present\n");
    917		return;
    918	}
    919
    920	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
    921		ctx_entry->hi, ctx_entry->lo);
    922
    923	/* legacy mode does not require PASID entries */
    924	if (!sm_supported(iommu))
    925		goto pgtable_walk;
    926
    927	/* get the pointer to pasid directory entry */
    928	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
    929	if (!dir) {
    930		pr_info("pasid directory entry is not present\n");
    931		return;
    932	}
    933	/* For request-without-pasid, get the pasid from context entry */
    934	if (intel_iommu_sm && pasid == INVALID_IOASID)
    935		pasid = PASID_RID2PASID;
    936
    937	dir_index = pasid >> PASID_PDE_SHIFT;
    938	pde = &dir[dir_index];
    939	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
    940
    941	/* get the pointer to the pasid table entry */
    942	entries = get_pasid_table_from_pde(pde);
    943	if (!entries) {
    944		pr_info("pasid table entry is not present\n");
    945		return;
    946	}
    947	index = pasid & PASID_PTE_MASK;
    948	pte = &entries[index];
    949	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
    950		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
    951
    952pgtable_walk:
    953	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
    954}
    955#endif
    956
    957static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
    958				      unsigned long pfn, int *target_level)
    959{
    960	struct dma_pte *parent, *pte;
    961	int level = agaw_to_level(domain->agaw);
    962	int offset;
    963
    964	BUG_ON(!domain->pgd);
    965
    966	if (!domain_pfn_supported(domain, pfn))
    967		/* Address beyond IOMMU's addressing capabilities. */
    968		return NULL;
    969
    970	parent = domain->pgd;
    971
    972	while (1) {
    973		void *tmp_page;
    974
    975		offset = pfn_level_offset(pfn, level);
    976		pte = &parent[offset];
    977		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
    978			break;
    979		if (level == *target_level)
    980			break;
    981
    982		if (!dma_pte_present(pte)) {
    983			uint64_t pteval;
    984
    985			tmp_page = alloc_pgtable_page(domain->nid);
    986
    987			if (!tmp_page)
    988				return NULL;
    989
    990			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
    991			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
    992			if (domain_use_first_level(domain)) {
    993				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
    994				if (iommu_is_dma_domain(&domain->domain))
    995					pteval |= DMA_FL_PTE_ACCESS;
    996			}
    997			if (cmpxchg64(&pte->val, 0ULL, pteval))
    998				/* Someone else set it while we were thinking; use theirs. */
    999				free_pgtable_page(tmp_page);
   1000			else
   1001				domain_flush_cache(domain, pte, sizeof(*pte));
   1002		}
   1003		if (level == 1)
   1004			break;
   1005
   1006		parent = phys_to_virt(dma_pte_addr(pte));
   1007		level--;
   1008	}
   1009
   1010	if (!*target_level)
   1011		*target_level = level;
   1012
   1013	return pte;
   1014}
   1015
   1016/* return address's pte at specific level */
   1017static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
   1018					 unsigned long pfn,
   1019					 int level, int *large_page)
   1020{
   1021	struct dma_pte *parent, *pte;
   1022	int total = agaw_to_level(domain->agaw);
   1023	int offset;
   1024
   1025	parent = domain->pgd;
   1026	while (level <= total) {
   1027		offset = pfn_level_offset(pfn, total);
   1028		pte = &parent[offset];
   1029		if (level == total)
   1030			return pte;
   1031
   1032		if (!dma_pte_present(pte)) {
   1033			*large_page = total;
   1034			break;
   1035		}
   1036
   1037		if (dma_pte_superpage(pte)) {
   1038			*large_page = total;
   1039			return pte;
   1040		}
   1041
   1042		parent = phys_to_virt(dma_pte_addr(pte));
   1043		total--;
   1044	}
   1045	return NULL;
   1046}
   1047
   1048/* clear last level pte, a tlb flush should be followed */
   1049static void dma_pte_clear_range(struct dmar_domain *domain,
   1050				unsigned long start_pfn,
   1051				unsigned long last_pfn)
   1052{
   1053	unsigned int large_page;
   1054	struct dma_pte *first_pte, *pte;
   1055
   1056	BUG_ON(!domain_pfn_supported(domain, start_pfn));
   1057	BUG_ON(!domain_pfn_supported(domain, last_pfn));
   1058	BUG_ON(start_pfn > last_pfn);
   1059
   1060	/* we don't need lock here; nobody else touches the iova range */
   1061	do {
   1062		large_page = 1;
   1063		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
   1064		if (!pte) {
   1065			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
   1066			continue;
   1067		}
   1068		do {
   1069			dma_clear_pte(pte);
   1070			start_pfn += lvl_to_nr_pages(large_page);
   1071			pte++;
   1072		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
   1073
   1074		domain_flush_cache(domain, first_pte,
   1075				   (void *)pte - (void *)first_pte);
   1076
   1077	} while (start_pfn && start_pfn <= last_pfn);
   1078}
   1079
   1080static void dma_pte_free_level(struct dmar_domain *domain, int level,
   1081			       int retain_level, struct dma_pte *pte,
   1082			       unsigned long pfn, unsigned long start_pfn,
   1083			       unsigned long last_pfn)
   1084{
   1085	pfn = max(start_pfn, pfn);
   1086	pte = &pte[pfn_level_offset(pfn, level)];
   1087
   1088	do {
   1089		unsigned long level_pfn;
   1090		struct dma_pte *level_pte;
   1091
   1092		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
   1093			goto next;
   1094
   1095		level_pfn = pfn & level_mask(level);
   1096		level_pte = phys_to_virt(dma_pte_addr(pte));
   1097
   1098		if (level > 2) {
   1099			dma_pte_free_level(domain, level - 1, retain_level,
   1100					   level_pte, level_pfn, start_pfn,
   1101					   last_pfn);
   1102		}
   1103
   1104		/*
   1105		 * Free the page table if we're below the level we want to
   1106		 * retain and the range covers the entire table.
   1107		 */
   1108		if (level < retain_level && !(start_pfn > level_pfn ||
   1109		      last_pfn < level_pfn + level_size(level) - 1)) {
   1110			dma_clear_pte(pte);
   1111			domain_flush_cache(domain, pte, sizeof(*pte));
   1112			free_pgtable_page(level_pte);
   1113		}
   1114next:
   1115		pfn += level_size(level);
   1116	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
   1117}
   1118
   1119/*
   1120 * clear last level (leaf) ptes and free page table pages below the
   1121 * level we wish to keep intact.
   1122 */
   1123static void dma_pte_free_pagetable(struct dmar_domain *domain,
   1124				   unsigned long start_pfn,
   1125				   unsigned long last_pfn,
   1126				   int retain_level)
   1127{
   1128	dma_pte_clear_range(domain, start_pfn, last_pfn);
   1129
   1130	/* We don't need lock here; nobody else touches the iova range */
   1131	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
   1132			   domain->pgd, 0, start_pfn, last_pfn);
   1133
   1134	/* free pgd */
   1135	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
   1136		free_pgtable_page(domain->pgd);
   1137		domain->pgd = NULL;
   1138	}
   1139}
   1140
   1141/* When a page at a given level is being unlinked from its parent, we don't
   1142   need to *modify* it at all. All we need to do is make a list of all the
   1143   pages which can be freed just as soon as we've flushed the IOTLB and we
   1144   know the hardware page-walk will no longer touch them.
   1145   The 'pte' argument is the *parent* PTE, pointing to the page that is to
   1146   be freed. */
   1147static void dma_pte_list_pagetables(struct dmar_domain *domain,
   1148				    int level, struct dma_pte *pte,
   1149				    struct list_head *freelist)
   1150{
   1151	struct page *pg;
   1152
   1153	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
   1154	list_add_tail(&pg->lru, freelist);
   1155
   1156	if (level == 1)
   1157		return;
   1158
   1159	pte = page_address(pg);
   1160	do {
   1161		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
   1162			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
   1163		pte++;
   1164	} while (!first_pte_in_page(pte));
   1165}
   1166
   1167static void dma_pte_clear_level(struct dmar_domain *domain, int level,
   1168				struct dma_pte *pte, unsigned long pfn,
   1169				unsigned long start_pfn, unsigned long last_pfn,
   1170				struct list_head *freelist)
   1171{
   1172	struct dma_pte *first_pte = NULL, *last_pte = NULL;
   1173
   1174	pfn = max(start_pfn, pfn);
   1175	pte = &pte[pfn_level_offset(pfn, level)];
   1176
   1177	do {
   1178		unsigned long level_pfn = pfn & level_mask(level);
   1179
   1180		if (!dma_pte_present(pte))
   1181			goto next;
   1182
   1183		/* If range covers entire pagetable, free it */
   1184		if (start_pfn <= level_pfn &&
   1185		    last_pfn >= level_pfn + level_size(level) - 1) {
   1186			/* These suborbinate page tables are going away entirely. Don't
   1187			   bother to clear them; we're just going to *free* them. */
   1188			if (level > 1 && !dma_pte_superpage(pte))
   1189				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
   1190
   1191			dma_clear_pte(pte);
   1192			if (!first_pte)
   1193				first_pte = pte;
   1194			last_pte = pte;
   1195		} else if (level > 1) {
   1196			/* Recurse down into a level that isn't *entirely* obsolete */
   1197			dma_pte_clear_level(domain, level - 1,
   1198					    phys_to_virt(dma_pte_addr(pte)),
   1199					    level_pfn, start_pfn, last_pfn,
   1200					    freelist);
   1201		}
   1202next:
   1203		pfn = level_pfn + level_size(level);
   1204	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
   1205
   1206	if (first_pte)
   1207		domain_flush_cache(domain, first_pte,
   1208				   (void *)++last_pte - (void *)first_pte);
   1209}
   1210
   1211/* We can't just free the pages because the IOMMU may still be walking
   1212   the page tables, and may have cached the intermediate levels. The
   1213   pages can only be freed after the IOTLB flush has been done. */
   1214static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
   1215			 unsigned long last_pfn, struct list_head *freelist)
   1216{
   1217	BUG_ON(!domain_pfn_supported(domain, start_pfn));
   1218	BUG_ON(!domain_pfn_supported(domain, last_pfn));
   1219	BUG_ON(start_pfn > last_pfn);
   1220
   1221	/* we don't need lock here; nobody else touches the iova range */
   1222	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
   1223			    domain->pgd, 0, start_pfn, last_pfn, freelist);
   1224
   1225	/* free pgd */
   1226	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
   1227		struct page *pgd_page = virt_to_page(domain->pgd);
   1228		list_add_tail(&pgd_page->lru, freelist);
   1229		domain->pgd = NULL;
   1230	}
   1231}
   1232
   1233/* iommu handling */
   1234static int iommu_alloc_root_entry(struct intel_iommu *iommu)
   1235{
   1236	struct root_entry *root;
   1237	unsigned long flags;
   1238
   1239	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
   1240	if (!root) {
   1241		pr_err("Allocating root entry for %s failed\n",
   1242			iommu->name);
   1243		return -ENOMEM;
   1244	}
   1245
   1246	__iommu_flush_cache(iommu, root, ROOT_SIZE);
   1247
   1248	spin_lock_irqsave(&iommu->lock, flags);
   1249	iommu->root_entry = root;
   1250	spin_unlock_irqrestore(&iommu->lock, flags);
   1251
   1252	return 0;
   1253}
   1254
   1255static void iommu_set_root_entry(struct intel_iommu *iommu)
   1256{
   1257	u64 addr;
   1258	u32 sts;
   1259	unsigned long flag;
   1260
   1261	addr = virt_to_phys(iommu->root_entry);
   1262	if (sm_supported(iommu))
   1263		addr |= DMA_RTADDR_SMT;
   1264
   1265	raw_spin_lock_irqsave(&iommu->register_lock, flag);
   1266	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
   1267
   1268	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
   1269
   1270	/* Make sure hardware complete it */
   1271	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
   1272		      readl, (sts & DMA_GSTS_RTPS), sts);
   1273
   1274	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
   1275
   1276	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
   1277	if (sm_supported(iommu))
   1278		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
   1279	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
   1280}
   1281
   1282void iommu_flush_write_buffer(struct intel_iommu *iommu)
   1283{
   1284	u32 val;
   1285	unsigned long flag;
   1286
   1287	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
   1288		return;
   1289
   1290	raw_spin_lock_irqsave(&iommu->register_lock, flag);
   1291	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
   1292
   1293	/* Make sure hardware complete it */
   1294	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
   1295		      readl, (!(val & DMA_GSTS_WBFS)), val);
   1296
   1297	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
   1298}
   1299
   1300/* return value determine if we need a write buffer flush */
   1301static void __iommu_flush_context(struct intel_iommu *iommu,
   1302				  u16 did, u16 source_id, u8 function_mask,
   1303				  u64 type)
   1304{
   1305	u64 val = 0;
   1306	unsigned long flag;
   1307
   1308	switch (type) {
   1309	case DMA_CCMD_GLOBAL_INVL:
   1310		val = DMA_CCMD_GLOBAL_INVL;
   1311		break;
   1312	case DMA_CCMD_DOMAIN_INVL:
   1313		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
   1314		break;
   1315	case DMA_CCMD_DEVICE_INVL:
   1316		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
   1317			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
   1318		break;
   1319	default:
   1320		BUG();
   1321	}
   1322	val |= DMA_CCMD_ICC;
   1323
   1324	raw_spin_lock_irqsave(&iommu->register_lock, flag);
   1325	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
   1326
   1327	/* Make sure hardware complete it */
   1328	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
   1329		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
   1330
   1331	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
   1332}
   1333
   1334/* return value determine if we need a write buffer flush */
   1335static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
   1336				u64 addr, unsigned int size_order, u64 type)
   1337{
   1338	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
   1339	u64 val = 0, val_iva = 0;
   1340	unsigned long flag;
   1341
   1342	switch (type) {
   1343	case DMA_TLB_GLOBAL_FLUSH:
   1344		/* global flush doesn't need set IVA_REG */
   1345		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
   1346		break;
   1347	case DMA_TLB_DSI_FLUSH:
   1348		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
   1349		break;
   1350	case DMA_TLB_PSI_FLUSH:
   1351		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
   1352		/* IH bit is passed in as part of address */
   1353		val_iva = size_order | addr;
   1354		break;
   1355	default:
   1356		BUG();
   1357	}
   1358	/* Note: set drain read/write */
   1359#if 0
   1360	/*
   1361	 * This is probably to be super secure.. Looks like we can
   1362	 * ignore it without any impact.
   1363	 */
   1364	if (cap_read_drain(iommu->cap))
   1365		val |= DMA_TLB_READ_DRAIN;
   1366#endif
   1367	if (cap_write_drain(iommu->cap))
   1368		val |= DMA_TLB_WRITE_DRAIN;
   1369
   1370	raw_spin_lock_irqsave(&iommu->register_lock, flag);
   1371	/* Note: Only uses first TLB reg currently */
   1372	if (val_iva)
   1373		dmar_writeq(iommu->reg + tlb_offset, val_iva);
   1374	dmar_writeq(iommu->reg + tlb_offset + 8, val);
   1375
   1376	/* Make sure hardware complete it */
   1377	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
   1378		dmar_readq, (!(val & DMA_TLB_IVT)), val);
   1379
   1380	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
   1381
   1382	/* check IOTLB invalidation granularity */
   1383	if (DMA_TLB_IAIG(val) == 0)
   1384		pr_err("Flush IOTLB failed\n");
   1385	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
   1386		pr_debug("TLB flush request %Lx, actual %Lx\n",
   1387			(unsigned long long)DMA_TLB_IIRG(type),
   1388			(unsigned long long)DMA_TLB_IAIG(val));
   1389}
   1390
   1391static struct device_domain_info *
   1392iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
   1393			 u8 bus, u8 devfn)
   1394{
   1395	struct device_domain_info *info;
   1396
   1397	assert_spin_locked(&device_domain_lock);
   1398
   1399	if (!iommu->qi)
   1400		return NULL;
   1401
   1402	list_for_each_entry(info, &domain->devices, link)
   1403		if (info->iommu == iommu && info->bus == bus &&
   1404		    info->devfn == devfn) {
   1405			if (info->ats_supported && info->dev)
   1406				return info;
   1407			break;
   1408		}
   1409
   1410	return NULL;
   1411}
   1412
   1413static void domain_update_iotlb(struct dmar_domain *domain)
   1414{
   1415	struct device_domain_info *info;
   1416	bool has_iotlb_device = false;
   1417
   1418	assert_spin_locked(&device_domain_lock);
   1419
   1420	list_for_each_entry(info, &domain->devices, link)
   1421		if (info->ats_enabled) {
   1422			has_iotlb_device = true;
   1423			break;
   1424		}
   1425
   1426	domain->has_iotlb_device = has_iotlb_device;
   1427}
   1428
   1429static void iommu_enable_dev_iotlb(struct device_domain_info *info)
   1430{
   1431	struct pci_dev *pdev;
   1432
   1433	assert_spin_locked(&device_domain_lock);
   1434
   1435	if (!info || !dev_is_pci(info->dev))
   1436		return;
   1437
   1438	pdev = to_pci_dev(info->dev);
   1439	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
   1440	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
   1441	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
   1442	 * reserved, which should be set to 0.
   1443	 */
   1444	if (!ecap_dit(info->iommu->ecap))
   1445		info->pfsid = 0;
   1446	else {
   1447		struct pci_dev *pf_pdev;
   1448
   1449		/* pdev will be returned if device is not a vf */
   1450		pf_pdev = pci_physfn(pdev);
   1451		info->pfsid = pci_dev_id(pf_pdev);
   1452	}
   1453
   1454#ifdef CONFIG_INTEL_IOMMU_SVM
   1455	/* The PCIe spec, in its wisdom, declares that the behaviour of
   1456	   the device if you enable PASID support after ATS support is
   1457	   undefined. So always enable PASID support on devices which
   1458	   have it, even if we can't yet know if we're ever going to
   1459	   use it. */
   1460	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
   1461		info->pasid_enabled = 1;
   1462
   1463	if (info->pri_supported &&
   1464	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
   1465	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
   1466		info->pri_enabled = 1;
   1467#endif
   1468	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
   1469	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
   1470		info->ats_enabled = 1;
   1471		domain_update_iotlb(info->domain);
   1472		info->ats_qdep = pci_ats_queue_depth(pdev);
   1473	}
   1474}
   1475
   1476static void iommu_disable_dev_iotlb(struct device_domain_info *info)
   1477{
   1478	struct pci_dev *pdev;
   1479
   1480	assert_spin_locked(&device_domain_lock);
   1481
   1482	if (!dev_is_pci(info->dev))
   1483		return;
   1484
   1485	pdev = to_pci_dev(info->dev);
   1486
   1487	if (info->ats_enabled) {
   1488		pci_disable_ats(pdev);
   1489		info->ats_enabled = 0;
   1490		domain_update_iotlb(info->domain);
   1491	}
   1492#ifdef CONFIG_INTEL_IOMMU_SVM
   1493	if (info->pri_enabled) {
   1494		pci_disable_pri(pdev);
   1495		info->pri_enabled = 0;
   1496	}
   1497	if (info->pasid_enabled) {
   1498		pci_disable_pasid(pdev);
   1499		info->pasid_enabled = 0;
   1500	}
   1501#endif
   1502}
   1503
   1504static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
   1505				    u64 addr, unsigned int mask)
   1506{
   1507	u16 sid, qdep;
   1508
   1509	if (!info || !info->ats_enabled)
   1510		return;
   1511
   1512	sid = info->bus << 8 | info->devfn;
   1513	qdep = info->ats_qdep;
   1514	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
   1515			   qdep, addr, mask);
   1516}
   1517
   1518static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
   1519				  u64 addr, unsigned mask)
   1520{
   1521	unsigned long flags;
   1522	struct device_domain_info *info;
   1523
   1524	if (!domain->has_iotlb_device)
   1525		return;
   1526
   1527	spin_lock_irqsave(&device_domain_lock, flags);
   1528	list_for_each_entry(info, &domain->devices, link)
   1529		__iommu_flush_dev_iotlb(info, addr, mask);
   1530
   1531	spin_unlock_irqrestore(&device_domain_lock, flags);
   1532}
   1533
   1534static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
   1535				  struct dmar_domain *domain,
   1536				  unsigned long pfn, unsigned int pages,
   1537				  int ih, int map)
   1538{
   1539	unsigned int aligned_pages = __roundup_pow_of_two(pages);
   1540	unsigned int mask = ilog2(aligned_pages);
   1541	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
   1542	u16 did = domain->iommu_did[iommu->seq_id];
   1543
   1544	BUG_ON(pages == 0);
   1545
   1546	if (ih)
   1547		ih = 1 << 6;
   1548
   1549	if (domain_use_first_level(domain)) {
   1550		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
   1551	} else {
   1552		unsigned long bitmask = aligned_pages - 1;
   1553
   1554		/*
   1555		 * PSI masks the low order bits of the base address. If the
   1556		 * address isn't aligned to the mask, then compute a mask value
   1557		 * needed to ensure the target range is flushed.
   1558		 */
   1559		if (unlikely(bitmask & pfn)) {
   1560			unsigned long end_pfn = pfn + pages - 1, shared_bits;
   1561
   1562			/*
   1563			 * Since end_pfn <= pfn + bitmask, the only way bits
   1564			 * higher than bitmask can differ in pfn and end_pfn is
   1565			 * by carrying. This means after masking out bitmask,
   1566			 * high bits starting with the first set bit in
   1567			 * shared_bits are all equal in both pfn and end_pfn.
   1568			 */
   1569			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
   1570			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
   1571		}
   1572
   1573		/*
   1574		 * Fallback to domain selective flush if no PSI support or
   1575		 * the size is too big.
   1576		 */
   1577		if (!cap_pgsel_inv(iommu->cap) ||
   1578		    mask > cap_max_amask_val(iommu->cap))
   1579			iommu->flush.flush_iotlb(iommu, did, 0, 0,
   1580							DMA_TLB_DSI_FLUSH);
   1581		else
   1582			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
   1583							DMA_TLB_PSI_FLUSH);
   1584	}
   1585
   1586	/*
   1587	 * In caching mode, changes of pages from non-present to present require
   1588	 * flush. However, device IOTLB doesn't need to be flushed in this case.
   1589	 */
   1590	if (!cap_caching_mode(iommu->cap) || !map)
   1591		iommu_flush_dev_iotlb(domain, addr, mask);
   1592}
   1593
   1594/* Notification for newly created mappings */
   1595static inline void __mapping_notify_one(struct intel_iommu *iommu,
   1596					struct dmar_domain *domain,
   1597					unsigned long pfn, unsigned int pages)
   1598{
   1599	/*
   1600	 * It's a non-present to present mapping. Only flush if caching mode
   1601	 * and second level.
   1602	 */
   1603	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
   1604		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
   1605	else
   1606		iommu_flush_write_buffer(iommu);
   1607}
   1608
   1609static void intel_flush_iotlb_all(struct iommu_domain *domain)
   1610{
   1611	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
   1612	int idx;
   1613
   1614	for_each_domain_iommu(idx, dmar_domain) {
   1615		struct intel_iommu *iommu = g_iommus[idx];
   1616		u16 did = dmar_domain->iommu_did[iommu->seq_id];
   1617
   1618		if (domain_use_first_level(dmar_domain))
   1619			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
   1620		else
   1621			iommu->flush.flush_iotlb(iommu, did, 0, 0,
   1622						 DMA_TLB_DSI_FLUSH);
   1623
   1624		if (!cap_caching_mode(iommu->cap))
   1625			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
   1626	}
   1627}
   1628
   1629static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
   1630{
   1631	u32 pmen;
   1632	unsigned long flags;
   1633
   1634	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
   1635		return;
   1636
   1637	raw_spin_lock_irqsave(&iommu->register_lock, flags);
   1638	pmen = readl(iommu->reg + DMAR_PMEN_REG);
   1639	pmen &= ~DMA_PMEN_EPM;
   1640	writel(pmen, iommu->reg + DMAR_PMEN_REG);
   1641
   1642	/* wait for the protected region status bit to clear */
   1643	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
   1644		readl, !(pmen & DMA_PMEN_PRS), pmen);
   1645
   1646	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
   1647}
   1648
   1649static void iommu_enable_translation(struct intel_iommu *iommu)
   1650{
   1651	u32 sts;
   1652	unsigned long flags;
   1653
   1654	raw_spin_lock_irqsave(&iommu->register_lock, flags);
   1655	iommu->gcmd |= DMA_GCMD_TE;
   1656	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
   1657
   1658	/* Make sure hardware complete it */
   1659	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
   1660		      readl, (sts & DMA_GSTS_TES), sts);
   1661
   1662	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
   1663}
   1664
   1665static void iommu_disable_translation(struct intel_iommu *iommu)
   1666{
   1667	u32 sts;
   1668	unsigned long flag;
   1669
   1670	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
   1671	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
   1672		return;
   1673
   1674	raw_spin_lock_irqsave(&iommu->register_lock, flag);
   1675	iommu->gcmd &= ~DMA_GCMD_TE;
   1676	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
   1677
   1678	/* Make sure hardware complete it */
   1679	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
   1680		      readl, (!(sts & DMA_GSTS_TES)), sts);
   1681
   1682	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
   1683}
   1684
   1685static int iommu_init_domains(struct intel_iommu *iommu)
   1686{
   1687	u32 ndomains;
   1688
   1689	ndomains = cap_ndoms(iommu->cap);
   1690	pr_debug("%s: Number of Domains supported <%d>\n",
   1691		 iommu->name, ndomains);
   1692
   1693	spin_lock_init(&iommu->lock);
   1694
   1695	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
   1696	if (!iommu->domain_ids)
   1697		return -ENOMEM;
   1698
   1699	/*
   1700	 * If Caching mode is set, then invalid translations are tagged
   1701	 * with domain-id 0, hence we need to pre-allocate it. We also
   1702	 * use domain-id 0 as a marker for non-allocated domain-id, so
   1703	 * make sure it is not used for a real domain.
   1704	 */
   1705	set_bit(0, iommu->domain_ids);
   1706
   1707	/*
   1708	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
   1709	 * entry for first-level or pass-through translation modes should
   1710	 * be programmed with a domain id different from those used for
   1711	 * second-level or nested translation. We reserve a domain id for
   1712	 * this purpose.
   1713	 */
   1714	if (sm_supported(iommu))
   1715		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
   1716
   1717	return 0;
   1718}
   1719
   1720static void disable_dmar_iommu(struct intel_iommu *iommu)
   1721{
   1722	struct device_domain_info *info, *tmp;
   1723	unsigned long flags;
   1724
   1725	if (!iommu->domain_ids)
   1726		return;
   1727
   1728	spin_lock_irqsave(&device_domain_lock, flags);
   1729	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
   1730		if (info->iommu != iommu)
   1731			continue;
   1732
   1733		if (!info->dev || !info->domain)
   1734			continue;
   1735
   1736		__dmar_remove_one_dev_info(info);
   1737	}
   1738	spin_unlock_irqrestore(&device_domain_lock, flags);
   1739
   1740	if (iommu->gcmd & DMA_GCMD_TE)
   1741		iommu_disable_translation(iommu);
   1742}
   1743
   1744static void free_dmar_iommu(struct intel_iommu *iommu)
   1745{
   1746	if (iommu->domain_ids) {
   1747		bitmap_free(iommu->domain_ids);
   1748		iommu->domain_ids = NULL;
   1749	}
   1750
   1751	g_iommus[iommu->seq_id] = NULL;
   1752
   1753	/* free context mapping */
   1754	free_context_table(iommu);
   1755
   1756#ifdef CONFIG_INTEL_IOMMU_SVM
   1757	if (pasid_supported(iommu)) {
   1758		if (ecap_prs(iommu->ecap))
   1759			intel_svm_finish_prq(iommu);
   1760	}
   1761	if (vccap_pasid(iommu->vccap))
   1762		ioasid_unregister_allocator(&iommu->pasid_allocator);
   1763
   1764#endif
   1765}
   1766
   1767/*
   1768 * Check and return whether first level is used by default for
   1769 * DMA translation.
   1770 */
   1771static bool first_level_by_default(unsigned int type)
   1772{
   1773	/* Only SL is available in legacy mode */
   1774	if (!scalable_mode_support())
   1775		return false;
   1776
   1777	/* Only level (either FL or SL) is available, just use it */
   1778	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
   1779		return intel_cap_flts_sanity();
   1780
   1781	/* Both levels are available, decide it based on domain type */
   1782	return type != IOMMU_DOMAIN_UNMANAGED;
   1783}
   1784
   1785static struct dmar_domain *alloc_domain(unsigned int type)
   1786{
   1787	struct dmar_domain *domain;
   1788
   1789	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
   1790	if (!domain)
   1791		return NULL;
   1792
   1793	domain->nid = NUMA_NO_NODE;
   1794	if (first_level_by_default(type))
   1795		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
   1796	domain->has_iotlb_device = false;
   1797	INIT_LIST_HEAD(&domain->devices);
   1798
   1799	return domain;
   1800}
   1801
   1802/* Must be called with iommu->lock */
   1803static int domain_attach_iommu(struct dmar_domain *domain,
   1804			       struct intel_iommu *iommu)
   1805{
   1806	unsigned long ndomains;
   1807	int num;
   1808
   1809	assert_spin_locked(&device_domain_lock);
   1810	assert_spin_locked(&iommu->lock);
   1811
   1812	domain->iommu_refcnt[iommu->seq_id] += 1;
   1813	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
   1814		ndomains = cap_ndoms(iommu->cap);
   1815		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
   1816
   1817		if (num >= ndomains) {
   1818			pr_err("%s: No free domain ids\n", iommu->name);
   1819			domain->iommu_refcnt[iommu->seq_id] -= 1;
   1820			return -ENOSPC;
   1821		}
   1822
   1823		set_bit(num, iommu->domain_ids);
   1824		domain->iommu_did[iommu->seq_id] = num;
   1825		domain->nid			 = iommu->node;
   1826		domain_update_iommu_cap(domain);
   1827	}
   1828
   1829	return 0;
   1830}
   1831
   1832static void domain_detach_iommu(struct dmar_domain *domain,
   1833				struct intel_iommu *iommu)
   1834{
   1835	int num;
   1836
   1837	assert_spin_locked(&device_domain_lock);
   1838	assert_spin_locked(&iommu->lock);
   1839
   1840	domain->iommu_refcnt[iommu->seq_id] -= 1;
   1841	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
   1842		num = domain->iommu_did[iommu->seq_id];
   1843		clear_bit(num, iommu->domain_ids);
   1844		domain_update_iommu_cap(domain);
   1845		domain->iommu_did[iommu->seq_id] = 0;
   1846	}
   1847}
   1848
   1849static inline int guestwidth_to_adjustwidth(int gaw)
   1850{
   1851	int agaw;
   1852	int r = (gaw - 12) % 9;
   1853
   1854	if (r == 0)
   1855		agaw = gaw;
   1856	else
   1857		agaw = gaw + 9 - r;
   1858	if (agaw > 64)
   1859		agaw = 64;
   1860	return agaw;
   1861}
   1862
   1863static void domain_exit(struct dmar_domain *domain)
   1864{
   1865
   1866	/* Remove associated devices and clear attached or cached domains */
   1867	domain_remove_dev_info(domain);
   1868
   1869	if (domain->pgd) {
   1870		LIST_HEAD(freelist);
   1871
   1872		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
   1873		put_pages_list(&freelist);
   1874	}
   1875
   1876	kfree(domain);
   1877}
   1878
   1879/*
   1880 * Get the PASID directory size for scalable mode context entry.
   1881 * Value of X in the PDTS field of a scalable mode context entry
   1882 * indicates PASID directory with 2^(X + 7) entries.
   1883 */
   1884static inline unsigned long context_get_sm_pds(struct pasid_table *table)
   1885{
   1886	unsigned long pds, max_pde;
   1887
   1888	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
   1889	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
   1890	if (pds < 7)
   1891		return 0;
   1892
   1893	return pds - 7;
   1894}
   1895
   1896/*
   1897 * Set the RID_PASID field of a scalable mode context entry. The
   1898 * IOMMU hardware will use the PASID value set in this field for
   1899 * DMA translations of DMA requests without PASID.
   1900 */
   1901static inline void
   1902context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
   1903{
   1904	context->hi |= pasid & ((1 << 20) - 1);
   1905}
   1906
   1907/*
   1908 * Set the DTE(Device-TLB Enable) field of a scalable mode context
   1909 * entry.
   1910 */
   1911static inline void context_set_sm_dte(struct context_entry *context)
   1912{
   1913	context->lo |= (1 << 2);
   1914}
   1915
   1916/*
   1917 * Set the PRE(Page Request Enable) field of a scalable mode context
   1918 * entry.
   1919 */
   1920static inline void context_set_sm_pre(struct context_entry *context)
   1921{
   1922	context->lo |= (1 << 4);
   1923}
   1924
   1925/* Convert value to context PASID directory size field coding. */
   1926#define context_pdts(pds)	(((pds) & 0x7) << 9)
   1927
   1928static int domain_context_mapping_one(struct dmar_domain *domain,
   1929				      struct intel_iommu *iommu,
   1930				      struct pasid_table *table,
   1931				      u8 bus, u8 devfn)
   1932{
   1933	u16 did = domain->iommu_did[iommu->seq_id];
   1934	int translation = CONTEXT_TT_MULTI_LEVEL;
   1935	struct device_domain_info *info = NULL;
   1936	struct context_entry *context;
   1937	unsigned long flags;
   1938	int ret;
   1939
   1940	WARN_ON(did == 0);
   1941
   1942	if (hw_pass_through && domain_type_is_si(domain))
   1943		translation = CONTEXT_TT_PASS_THROUGH;
   1944
   1945	pr_debug("Set context mapping for %02x:%02x.%d\n",
   1946		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
   1947
   1948	BUG_ON(!domain->pgd);
   1949
   1950	spin_lock_irqsave(&device_domain_lock, flags);
   1951	spin_lock(&iommu->lock);
   1952
   1953	ret = -ENOMEM;
   1954	context = iommu_context_addr(iommu, bus, devfn, 1);
   1955	if (!context)
   1956		goto out_unlock;
   1957
   1958	ret = 0;
   1959	if (context_present(context))
   1960		goto out_unlock;
   1961
   1962	/*
   1963	 * For kdump cases, old valid entries may be cached due to the
   1964	 * in-flight DMA and copied pgtable, but there is no unmapping
   1965	 * behaviour for them, thus we need an explicit cache flush for
   1966	 * the newly-mapped device. For kdump, at this point, the device
   1967	 * is supposed to finish reset at its driver probe stage, so no
   1968	 * in-flight DMA will exist, and we don't need to worry anymore
   1969	 * hereafter.
   1970	 */
   1971	if (context_copied(context)) {
   1972		u16 did_old = context_domain_id(context);
   1973
   1974		if (did_old < cap_ndoms(iommu->cap)) {
   1975			iommu->flush.flush_context(iommu, did_old,
   1976						   (((u16)bus) << 8) | devfn,
   1977						   DMA_CCMD_MASK_NOBIT,
   1978						   DMA_CCMD_DEVICE_INVL);
   1979			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
   1980						 DMA_TLB_DSI_FLUSH);
   1981		}
   1982	}
   1983
   1984	context_clear_entry(context);
   1985
   1986	if (sm_supported(iommu)) {
   1987		unsigned long pds;
   1988
   1989		WARN_ON(!table);
   1990
   1991		/* Setup the PASID DIR pointer: */
   1992		pds = context_get_sm_pds(table);
   1993		context->lo = (u64)virt_to_phys(table->table) |
   1994				context_pdts(pds);
   1995
   1996		/* Setup the RID_PASID field: */
   1997		context_set_sm_rid2pasid(context, PASID_RID2PASID);
   1998
   1999		/*
   2000		 * Setup the Device-TLB enable bit and Page request
   2001		 * Enable bit:
   2002		 */
   2003		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
   2004		if (info && info->ats_supported)
   2005			context_set_sm_dte(context);
   2006		if (info && info->pri_supported)
   2007			context_set_sm_pre(context);
   2008	} else {
   2009		struct dma_pte *pgd = domain->pgd;
   2010		int agaw;
   2011
   2012		context_set_domain_id(context, did);
   2013
   2014		if (translation != CONTEXT_TT_PASS_THROUGH) {
   2015			/*
   2016			 * Skip top levels of page tables for iommu which has
   2017			 * less agaw than default. Unnecessary for PT mode.
   2018			 */
   2019			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
   2020				ret = -ENOMEM;
   2021				pgd = phys_to_virt(dma_pte_addr(pgd));
   2022				if (!dma_pte_present(pgd))
   2023					goto out_unlock;
   2024			}
   2025
   2026			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
   2027			if (info && info->ats_supported)
   2028				translation = CONTEXT_TT_DEV_IOTLB;
   2029			else
   2030				translation = CONTEXT_TT_MULTI_LEVEL;
   2031
   2032			context_set_address_root(context, virt_to_phys(pgd));
   2033			context_set_address_width(context, agaw);
   2034		} else {
   2035			/*
   2036			 * In pass through mode, AW must be programmed to
   2037			 * indicate the largest AGAW value supported by
   2038			 * hardware. And ASR is ignored by hardware.
   2039			 */
   2040			context_set_address_width(context, iommu->msagaw);
   2041		}
   2042
   2043		context_set_translation_type(context, translation);
   2044	}
   2045
   2046	context_set_fault_enable(context);
   2047	context_set_present(context);
   2048	if (!ecap_coherent(iommu->ecap))
   2049		clflush_cache_range(context, sizeof(*context));
   2050
   2051	/*
   2052	 * It's a non-present to present mapping. If hardware doesn't cache
   2053	 * non-present entry we only need to flush the write-buffer. If the
   2054	 * _does_ cache non-present entries, then it does so in the special
   2055	 * domain #0, which we have to flush:
   2056	 */
   2057	if (cap_caching_mode(iommu->cap)) {
   2058		iommu->flush.flush_context(iommu, 0,
   2059					   (((u16)bus) << 8) | devfn,
   2060					   DMA_CCMD_MASK_NOBIT,
   2061					   DMA_CCMD_DEVICE_INVL);
   2062		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
   2063	} else {
   2064		iommu_flush_write_buffer(iommu);
   2065	}
   2066	iommu_enable_dev_iotlb(info);
   2067
   2068	ret = 0;
   2069
   2070out_unlock:
   2071	spin_unlock(&iommu->lock);
   2072	spin_unlock_irqrestore(&device_domain_lock, flags);
   2073
   2074	return ret;
   2075}
   2076
   2077struct domain_context_mapping_data {
   2078	struct dmar_domain *domain;
   2079	struct intel_iommu *iommu;
   2080	struct pasid_table *table;
   2081};
   2082
   2083static int domain_context_mapping_cb(struct pci_dev *pdev,
   2084				     u16 alias, void *opaque)
   2085{
   2086	struct domain_context_mapping_data *data = opaque;
   2087
   2088	return domain_context_mapping_one(data->domain, data->iommu,
   2089					  data->table, PCI_BUS_NUM(alias),
   2090					  alias & 0xff);
   2091}
   2092
   2093static int
   2094domain_context_mapping(struct dmar_domain *domain, struct device *dev)
   2095{
   2096	struct domain_context_mapping_data data;
   2097	struct pasid_table *table;
   2098	struct intel_iommu *iommu;
   2099	u8 bus, devfn;
   2100
   2101	iommu = device_to_iommu(dev, &bus, &devfn);
   2102	if (!iommu)
   2103		return -ENODEV;
   2104
   2105	table = intel_pasid_get_table(dev);
   2106
   2107	if (!dev_is_pci(dev))
   2108		return domain_context_mapping_one(domain, iommu, table,
   2109						  bus, devfn);
   2110
   2111	data.domain = domain;
   2112	data.iommu = iommu;
   2113	data.table = table;
   2114
   2115	return pci_for_each_dma_alias(to_pci_dev(dev),
   2116				      &domain_context_mapping_cb, &data);
   2117}
   2118
   2119static int domain_context_mapped_cb(struct pci_dev *pdev,
   2120				    u16 alias, void *opaque)
   2121{
   2122	struct intel_iommu *iommu = opaque;
   2123
   2124	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
   2125}
   2126
   2127static int domain_context_mapped(struct device *dev)
   2128{
   2129	struct intel_iommu *iommu;
   2130	u8 bus, devfn;
   2131
   2132	iommu = device_to_iommu(dev, &bus, &devfn);
   2133	if (!iommu)
   2134		return -ENODEV;
   2135
   2136	if (!dev_is_pci(dev))
   2137		return device_context_mapped(iommu, bus, devfn);
   2138
   2139	return !pci_for_each_dma_alias(to_pci_dev(dev),
   2140				       domain_context_mapped_cb, iommu);
   2141}
   2142
   2143/* Returns a number of VTD pages, but aligned to MM page size */
   2144static inline unsigned long aligned_nrpages(unsigned long host_addr,
   2145					    size_t size)
   2146{
   2147	host_addr &= ~PAGE_MASK;
   2148	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
   2149}
   2150
   2151/* Return largest possible superpage level for a given mapping */
   2152static inline int hardware_largepage_caps(struct dmar_domain *domain,
   2153					  unsigned long iov_pfn,
   2154					  unsigned long phy_pfn,
   2155					  unsigned long pages)
   2156{
   2157	int support, level = 1;
   2158	unsigned long pfnmerge;
   2159
   2160	support = domain->iommu_superpage;
   2161
   2162	/* To use a large page, the virtual *and* physical addresses
   2163	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
   2164	   of them will mean we have to use smaller pages. So just
   2165	   merge them and check both at once. */
   2166	pfnmerge = iov_pfn | phy_pfn;
   2167
   2168	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
   2169		pages >>= VTD_STRIDE_SHIFT;
   2170		if (!pages)
   2171			break;
   2172		pfnmerge >>= VTD_STRIDE_SHIFT;
   2173		level++;
   2174		support--;
   2175	}
   2176	return level;
   2177}
   2178
   2179/*
   2180 * Ensure that old small page tables are removed to make room for superpage(s).
   2181 * We're going to add new large pages, so make sure we don't remove their parent
   2182 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
   2183 */
   2184static void switch_to_super_page(struct dmar_domain *domain,
   2185				 unsigned long start_pfn,
   2186				 unsigned long end_pfn, int level)
   2187{
   2188	unsigned long lvl_pages = lvl_to_nr_pages(level);
   2189	struct dma_pte *pte = NULL;
   2190	int i;
   2191
   2192	while (start_pfn <= end_pfn) {
   2193		if (!pte)
   2194			pte = pfn_to_dma_pte(domain, start_pfn, &level);
   2195
   2196		if (dma_pte_present(pte)) {
   2197			dma_pte_free_pagetable(domain, start_pfn,
   2198					       start_pfn + lvl_pages - 1,
   2199					       level + 1);
   2200
   2201			for_each_domain_iommu(i, domain)
   2202				iommu_flush_iotlb_psi(g_iommus[i], domain,
   2203						      start_pfn, lvl_pages,
   2204						      0, 0);
   2205		}
   2206
   2207		pte++;
   2208		start_pfn += lvl_pages;
   2209		if (first_pte_in_page(pte))
   2210			pte = NULL;
   2211	}
   2212}
   2213
   2214static int
   2215__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
   2216		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
   2217{
   2218	struct dma_pte *first_pte = NULL, *pte = NULL;
   2219	unsigned int largepage_lvl = 0;
   2220	unsigned long lvl_pages = 0;
   2221	phys_addr_t pteval;
   2222	u64 attr;
   2223
   2224	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
   2225
   2226	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
   2227		return -EINVAL;
   2228
   2229	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
   2230	attr |= DMA_FL_PTE_PRESENT;
   2231	if (domain_use_first_level(domain)) {
   2232		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
   2233		if (prot & DMA_PTE_WRITE)
   2234			attr |= DMA_FL_PTE_DIRTY;
   2235	}
   2236
   2237	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
   2238
   2239	while (nr_pages > 0) {
   2240		uint64_t tmp;
   2241
   2242		if (!pte) {
   2243			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
   2244					phys_pfn, nr_pages);
   2245
   2246			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
   2247			if (!pte)
   2248				return -ENOMEM;
   2249			first_pte = pte;
   2250
   2251			lvl_pages = lvl_to_nr_pages(largepage_lvl);
   2252
   2253			/* It is large page*/
   2254			if (largepage_lvl > 1) {
   2255				unsigned long end_pfn;
   2256				unsigned long pages_to_remove;
   2257
   2258				pteval |= DMA_PTE_LARGE_PAGE;
   2259				pages_to_remove = min_t(unsigned long, nr_pages,
   2260							nr_pte_to_next_page(pte) * lvl_pages);
   2261				end_pfn = iov_pfn + pages_to_remove - 1;
   2262				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
   2263			} else {
   2264				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
   2265			}
   2266
   2267		}
   2268		/* We don't need lock here, nobody else
   2269		 * touches the iova range
   2270		 */
   2271		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
   2272		if (tmp) {
   2273			static int dumps = 5;
   2274			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
   2275				iov_pfn, tmp, (unsigned long long)pteval);
   2276			if (dumps) {
   2277				dumps--;
   2278				debug_dma_dump_mappings(NULL);
   2279			}
   2280			WARN_ON(1);
   2281		}
   2282
   2283		nr_pages -= lvl_pages;
   2284		iov_pfn += lvl_pages;
   2285		phys_pfn += lvl_pages;
   2286		pteval += lvl_pages * VTD_PAGE_SIZE;
   2287
   2288		/* If the next PTE would be the first in a new page, then we
   2289		 * need to flush the cache on the entries we've just written.
   2290		 * And then we'll need to recalculate 'pte', so clear it and
   2291		 * let it get set again in the if (!pte) block above.
   2292		 *
   2293		 * If we're done (!nr_pages) we need to flush the cache too.
   2294		 *
   2295		 * Also if we've been setting superpages, we may need to
   2296		 * recalculate 'pte' and switch back to smaller pages for the
   2297		 * end of the mapping, if the trailing size is not enough to
   2298		 * use another superpage (i.e. nr_pages < lvl_pages).
   2299		 */
   2300		pte++;
   2301		if (!nr_pages || first_pte_in_page(pte) ||
   2302		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
   2303			domain_flush_cache(domain, first_pte,
   2304					   (void *)pte - (void *)first_pte);
   2305			pte = NULL;
   2306		}
   2307	}
   2308
   2309	return 0;
   2310}
   2311
   2312static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
   2313{
   2314	struct intel_iommu *iommu = info->iommu;
   2315	struct context_entry *context;
   2316	unsigned long flags;
   2317	u16 did_old;
   2318
   2319	if (!iommu)
   2320		return;
   2321
   2322	spin_lock_irqsave(&iommu->lock, flags);
   2323	context = iommu_context_addr(iommu, bus, devfn, 0);
   2324	if (!context) {
   2325		spin_unlock_irqrestore(&iommu->lock, flags);
   2326		return;
   2327	}
   2328
   2329	if (sm_supported(iommu)) {
   2330		if (hw_pass_through && domain_type_is_si(info->domain))
   2331			did_old = FLPT_DEFAULT_DID;
   2332		else
   2333			did_old = info->domain->iommu_did[iommu->seq_id];
   2334	} else {
   2335		did_old = context_domain_id(context);
   2336	}
   2337
   2338	context_clear_entry(context);
   2339	__iommu_flush_cache(iommu, context, sizeof(*context));
   2340	spin_unlock_irqrestore(&iommu->lock, flags);
   2341	iommu->flush.flush_context(iommu,
   2342				   did_old,
   2343				   (((u16)bus) << 8) | devfn,
   2344				   DMA_CCMD_MASK_NOBIT,
   2345				   DMA_CCMD_DEVICE_INVL);
   2346
   2347	if (sm_supported(iommu))
   2348		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
   2349
   2350	iommu->flush.flush_iotlb(iommu,
   2351				 did_old,
   2352				 0,
   2353				 0,
   2354				 DMA_TLB_DSI_FLUSH);
   2355
   2356	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
   2357}
   2358
   2359static void domain_remove_dev_info(struct dmar_domain *domain)
   2360{
   2361	struct device_domain_info *info, *tmp;
   2362	unsigned long flags;
   2363
   2364	spin_lock_irqsave(&device_domain_lock, flags);
   2365	list_for_each_entry_safe(info, tmp, &domain->devices, link)
   2366		__dmar_remove_one_dev_info(info);
   2367	spin_unlock_irqrestore(&device_domain_lock, flags);
   2368}
   2369
   2370static inline struct device_domain_info *
   2371dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
   2372{
   2373	struct device_domain_info *info;
   2374
   2375	list_for_each_entry(info, &device_domain_list, global)
   2376		if (info->segment == segment && info->bus == bus &&
   2377		    info->devfn == devfn)
   2378			return info;
   2379
   2380	return NULL;
   2381}
   2382
   2383static int domain_setup_first_level(struct intel_iommu *iommu,
   2384				    struct dmar_domain *domain,
   2385				    struct device *dev,
   2386				    u32 pasid)
   2387{
   2388	struct dma_pte *pgd = domain->pgd;
   2389	int agaw, level;
   2390	int flags = 0;
   2391
   2392	/*
   2393	 * Skip top levels of page tables for iommu which has
   2394	 * less agaw than default. Unnecessary for PT mode.
   2395	 */
   2396	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
   2397		pgd = phys_to_virt(dma_pte_addr(pgd));
   2398		if (!dma_pte_present(pgd))
   2399			return -ENOMEM;
   2400	}
   2401
   2402	level = agaw_to_level(agaw);
   2403	if (level != 4 && level != 5)
   2404		return -EINVAL;
   2405
   2406	if (pasid != PASID_RID2PASID)
   2407		flags |= PASID_FLAG_SUPERVISOR_MODE;
   2408	if (level == 5)
   2409		flags |= PASID_FLAG_FL5LP;
   2410
   2411	if (domain->force_snooping)
   2412		flags |= PASID_FLAG_PAGE_SNOOP;
   2413
   2414	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
   2415					     domain->iommu_did[iommu->seq_id],
   2416					     flags);
   2417}
   2418
   2419static bool dev_is_real_dma_subdevice(struct device *dev)
   2420{
   2421	return dev && dev_is_pci(dev) &&
   2422	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
   2423}
   2424
   2425static int iommu_domain_identity_map(struct dmar_domain *domain,
   2426				     unsigned long first_vpfn,
   2427				     unsigned long last_vpfn)
   2428{
   2429	/*
   2430	 * RMRR range might have overlap with physical memory range,
   2431	 * clear it first
   2432	 */
   2433	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
   2434
   2435	return __domain_mapping(domain, first_vpfn,
   2436				first_vpfn, last_vpfn - first_vpfn + 1,
   2437				DMA_PTE_READ|DMA_PTE_WRITE);
   2438}
   2439
   2440static int md_domain_init(struct dmar_domain *domain, int guest_width);
   2441
   2442static int __init si_domain_init(int hw)
   2443{
   2444	struct dmar_rmrr_unit *rmrr;
   2445	struct device *dev;
   2446	int i, nid, ret;
   2447
   2448	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
   2449	if (!si_domain)
   2450		return -EFAULT;
   2451
   2452	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
   2453		domain_exit(si_domain);
   2454		return -EFAULT;
   2455	}
   2456
   2457	if (hw)
   2458		return 0;
   2459
   2460	for_each_online_node(nid) {
   2461		unsigned long start_pfn, end_pfn;
   2462		int i;
   2463
   2464		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
   2465			ret = iommu_domain_identity_map(si_domain,
   2466					mm_to_dma_pfn(start_pfn),
   2467					mm_to_dma_pfn(end_pfn));
   2468			if (ret)
   2469				return ret;
   2470		}
   2471	}
   2472
   2473	/*
   2474	 * Identity map the RMRRs so that devices with RMRRs could also use
   2475	 * the si_domain.
   2476	 */
   2477	for_each_rmrr_units(rmrr) {
   2478		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
   2479					  i, dev) {
   2480			unsigned long long start = rmrr->base_address;
   2481			unsigned long long end = rmrr->end_address;
   2482
   2483			if (WARN_ON(end < start ||
   2484				    end >> agaw_to_width(si_domain->agaw)))
   2485				continue;
   2486
   2487			ret = iommu_domain_identity_map(si_domain,
   2488					mm_to_dma_pfn(start >> PAGE_SHIFT),
   2489					mm_to_dma_pfn(end >> PAGE_SHIFT));
   2490			if (ret)
   2491				return ret;
   2492		}
   2493	}
   2494
   2495	return 0;
   2496}
   2497
   2498static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
   2499{
   2500	struct device_domain_info *info = dev_iommu_priv_get(dev);
   2501	struct intel_iommu *iommu;
   2502	unsigned long flags;
   2503	u8 bus, devfn;
   2504	int ret;
   2505
   2506	iommu = device_to_iommu(dev, &bus, &devfn);
   2507	if (!iommu)
   2508		return -ENODEV;
   2509
   2510	spin_lock_irqsave(&device_domain_lock, flags);
   2511	info->domain = domain;
   2512	spin_lock(&iommu->lock);
   2513	ret = domain_attach_iommu(domain, iommu);
   2514	spin_unlock(&iommu->lock);
   2515	if (ret) {
   2516		spin_unlock_irqrestore(&device_domain_lock, flags);
   2517		return ret;
   2518	}
   2519	list_add(&info->link, &domain->devices);
   2520	spin_unlock_irqrestore(&device_domain_lock, flags);
   2521
   2522	/* PASID table is mandatory for a PCI device in scalable mode. */
   2523	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
   2524		ret = intel_pasid_alloc_table(dev);
   2525		if (ret) {
   2526			dev_err(dev, "PASID table allocation failed\n");
   2527			dmar_remove_one_dev_info(dev);
   2528			return ret;
   2529		}
   2530
   2531		/* Setup the PASID entry for requests without PASID: */
   2532		spin_lock_irqsave(&iommu->lock, flags);
   2533		if (hw_pass_through && domain_type_is_si(domain))
   2534			ret = intel_pasid_setup_pass_through(iommu, domain,
   2535					dev, PASID_RID2PASID);
   2536		else if (domain_use_first_level(domain))
   2537			ret = domain_setup_first_level(iommu, domain, dev,
   2538					PASID_RID2PASID);
   2539		else
   2540			ret = intel_pasid_setup_second_level(iommu, domain,
   2541					dev, PASID_RID2PASID);
   2542		spin_unlock_irqrestore(&iommu->lock, flags);
   2543		if (ret) {
   2544			dev_err(dev, "Setup RID2PASID failed\n");
   2545			dmar_remove_one_dev_info(dev);
   2546			return ret;
   2547		}
   2548	}
   2549
   2550	ret = domain_context_mapping(domain, dev);
   2551	if (ret) {
   2552		dev_err(dev, "Domain context map failed\n");
   2553		dmar_remove_one_dev_info(dev);
   2554		return ret;
   2555	}
   2556
   2557	return 0;
   2558}
   2559
   2560static bool device_has_rmrr(struct device *dev)
   2561{
   2562	struct dmar_rmrr_unit *rmrr;
   2563	struct device *tmp;
   2564	int i;
   2565
   2566	rcu_read_lock();
   2567	for_each_rmrr_units(rmrr) {
   2568		/*
   2569		 * Return TRUE if this RMRR contains the device that
   2570		 * is passed in.
   2571		 */
   2572		for_each_active_dev_scope(rmrr->devices,
   2573					  rmrr->devices_cnt, i, tmp)
   2574			if (tmp == dev ||
   2575			    is_downstream_to_pci_bridge(dev, tmp)) {
   2576				rcu_read_unlock();
   2577				return true;
   2578			}
   2579	}
   2580	rcu_read_unlock();
   2581	return false;
   2582}
   2583
   2584/**
   2585 * device_rmrr_is_relaxable - Test whether the RMRR of this device
   2586 * is relaxable (ie. is allowed to be not enforced under some conditions)
   2587 * @dev: device handle
   2588 *
   2589 * We assume that PCI USB devices with RMRRs have them largely
   2590 * for historical reasons and that the RMRR space is not actively used post
   2591 * boot.  This exclusion may change if vendors begin to abuse it.
   2592 *
   2593 * The same exception is made for graphics devices, with the requirement that
   2594 * any use of the RMRR regions will be torn down before assigning the device
   2595 * to a guest.
   2596 *
   2597 * Return: true if the RMRR is relaxable, false otherwise
   2598 */
   2599static bool device_rmrr_is_relaxable(struct device *dev)
   2600{
   2601	struct pci_dev *pdev;
   2602
   2603	if (!dev_is_pci(dev))
   2604		return false;
   2605
   2606	pdev = to_pci_dev(dev);
   2607	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
   2608		return true;
   2609	else
   2610		return false;
   2611}
   2612
   2613/*
   2614 * There are a couple cases where we need to restrict the functionality of
   2615 * devices associated with RMRRs.  The first is when evaluating a device for
   2616 * identity mapping because problems exist when devices are moved in and out
   2617 * of domains and their respective RMRR information is lost.  This means that
   2618 * a device with associated RMRRs will never be in a "passthrough" domain.
   2619 * The second is use of the device through the IOMMU API.  This interface
   2620 * expects to have full control of the IOVA space for the device.  We cannot
   2621 * satisfy both the requirement that RMRR access is maintained and have an
   2622 * unencumbered IOVA space.  We also have no ability to quiesce the device's
   2623 * use of the RMRR space or even inform the IOMMU API user of the restriction.
   2624 * We therefore prevent devices associated with an RMRR from participating in
   2625 * the IOMMU API, which eliminates them from device assignment.
   2626 *
   2627 * In both cases, devices which have relaxable RMRRs are not concerned by this
   2628 * restriction. See device_rmrr_is_relaxable comment.
   2629 */
   2630static bool device_is_rmrr_locked(struct device *dev)
   2631{
   2632	if (!device_has_rmrr(dev))
   2633		return false;
   2634
   2635	if (device_rmrr_is_relaxable(dev))
   2636		return false;
   2637
   2638	return true;
   2639}
   2640
   2641/*
   2642 * Return the required default domain type for a specific device.
   2643 *
   2644 * @dev: the device in query
   2645 * @startup: true if this is during early boot
   2646 *
   2647 * Returns:
   2648 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
   2649 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
   2650 *  - 0: both identity and dynamic domains work for this device
   2651 */
   2652static int device_def_domain_type(struct device *dev)
   2653{
   2654	if (dev_is_pci(dev)) {
   2655		struct pci_dev *pdev = to_pci_dev(dev);
   2656
   2657		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
   2658			return IOMMU_DOMAIN_IDENTITY;
   2659
   2660		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
   2661			return IOMMU_DOMAIN_IDENTITY;
   2662	}
   2663
   2664	return 0;
   2665}
   2666
   2667static void intel_iommu_init_qi(struct intel_iommu *iommu)
   2668{
   2669	/*
   2670	 * Start from the sane iommu hardware state.
   2671	 * If the queued invalidation is already initialized by us
   2672	 * (for example, while enabling interrupt-remapping) then
   2673	 * we got the things already rolling from a sane state.
   2674	 */
   2675	if (!iommu->qi) {
   2676		/*
   2677		 * Clear any previous faults.
   2678		 */
   2679		dmar_fault(-1, iommu);
   2680		/*
   2681		 * Disable queued invalidation if supported and already enabled
   2682		 * before OS handover.
   2683		 */
   2684		dmar_disable_qi(iommu);
   2685	}
   2686
   2687	if (dmar_enable_qi(iommu)) {
   2688		/*
   2689		 * Queued Invalidate not enabled, use Register Based Invalidate
   2690		 */
   2691		iommu->flush.flush_context = __iommu_flush_context;
   2692		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
   2693		pr_info("%s: Using Register based invalidation\n",
   2694			iommu->name);
   2695	} else {
   2696		iommu->flush.flush_context = qi_flush_context;
   2697		iommu->flush.flush_iotlb = qi_flush_iotlb;
   2698		pr_info("%s: Using Queued invalidation\n", iommu->name);
   2699	}
   2700}
   2701
   2702static int copy_context_table(struct intel_iommu *iommu,
   2703			      struct root_entry *old_re,
   2704			      struct context_entry **tbl,
   2705			      int bus, bool ext)
   2706{
   2707	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
   2708	struct context_entry *new_ce = NULL, ce;
   2709	struct context_entry *old_ce = NULL;
   2710	struct root_entry re;
   2711	phys_addr_t old_ce_phys;
   2712
   2713	tbl_idx = ext ? bus * 2 : bus;
   2714	memcpy(&re, old_re, sizeof(re));
   2715
   2716	for (devfn = 0; devfn < 256; devfn++) {
   2717		/* First calculate the correct index */
   2718		idx = (ext ? devfn * 2 : devfn) % 256;
   2719
   2720		if (idx == 0) {
   2721			/* First save what we may have and clean up */
   2722			if (new_ce) {
   2723				tbl[tbl_idx] = new_ce;
   2724				__iommu_flush_cache(iommu, new_ce,
   2725						    VTD_PAGE_SIZE);
   2726				pos = 1;
   2727			}
   2728
   2729			if (old_ce)
   2730				memunmap(old_ce);
   2731
   2732			ret = 0;
   2733			if (devfn < 0x80)
   2734				old_ce_phys = root_entry_lctp(&re);
   2735			else
   2736				old_ce_phys = root_entry_uctp(&re);
   2737
   2738			if (!old_ce_phys) {
   2739				if (ext && devfn == 0) {
   2740					/* No LCTP, try UCTP */
   2741					devfn = 0x7f;
   2742					continue;
   2743				} else {
   2744					goto out;
   2745				}
   2746			}
   2747
   2748			ret = -ENOMEM;
   2749			old_ce = memremap(old_ce_phys, PAGE_SIZE,
   2750					MEMREMAP_WB);
   2751			if (!old_ce)
   2752				goto out;
   2753
   2754			new_ce = alloc_pgtable_page(iommu->node);
   2755			if (!new_ce)
   2756				goto out_unmap;
   2757
   2758			ret = 0;
   2759		}
   2760
   2761		/* Now copy the context entry */
   2762		memcpy(&ce, old_ce + idx, sizeof(ce));
   2763
   2764		if (!__context_present(&ce))
   2765			continue;
   2766
   2767		did = context_domain_id(&ce);
   2768		if (did >= 0 && did < cap_ndoms(iommu->cap))
   2769			set_bit(did, iommu->domain_ids);
   2770
   2771		/*
   2772		 * We need a marker for copied context entries. This
   2773		 * marker needs to work for the old format as well as
   2774		 * for extended context entries.
   2775		 *
   2776		 * Bit 67 of the context entry is used. In the old
   2777		 * format this bit is available to software, in the
   2778		 * extended format it is the PGE bit, but PGE is ignored
   2779		 * by HW if PASIDs are disabled (and thus still
   2780		 * available).
   2781		 *
   2782		 * So disable PASIDs first and then mark the entry
   2783		 * copied. This means that we don't copy PASID
   2784		 * translations from the old kernel, but this is fine as
   2785		 * faults there are not fatal.
   2786		 */
   2787		context_clear_pasid_enable(&ce);
   2788		context_set_copied(&ce);
   2789
   2790		new_ce[idx] = ce;
   2791	}
   2792
   2793	tbl[tbl_idx + pos] = new_ce;
   2794
   2795	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
   2796
   2797out_unmap:
   2798	memunmap(old_ce);
   2799
   2800out:
   2801	return ret;
   2802}
   2803
   2804static int copy_translation_tables(struct intel_iommu *iommu)
   2805{
   2806	struct context_entry **ctxt_tbls;
   2807	struct root_entry *old_rt;
   2808	phys_addr_t old_rt_phys;
   2809	int ctxt_table_entries;
   2810	unsigned long flags;
   2811	u64 rtaddr_reg;
   2812	int bus, ret;
   2813	bool new_ext, ext;
   2814
   2815	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
   2816	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
   2817	new_ext    = !!ecap_ecs(iommu->ecap);
   2818
   2819	/*
   2820	 * The RTT bit can only be changed when translation is disabled,
   2821	 * but disabling translation means to open a window for data
   2822	 * corruption. So bail out and don't copy anything if we would
   2823	 * have to change the bit.
   2824	 */
   2825	if (new_ext != ext)
   2826		return -EINVAL;
   2827
   2828	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
   2829	if (!old_rt_phys)
   2830		return -EINVAL;
   2831
   2832	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
   2833	if (!old_rt)
   2834		return -ENOMEM;
   2835
   2836	/* This is too big for the stack - allocate it from slab */
   2837	ctxt_table_entries = ext ? 512 : 256;
   2838	ret = -ENOMEM;
   2839	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
   2840	if (!ctxt_tbls)
   2841		goto out_unmap;
   2842
   2843	for (bus = 0; bus < 256; bus++) {
   2844		ret = copy_context_table(iommu, &old_rt[bus],
   2845					 ctxt_tbls, bus, ext);
   2846		if (ret) {
   2847			pr_err("%s: Failed to copy context table for bus %d\n",
   2848				iommu->name, bus);
   2849			continue;
   2850		}
   2851	}
   2852
   2853	spin_lock_irqsave(&iommu->lock, flags);
   2854
   2855	/* Context tables are copied, now write them to the root_entry table */
   2856	for (bus = 0; bus < 256; bus++) {
   2857		int idx = ext ? bus * 2 : bus;
   2858		u64 val;
   2859
   2860		if (ctxt_tbls[idx]) {
   2861			val = virt_to_phys(ctxt_tbls[idx]) | 1;
   2862			iommu->root_entry[bus].lo = val;
   2863		}
   2864
   2865		if (!ext || !ctxt_tbls[idx + 1])
   2866			continue;
   2867
   2868		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
   2869		iommu->root_entry[bus].hi = val;
   2870	}
   2871
   2872	spin_unlock_irqrestore(&iommu->lock, flags);
   2873
   2874	kfree(ctxt_tbls);
   2875
   2876	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
   2877
   2878	ret = 0;
   2879
   2880out_unmap:
   2881	memunmap(old_rt);
   2882
   2883	return ret;
   2884}
   2885
   2886#ifdef CONFIG_INTEL_IOMMU_SVM
   2887static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
   2888{
   2889	struct intel_iommu *iommu = data;
   2890	ioasid_t ioasid;
   2891
   2892	if (!iommu)
   2893		return INVALID_IOASID;
   2894	/*
   2895	 * VT-d virtual command interface always uses the full 20 bit
   2896	 * PASID range. Host can partition guest PASID range based on
   2897	 * policies but it is out of guest's control.
   2898	 */
   2899	if (min < PASID_MIN || max > intel_pasid_max_id)
   2900		return INVALID_IOASID;
   2901
   2902	if (vcmd_alloc_pasid(iommu, &ioasid))
   2903		return INVALID_IOASID;
   2904
   2905	return ioasid;
   2906}
   2907
   2908static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
   2909{
   2910	struct intel_iommu *iommu = data;
   2911
   2912	if (!iommu)
   2913		return;
   2914	/*
   2915	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
   2916	 * We can only free the PASID when all the devices are unbound.
   2917	 */
   2918	if (ioasid_find(NULL, ioasid, NULL)) {
   2919		pr_alert("Cannot free active IOASID %d\n", ioasid);
   2920		return;
   2921	}
   2922	vcmd_free_pasid(iommu, ioasid);
   2923}
   2924
   2925static void register_pasid_allocator(struct intel_iommu *iommu)
   2926{
   2927	/*
   2928	 * If we are running in the host, no need for custom allocator
   2929	 * in that PASIDs are allocated from the host system-wide.
   2930	 */
   2931	if (!cap_caching_mode(iommu->cap))
   2932		return;
   2933
   2934	if (!sm_supported(iommu)) {
   2935		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
   2936		return;
   2937	}
   2938
   2939	/*
   2940	 * Register a custom PASID allocator if we are running in a guest,
   2941	 * guest PASID must be obtained via virtual command interface.
   2942	 * There can be multiple vIOMMUs in each guest but only one allocator
   2943	 * is active. All vIOMMU allocators will eventually be calling the same
   2944	 * host allocator.
   2945	 */
   2946	if (!vccap_pasid(iommu->vccap))
   2947		return;
   2948
   2949	pr_info("Register custom PASID allocator\n");
   2950	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
   2951	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
   2952	iommu->pasid_allocator.pdata = (void *)iommu;
   2953	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
   2954		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
   2955		/*
   2956		 * Disable scalable mode on this IOMMU if there
   2957		 * is no custom allocator. Mixing SM capable vIOMMU
   2958		 * and non-SM vIOMMU are not supported.
   2959		 */
   2960		intel_iommu_sm = 0;
   2961	}
   2962}
   2963#endif
   2964
   2965static int __init init_dmars(void)
   2966{
   2967	struct dmar_drhd_unit *drhd;
   2968	struct intel_iommu *iommu;
   2969	int ret;
   2970
   2971	/*
   2972	 * for each drhd
   2973	 *    allocate root
   2974	 *    initialize and program root entry to not present
   2975	 * endfor
   2976	 */
   2977	for_each_drhd_unit(drhd) {
   2978		/*
   2979		 * lock not needed as this is only incremented in the single
   2980		 * threaded kernel __init code path all other access are read
   2981		 * only
   2982		 */
   2983		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
   2984			g_num_of_iommus++;
   2985			continue;
   2986		}
   2987		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
   2988	}
   2989
   2990	/* Preallocate enough resources for IOMMU hot-addition */
   2991	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
   2992		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
   2993
   2994	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
   2995			GFP_KERNEL);
   2996	if (!g_iommus) {
   2997		ret = -ENOMEM;
   2998		goto error;
   2999	}
   3000
   3001	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
   3002	if (ret)
   3003		goto free_iommu;
   3004
   3005	for_each_iommu(iommu, drhd) {
   3006		if (drhd->ignored) {
   3007			iommu_disable_translation(iommu);
   3008			continue;
   3009		}
   3010
   3011		/*
   3012		 * Find the max pasid size of all IOMMU's in the system.
   3013		 * We need to ensure the system pasid table is no bigger
   3014		 * than the smallest supported.
   3015		 */
   3016		if (pasid_supported(iommu)) {
   3017			u32 temp = 2 << ecap_pss(iommu->ecap);
   3018
   3019			intel_pasid_max_id = min_t(u32, temp,
   3020						   intel_pasid_max_id);
   3021		}
   3022
   3023		g_iommus[iommu->seq_id] = iommu;
   3024
   3025		intel_iommu_init_qi(iommu);
   3026
   3027		ret = iommu_init_domains(iommu);
   3028		if (ret)
   3029			goto free_iommu;
   3030
   3031		init_translation_status(iommu);
   3032
   3033		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
   3034			iommu_disable_translation(iommu);
   3035			clear_translation_pre_enabled(iommu);
   3036			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
   3037				iommu->name);
   3038		}
   3039
   3040		/*
   3041		 * TBD:
   3042		 * we could share the same root & context tables
   3043		 * among all IOMMU's. Need to Split it later.
   3044		 */
   3045		ret = iommu_alloc_root_entry(iommu);
   3046		if (ret)
   3047			goto free_iommu;
   3048
   3049		if (translation_pre_enabled(iommu)) {
   3050			pr_info("Translation already enabled - trying to copy translation structures\n");
   3051
   3052			ret = copy_translation_tables(iommu);
   3053			if (ret) {
   3054				/*
   3055				 * We found the IOMMU with translation
   3056				 * enabled - but failed to copy over the
   3057				 * old root-entry table. Try to proceed
   3058				 * by disabling translation now and
   3059				 * allocating a clean root-entry table.
   3060				 * This might cause DMAR faults, but
   3061				 * probably the dump will still succeed.
   3062				 */
   3063				pr_err("Failed to copy translation tables from previous kernel for %s\n",
   3064				       iommu->name);
   3065				iommu_disable_translation(iommu);
   3066				clear_translation_pre_enabled(iommu);
   3067			} else {
   3068				pr_info("Copied translation tables from previous kernel for %s\n",
   3069					iommu->name);
   3070			}
   3071		}
   3072
   3073		if (!ecap_pass_through(iommu->ecap))
   3074			hw_pass_through = 0;
   3075		intel_svm_check(iommu);
   3076	}
   3077
   3078	/*
   3079	 * Now that qi is enabled on all iommus, set the root entry and flush
   3080	 * caches. This is required on some Intel X58 chipsets, otherwise the
   3081	 * flush_context function will loop forever and the boot hangs.
   3082	 */
   3083	for_each_active_iommu(iommu, drhd) {
   3084		iommu_flush_write_buffer(iommu);
   3085#ifdef CONFIG_INTEL_IOMMU_SVM
   3086		register_pasid_allocator(iommu);
   3087#endif
   3088		iommu_set_root_entry(iommu);
   3089	}
   3090
   3091#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
   3092	dmar_map_gfx = 0;
   3093#endif
   3094
   3095	if (!dmar_map_gfx)
   3096		iommu_identity_mapping |= IDENTMAP_GFX;
   3097
   3098	check_tylersburg_isoch();
   3099
   3100	ret = si_domain_init(hw_pass_through);
   3101	if (ret)
   3102		goto free_iommu;
   3103
   3104	/*
   3105	 * for each drhd
   3106	 *   enable fault log
   3107	 *   global invalidate context cache
   3108	 *   global invalidate iotlb
   3109	 *   enable translation
   3110	 */
   3111	for_each_iommu(iommu, drhd) {
   3112		if (drhd->ignored) {
   3113			/*
   3114			 * we always have to disable PMRs or DMA may fail on
   3115			 * this device
   3116			 */
   3117			if (force_on)
   3118				iommu_disable_protect_mem_regions(iommu);
   3119			continue;
   3120		}
   3121
   3122		iommu_flush_write_buffer(iommu);
   3123
   3124#ifdef CONFIG_INTEL_IOMMU_SVM
   3125		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
   3126			/*
   3127			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
   3128			 * could cause possible lock race condition.
   3129			 */
   3130			up_write(&dmar_global_lock);
   3131			ret = intel_svm_enable_prq(iommu);
   3132			down_write(&dmar_global_lock);
   3133			if (ret)
   3134				goto free_iommu;
   3135		}
   3136#endif
   3137		ret = dmar_set_interrupt(iommu);
   3138		if (ret)
   3139			goto free_iommu;
   3140	}
   3141
   3142	return 0;
   3143
   3144free_iommu:
   3145	for_each_active_iommu(iommu, drhd) {
   3146		disable_dmar_iommu(iommu);
   3147		free_dmar_iommu(iommu);
   3148	}
   3149
   3150	kfree(g_iommus);
   3151
   3152error:
   3153	return ret;
   3154}
   3155
   3156static void __init init_no_remapping_devices(void)
   3157{
   3158	struct dmar_drhd_unit *drhd;
   3159	struct device *dev;
   3160	int i;
   3161
   3162	for_each_drhd_unit(drhd) {
   3163		if (!drhd->include_all) {
   3164			for_each_active_dev_scope(drhd->devices,
   3165						  drhd->devices_cnt, i, dev)
   3166				break;
   3167			/* ignore DMAR unit if no devices exist */
   3168			if (i == drhd->devices_cnt)
   3169				drhd->ignored = 1;
   3170		}
   3171	}
   3172
   3173	for_each_active_drhd_unit(drhd) {
   3174		if (drhd->include_all)
   3175			continue;
   3176
   3177		for_each_active_dev_scope(drhd->devices,
   3178					  drhd->devices_cnt, i, dev)
   3179			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
   3180				break;
   3181		if (i < drhd->devices_cnt)
   3182			continue;
   3183
   3184		/* This IOMMU has *only* gfx devices. Either bypass it or
   3185		   set the gfx_mapped flag, as appropriate */
   3186		drhd->gfx_dedicated = 1;
   3187		if (!dmar_map_gfx)
   3188			drhd->ignored = 1;
   3189	}
   3190}
   3191
   3192#ifdef CONFIG_SUSPEND
   3193static int init_iommu_hw(void)
   3194{
   3195	struct dmar_drhd_unit *drhd;
   3196	struct intel_iommu *iommu = NULL;
   3197
   3198	for_each_active_iommu(iommu, drhd)
   3199		if (iommu->qi)
   3200			dmar_reenable_qi(iommu);
   3201
   3202	for_each_iommu(iommu, drhd) {
   3203		if (drhd->ignored) {
   3204			/*
   3205			 * we always have to disable PMRs or DMA may fail on
   3206			 * this device
   3207			 */
   3208			if (force_on)
   3209				iommu_disable_protect_mem_regions(iommu);
   3210			continue;
   3211		}
   3212
   3213		iommu_flush_write_buffer(iommu);
   3214		iommu_set_root_entry(iommu);
   3215		iommu_enable_translation(iommu);
   3216		iommu_disable_protect_mem_regions(iommu);
   3217	}
   3218
   3219	return 0;
   3220}
   3221
   3222static void iommu_flush_all(void)
   3223{
   3224	struct dmar_drhd_unit *drhd;
   3225	struct intel_iommu *iommu;
   3226
   3227	for_each_active_iommu(iommu, drhd) {
   3228		iommu->flush.flush_context(iommu, 0, 0, 0,
   3229					   DMA_CCMD_GLOBAL_INVL);
   3230		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
   3231					 DMA_TLB_GLOBAL_FLUSH);
   3232	}
   3233}
   3234
   3235static int iommu_suspend(void)
   3236{
   3237	struct dmar_drhd_unit *drhd;
   3238	struct intel_iommu *iommu = NULL;
   3239	unsigned long flag;
   3240
   3241	for_each_active_iommu(iommu, drhd) {
   3242		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
   3243					     GFP_KERNEL);
   3244		if (!iommu->iommu_state)
   3245			goto nomem;
   3246	}
   3247
   3248	iommu_flush_all();
   3249
   3250	for_each_active_iommu(iommu, drhd) {
   3251		iommu_disable_translation(iommu);
   3252
   3253		raw_spin_lock_irqsave(&iommu->register_lock, flag);
   3254
   3255		iommu->iommu_state[SR_DMAR_FECTL_REG] =
   3256			readl(iommu->reg + DMAR_FECTL_REG);
   3257		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
   3258			readl(iommu->reg + DMAR_FEDATA_REG);
   3259		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
   3260			readl(iommu->reg + DMAR_FEADDR_REG);
   3261		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
   3262			readl(iommu->reg + DMAR_FEUADDR_REG);
   3263
   3264		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
   3265	}
   3266	return 0;
   3267
   3268nomem:
   3269	for_each_active_iommu(iommu, drhd)
   3270		kfree(iommu->iommu_state);
   3271
   3272	return -ENOMEM;
   3273}
   3274
   3275static void iommu_resume(void)
   3276{
   3277	struct dmar_drhd_unit *drhd;
   3278	struct intel_iommu *iommu = NULL;
   3279	unsigned long flag;
   3280
   3281	if (init_iommu_hw()) {
   3282		if (force_on)
   3283			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
   3284		else
   3285			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
   3286		return;
   3287	}
   3288
   3289	for_each_active_iommu(iommu, drhd) {
   3290
   3291		raw_spin_lock_irqsave(&iommu->register_lock, flag);
   3292
   3293		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
   3294			iommu->reg + DMAR_FECTL_REG);
   3295		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
   3296			iommu->reg + DMAR_FEDATA_REG);
   3297		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
   3298			iommu->reg + DMAR_FEADDR_REG);
   3299		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
   3300			iommu->reg + DMAR_FEUADDR_REG);
   3301
   3302		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
   3303	}
   3304
   3305	for_each_active_iommu(iommu, drhd)
   3306		kfree(iommu->iommu_state);
   3307}
   3308
   3309static struct syscore_ops iommu_syscore_ops = {
   3310	.resume		= iommu_resume,
   3311	.suspend	= iommu_suspend,
   3312};
   3313
   3314static void __init init_iommu_pm_ops(void)
   3315{
   3316	register_syscore_ops(&iommu_syscore_ops);
   3317}
   3318
   3319#else
   3320static inline void init_iommu_pm_ops(void) {}
   3321#endif	/* CONFIG_PM */
   3322
   3323static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
   3324{
   3325	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
   3326	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
   3327	    rmrr->end_address <= rmrr->base_address ||
   3328	    arch_rmrr_sanity_check(rmrr))
   3329		return -EINVAL;
   3330
   3331	return 0;
   3332}
   3333
   3334int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
   3335{
   3336	struct acpi_dmar_reserved_memory *rmrr;
   3337	struct dmar_rmrr_unit *rmrru;
   3338
   3339	rmrr = (struct acpi_dmar_reserved_memory *)header;
   3340	if (rmrr_sanity_check(rmrr)) {
   3341		pr_warn(FW_BUG
   3342			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
   3343			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
   3344			   rmrr->base_address, rmrr->end_address,
   3345			   dmi_get_system_info(DMI_BIOS_VENDOR),
   3346			   dmi_get_system_info(DMI_BIOS_VERSION),
   3347			   dmi_get_system_info(DMI_PRODUCT_VERSION));
   3348		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
   3349	}
   3350
   3351	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
   3352	if (!rmrru)
   3353		goto out;
   3354
   3355	rmrru->hdr = header;
   3356
   3357	rmrru->base_address = rmrr->base_address;
   3358	rmrru->end_address = rmrr->end_address;
   3359
   3360	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
   3361				((void *)rmrr) + rmrr->header.length,
   3362				&rmrru->devices_cnt);
   3363	if (rmrru->devices_cnt && rmrru->devices == NULL)
   3364		goto free_rmrru;
   3365
   3366	list_add(&rmrru->list, &dmar_rmrr_units);
   3367
   3368	return 0;
   3369free_rmrru:
   3370	kfree(rmrru);
   3371out:
   3372	return -ENOMEM;
   3373}
   3374
   3375static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
   3376{
   3377	struct dmar_atsr_unit *atsru;
   3378	struct acpi_dmar_atsr *tmp;
   3379
   3380	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
   3381				dmar_rcu_check()) {
   3382		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
   3383		if (atsr->segment != tmp->segment)
   3384			continue;
   3385		if (atsr->header.length != tmp->header.length)
   3386			continue;
   3387		if (memcmp(atsr, tmp, atsr->header.length) == 0)
   3388			return atsru;
   3389	}
   3390
   3391	return NULL;
   3392}
   3393
   3394int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
   3395{
   3396	struct acpi_dmar_atsr *atsr;
   3397	struct dmar_atsr_unit *atsru;
   3398
   3399	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
   3400		return 0;
   3401
   3402	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
   3403	atsru = dmar_find_atsr(atsr);
   3404	if (atsru)
   3405		return 0;
   3406
   3407	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
   3408	if (!atsru)
   3409		return -ENOMEM;
   3410
   3411	/*
   3412	 * If memory is allocated from slab by ACPI _DSM method, we need to
   3413	 * copy the memory content because the memory buffer will be freed
   3414	 * on return.
   3415	 */
   3416	atsru->hdr = (void *)(atsru + 1);
   3417	memcpy(atsru->hdr, hdr, hdr->length);
   3418	atsru->include_all = atsr->flags & 0x1;
   3419	if (!atsru->include_all) {
   3420		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
   3421				(void *)atsr + atsr->header.length,
   3422				&atsru->devices_cnt);
   3423		if (atsru->devices_cnt && atsru->devices == NULL) {
   3424			kfree(atsru);
   3425			return -ENOMEM;
   3426		}
   3427	}
   3428
   3429	list_add_rcu(&atsru->list, &dmar_atsr_units);
   3430
   3431	return 0;
   3432}
   3433
   3434static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
   3435{
   3436	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
   3437	kfree(atsru);
   3438}
   3439
   3440int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
   3441{
   3442	struct acpi_dmar_atsr *atsr;
   3443	struct dmar_atsr_unit *atsru;
   3444
   3445	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
   3446	atsru = dmar_find_atsr(atsr);
   3447	if (atsru) {
   3448		list_del_rcu(&atsru->list);
   3449		synchronize_rcu();
   3450		intel_iommu_free_atsr(atsru);
   3451	}
   3452
   3453	return 0;
   3454}
   3455
   3456int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
   3457{
   3458	int i;
   3459	struct device *dev;
   3460	struct acpi_dmar_atsr *atsr;
   3461	struct dmar_atsr_unit *atsru;
   3462
   3463	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
   3464	atsru = dmar_find_atsr(atsr);
   3465	if (!atsru)
   3466		return 0;
   3467
   3468	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
   3469		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
   3470					  i, dev)
   3471			return -EBUSY;
   3472	}
   3473
   3474	return 0;
   3475}
   3476
   3477static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
   3478{
   3479	struct dmar_satc_unit *satcu;
   3480	struct acpi_dmar_satc *tmp;
   3481
   3482	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
   3483				dmar_rcu_check()) {
   3484		tmp = (struct acpi_dmar_satc *)satcu->hdr;
   3485		if (satc->segment != tmp->segment)
   3486			continue;
   3487		if (satc->header.length != tmp->header.length)
   3488			continue;
   3489		if (memcmp(satc, tmp, satc->header.length) == 0)
   3490			return satcu;
   3491	}
   3492
   3493	return NULL;
   3494}
   3495
   3496int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
   3497{
   3498	struct acpi_dmar_satc *satc;
   3499	struct dmar_satc_unit *satcu;
   3500
   3501	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
   3502		return 0;
   3503
   3504	satc = container_of(hdr, struct acpi_dmar_satc, header);
   3505	satcu = dmar_find_satc(satc);
   3506	if (satcu)
   3507		return 0;
   3508
   3509	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
   3510	if (!satcu)
   3511		return -ENOMEM;
   3512
   3513	satcu->hdr = (void *)(satcu + 1);
   3514	memcpy(satcu->hdr, hdr, hdr->length);
   3515	satcu->atc_required = satc->flags & 0x1;
   3516	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
   3517					      (void *)satc + satc->header.length,
   3518					      &satcu->devices_cnt);
   3519	if (satcu->devices_cnt && !satcu->devices) {
   3520		kfree(satcu);
   3521		return -ENOMEM;
   3522	}
   3523	list_add_rcu(&satcu->list, &dmar_satc_units);
   3524
   3525	return 0;
   3526}
   3527
   3528static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
   3529{
   3530	int sp, ret;
   3531	struct intel_iommu *iommu = dmaru->iommu;
   3532
   3533	if (g_iommus[iommu->seq_id])
   3534		return 0;
   3535
   3536	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
   3537	if (ret)
   3538		goto out;
   3539
   3540	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
   3541		pr_warn("%s: Doesn't support hardware pass through.\n",
   3542			iommu->name);
   3543		return -ENXIO;
   3544	}
   3545
   3546	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
   3547	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
   3548		pr_warn("%s: Doesn't support large page.\n",
   3549			iommu->name);
   3550		return -ENXIO;
   3551	}
   3552
   3553	/*
   3554	 * Disable translation if already enabled prior to OS handover.
   3555	 */
   3556	if (iommu->gcmd & DMA_GCMD_TE)
   3557		iommu_disable_translation(iommu);
   3558
   3559	g_iommus[iommu->seq_id] = iommu;
   3560	ret = iommu_init_domains(iommu);
   3561	if (ret == 0)
   3562		ret = iommu_alloc_root_entry(iommu);
   3563	if (ret)
   3564		goto out;
   3565
   3566	intel_svm_check(iommu);
   3567
   3568	if (dmaru->ignored) {
   3569		/*
   3570		 * we always have to disable PMRs or DMA may fail on this device
   3571		 */
   3572		if (force_on)
   3573			iommu_disable_protect_mem_regions(iommu);
   3574		return 0;
   3575	}
   3576
   3577	intel_iommu_init_qi(iommu);
   3578	iommu_flush_write_buffer(iommu);
   3579
   3580#ifdef CONFIG_INTEL_IOMMU_SVM
   3581	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
   3582		ret = intel_svm_enable_prq(iommu);
   3583		if (ret)
   3584			goto disable_iommu;
   3585	}
   3586#endif
   3587	ret = dmar_set_interrupt(iommu);
   3588	if (ret)
   3589		goto disable_iommu;
   3590
   3591	iommu_set_root_entry(iommu);
   3592	iommu_enable_translation(iommu);
   3593
   3594	iommu_disable_protect_mem_regions(iommu);
   3595	return 0;
   3596
   3597disable_iommu:
   3598	disable_dmar_iommu(iommu);
   3599out:
   3600	free_dmar_iommu(iommu);
   3601	return ret;
   3602}
   3603
   3604int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
   3605{
   3606	int ret = 0;
   3607	struct intel_iommu *iommu = dmaru->iommu;
   3608
   3609	if (!intel_iommu_enabled)
   3610		return 0;
   3611	if (iommu == NULL)
   3612		return -EINVAL;
   3613
   3614	if (insert) {
   3615		ret = intel_iommu_add(dmaru);
   3616	} else {
   3617		disable_dmar_iommu(iommu);
   3618		free_dmar_iommu(iommu);
   3619	}
   3620
   3621	return ret;
   3622}
   3623
   3624static void intel_iommu_free_dmars(void)
   3625{
   3626	struct dmar_rmrr_unit *rmrru, *rmrr_n;
   3627	struct dmar_atsr_unit *atsru, *atsr_n;
   3628	struct dmar_satc_unit *satcu, *satc_n;
   3629
   3630	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
   3631		list_del(&rmrru->list);
   3632		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
   3633		kfree(rmrru);
   3634	}
   3635
   3636	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
   3637		list_del(&atsru->list);
   3638		intel_iommu_free_atsr(atsru);
   3639	}
   3640	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
   3641		list_del(&satcu->list);
   3642		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
   3643		kfree(satcu);
   3644	}
   3645}
   3646
   3647static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
   3648{
   3649	struct dmar_satc_unit *satcu;
   3650	struct acpi_dmar_satc *satc;
   3651	struct device *tmp;
   3652	int i;
   3653
   3654	dev = pci_physfn(dev);
   3655	rcu_read_lock();
   3656
   3657	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
   3658		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
   3659		if (satc->segment != pci_domain_nr(dev->bus))
   3660			continue;
   3661		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
   3662			if (to_pci_dev(tmp) == dev)
   3663				goto out;
   3664	}
   3665	satcu = NULL;
   3666out:
   3667	rcu_read_unlock();
   3668	return satcu;
   3669}
   3670
   3671static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
   3672{
   3673	int i, ret = 1;
   3674	struct pci_bus *bus;
   3675	struct pci_dev *bridge = NULL;
   3676	struct device *tmp;
   3677	struct acpi_dmar_atsr *atsr;
   3678	struct dmar_atsr_unit *atsru;
   3679	struct dmar_satc_unit *satcu;
   3680
   3681	dev = pci_physfn(dev);
   3682	satcu = dmar_find_matched_satc_unit(dev);
   3683	if (satcu)
   3684		/*
   3685		 * This device supports ATS as it is in SATC table.
   3686		 * When IOMMU is in legacy mode, enabling ATS is done
   3687		 * automatically by HW for the device that requires
   3688		 * ATS, hence OS should not enable this device ATS
   3689		 * to avoid duplicated TLB invalidation.
   3690		 */
   3691		return !(satcu->atc_required && !sm_supported(iommu));
   3692
   3693	for (bus = dev->bus; bus; bus = bus->parent) {
   3694		bridge = bus->self;
   3695		/* If it's an integrated device, allow ATS */
   3696		if (!bridge)
   3697			return 1;
   3698		/* Connected via non-PCIe: no ATS */
   3699		if (!pci_is_pcie(bridge) ||
   3700		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
   3701			return 0;
   3702		/* If we found the root port, look it up in the ATSR */
   3703		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
   3704			break;
   3705	}
   3706
   3707	rcu_read_lock();
   3708	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
   3709		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
   3710		if (atsr->segment != pci_domain_nr(dev->bus))
   3711			continue;
   3712
   3713		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
   3714			if (tmp == &bridge->dev)
   3715				goto out;
   3716
   3717		if (atsru->include_all)
   3718			goto out;
   3719	}
   3720	ret = 0;
   3721out:
   3722	rcu_read_unlock();
   3723
   3724	return ret;
   3725}
   3726
   3727int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
   3728{
   3729	int ret;
   3730	struct dmar_rmrr_unit *rmrru;
   3731	struct dmar_atsr_unit *atsru;
   3732	struct dmar_satc_unit *satcu;
   3733	struct acpi_dmar_atsr *atsr;
   3734	struct acpi_dmar_reserved_memory *rmrr;
   3735	struct acpi_dmar_satc *satc;
   3736
   3737	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
   3738		return 0;
   3739
   3740	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
   3741		rmrr = container_of(rmrru->hdr,
   3742				    struct acpi_dmar_reserved_memory, header);
   3743		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
   3744			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
   3745				((void *)rmrr) + rmrr->header.length,
   3746				rmrr->segment, rmrru->devices,
   3747				rmrru->devices_cnt);
   3748			if (ret < 0)
   3749				return ret;
   3750		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
   3751			dmar_remove_dev_scope(info, rmrr->segment,
   3752				rmrru->devices, rmrru->devices_cnt);
   3753		}
   3754	}
   3755
   3756	list_for_each_entry(atsru, &dmar_atsr_units, list) {
   3757		if (atsru->include_all)
   3758			continue;
   3759
   3760		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
   3761		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
   3762			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
   3763					(void *)atsr + atsr->header.length,
   3764					atsr->segment, atsru->devices,
   3765					atsru->devices_cnt);
   3766			if (ret > 0)
   3767				break;
   3768			else if (ret < 0)
   3769				return ret;
   3770		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
   3771			if (dmar_remove_dev_scope(info, atsr->segment,
   3772					atsru->devices, atsru->devices_cnt))
   3773				break;
   3774		}
   3775	}
   3776	list_for_each_entry(satcu, &dmar_satc_units, list) {
   3777		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
   3778		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
   3779			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
   3780					(void *)satc + satc->header.length,
   3781					satc->segment, satcu->devices,
   3782					satcu->devices_cnt);
   3783			if (ret > 0)
   3784				break;
   3785			else if (ret < 0)
   3786				return ret;
   3787		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
   3788			if (dmar_remove_dev_scope(info, satc->segment,
   3789					satcu->devices, satcu->devices_cnt))
   3790				break;
   3791		}
   3792	}
   3793
   3794	return 0;
   3795}
   3796
   3797static int intel_iommu_memory_notifier(struct notifier_block *nb,
   3798				       unsigned long val, void *v)
   3799{
   3800	struct memory_notify *mhp = v;
   3801	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
   3802	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
   3803			mhp->nr_pages - 1);
   3804
   3805	switch (val) {
   3806	case MEM_GOING_ONLINE:
   3807		if (iommu_domain_identity_map(si_domain,
   3808					      start_vpfn, last_vpfn)) {
   3809			pr_warn("Failed to build identity map for [%lx-%lx]\n",
   3810				start_vpfn, last_vpfn);
   3811			return NOTIFY_BAD;
   3812		}
   3813		break;
   3814
   3815	case MEM_OFFLINE:
   3816	case MEM_CANCEL_ONLINE:
   3817		{
   3818			struct dmar_drhd_unit *drhd;
   3819			struct intel_iommu *iommu;
   3820			LIST_HEAD(freelist);
   3821
   3822			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
   3823
   3824			rcu_read_lock();
   3825			for_each_active_iommu(iommu, drhd)
   3826				iommu_flush_iotlb_psi(iommu, si_domain,
   3827					start_vpfn, mhp->nr_pages,
   3828					list_empty(&freelist), 0);
   3829			rcu_read_unlock();
   3830			put_pages_list(&freelist);
   3831		}
   3832		break;
   3833	}
   3834
   3835	return NOTIFY_OK;
   3836}
   3837
   3838static struct notifier_block intel_iommu_memory_nb = {
   3839	.notifier_call = intel_iommu_memory_notifier,
   3840	.priority = 0
   3841};
   3842
   3843static void intel_disable_iommus(void)
   3844{
   3845	struct intel_iommu *iommu = NULL;
   3846	struct dmar_drhd_unit *drhd;
   3847
   3848	for_each_iommu(iommu, drhd)
   3849		iommu_disable_translation(iommu);
   3850}
   3851
   3852void intel_iommu_shutdown(void)
   3853{
   3854	struct dmar_drhd_unit *drhd;
   3855	struct intel_iommu *iommu = NULL;
   3856
   3857	if (no_iommu || dmar_disabled)
   3858		return;
   3859
   3860	down_write(&dmar_global_lock);
   3861
   3862	/* Disable PMRs explicitly here. */
   3863	for_each_iommu(iommu, drhd)
   3864		iommu_disable_protect_mem_regions(iommu);
   3865
   3866	/* Make sure the IOMMUs are switched off */
   3867	intel_disable_iommus();
   3868
   3869	up_write(&dmar_global_lock);
   3870}
   3871
   3872static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
   3873{
   3874	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
   3875
   3876	return container_of(iommu_dev, struct intel_iommu, iommu);
   3877}
   3878
   3879static ssize_t version_show(struct device *dev,
   3880			    struct device_attribute *attr, char *buf)
   3881{
   3882	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
   3883	u32 ver = readl(iommu->reg + DMAR_VER_REG);
   3884	return sprintf(buf, "%d:%d\n",
   3885		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
   3886}
   3887static DEVICE_ATTR_RO(version);
   3888
   3889static ssize_t address_show(struct device *dev,
   3890			    struct device_attribute *attr, char *buf)
   3891{
   3892	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
   3893	return sprintf(buf, "%llx\n", iommu->reg_phys);
   3894}
   3895static DEVICE_ATTR_RO(address);
   3896
   3897static ssize_t cap_show(struct device *dev,
   3898			struct device_attribute *attr, char *buf)
   3899{
   3900	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
   3901	return sprintf(buf, "%llx\n", iommu->cap);
   3902}
   3903static DEVICE_ATTR_RO(cap);
   3904
   3905static ssize_t ecap_show(struct device *dev,
   3906			 struct device_attribute *attr, char *buf)
   3907{
   3908	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
   3909	return sprintf(buf, "%llx\n", iommu->ecap);
   3910}
   3911static DEVICE_ATTR_RO(ecap);
   3912
   3913static ssize_t domains_supported_show(struct device *dev,
   3914				      struct device_attribute *attr, char *buf)
   3915{
   3916	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
   3917	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
   3918}
   3919static DEVICE_ATTR_RO(domains_supported);
   3920
   3921static ssize_t domains_used_show(struct device *dev,
   3922				 struct device_attribute *attr, char *buf)
   3923{
   3924	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
   3925	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
   3926						  cap_ndoms(iommu->cap)));
   3927}
   3928static DEVICE_ATTR_RO(domains_used);
   3929
   3930static struct attribute *intel_iommu_attrs[] = {
   3931	&dev_attr_version.attr,
   3932	&dev_attr_address.attr,
   3933	&dev_attr_cap.attr,
   3934	&dev_attr_ecap.attr,
   3935	&dev_attr_domains_supported.attr,
   3936	&dev_attr_domains_used.attr,
   3937	NULL,
   3938};
   3939
   3940static struct attribute_group intel_iommu_group = {
   3941	.name = "intel-iommu",
   3942	.attrs = intel_iommu_attrs,
   3943};
   3944
   3945const struct attribute_group *intel_iommu_groups[] = {
   3946	&intel_iommu_group,
   3947	NULL,
   3948};
   3949
   3950static inline bool has_external_pci(void)
   3951{
   3952	struct pci_dev *pdev = NULL;
   3953
   3954	for_each_pci_dev(pdev)
   3955		if (pdev->external_facing)
   3956			return true;
   3957
   3958	return false;
   3959}
   3960
   3961static int __init platform_optin_force_iommu(void)
   3962{
   3963	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
   3964		return 0;
   3965
   3966	if (no_iommu || dmar_disabled)
   3967		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
   3968
   3969	/*
   3970	 * If Intel-IOMMU is disabled by default, we will apply identity
   3971	 * map for all devices except those marked as being untrusted.
   3972	 */
   3973	if (dmar_disabled)
   3974		iommu_set_default_passthrough(false);
   3975
   3976	dmar_disabled = 0;
   3977	no_iommu = 0;
   3978
   3979	return 1;
   3980}
   3981
   3982static int __init probe_acpi_namespace_devices(void)
   3983{
   3984	struct dmar_drhd_unit *drhd;
   3985	/* To avoid a -Wunused-but-set-variable warning. */
   3986	struct intel_iommu *iommu __maybe_unused;
   3987	struct device *dev;
   3988	int i, ret = 0;
   3989
   3990	for_each_active_iommu(iommu, drhd) {
   3991		for_each_active_dev_scope(drhd->devices,
   3992					  drhd->devices_cnt, i, dev) {
   3993			struct acpi_device_physical_node *pn;
   3994			struct iommu_group *group;
   3995			struct acpi_device *adev;
   3996
   3997			if (dev->bus != &acpi_bus_type)
   3998				continue;
   3999
   4000			adev = to_acpi_device(dev);
   4001			mutex_lock(&adev->physical_node_lock);
   4002			list_for_each_entry(pn,
   4003					    &adev->physical_node_list, node) {
   4004				group = iommu_group_get(pn->dev);
   4005				if (group) {
   4006					iommu_group_put(group);
   4007					continue;
   4008				}
   4009
   4010				pn->dev->bus->iommu_ops = &intel_iommu_ops;
   4011				ret = iommu_probe_device(pn->dev);
   4012				if (ret)
   4013					break;
   4014			}
   4015			mutex_unlock(&adev->physical_node_lock);
   4016
   4017			if (ret)
   4018				return ret;
   4019		}
   4020	}
   4021
   4022	return 0;
   4023}
   4024
   4025int __init intel_iommu_init(void)
   4026{
   4027	int ret = -ENODEV;
   4028	struct dmar_drhd_unit *drhd;
   4029	struct intel_iommu *iommu;
   4030
   4031	/*
   4032	 * Intel IOMMU is required for a TXT/tboot launch or platform
   4033	 * opt in, so enforce that.
   4034	 */
   4035	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
   4036		    platform_optin_force_iommu();
   4037
   4038	down_write(&dmar_global_lock);
   4039	if (dmar_table_init()) {
   4040		if (force_on)
   4041			panic("tboot: Failed to initialize DMAR table\n");
   4042		goto out_free_dmar;
   4043	}
   4044
   4045	if (dmar_dev_scope_init() < 0) {
   4046		if (force_on)
   4047			panic("tboot: Failed to initialize DMAR device scope\n");
   4048		goto out_free_dmar;
   4049	}
   4050
   4051	up_write(&dmar_global_lock);
   4052
   4053	/*
   4054	 * The bus notifier takes the dmar_global_lock, so lockdep will
   4055	 * complain later when we register it under the lock.
   4056	 */
   4057	dmar_register_bus_notifier();
   4058
   4059	down_write(&dmar_global_lock);
   4060
   4061	if (!no_iommu)
   4062		intel_iommu_debugfs_init();
   4063
   4064	if (no_iommu || dmar_disabled) {
   4065		/*
   4066		 * We exit the function here to ensure IOMMU's remapping and
   4067		 * mempool aren't setup, which means that the IOMMU's PMRs
   4068		 * won't be disabled via the call to init_dmars(). So disable
   4069		 * it explicitly here. The PMRs were setup by tboot prior to
   4070		 * calling SENTER, but the kernel is expected to reset/tear
   4071		 * down the PMRs.
   4072		 */
   4073		if (intel_iommu_tboot_noforce) {
   4074			for_each_iommu(iommu, drhd)
   4075				iommu_disable_protect_mem_regions(iommu);
   4076		}
   4077
   4078		/*
   4079		 * Make sure the IOMMUs are switched off, even when we
   4080		 * boot into a kexec kernel and the previous kernel left
   4081		 * them enabled
   4082		 */
   4083		intel_disable_iommus();
   4084		goto out_free_dmar;
   4085	}
   4086
   4087	if (list_empty(&dmar_rmrr_units))
   4088		pr_info("No RMRR found\n");
   4089
   4090	if (list_empty(&dmar_atsr_units))
   4091		pr_info("No ATSR found\n");
   4092
   4093	if (list_empty(&dmar_satc_units))
   4094		pr_info("No SATC found\n");
   4095
   4096	if (dmar_map_gfx)
   4097		intel_iommu_gfx_mapped = 1;
   4098
   4099	init_no_remapping_devices();
   4100
   4101	ret = init_dmars();
   4102	if (ret) {
   4103		if (force_on)
   4104			panic("tboot: Failed to initialize DMARs\n");
   4105		pr_err("Initialization failed\n");
   4106		goto out_free_dmar;
   4107	}
   4108	up_write(&dmar_global_lock);
   4109
   4110	init_iommu_pm_ops();
   4111
   4112	down_read(&dmar_global_lock);
   4113	for_each_active_iommu(iommu, drhd) {
   4114		/*
   4115		 * The flush queue implementation does not perform
   4116		 * page-selective invalidations that are required for efficient
   4117		 * TLB flushes in virtual environments.  The benefit of batching
   4118		 * is likely to be much lower than the overhead of synchronizing
   4119		 * the virtual and physical IOMMU page-tables.
   4120		 */
   4121		if (cap_caching_mode(iommu->cap)) {
   4122			pr_info_once("IOMMU batching disallowed due to virtualization\n");
   4123			iommu_set_dma_strict();
   4124		}
   4125		iommu_device_sysfs_add(&iommu->iommu, NULL,
   4126				       intel_iommu_groups,
   4127				       "%s", iommu->name);
   4128		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
   4129	}
   4130	up_read(&dmar_global_lock);
   4131
   4132	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
   4133	if (si_domain && !hw_pass_through)
   4134		register_memory_notifier(&intel_iommu_memory_nb);
   4135
   4136	down_read(&dmar_global_lock);
   4137	if (probe_acpi_namespace_devices())
   4138		pr_warn("ACPI name space devices didn't probe correctly\n");
   4139
   4140	/* Finally, we enable the DMA remapping hardware. */
   4141	for_each_iommu(iommu, drhd) {
   4142		if (!drhd->ignored && !translation_pre_enabled(iommu))
   4143			iommu_enable_translation(iommu);
   4144
   4145		iommu_disable_protect_mem_regions(iommu);
   4146	}
   4147	up_read(&dmar_global_lock);
   4148
   4149	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
   4150
   4151	intel_iommu_enabled = 1;
   4152
   4153	return 0;
   4154
   4155out_free_dmar:
   4156	intel_iommu_free_dmars();
   4157	up_write(&dmar_global_lock);
   4158	return ret;
   4159}
   4160
   4161static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
   4162{
   4163	struct device_domain_info *info = opaque;
   4164
   4165	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
   4166	return 0;
   4167}
   4168
   4169/*
   4170 * NB - intel-iommu lacks any sort of reference counting for the users of
   4171 * dependent devices.  If multiple endpoints have intersecting dependent
   4172 * devices, unbinding the driver from any one of them will possibly leave
   4173 * the others unable to operate.
   4174 */
   4175static void domain_context_clear(struct device_domain_info *info)
   4176{
   4177	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
   4178		return;
   4179
   4180	pci_for_each_dma_alias(to_pci_dev(info->dev),
   4181			       &domain_context_clear_one_cb, info);
   4182}
   4183
   4184static void __dmar_remove_one_dev_info(struct device_domain_info *info)
   4185{
   4186	struct dmar_domain *domain;
   4187	struct intel_iommu *iommu;
   4188	unsigned long flags;
   4189
   4190	assert_spin_locked(&device_domain_lock);
   4191
   4192	if (WARN_ON(!info))
   4193		return;
   4194
   4195	iommu = info->iommu;
   4196	domain = info->domain;
   4197
   4198	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
   4199		if (dev_is_pci(info->dev) && sm_supported(iommu))
   4200			intel_pasid_tear_down_entry(iommu, info->dev,
   4201					PASID_RID2PASID, false);
   4202
   4203		iommu_disable_dev_iotlb(info);
   4204		domain_context_clear(info);
   4205		intel_pasid_free_table(info->dev);
   4206	}
   4207
   4208	list_del(&info->link);
   4209
   4210	spin_lock_irqsave(&iommu->lock, flags);
   4211	domain_detach_iommu(domain, iommu);
   4212	spin_unlock_irqrestore(&iommu->lock, flags);
   4213}
   4214
   4215static void dmar_remove_one_dev_info(struct device *dev)
   4216{
   4217	struct device_domain_info *info;
   4218	unsigned long flags;
   4219
   4220	spin_lock_irqsave(&device_domain_lock, flags);
   4221	info = dev_iommu_priv_get(dev);
   4222	if (info)
   4223		__dmar_remove_one_dev_info(info);
   4224	spin_unlock_irqrestore(&device_domain_lock, flags);
   4225}
   4226
   4227static int md_domain_init(struct dmar_domain *domain, int guest_width)
   4228{
   4229	int adjust_width;
   4230
   4231	/* calculate AGAW */
   4232	domain->gaw = guest_width;
   4233	adjust_width = guestwidth_to_adjustwidth(guest_width);
   4234	domain->agaw = width_to_agaw(adjust_width);
   4235
   4236	domain->iommu_coherency = false;
   4237	domain->iommu_superpage = 0;
   4238	domain->max_addr = 0;
   4239
   4240	/* always allocate the top pgd */
   4241	domain->pgd = alloc_pgtable_page(domain->nid);
   4242	if (!domain->pgd)
   4243		return -ENOMEM;
   4244	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
   4245	return 0;
   4246}
   4247
   4248static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
   4249{
   4250	struct dmar_domain *dmar_domain;
   4251	struct iommu_domain *domain;
   4252
   4253	switch (type) {
   4254	case IOMMU_DOMAIN_DMA:
   4255	case IOMMU_DOMAIN_DMA_FQ:
   4256	case IOMMU_DOMAIN_UNMANAGED:
   4257		dmar_domain = alloc_domain(type);
   4258		if (!dmar_domain) {
   4259			pr_err("Can't allocate dmar_domain\n");
   4260			return NULL;
   4261		}
   4262		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
   4263			pr_err("Domain initialization failed\n");
   4264			domain_exit(dmar_domain);
   4265			return NULL;
   4266		}
   4267
   4268		domain = &dmar_domain->domain;
   4269		domain->geometry.aperture_start = 0;
   4270		domain->geometry.aperture_end   =
   4271				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
   4272		domain->geometry.force_aperture = true;
   4273
   4274		return domain;
   4275	case IOMMU_DOMAIN_IDENTITY:
   4276		return &si_domain->domain;
   4277	default:
   4278		return NULL;
   4279	}
   4280
   4281	return NULL;
   4282}
   4283
   4284static void intel_iommu_domain_free(struct iommu_domain *domain)
   4285{
   4286	if (domain != &si_domain->domain)
   4287		domain_exit(to_dmar_domain(domain));
   4288}
   4289
   4290static int prepare_domain_attach_device(struct iommu_domain *domain,
   4291					struct device *dev)
   4292{
   4293	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
   4294	struct intel_iommu *iommu;
   4295	int addr_width;
   4296
   4297	iommu = device_to_iommu(dev, NULL, NULL);
   4298	if (!iommu)
   4299		return -ENODEV;
   4300
   4301	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
   4302		return -EOPNOTSUPP;
   4303
   4304	/* check if this iommu agaw is sufficient for max mapped address */
   4305	addr_width = agaw_to_width(iommu->agaw);
   4306	if (addr_width > cap_mgaw(iommu->cap))
   4307		addr_width = cap_mgaw(iommu->cap);
   4308
   4309	if (dmar_domain->max_addr > (1LL << addr_width)) {
   4310		dev_err(dev, "%s: iommu width (%d) is not "
   4311		        "sufficient for the mapped address (%llx)\n",
   4312		        __func__, addr_width, dmar_domain->max_addr);
   4313		return -EFAULT;
   4314	}
   4315	dmar_domain->gaw = addr_width;
   4316
   4317	/*
   4318	 * Knock out extra levels of page tables if necessary
   4319	 */
   4320	while (iommu->agaw < dmar_domain->agaw) {
   4321		struct dma_pte *pte;
   4322
   4323		pte = dmar_domain->pgd;
   4324		if (dma_pte_present(pte)) {
   4325			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
   4326			free_pgtable_page(pte);
   4327		}
   4328		dmar_domain->agaw--;
   4329	}
   4330
   4331	return 0;
   4332}
   4333
   4334static int intel_iommu_attach_device(struct iommu_domain *domain,
   4335				     struct device *dev)
   4336{
   4337	int ret;
   4338
   4339	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
   4340	    device_is_rmrr_locked(dev)) {
   4341		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
   4342		return -EPERM;
   4343	}
   4344
   4345	/* normally dev is not mapped */
   4346	if (unlikely(domain_context_mapped(dev))) {
   4347		struct device_domain_info *info = dev_iommu_priv_get(dev);
   4348
   4349		if (info->domain)
   4350			dmar_remove_one_dev_info(dev);
   4351	}
   4352
   4353	ret = prepare_domain_attach_device(domain, dev);
   4354	if (ret)
   4355		return ret;
   4356
   4357	return domain_add_dev_info(to_dmar_domain(domain), dev);
   4358}
   4359
   4360static void intel_iommu_detach_device(struct iommu_domain *domain,
   4361				      struct device *dev)
   4362{
   4363	dmar_remove_one_dev_info(dev);
   4364}
   4365
   4366static int intel_iommu_map(struct iommu_domain *domain,
   4367			   unsigned long iova, phys_addr_t hpa,
   4368			   size_t size, int iommu_prot, gfp_t gfp)
   4369{
   4370	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
   4371	u64 max_addr;
   4372	int prot = 0;
   4373
   4374	if (iommu_prot & IOMMU_READ)
   4375		prot |= DMA_PTE_READ;
   4376	if (iommu_prot & IOMMU_WRITE)
   4377		prot |= DMA_PTE_WRITE;
   4378	if (dmar_domain->set_pte_snp)
   4379		prot |= DMA_PTE_SNP;
   4380
   4381	max_addr = iova + size;
   4382	if (dmar_domain->max_addr < max_addr) {
   4383		u64 end;
   4384
   4385		/* check if minimum agaw is sufficient for mapped address */
   4386		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
   4387		if (end < max_addr) {
   4388			pr_err("%s: iommu width (%d) is not "
   4389			       "sufficient for the mapped address (%llx)\n",
   4390			       __func__, dmar_domain->gaw, max_addr);
   4391			return -EFAULT;
   4392		}
   4393		dmar_domain->max_addr = max_addr;
   4394	}
   4395	/* Round up size to next multiple of PAGE_SIZE, if it and
   4396	   the low bits of hpa would take us onto the next page */
   4397	size = aligned_nrpages(hpa, size);
   4398	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
   4399				hpa >> VTD_PAGE_SHIFT, size, prot);
   4400}
   4401
   4402static int intel_iommu_map_pages(struct iommu_domain *domain,
   4403				 unsigned long iova, phys_addr_t paddr,
   4404				 size_t pgsize, size_t pgcount,
   4405				 int prot, gfp_t gfp, size_t *mapped)
   4406{
   4407	unsigned long pgshift = __ffs(pgsize);
   4408	size_t size = pgcount << pgshift;
   4409	int ret;
   4410
   4411	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
   4412		return -EINVAL;
   4413
   4414	if (!IS_ALIGNED(iova | paddr, pgsize))
   4415		return -EINVAL;
   4416
   4417	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
   4418	if (!ret && mapped)
   4419		*mapped = size;
   4420
   4421	return ret;
   4422}
   4423
   4424static size_t intel_iommu_unmap(struct iommu_domain *domain,
   4425				unsigned long iova, size_t size,
   4426				struct iommu_iotlb_gather *gather)
   4427{
   4428	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
   4429	unsigned long start_pfn, last_pfn;
   4430	int level = 0;
   4431
   4432	/* Cope with horrid API which requires us to unmap more than the
   4433	   size argument if it happens to be a large-page mapping. */
   4434	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
   4435
   4436	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
   4437		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
   4438
   4439	start_pfn = iova >> VTD_PAGE_SHIFT;
   4440	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
   4441
   4442	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
   4443
   4444	if (dmar_domain->max_addr == iova + size)
   4445		dmar_domain->max_addr = iova;
   4446
   4447	iommu_iotlb_gather_add_page(domain, gather, iova, size);
   4448
   4449	return size;
   4450}
   4451
   4452static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
   4453				      unsigned long iova,
   4454				      size_t pgsize, size_t pgcount,
   4455				      struct iommu_iotlb_gather *gather)
   4456{
   4457	unsigned long pgshift = __ffs(pgsize);
   4458	size_t size = pgcount << pgshift;
   4459
   4460	return intel_iommu_unmap(domain, iova, size, gather);
   4461}
   4462
   4463static void intel_iommu_tlb_sync(struct iommu_domain *domain,
   4464				 struct iommu_iotlb_gather *gather)
   4465{
   4466	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
   4467	unsigned long iova_pfn = IOVA_PFN(gather->start);
   4468	size_t size = gather->end - gather->start;
   4469	unsigned long start_pfn;
   4470	unsigned long nrpages;
   4471	int iommu_id;
   4472
   4473	nrpages = aligned_nrpages(gather->start, size);
   4474	start_pfn = mm_to_dma_pfn(iova_pfn);
   4475
   4476	for_each_domain_iommu(iommu_id, dmar_domain)
   4477		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
   4478				      start_pfn, nrpages,
   4479				      list_empty(&gather->freelist), 0);
   4480
   4481	put_pages_list(&gather->freelist);
   4482}
   4483
   4484static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
   4485					    dma_addr_t iova)
   4486{
   4487	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
   4488	struct dma_pte *pte;
   4489	int level = 0;
   4490	u64 phys = 0;
   4491
   4492	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
   4493	if (pte && dma_pte_present(pte))
   4494		phys = dma_pte_addr(pte) +
   4495			(iova & (BIT_MASK(level_to_offset_bits(level) +
   4496						VTD_PAGE_SHIFT) - 1));
   4497
   4498	return phys;
   4499}
   4500
   4501static bool domain_support_force_snooping(struct dmar_domain *domain)
   4502{
   4503	struct device_domain_info *info;
   4504	bool support = true;
   4505
   4506	assert_spin_locked(&device_domain_lock);
   4507	list_for_each_entry(info, &domain->devices, link) {
   4508		if (!ecap_sc_support(info->iommu->ecap)) {
   4509			support = false;
   4510			break;
   4511		}
   4512	}
   4513
   4514	return support;
   4515}
   4516
   4517static void domain_set_force_snooping(struct dmar_domain *domain)
   4518{
   4519	struct device_domain_info *info;
   4520
   4521	assert_spin_locked(&device_domain_lock);
   4522
   4523	/*
   4524	 * Second level page table supports per-PTE snoop control. The
   4525	 * iommu_map() interface will handle this by setting SNP bit.
   4526	 */
   4527	if (!domain_use_first_level(domain)) {
   4528		domain->set_pte_snp = true;
   4529		return;
   4530	}
   4531
   4532	list_for_each_entry(info, &domain->devices, link)
   4533		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
   4534						     PASID_RID2PASID);
   4535}
   4536
   4537static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
   4538{
   4539	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
   4540	unsigned long flags;
   4541
   4542	if (dmar_domain->force_snooping)
   4543		return true;
   4544
   4545	spin_lock_irqsave(&device_domain_lock, flags);
   4546	if (!domain_support_force_snooping(dmar_domain)) {
   4547		spin_unlock_irqrestore(&device_domain_lock, flags);
   4548		return false;
   4549	}
   4550
   4551	domain_set_force_snooping(dmar_domain);
   4552	dmar_domain->force_snooping = true;
   4553	spin_unlock_irqrestore(&device_domain_lock, flags);
   4554
   4555	return true;
   4556}
   4557
   4558static bool intel_iommu_capable(enum iommu_cap cap)
   4559{
   4560	if (cap == IOMMU_CAP_CACHE_COHERENCY)
   4561		return true;
   4562	if (cap == IOMMU_CAP_INTR_REMAP)
   4563		return irq_remapping_enabled == 1;
   4564	if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
   4565		return dmar_platform_optin();
   4566
   4567	return false;
   4568}
   4569
   4570static struct iommu_device *intel_iommu_probe_device(struct device *dev)
   4571{
   4572	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
   4573	struct device_domain_info *info;
   4574	struct intel_iommu *iommu;
   4575	unsigned long flags;
   4576	u8 bus, devfn;
   4577
   4578	iommu = device_to_iommu(dev, &bus, &devfn);
   4579	if (!iommu)
   4580		return ERR_PTR(-ENODEV);
   4581
   4582	info = kzalloc(sizeof(*info), GFP_KERNEL);
   4583	if (!info)
   4584		return ERR_PTR(-ENOMEM);
   4585
   4586	if (dev_is_real_dma_subdevice(dev)) {
   4587		info->bus = pdev->bus->number;
   4588		info->devfn = pdev->devfn;
   4589		info->segment = pci_domain_nr(pdev->bus);
   4590	} else {
   4591		info->bus = bus;
   4592		info->devfn = devfn;
   4593		info->segment = iommu->segment;
   4594	}
   4595
   4596	info->dev = dev;
   4597	info->iommu = iommu;
   4598	if (dev_is_pci(dev)) {
   4599		if (ecap_dev_iotlb_support(iommu->ecap) &&
   4600		    pci_ats_supported(pdev) &&
   4601		    dmar_ats_supported(pdev, iommu))
   4602			info->ats_supported = 1;
   4603
   4604		if (sm_supported(iommu)) {
   4605			if (pasid_supported(iommu)) {
   4606				int features = pci_pasid_features(pdev);
   4607
   4608				if (features >= 0)
   4609					info->pasid_supported = features | 1;
   4610			}
   4611
   4612			if (info->ats_supported && ecap_prs(iommu->ecap) &&
   4613			    pci_pri_supported(pdev))
   4614				info->pri_supported = 1;
   4615		}
   4616	}
   4617
   4618	spin_lock_irqsave(&device_domain_lock, flags);
   4619	list_add(&info->global, &device_domain_list);
   4620	dev_iommu_priv_set(dev, info);
   4621	spin_unlock_irqrestore(&device_domain_lock, flags);
   4622
   4623	return &iommu->iommu;
   4624}
   4625
   4626static void intel_iommu_release_device(struct device *dev)
   4627{
   4628	struct device_domain_info *info = dev_iommu_priv_get(dev);
   4629	unsigned long flags;
   4630
   4631	dmar_remove_one_dev_info(dev);
   4632
   4633	spin_lock_irqsave(&device_domain_lock, flags);
   4634	dev_iommu_priv_set(dev, NULL);
   4635	list_del(&info->global);
   4636	spin_unlock_irqrestore(&device_domain_lock, flags);
   4637
   4638	kfree(info);
   4639	set_dma_ops(dev, NULL);
   4640}
   4641
   4642static void intel_iommu_probe_finalize(struct device *dev)
   4643{
   4644	set_dma_ops(dev, NULL);
   4645	iommu_setup_dma_ops(dev, 0, U64_MAX);
   4646}
   4647
   4648static void intel_iommu_get_resv_regions(struct device *device,
   4649					 struct list_head *head)
   4650{
   4651	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
   4652	struct iommu_resv_region *reg;
   4653	struct dmar_rmrr_unit *rmrr;
   4654	struct device *i_dev;
   4655	int i;
   4656
   4657	down_read(&dmar_global_lock);
   4658	for_each_rmrr_units(rmrr) {
   4659		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
   4660					  i, i_dev) {
   4661			struct iommu_resv_region *resv;
   4662			enum iommu_resv_type type;
   4663			size_t length;
   4664
   4665			if (i_dev != device &&
   4666			    !is_downstream_to_pci_bridge(device, i_dev))
   4667				continue;
   4668
   4669			length = rmrr->end_address - rmrr->base_address + 1;
   4670
   4671			type = device_rmrr_is_relaxable(device) ?
   4672				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
   4673
   4674			resv = iommu_alloc_resv_region(rmrr->base_address,
   4675						       length, prot, type);
   4676			if (!resv)
   4677				break;
   4678
   4679			list_add_tail(&resv->list, head);
   4680		}
   4681	}
   4682	up_read(&dmar_global_lock);
   4683
   4684#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
   4685	if (dev_is_pci(device)) {
   4686		struct pci_dev *pdev = to_pci_dev(device);
   4687
   4688		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
   4689			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
   4690						   IOMMU_RESV_DIRECT_RELAXABLE);
   4691			if (reg)
   4692				list_add_tail(&reg->list, head);
   4693		}
   4694	}
   4695#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
   4696
   4697	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
   4698				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
   4699				      0, IOMMU_RESV_MSI);
   4700	if (!reg)
   4701		return;
   4702	list_add_tail(&reg->list, head);
   4703}
   4704
   4705int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
   4706{
   4707	struct device_domain_info *info = dev_iommu_priv_get(dev);
   4708	struct context_entry *context;
   4709	struct dmar_domain *domain;
   4710	unsigned long flags;
   4711	u64 ctx_lo;
   4712	int ret;
   4713
   4714	domain = info->domain;
   4715	if (!domain)
   4716		return -EINVAL;
   4717
   4718	spin_lock_irqsave(&device_domain_lock, flags);
   4719	spin_lock(&iommu->lock);
   4720
   4721	ret = -EINVAL;
   4722	if (!info->pasid_supported)
   4723		goto out;
   4724
   4725	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
   4726	if (WARN_ON(!context))
   4727		goto out;
   4728
   4729	ctx_lo = context[0].lo;
   4730
   4731	if (!(ctx_lo & CONTEXT_PASIDE)) {
   4732		ctx_lo |= CONTEXT_PASIDE;
   4733		context[0].lo = ctx_lo;
   4734		wmb();
   4735		iommu->flush.flush_context(iommu,
   4736					   domain->iommu_did[iommu->seq_id],
   4737					   PCI_DEVID(info->bus, info->devfn),
   4738					   DMA_CCMD_MASK_NOBIT,
   4739					   DMA_CCMD_DEVICE_INVL);
   4740	}
   4741
   4742	/* Enable PASID support in the device, if it wasn't already */
   4743	if (!info->pasid_enabled)
   4744		iommu_enable_dev_iotlb(info);
   4745
   4746	ret = 0;
   4747
   4748 out:
   4749	spin_unlock(&iommu->lock);
   4750	spin_unlock_irqrestore(&device_domain_lock, flags);
   4751
   4752	return ret;
   4753}
   4754
   4755static struct iommu_group *intel_iommu_device_group(struct device *dev)
   4756{
   4757	if (dev_is_pci(dev))
   4758		return pci_device_group(dev);
   4759	return generic_device_group(dev);
   4760}
   4761
   4762static int intel_iommu_enable_sva(struct device *dev)
   4763{
   4764	struct device_domain_info *info = dev_iommu_priv_get(dev);
   4765	struct intel_iommu *iommu;
   4766	int ret;
   4767
   4768	if (!info || dmar_disabled)
   4769		return -EINVAL;
   4770
   4771	iommu = info->iommu;
   4772	if (!iommu)
   4773		return -EINVAL;
   4774
   4775	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
   4776		return -ENODEV;
   4777
   4778	if (intel_iommu_enable_pasid(iommu, dev))
   4779		return -ENODEV;
   4780
   4781	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
   4782		return -EINVAL;
   4783
   4784	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
   4785	if (!ret)
   4786		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
   4787
   4788	return ret;
   4789}
   4790
   4791static int intel_iommu_disable_sva(struct device *dev)
   4792{
   4793	struct device_domain_info *info = dev_iommu_priv_get(dev);
   4794	struct intel_iommu *iommu = info->iommu;
   4795	int ret;
   4796
   4797	ret = iommu_unregister_device_fault_handler(dev);
   4798	if (!ret)
   4799		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
   4800
   4801	return ret;
   4802}
   4803
   4804static int intel_iommu_enable_iopf(struct device *dev)
   4805{
   4806	struct device_domain_info *info = dev_iommu_priv_get(dev);
   4807
   4808	if (info && info->pri_supported)
   4809		return 0;
   4810
   4811	return -ENODEV;
   4812}
   4813
   4814static int
   4815intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
   4816{
   4817	switch (feat) {
   4818	case IOMMU_DEV_FEAT_IOPF:
   4819		return intel_iommu_enable_iopf(dev);
   4820
   4821	case IOMMU_DEV_FEAT_SVA:
   4822		return intel_iommu_enable_sva(dev);
   4823
   4824	default:
   4825		return -ENODEV;
   4826	}
   4827}
   4828
   4829static int
   4830intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
   4831{
   4832	switch (feat) {
   4833	case IOMMU_DEV_FEAT_IOPF:
   4834		return 0;
   4835
   4836	case IOMMU_DEV_FEAT_SVA:
   4837		return intel_iommu_disable_sva(dev);
   4838
   4839	default:
   4840		return -ENODEV;
   4841	}
   4842}
   4843
   4844static bool intel_iommu_is_attach_deferred(struct device *dev)
   4845{
   4846	struct device_domain_info *info = dev_iommu_priv_get(dev);
   4847
   4848	return translation_pre_enabled(info->iommu) && !info->domain;
   4849}
   4850
   4851/*
   4852 * Check that the device does not live on an external facing PCI port that is
   4853 * marked as untrusted. Such devices should not be able to apply quirks and
   4854 * thus not be able to bypass the IOMMU restrictions.
   4855 */
   4856static bool risky_device(struct pci_dev *pdev)
   4857{
   4858	if (pdev->untrusted) {
   4859		pci_info(pdev,
   4860			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
   4861			 pdev->vendor, pdev->device);
   4862		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
   4863		return true;
   4864	}
   4865	return false;
   4866}
   4867
   4868static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
   4869				       unsigned long iova, size_t size)
   4870{
   4871	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
   4872	unsigned long pages = aligned_nrpages(iova, size);
   4873	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
   4874	struct intel_iommu *iommu;
   4875	int iommu_id;
   4876
   4877	for_each_domain_iommu(iommu_id, dmar_domain) {
   4878		iommu = g_iommus[iommu_id];
   4879		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
   4880	}
   4881}
   4882
   4883const struct iommu_ops intel_iommu_ops = {
   4884	.capable		= intel_iommu_capable,
   4885	.domain_alloc		= intel_iommu_domain_alloc,
   4886	.probe_device		= intel_iommu_probe_device,
   4887	.probe_finalize		= intel_iommu_probe_finalize,
   4888	.release_device		= intel_iommu_release_device,
   4889	.get_resv_regions	= intel_iommu_get_resv_regions,
   4890	.put_resv_regions	= generic_iommu_put_resv_regions,
   4891	.device_group		= intel_iommu_device_group,
   4892	.dev_enable_feat	= intel_iommu_dev_enable_feat,
   4893	.dev_disable_feat	= intel_iommu_dev_disable_feat,
   4894	.is_attach_deferred	= intel_iommu_is_attach_deferred,
   4895	.def_domain_type	= device_def_domain_type,
   4896	.pgsize_bitmap		= SZ_4K,
   4897#ifdef CONFIG_INTEL_IOMMU_SVM
   4898	.sva_bind		= intel_svm_bind,
   4899	.sva_unbind		= intel_svm_unbind,
   4900	.sva_get_pasid		= intel_svm_get_pasid,
   4901	.page_response		= intel_svm_page_response,
   4902#endif
   4903	.default_domain_ops = &(const struct iommu_domain_ops) {
   4904		.attach_dev		= intel_iommu_attach_device,
   4905		.detach_dev		= intel_iommu_detach_device,
   4906		.map_pages		= intel_iommu_map_pages,
   4907		.unmap_pages		= intel_iommu_unmap_pages,
   4908		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
   4909		.flush_iotlb_all        = intel_flush_iotlb_all,
   4910		.iotlb_sync		= intel_iommu_tlb_sync,
   4911		.iova_to_phys		= intel_iommu_iova_to_phys,
   4912		.free			= intel_iommu_domain_free,
   4913		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
   4914	}
   4915};
   4916
   4917static void quirk_iommu_igfx(struct pci_dev *dev)
   4918{
   4919	if (risky_device(dev))
   4920		return;
   4921
   4922	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
   4923	dmar_map_gfx = 0;
   4924}
   4925
   4926/* G4x/GM45 integrated gfx dmar support is totally busted. */
   4927DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
   4928DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
   4929DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
   4930DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
   4931DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
   4932DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
   4933DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
   4934
   4935/* Broadwell igfx malfunctions with dmar */
   4936DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
   4937DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
   4938DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
   4939DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
   4940DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
   4941DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
   4942DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
   4943DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
   4944DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
   4945DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
   4946DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
   4947DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
   4948DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
   4949DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
   4950DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
   4951DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
   4952DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
   4953DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
   4954DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
   4955DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
   4956DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
   4957DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
   4958DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
   4959DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
   4960
   4961static void quirk_iommu_rwbf(struct pci_dev *dev)
   4962{
   4963	if (risky_device(dev))
   4964		return;
   4965
   4966	/*
   4967	 * Mobile 4 Series Chipset neglects to set RWBF capability,
   4968	 * but needs it. Same seems to hold for the desktop versions.
   4969	 */
   4970	pci_info(dev, "Forcing write-buffer flush capability\n");
   4971	rwbf_quirk = 1;
   4972}
   4973
   4974DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
   4975DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
   4976DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
   4977DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
   4978DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
   4979DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
   4980DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
   4981
   4982#define GGC 0x52
   4983#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
   4984#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
   4985#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
   4986#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
   4987#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
   4988#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
   4989#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
   4990#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
   4991
   4992static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
   4993{
   4994	unsigned short ggc;
   4995
   4996	if (risky_device(dev))
   4997		return;
   4998
   4999	if (pci_read_config_word(dev, GGC, &ggc))
   5000		return;
   5001
   5002	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
   5003		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
   5004		dmar_map_gfx = 0;
   5005	} else if (dmar_map_gfx) {
   5006		/* we have to ensure the gfx device is idle before we flush */
   5007		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
   5008		iommu_set_dma_strict();
   5009	}
   5010}
   5011DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
   5012DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
   5013DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
   5014DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
   5015
   5016static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
   5017{
   5018	unsigned short ver;
   5019
   5020	if (!IS_GFX_DEVICE(dev))
   5021		return;
   5022
   5023	ver = (dev->device >> 8) & 0xff;
   5024	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
   5025	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
   5026	    ver != 0x9a && ver != 0xa7)
   5027		return;
   5028
   5029	if (risky_device(dev))
   5030		return;
   5031
   5032	pci_info(dev, "Skip IOMMU disabling for graphics\n");
   5033	iommu_skip_te_disable = 1;
   5034}
   5035DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
   5036
   5037/* On Tylersburg chipsets, some BIOSes have been known to enable the
   5038   ISOCH DMAR unit for the Azalia sound device, but not give it any
   5039   TLB entries, which causes it to deadlock. Check for that.  We do
   5040   this in a function called from init_dmars(), instead of in a PCI
   5041   quirk, because we don't want to print the obnoxious "BIOS broken"
   5042   message if VT-d is actually disabled.
   5043*/
   5044static void __init check_tylersburg_isoch(void)
   5045{
   5046	struct pci_dev *pdev;
   5047	uint32_t vtisochctrl;
   5048
   5049	/* If there's no Azalia in the system anyway, forget it. */
   5050	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
   5051	if (!pdev)
   5052		return;
   5053
   5054	if (risky_device(pdev)) {
   5055		pci_dev_put(pdev);
   5056		return;
   5057	}
   5058
   5059	pci_dev_put(pdev);
   5060
   5061	/* System Management Registers. Might be hidden, in which case
   5062	   we can't do the sanity check. But that's OK, because the
   5063	   known-broken BIOSes _don't_ actually hide it, so far. */
   5064	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
   5065	if (!pdev)
   5066		return;
   5067
   5068	if (risky_device(pdev)) {
   5069		pci_dev_put(pdev);
   5070		return;
   5071	}
   5072
   5073	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
   5074		pci_dev_put(pdev);
   5075		return;
   5076	}
   5077
   5078	pci_dev_put(pdev);
   5079
   5080	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
   5081	if (vtisochctrl & 1)
   5082		return;
   5083
   5084	/* Drop all bits other than the number of TLB entries */
   5085	vtisochctrl &= 0x1c;
   5086
   5087	/* If we have the recommended number of TLB entries (16), fine. */
   5088	if (vtisochctrl == 0x10)
   5089		return;
   5090
   5091	/* Zero TLB entries? You get to ride the short bus to school. */
   5092	if (!vtisochctrl) {
   5093		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
   5094		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
   5095		     dmi_get_system_info(DMI_BIOS_VENDOR),
   5096		     dmi_get_system_info(DMI_BIOS_VERSION),
   5097		     dmi_get_system_info(DMI_PRODUCT_VERSION));
   5098		iommu_identity_mapping |= IDENTMAP_AZALIA;
   5099		return;
   5100	}
   5101
   5102	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
   5103	       vtisochctrl);
   5104}