io_pgtable.c (11978B)
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * CPU-agnostic AMD IO page table allocator. 4 * 5 * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 */ 8 9#define pr_fmt(fmt) "AMD-Vi: " fmt 10#define dev_fmt(fmt) pr_fmt(fmt) 11 12#include <linux/atomic.h> 13#include <linux/bitops.h> 14#include <linux/io-pgtable.h> 15#include <linux/kernel.h> 16#include <linux/sizes.h> 17#include <linux/slab.h> 18#include <linux/types.h> 19#include <linux/dma-mapping.h> 20 21#include <asm/barrier.h> 22 23#include "amd_iommu_types.h" 24#include "amd_iommu.h" 25 26static void v1_tlb_flush_all(void *cookie) 27{ 28} 29 30static void v1_tlb_flush_walk(unsigned long iova, size_t size, 31 size_t granule, void *cookie) 32{ 33} 34 35static void v1_tlb_add_page(struct iommu_iotlb_gather *gather, 36 unsigned long iova, size_t granule, 37 void *cookie) 38{ 39} 40 41static const struct iommu_flush_ops v1_flush_ops = { 42 .tlb_flush_all = v1_tlb_flush_all, 43 .tlb_flush_walk = v1_tlb_flush_walk, 44 .tlb_add_page = v1_tlb_add_page, 45}; 46 47/* 48 * Helper function to get the first pte of a large mapping 49 */ 50static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 51 unsigned long *count) 52{ 53 unsigned long pte_mask, pg_size, cnt; 54 u64 *fpte; 55 56 pg_size = PTE_PAGE_SIZE(*pte); 57 cnt = PAGE_SIZE_PTE_COUNT(pg_size); 58 pte_mask = ~((cnt << 3) - 1); 59 fpte = (u64 *)(((unsigned long)pte) & pte_mask); 60 61 if (page_size) 62 *page_size = pg_size; 63 64 if (count) 65 *count = cnt; 66 67 return fpte; 68} 69 70/**************************************************************************** 71 * 72 * The functions below are used the create the page table mappings for 73 * unity mapped regions. 74 * 75 ****************************************************************************/ 76 77static void free_pt_page(u64 *pt, struct list_head *freelist) 78{ 79 struct page *p = virt_to_page(pt); 80 81 list_add_tail(&p->lru, freelist); 82} 83 84static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl) 85{ 86 u64 *p; 87 int i; 88 89 for (i = 0; i < 512; ++i) { 90 /* PTE present? */ 91 if (!IOMMU_PTE_PRESENT(pt[i])) 92 continue; 93 94 /* Large PTE? */ 95 if (PM_PTE_LEVEL(pt[i]) == 0 || 96 PM_PTE_LEVEL(pt[i]) == 7) 97 continue; 98 99 /* 100 * Free the next level. No need to look at l1 tables here since 101 * they can only contain leaf PTEs; just free them directly. 102 */ 103 p = IOMMU_PTE_PAGE(pt[i]); 104 if (lvl > 2) 105 free_pt_lvl(p, freelist, lvl - 1); 106 else 107 free_pt_page(p, freelist); 108 } 109 110 free_pt_page(pt, freelist); 111} 112 113static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) 114{ 115 switch (mode) { 116 case PAGE_MODE_NONE: 117 case PAGE_MODE_7_LEVEL: 118 break; 119 case PAGE_MODE_1_LEVEL: 120 free_pt_page(root, freelist); 121 break; 122 case PAGE_MODE_2_LEVEL: 123 case PAGE_MODE_3_LEVEL: 124 case PAGE_MODE_4_LEVEL: 125 case PAGE_MODE_5_LEVEL: 126 case PAGE_MODE_6_LEVEL: 127 free_pt_lvl(root, freelist, mode); 128 break; 129 default: 130 BUG(); 131 } 132} 133 134void amd_iommu_domain_set_pgtable(struct protection_domain *domain, 135 u64 *root, int mode) 136{ 137 u64 pt_root; 138 139 /* lowest 3 bits encode pgtable mode */ 140 pt_root = mode & 7; 141 pt_root |= (u64)root; 142 143 amd_iommu_domain_set_pt_root(domain, pt_root); 144} 145 146/* 147 * This function is used to add another level to an IO page table. Adding 148 * another level increases the size of the address space by 9 bits to a size up 149 * to 64 bits. 150 */ 151static bool increase_address_space(struct protection_domain *domain, 152 unsigned long address, 153 gfp_t gfp) 154{ 155 unsigned long flags; 156 bool ret = true; 157 u64 *pte; 158 159 pte = (void *)get_zeroed_page(gfp); 160 if (!pte) 161 return false; 162 163 spin_lock_irqsave(&domain->lock, flags); 164 165 if (address <= PM_LEVEL_SIZE(domain->iop.mode)) 166 goto out; 167 168 ret = false; 169 if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL)) 170 goto out; 171 172 *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root)); 173 174 domain->iop.root = pte; 175 domain->iop.mode += 1; 176 amd_iommu_update_and_flush_device_table(domain); 177 amd_iommu_domain_flush_complete(domain); 178 179 /* 180 * Device Table needs to be updated and flushed before the new root can 181 * be published. 182 */ 183 amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode); 184 185 pte = NULL; 186 ret = true; 187 188out: 189 spin_unlock_irqrestore(&domain->lock, flags); 190 free_page((unsigned long)pte); 191 192 return ret; 193} 194 195static u64 *alloc_pte(struct protection_domain *domain, 196 unsigned long address, 197 unsigned long page_size, 198 u64 **pte_page, 199 gfp_t gfp, 200 bool *updated) 201{ 202 int level, end_lvl; 203 u64 *pte, *page; 204 205 BUG_ON(!is_power_of_2(page_size)); 206 207 while (address > PM_LEVEL_SIZE(domain->iop.mode)) { 208 /* 209 * Return an error if there is no memory to update the 210 * page-table. 211 */ 212 if (!increase_address_space(domain, address, gfp)) 213 return NULL; 214 } 215 216 217 level = domain->iop.mode - 1; 218 pte = &domain->iop.root[PM_LEVEL_INDEX(level, address)]; 219 address = PAGE_SIZE_ALIGN(address, page_size); 220 end_lvl = PAGE_SIZE_LEVEL(page_size); 221 222 while (level > end_lvl) { 223 u64 __pte, __npte; 224 int pte_level; 225 226 __pte = *pte; 227 pte_level = PM_PTE_LEVEL(__pte); 228 229 /* 230 * If we replace a series of large PTEs, we need 231 * to tear down all of them. 232 */ 233 if (IOMMU_PTE_PRESENT(__pte) && 234 pte_level == PAGE_MODE_7_LEVEL) { 235 unsigned long count, i; 236 u64 *lpte; 237 238 lpte = first_pte_l7(pte, NULL, &count); 239 240 /* 241 * Unmap the replicated PTEs that still match the 242 * original large mapping 243 */ 244 for (i = 0; i < count; ++i) 245 cmpxchg64(&lpte[i], __pte, 0ULL); 246 247 *updated = true; 248 continue; 249 } 250 251 if (!IOMMU_PTE_PRESENT(__pte) || 252 pte_level == PAGE_MODE_NONE) { 253 page = (u64 *)get_zeroed_page(gfp); 254 255 if (!page) 256 return NULL; 257 258 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 259 260 /* pte could have been changed somewhere. */ 261 if (cmpxchg64(pte, __pte, __npte) != __pte) 262 free_page((unsigned long)page); 263 else if (IOMMU_PTE_PRESENT(__pte)) 264 *updated = true; 265 266 continue; 267 } 268 269 /* No level skipping support yet */ 270 if (pte_level != level) 271 return NULL; 272 273 level -= 1; 274 275 pte = IOMMU_PTE_PAGE(__pte); 276 277 if (pte_page && level == end_lvl) 278 *pte_page = pte; 279 280 pte = &pte[PM_LEVEL_INDEX(level, address)]; 281 } 282 283 return pte; 284} 285 286/* 287 * This function checks if there is a PTE for a given dma address. If 288 * there is one, it returns the pointer to it. 289 */ 290static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 291 unsigned long address, 292 unsigned long *page_size) 293{ 294 int level; 295 u64 *pte; 296 297 *page_size = 0; 298 299 if (address > PM_LEVEL_SIZE(pgtable->mode)) 300 return NULL; 301 302 level = pgtable->mode - 1; 303 pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 304 *page_size = PTE_LEVEL_PAGE_SIZE(level); 305 306 while (level > 0) { 307 308 /* Not Present */ 309 if (!IOMMU_PTE_PRESENT(*pte)) 310 return NULL; 311 312 /* Large PTE */ 313 if (PM_PTE_LEVEL(*pte) == 7 || 314 PM_PTE_LEVEL(*pte) == 0) 315 break; 316 317 /* No level skipping support yet */ 318 if (PM_PTE_LEVEL(*pte) != level) 319 return NULL; 320 321 level -= 1; 322 323 /* Walk to the next level */ 324 pte = IOMMU_PTE_PAGE(*pte); 325 pte = &pte[PM_LEVEL_INDEX(level, address)]; 326 *page_size = PTE_LEVEL_PAGE_SIZE(level); 327 } 328 329 /* 330 * If we have a series of large PTEs, make 331 * sure to return a pointer to the first one. 332 */ 333 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 334 pte = first_pte_l7(pte, page_size, NULL); 335 336 return pte; 337} 338 339static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist) 340{ 341 u64 *pt; 342 int mode; 343 344 while (cmpxchg64(pte, pteval, 0) != pteval) { 345 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 346 pteval = *pte; 347 } 348 349 if (!IOMMU_PTE_PRESENT(pteval)) 350 return; 351 352 pt = IOMMU_PTE_PAGE(pteval); 353 mode = IOMMU_PTE_MODE(pteval); 354 355 free_sub_pt(pt, mode, freelist); 356} 357 358/* 359 * Generic mapping functions. It maps a physical address into a DMA 360 * address space. It allocates the page table pages if necessary. 361 * In the future it can be extended to a generic mapping function 362 * supporting all features of AMD IOMMU page tables like level skipping 363 * and full 64 bit address spaces. 364 */ 365static int iommu_v1_map_page(struct io_pgtable_ops *ops, unsigned long iova, 366 phys_addr_t paddr, size_t size, int prot, gfp_t gfp) 367{ 368 struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 369 LIST_HEAD(freelist); 370 bool updated = false; 371 u64 __pte, *pte; 372 int ret, i, count; 373 374 BUG_ON(!IS_ALIGNED(iova, size)); 375 BUG_ON(!IS_ALIGNED(paddr, size)); 376 377 ret = -EINVAL; 378 if (!(prot & IOMMU_PROT_MASK)) 379 goto out; 380 381 count = PAGE_SIZE_PTE_COUNT(size); 382 pte = alloc_pte(dom, iova, size, NULL, gfp, &updated); 383 384 ret = -ENOMEM; 385 if (!pte) 386 goto out; 387 388 for (i = 0; i < count; ++i) 389 free_clear_pte(&pte[i], pte[i], &freelist); 390 391 if (!list_empty(&freelist)) 392 updated = true; 393 394 if (count > 1) { 395 __pte = PAGE_SIZE_PTE(__sme_set(paddr), size); 396 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 397 } else 398 __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 399 400 if (prot & IOMMU_PROT_IR) 401 __pte |= IOMMU_PTE_IR; 402 if (prot & IOMMU_PROT_IW) 403 __pte |= IOMMU_PTE_IW; 404 405 for (i = 0; i < count; ++i) 406 pte[i] = __pte; 407 408 ret = 0; 409 410out: 411 if (updated) { 412 unsigned long flags; 413 414 spin_lock_irqsave(&dom->lock, flags); 415 /* 416 * Flush domain TLB(s) and wait for completion. Any Device-Table 417 * Updates and flushing already happened in 418 * increase_address_space(). 419 */ 420 amd_iommu_domain_flush_tlb_pde(dom); 421 amd_iommu_domain_flush_complete(dom); 422 spin_unlock_irqrestore(&dom->lock, flags); 423 } 424 425 /* Everything flushed out, free pages now */ 426 put_pages_list(&freelist); 427 428 return ret; 429} 430 431static unsigned long iommu_v1_unmap_page(struct io_pgtable_ops *ops, 432 unsigned long iova, 433 size_t size, 434 struct iommu_iotlb_gather *gather) 435{ 436 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 437 unsigned long long unmapped; 438 unsigned long unmap_size; 439 u64 *pte; 440 441 BUG_ON(!is_power_of_2(size)); 442 443 unmapped = 0; 444 445 while (unmapped < size) { 446 pte = fetch_pte(pgtable, iova, &unmap_size); 447 if (pte) { 448 int i, count; 449 450 count = PAGE_SIZE_PTE_COUNT(unmap_size); 451 for (i = 0; i < count; i++) 452 pte[i] = 0ULL; 453 } 454 455 iova = (iova & ~(unmap_size - 1)) + unmap_size; 456 unmapped += unmap_size; 457 } 458 459 BUG_ON(unmapped && !is_power_of_2(unmapped)); 460 461 return unmapped; 462} 463 464static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 465{ 466 struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 467 unsigned long offset_mask, pte_pgsize; 468 u64 *pte, __pte; 469 470 pte = fetch_pte(pgtable, iova, &pte_pgsize); 471 472 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 473 return 0; 474 475 offset_mask = pte_pgsize - 1; 476 __pte = __sme_clr(*pte & PM_ADDR_MASK); 477 478 return (__pte & ~offset_mask) | (iova & offset_mask); 479} 480 481/* 482 * ---------------------------------------------------- 483 */ 484static void v1_free_pgtable(struct io_pgtable *iop) 485{ 486 struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); 487 struct protection_domain *dom; 488 LIST_HEAD(freelist); 489 490 if (pgtable->mode == PAGE_MODE_NONE) 491 return; 492 493 dom = container_of(pgtable, struct protection_domain, iop); 494 495 /* Page-table is not visible to IOMMU anymore, so free it */ 496 BUG_ON(pgtable->mode < PAGE_MODE_NONE || 497 pgtable->mode > PAGE_MODE_6_LEVEL); 498 499 free_sub_pt(pgtable->root, pgtable->mode, &freelist); 500 501 /* Update data structure */ 502 amd_iommu_domain_clr_pt_root(dom); 503 504 /* Make changes visible to IOMMUs */ 505 amd_iommu_domain_update(dom); 506 507 put_pages_list(&freelist); 508} 509 510static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 511{ 512 struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 513 514 cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES, 515 cfg->ias = IOMMU_IN_ADDR_BIT_SIZE, 516 cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, 517 cfg->tlb = &v1_flush_ops; 518 519 pgtable->iop.ops.map = iommu_v1_map_page; 520 pgtable->iop.ops.unmap = iommu_v1_unmap_page; 521 pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; 522 523 return &pgtable->iop; 524} 525 526struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 527 .alloc = v1_alloc_pgtable, 528 .free = v1_free_pgtable, 529};