tdp_mmu.c (59287B)
1// SPDX-License-Identifier: GPL-2.0 2 3#include "mmu.h" 4#include "mmu_internal.h" 5#include "mmutrace.h" 6#include "tdp_iter.h" 7#include "tdp_mmu.h" 8#include "spte.h" 9 10#include <asm/cmpxchg.h> 11#include <trace/events/kvm.h> 12 13static bool __read_mostly tdp_mmu_enabled = true; 14module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16/* Initializes the TDP MMU for the VM, if enabled. */ 17int kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18{ 19 struct workqueue_struct *wq; 20 21 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 22 return 0; 23 24 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 25 if (!wq) 26 return -ENOMEM; 27 28 /* This should not be changed for the lifetime of the VM. */ 29 kvm->arch.tdp_mmu_enabled = true; 30 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 31 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 32 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 33 kvm->arch.tdp_mmu_zap_wq = wq; 34 return 1; 35} 36 37/* Arbitrarily returns true so that this may be used in if statements. */ 38static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 39 bool shared) 40{ 41 if (shared) 42 lockdep_assert_held_read(&kvm->mmu_lock); 43 else 44 lockdep_assert_held_write(&kvm->mmu_lock); 45 46 return true; 47} 48 49void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 50{ 51 if (!kvm->arch.tdp_mmu_enabled) 52 return; 53 54 /* Also waits for any queued work items. */ 55 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 56 57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); 58 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 59 60 /* 61 * Ensure that all the outstanding RCU callbacks to free shadow pages 62 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 63 * can call kvm_tdp_mmu_put_root and create new callbacks. 64 */ 65 rcu_barrier(); 66} 67 68static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 69{ 70 free_page((unsigned long)sp->spt); 71 kmem_cache_free(mmu_page_header_cache, sp); 72} 73 74/* 75 * This is called through call_rcu in order to free TDP page table memory 76 * safely with respect to other kernel threads that may be operating on 77 * the memory. 78 * By only accessing TDP MMU page table memory in an RCU read critical 79 * section, and freeing it after a grace period, lockless access to that 80 * memory won't use it after it is freed. 81 */ 82static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 83{ 84 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 85 rcu_head); 86 87 tdp_mmu_free_sp(sp); 88} 89 90static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 91 bool shared); 92 93static void tdp_mmu_zap_root_work(struct work_struct *work) 94{ 95 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 96 tdp_mmu_async_work); 97 struct kvm *kvm = root->tdp_mmu_async_data; 98 99 read_lock(&kvm->mmu_lock); 100 101 /* 102 * A TLB flush is not necessary as KVM performs a local TLB flush when 103 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 104 * to a different pCPU. Note, the local TLB flush on reuse also 105 * invalidates any paging-structure-cache entries, i.e. TLB entries for 106 * intermediate paging structures, that may be zapped, as such entries 107 * are associated with the ASID on both VMX and SVM. 108 */ 109 tdp_mmu_zap_root(kvm, root, true); 110 111 /* 112 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 113 * avoiding an infinite loop. By design, the root is reachable while 114 * it's being asynchronously zapped, thus a different task can put its 115 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 116 * asynchronously zapped root is unavoidable. 117 */ 118 kvm_tdp_mmu_put_root(kvm, root, true); 119 120 read_unlock(&kvm->mmu_lock); 121} 122 123static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 124{ 125 root->tdp_mmu_async_data = kvm; 126 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 127 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 128} 129 130static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) 131{ 132 union kvm_mmu_page_role role = page->role; 133 role.invalid = true; 134 135 /* No need to use cmpxchg, only the invalid bit can change. */ 136 role.word = xchg(&page->role.word, role.word); 137 return role.invalid; 138} 139 140void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 141 bool shared) 142{ 143 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 144 145 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 146 return; 147 148 WARN_ON(!root->tdp_mmu_page); 149 150 /* 151 * The root now has refcount=0. It is valid, but readers already 152 * cannot acquire a reference to it because kvm_tdp_mmu_get_root() 153 * rejects it. This remains true for the rest of the execution 154 * of this function, because readers visit valid roots only 155 * (except for tdp_mmu_zap_root_work(), which however 156 * does not acquire any reference itself). 157 * 158 * Even though there are flows that need to visit all roots for 159 * correctness, they all take mmu_lock for write, so they cannot yet 160 * run concurrently. The same is true after kvm_tdp_root_mark_invalid, 161 * since the root still has refcount=0. 162 * 163 * However, tdp_mmu_zap_root can yield, and writers do not expect to 164 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). 165 * So the root temporarily gets an extra reference, going to refcount=1 166 * while staying invalid. Readers still cannot acquire any reference; 167 * but writers are now allowed to run if tdp_mmu_zap_root yields and 168 * they might take an extra reference if they themselves yield. 169 * Therefore, when the reference is given back by the worker, 170 * there is no guarantee that the refcount is still 1. If not, whoever 171 * puts the last reference will free the page, but they will not have to 172 * zap the root because a root cannot go from invalid to valid. 173 */ 174 if (!kvm_tdp_root_mark_invalid(root)) { 175 refcount_set(&root->tdp_mmu_root_count, 1); 176 177 /* 178 * Zapping the root in a worker is not just "nice to have"; 179 * it is required because kvm_tdp_mmu_invalidate_all_roots() 180 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did 181 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() 182 * might return with some roots not zapped yet. 183 */ 184 tdp_mmu_schedule_zap_root(kvm, root); 185 return; 186 } 187 188 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 189 list_del_rcu(&root->link); 190 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 191 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 192} 193 194/* 195 * Returns the next root after @prev_root (or the first root if @prev_root is 196 * NULL). A reference to the returned root is acquired, and the reference to 197 * @prev_root is released (the caller obviously must hold a reference to 198 * @prev_root if it's non-NULL). 199 * 200 * If @only_valid is true, invalid roots are skipped. 201 * 202 * Returns NULL if the end of tdp_mmu_roots was reached. 203 */ 204static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 205 struct kvm_mmu_page *prev_root, 206 bool shared, bool only_valid) 207{ 208 struct kvm_mmu_page *next_root; 209 210 rcu_read_lock(); 211 212 if (prev_root) 213 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 214 &prev_root->link, 215 typeof(*prev_root), link); 216 else 217 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 218 typeof(*next_root), link); 219 220 while (next_root) { 221 if ((!only_valid || !next_root->role.invalid) && 222 kvm_tdp_mmu_get_root(next_root)) 223 break; 224 225 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 226 &next_root->link, typeof(*next_root), link); 227 } 228 229 rcu_read_unlock(); 230 231 if (prev_root) 232 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 233 234 return next_root; 235} 236 237/* 238 * Note: this iterator gets and puts references to the roots it iterates over. 239 * This makes it safe to release the MMU lock and yield within the loop, but 240 * if exiting the loop early, the caller must drop the reference to the most 241 * recent root. (Unless keeping a live reference is desirable.) 242 * 243 * If shared is set, this function is operating under the MMU lock in read 244 * mode. In the unlikely event that this thread must free a root, the lock 245 * will be temporarily dropped and reacquired in write mode. 246 */ 247#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 248 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 249 _root; \ 250 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 251 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 252 kvm_mmu_page_as_id(_root) != _as_id) { \ 253 } else 254 255#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 256 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 257 258#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 259 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 260 261/* 262 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 263 * the implication being that any flow that holds mmu_lock for read is 264 * inherently yield-friendly and should use the yield-safe variant above. 265 * Holding mmu_lock for write obviates the need for RCU protection as the list 266 * is guaranteed to be stable. 267 */ 268#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 269 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 270 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 271 kvm_mmu_page_as_id(_root) != _as_id) { \ 272 } else 273 274static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 275{ 276 struct kvm_mmu_page *sp; 277 278 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 279 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 280 281 return sp; 282} 283 284static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 285 gfn_t gfn, union kvm_mmu_page_role role) 286{ 287 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 288 289 sp->role = role; 290 sp->gfn = gfn; 291 sp->ptep = sptep; 292 sp->tdp_mmu_page = true; 293 294 trace_kvm_mmu_get_page(sp, true); 295} 296 297static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 298 struct tdp_iter *iter) 299{ 300 struct kvm_mmu_page *parent_sp; 301 union kvm_mmu_page_role role; 302 303 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 304 305 role = parent_sp->role; 306 role.level--; 307 308 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 309} 310 311hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 312{ 313 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 314 struct kvm *kvm = vcpu->kvm; 315 struct kvm_mmu_page *root; 316 317 lockdep_assert_held_write(&kvm->mmu_lock); 318 319 /* 320 * Check for an existing root before allocating a new one. Note, the 321 * role check prevents consuming an invalid root. 322 */ 323 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 324 if (root->role.word == role.word && 325 kvm_tdp_mmu_get_root(root)) 326 goto out; 327 } 328 329 root = tdp_mmu_alloc_sp(vcpu); 330 tdp_mmu_init_sp(root, NULL, 0, role); 331 332 refcount_set(&root->tdp_mmu_root_count, 1); 333 334 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 335 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 336 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 337 338out: 339 return __pa(root->spt); 340} 341 342static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 343 u64 old_spte, u64 new_spte, int level, 344 bool shared); 345 346static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 347{ 348 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 349 return; 350 351 if (is_accessed_spte(old_spte) && 352 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 353 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 354 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 355} 356 357static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 358 u64 old_spte, u64 new_spte, int level) 359{ 360 bool pfn_changed; 361 struct kvm_memory_slot *slot; 362 363 if (level > PG_LEVEL_4K) 364 return; 365 366 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 367 368 if ((!is_writable_pte(old_spte) || pfn_changed) && 369 is_writable_pte(new_spte)) { 370 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 371 mark_page_dirty_in_slot(kvm, slot, gfn); 372 } 373} 374 375/** 376 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 377 * 378 * @kvm: kvm instance 379 * @sp: the page to be removed 380 * @shared: This operation may not be running under the exclusive use of 381 * the MMU lock and the operation must synchronize with other 382 * threads that might be adding or removing pages. 383 */ 384static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 385 bool shared) 386{ 387 if (shared) 388 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 389 else 390 lockdep_assert_held_write(&kvm->mmu_lock); 391 392 list_del(&sp->link); 393 if (sp->lpage_disallowed) 394 unaccount_huge_nx_page(kvm, sp); 395 396 if (shared) 397 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 398} 399 400/** 401 * handle_removed_pt() - handle a page table removed from the TDP structure 402 * 403 * @kvm: kvm instance 404 * @pt: the page removed from the paging structure 405 * @shared: This operation may not be running under the exclusive use 406 * of the MMU lock and the operation must synchronize with other 407 * threads that might be modifying SPTEs. 408 * 409 * Given a page table that has been removed from the TDP paging structure, 410 * iterates through the page table to clear SPTEs and free child page tables. 411 * 412 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 413 * protection. Since this thread removed it from the paging structure, 414 * this thread will be responsible for ensuring the page is freed. Hence the 415 * early rcu_dereferences in the function. 416 */ 417static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 418{ 419 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 420 int level = sp->role.level; 421 gfn_t base_gfn = sp->gfn; 422 int i; 423 424 trace_kvm_mmu_prepare_zap_page(sp); 425 426 tdp_mmu_unlink_sp(kvm, sp, shared); 427 428 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 429 tdp_ptep_t sptep = pt + i; 430 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 431 u64 old_spte; 432 433 if (shared) { 434 /* 435 * Set the SPTE to a nonpresent value that other 436 * threads will not overwrite. If the SPTE was 437 * already marked as removed then another thread 438 * handling a page fault could overwrite it, so 439 * set the SPTE until it is set from some other 440 * value to the removed SPTE value. 441 */ 442 for (;;) { 443 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); 444 if (!is_removed_spte(old_spte)) 445 break; 446 cpu_relax(); 447 } 448 } else { 449 /* 450 * If the SPTE is not MMU-present, there is no backing 451 * page associated with the SPTE and so no side effects 452 * that need to be recorded, and exclusive ownership of 453 * mmu_lock ensures the SPTE can't be made present. 454 * Note, zapping MMIO SPTEs is also unnecessary as they 455 * are guarded by the memslots generation, not by being 456 * unreachable. 457 */ 458 old_spte = kvm_tdp_mmu_read_spte(sptep); 459 if (!is_shadow_present_pte(old_spte)) 460 continue; 461 462 /* 463 * Use the common helper instead of a raw WRITE_ONCE as 464 * the SPTE needs to be updated atomically if it can be 465 * modified by a different vCPU outside of mmu_lock. 466 * Even though the parent SPTE is !PRESENT, the TLB 467 * hasn't yet been flushed, and both Intel and AMD 468 * document that A/D assists can use upper-level PxE 469 * entries that are cached in the TLB, i.e. the CPU can 470 * still access the page and mark it dirty. 471 * 472 * No retry is needed in the atomic update path as the 473 * sole concern is dropping a Dirty bit, i.e. no other 474 * task can zap/remove the SPTE as mmu_lock is held for 475 * write. Marking the SPTE as a removed SPTE is not 476 * strictly necessary for the same reason, but using 477 * the remove SPTE value keeps the shared/exclusive 478 * paths consistent and allows the handle_changed_spte() 479 * call below to hardcode the new value to REMOVED_SPTE. 480 * 481 * Note, even though dropping a Dirty bit is the only 482 * scenario where a non-atomic update could result in a 483 * functional bug, simply checking the Dirty bit isn't 484 * sufficient as a fast page fault could read the upper 485 * level SPTE before it is zapped, and then make this 486 * target SPTE writable, resume the guest, and set the 487 * Dirty bit between reading the SPTE above and writing 488 * it here. 489 */ 490 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 491 REMOVED_SPTE, level); 492 } 493 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 494 old_spte, REMOVED_SPTE, level, shared); 495 } 496 497 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 498} 499 500/** 501 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 502 * @kvm: kvm instance 503 * @as_id: the address space of the paging structure the SPTE was a part of 504 * @gfn: the base GFN that was mapped by the SPTE 505 * @old_spte: The value of the SPTE before the change 506 * @new_spte: The value of the SPTE after the change 507 * @level: the level of the PT the SPTE is part of in the paging structure 508 * @shared: This operation may not be running under the exclusive use of 509 * the MMU lock and the operation must synchronize with other 510 * threads that might be modifying SPTEs. 511 * 512 * Handle bookkeeping that might result from the modification of a SPTE. 513 * This function must be called for all TDP SPTE modifications. 514 */ 515static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 516 u64 old_spte, u64 new_spte, int level, 517 bool shared) 518{ 519 bool was_present = is_shadow_present_pte(old_spte); 520 bool is_present = is_shadow_present_pte(new_spte); 521 bool was_leaf = was_present && is_last_spte(old_spte, level); 522 bool is_leaf = is_present && is_last_spte(new_spte, level); 523 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 524 525 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 526 WARN_ON(level < PG_LEVEL_4K); 527 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 528 529 /* 530 * If this warning were to trigger it would indicate that there was a 531 * missing MMU notifier or a race with some notifier handler. 532 * A present, leaf SPTE should never be directly replaced with another 533 * present leaf SPTE pointing to a different PFN. A notifier handler 534 * should be zapping the SPTE before the main MM's page table is 535 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 536 * thread before replacement. 537 */ 538 if (was_leaf && is_leaf && pfn_changed) { 539 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 540 "SPTE with another present leaf SPTE mapping a\n" 541 "different PFN!\n" 542 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 543 as_id, gfn, old_spte, new_spte, level); 544 545 /* 546 * Crash the host to prevent error propagation and guest data 547 * corruption. 548 */ 549 BUG(); 550 } 551 552 if (old_spte == new_spte) 553 return; 554 555 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 556 557 if (is_leaf) 558 check_spte_writable_invariants(new_spte); 559 560 /* 561 * The only times a SPTE should be changed from a non-present to 562 * non-present state is when an MMIO entry is installed/modified/ 563 * removed. In that case, there is nothing to do here. 564 */ 565 if (!was_present && !is_present) { 566 /* 567 * If this change does not involve a MMIO SPTE or removed SPTE, 568 * it is unexpected. Log the change, though it should not 569 * impact the guest since both the former and current SPTEs 570 * are nonpresent. 571 */ 572 if (WARN_ON(!is_mmio_spte(old_spte) && 573 !is_mmio_spte(new_spte) && 574 !is_removed_spte(new_spte))) 575 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 576 "should not be replaced with another,\n" 577 "different nonpresent SPTE, unless one or both\n" 578 "are MMIO SPTEs, or the new SPTE is\n" 579 "a temporary removed SPTE.\n" 580 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 581 as_id, gfn, old_spte, new_spte, level); 582 return; 583 } 584 585 if (is_leaf != was_leaf) 586 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 587 588 if (was_leaf && is_dirty_spte(old_spte) && 589 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 590 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 591 592 /* 593 * Recursively handle child PTs if the change removed a subtree from 594 * the paging structure. Note the WARN on the PFN changing without the 595 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 596 * pages are kernel allocations and should never be migrated. 597 */ 598 if (was_present && !was_leaf && 599 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 600 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 601} 602 603static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 604 u64 old_spte, u64 new_spte, int level, 605 bool shared) 606{ 607 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 608 shared); 609 handle_changed_spte_acc_track(old_spte, new_spte, level); 610 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 611 new_spte, level); 612} 613 614/* 615 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 616 * and handle the associated bookkeeping. Do not mark the page dirty 617 * in KVM's dirty bitmaps. 618 * 619 * If setting the SPTE fails because it has changed, iter->old_spte will be 620 * refreshed to the current value of the spte. 621 * 622 * @kvm: kvm instance 623 * @iter: a tdp_iter instance currently on the SPTE that should be set 624 * @new_spte: The value the SPTE should be set to 625 * Return: 626 * * 0 - If the SPTE was set. 627 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 628 * no side-effects other than setting iter->old_spte to the last 629 * known value of the spte. 630 */ 631static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 632 struct tdp_iter *iter, 633 u64 new_spte) 634{ 635 u64 *sptep = rcu_dereference(iter->sptep); 636 u64 old_spte; 637 638 /* 639 * The caller is responsible for ensuring the old SPTE is not a REMOVED 640 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 641 * and pre-checking before inserting a new SPTE is advantageous as it 642 * avoids unnecessary work. 643 */ 644 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 645 646 lockdep_assert_held_read(&kvm->mmu_lock); 647 648 /* 649 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 650 * does not hold the mmu_lock. 651 */ 652 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); 653 if (old_spte != iter->old_spte) { 654 /* 655 * The page table entry was modified by a different logical 656 * CPU. Refresh iter->old_spte with the current value so the 657 * caller operates on fresh data, e.g. if it retries 658 * tdp_mmu_set_spte_atomic(). 659 */ 660 iter->old_spte = old_spte; 661 return -EBUSY; 662 } 663 664 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 665 new_spte, iter->level, true); 666 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 667 668 return 0; 669} 670 671static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 672 struct tdp_iter *iter) 673{ 674 int ret; 675 676 /* 677 * Freeze the SPTE by setting it to a special, 678 * non-present value. This will stop other threads from 679 * immediately installing a present entry in its place 680 * before the TLBs are flushed. 681 */ 682 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 683 if (ret) 684 return ret; 685 686 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 687 KVM_PAGES_PER_HPAGE(iter->level)); 688 689 /* 690 * No other thread can overwrite the removed SPTE as they must either 691 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not 692 * overwrite the special removed SPTE value. No bookkeeping is needed 693 * here since the SPTE is going from non-present to non-present. Use 694 * the raw write helper to avoid an unnecessary check on volatile bits. 695 */ 696 __kvm_tdp_mmu_write_spte(iter->sptep, 0); 697 698 return 0; 699} 700 701 702/* 703 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 704 * @kvm: KVM instance 705 * @as_id: Address space ID, i.e. regular vs. SMM 706 * @sptep: Pointer to the SPTE 707 * @old_spte: The current value of the SPTE 708 * @new_spte: The new value that will be set for the SPTE 709 * @gfn: The base GFN that was (or will be) mapped by the SPTE 710 * @level: The level _containing_ the SPTE (its parent PT's level) 711 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 712 * of the page. Should be set unless handling an MMU 713 * notifier for access tracking. Leaving record_acc_track 714 * unset in that case prevents page accesses from being 715 * double counted. 716 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 717 * appropriate for the change being made. Should be set 718 * unless performing certain dirty logging operations. 719 * Leaving record_dirty_log unset in that case prevents page 720 * writes from being double counted. 721 * 722 * Returns the old SPTE value, which _may_ be different than @old_spte if the 723 * SPTE had voldatile bits. 724 */ 725static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 726 u64 old_spte, u64 new_spte, gfn_t gfn, int level, 727 bool record_acc_track, bool record_dirty_log) 728{ 729 lockdep_assert_held_write(&kvm->mmu_lock); 730 731 /* 732 * No thread should be using this function to set SPTEs to or from the 733 * temporary removed SPTE value. 734 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 735 * should be used. If operating under the MMU lock in write mode, the 736 * use of the removed SPTE should not be necessary. 737 */ 738 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 739 740 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 741 742 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 743 744 if (record_acc_track) 745 handle_changed_spte_acc_track(old_spte, new_spte, level); 746 if (record_dirty_log) 747 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 748 new_spte, level); 749 return old_spte; 750} 751 752static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 753 u64 new_spte, bool record_acc_track, 754 bool record_dirty_log) 755{ 756 WARN_ON_ONCE(iter->yielded); 757 758 iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 759 iter->old_spte, new_spte, 760 iter->gfn, iter->level, 761 record_acc_track, record_dirty_log); 762} 763 764static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 765 u64 new_spte) 766{ 767 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 768} 769 770static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 771 struct tdp_iter *iter, 772 u64 new_spte) 773{ 774 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 775} 776 777static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 778 struct tdp_iter *iter, 779 u64 new_spte) 780{ 781 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 782} 783 784#define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 785 for_each_tdp_pte(_iter, _root, _start, _end) 786 787#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 788 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 789 if (!is_shadow_present_pte(_iter.old_spte) || \ 790 !is_last_spte(_iter.old_spte, _iter.level)) \ 791 continue; \ 792 else 793 794#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 795 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) 796 797/* 798 * Yield if the MMU lock is contended or this thread needs to return control 799 * to the scheduler. 800 * 801 * If this function should yield and flush is set, it will perform a remote 802 * TLB flush before yielding. 803 * 804 * If this function yields, iter->yielded is set and the caller must skip to 805 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 806 * over the paging structures to allow the iterator to continue its traversal 807 * from the paging structure root. 808 * 809 * Returns true if this function yielded. 810 */ 811static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 812 struct tdp_iter *iter, 813 bool flush, bool shared) 814{ 815 WARN_ON(iter->yielded); 816 817 /* Ensure forward progress has been made before yielding. */ 818 if (iter->next_last_level_gfn == iter->yielded_gfn) 819 return false; 820 821 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 822 if (flush) 823 kvm_flush_remote_tlbs(kvm); 824 825 rcu_read_unlock(); 826 827 if (shared) 828 cond_resched_rwlock_read(&kvm->mmu_lock); 829 else 830 cond_resched_rwlock_write(&kvm->mmu_lock); 831 832 rcu_read_lock(); 833 834 WARN_ON(iter->gfn > iter->next_last_level_gfn); 835 836 iter->yielded = true; 837 } 838 839 return iter->yielded; 840} 841 842static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 843{ 844 /* 845 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 846 * a gpa range that would exceed the max gfn, and KVM does not create 847 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 848 * the slow emulation path every time. 849 */ 850 return kvm_mmu_max_gfn() + 1; 851} 852 853static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 854 bool shared, int zap_level) 855{ 856 struct tdp_iter iter; 857 858 gfn_t end = tdp_mmu_max_gfn_exclusive(); 859 gfn_t start = 0; 860 861 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 862retry: 863 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 864 continue; 865 866 if (!is_shadow_present_pte(iter.old_spte)) 867 continue; 868 869 if (iter.level > zap_level) 870 continue; 871 872 if (!shared) 873 tdp_mmu_set_spte(kvm, &iter, 0); 874 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 875 goto retry; 876 } 877} 878 879static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 880 bool shared) 881{ 882 883 /* 884 * The root must have an elevated refcount so that it's reachable via 885 * mmu_notifier callbacks, which allows this path to yield and drop 886 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 887 * must drop all references to relevant pages prior to completing the 888 * callback. Dropping mmu_lock with an unreachable root would result 889 * in zapping SPTEs after a relevant mmu_notifier callback completes 890 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 891 * dirty accessed bits to the SPTE's associated struct page. 892 */ 893 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 894 895 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 896 897 rcu_read_lock(); 898 899 /* 900 * To avoid RCU stalls due to recursively removing huge swaths of SPs, 901 * split the zap into two passes. On the first pass, zap at the 1gb 902 * level, and then zap top-level SPs on the second pass. "1gb" is not 903 * arbitrary, as KVM must be able to zap a 1gb shadow page without 904 * inducing a stall to allow in-place replacement with a 1gb hugepage. 905 * 906 * Because zapping a SP recurses on its children, stepping down to 907 * PG_LEVEL_4K in the iterator itself is unnecessary. 908 */ 909 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 910 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 911 912 rcu_read_unlock(); 913} 914 915bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 916{ 917 u64 old_spte; 918 919 /* 920 * This helper intentionally doesn't allow zapping a root shadow page, 921 * which doesn't have a parent page table and thus no associated entry. 922 */ 923 if (WARN_ON_ONCE(!sp->ptep)) 924 return false; 925 926 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 927 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 928 return false; 929 930 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 931 sp->gfn, sp->role.level + 1, true, true); 932 933 return true; 934} 935 936/* 937 * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs 938 * have been cleared and a TLB flush is needed before releasing the MMU lock. 939 * 940 * If can_yield is true, will release the MMU lock and reschedule if the 941 * scheduler needs the CPU or there is contention on the MMU lock. If this 942 * function cannot yield, it will not release the MMU lock or reschedule and 943 * the caller must ensure it does not supply too large a GFN range, or the 944 * operation can cause a soft lockup. 945 */ 946static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 947 gfn_t start, gfn_t end, bool can_yield, bool flush) 948{ 949 struct tdp_iter iter; 950 951 end = min(end, tdp_mmu_max_gfn_exclusive()); 952 953 lockdep_assert_held_write(&kvm->mmu_lock); 954 955 rcu_read_lock(); 956 957 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 958 if (can_yield && 959 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 960 flush = false; 961 continue; 962 } 963 964 if (!is_shadow_present_pte(iter.old_spte) || 965 !is_last_spte(iter.old_spte, iter.level)) 966 continue; 967 968 tdp_mmu_set_spte(kvm, &iter, 0); 969 flush = true; 970 } 971 972 rcu_read_unlock(); 973 974 /* 975 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 976 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 977 */ 978 return flush; 979} 980 981/* 982 * Tears down the mappings for the range of gfns, [start, end), and frees the 983 * non-root pages mapping GFNs strictly within that range. Returns true if 984 * SPTEs have been cleared and a TLB flush is needed before releasing the 985 * MMU lock. 986 */ 987bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, 988 bool can_yield, bool flush) 989{ 990 struct kvm_mmu_page *root; 991 992 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 993 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); 994 995 return flush; 996} 997 998void kvm_tdp_mmu_zap_all(struct kvm *kvm) 999{ 1000 struct kvm_mmu_page *root; 1001 int i; 1002 1003 /* 1004 * Zap all roots, including invalid roots, as all SPTEs must be dropped 1005 * before returning to the caller. Zap directly even if the root is 1006 * also being zapped by a worker. Walking zapped top-level SPTEs isn't 1007 * all that expensive and mmu_lock is already held, which means the 1008 * worker has yielded, i.e. flushing the work instead of zapping here 1009 * isn't guaranteed to be any faster. 1010 * 1011 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 1012 * is being destroyed or the userspace VMM has exited. In both cases, 1013 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 1014 */ 1015 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 1016 for_each_tdp_mmu_root_yield_safe(kvm, root, i) 1017 tdp_mmu_zap_root(kvm, root, false); 1018 } 1019} 1020 1021/* 1022 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 1023 * zap" completes. 1024 */ 1025void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 1026{ 1027 flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 1028} 1029 1030/* 1031 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1032 * is about to be zapped, e.g. in response to a memslots update. The actual 1033 * zapping is performed asynchronously, so a reference is taken on all roots. 1034 * Using a separate workqueue makes it easy to ensure that the destruction is 1035 * performed before the "fast zap" completes, without keeping a separate list 1036 * of invalidated roots; the list is effectively the list of work items in 1037 * the workqueue. 1038 * 1039 * Get a reference even if the root is already invalid, the asynchronous worker 1040 * assumes it was gifted a reference to the root it processes. Because mmu_lock 1041 * is held for write, it should be impossible to observe a root with zero refcount, 1042 * i.e. the list of roots cannot be stale. 1043 * 1044 * This has essentially the same effect for the TDP MMU 1045 * as updating mmu_valid_gen does for the shadow MMU. 1046 */ 1047void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 1048{ 1049 struct kvm_mmu_page *root; 1050 1051 lockdep_assert_held_write(&kvm->mmu_lock); 1052 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1053 if (!root->role.invalid && 1054 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { 1055 root->role.invalid = true; 1056 tdp_mmu_schedule_zap_root(kvm, root); 1057 } 1058 } 1059} 1060 1061/* 1062 * Installs a last-level SPTE to handle a TDP page fault. 1063 * (NPT/EPT violation/misconfiguration) 1064 */ 1065static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1066 struct kvm_page_fault *fault, 1067 struct tdp_iter *iter) 1068{ 1069 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1070 u64 new_spte; 1071 int ret = RET_PF_FIXED; 1072 bool wrprot = false; 1073 int modes[] = { 1074 KVM_PAGE_TRACK_EXEC, 1075 KVM_PAGE_TRACK_ACCESS, 1076 }; 1077 int i; 1078 1079 WARN_ON(sp->role.level != fault->goal_level); 1080 if (unlikely(!fault->slot)) 1081 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1082 else 1083 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1084 fault->pfn, iter->old_spte, fault->prefetch, true, 1085 fault->map_writable, &new_spte); 1086 1087 /* reprotect the spte according to tracking */ 1088 for (i = 0; i < 2; i++) { 1089 if (kvm_slot_page_track_is_active(vcpu->kvm, 1090 fault->slot, fault->gfn, modes[i])) { 1091 new_spte = cpc_protect_pte(new_spte, modes[i]); 1092 break; 1093 } 1094 } 1095 1096 if (new_spte == iter->old_spte) 1097 ret = RET_PF_SPURIOUS; 1098 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1099 return RET_PF_RETRY; 1100 else if (is_shadow_present_pte(iter->old_spte) && 1101 !is_last_spte(iter->old_spte, iter->level)) 1102 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1103 KVM_PAGES_PER_HPAGE(iter->level + 1)); 1104 1105 /* 1106 * If the page fault was caused by a write but the page is write 1107 * protected, emulation is needed. If the emulation was skipped, 1108 * the vCPU would have the same fault again. 1109 */ 1110 if (wrprot) { 1111 if (fault->write) 1112 ret = RET_PF_EMULATE; 1113 } 1114 1115 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1116 if (unlikely(is_mmio_spte(new_spte))) { 1117 vcpu->stat.pf_mmio_spte_created++; 1118 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1119 new_spte); 1120 ret = RET_PF_EMULATE; 1121 } else { 1122 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1123 rcu_dereference(iter->sptep)); 1124 } 1125 1126 return ret; 1127} 1128 1129/* 1130 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1131 * provided page table. 1132 * 1133 * @kvm: kvm instance 1134 * @iter: a tdp_iter instance currently on the SPTE that should be set 1135 * @sp: The new TDP page table to install. 1136 * @account_nx: True if this page table is being installed to split a 1137 * non-executable huge page. 1138 * @shared: This operation is running under the MMU lock in read mode. 1139 * 1140 * Returns: 0 if the new page table was installed. Non-0 if the page table 1141 * could not be installed (e.g. the atomic compare-exchange failed). 1142 */ 1143static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1144 struct kvm_mmu_page *sp, bool account_nx, 1145 bool shared) 1146{ 1147 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); 1148 int ret = 0; 1149 1150 if (shared) { 1151 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1152 if (ret) 1153 return ret; 1154 } else { 1155 tdp_mmu_set_spte(kvm, iter, spte); 1156 } 1157 1158 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1159 list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 1160 if (account_nx) 1161 account_huge_nx_page(kvm, sp); 1162 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1163 1164 return 0; 1165} 1166 1167/* 1168 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1169 * page tables and SPTEs to translate the faulting guest physical address. 1170 */ 1171int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1172{ 1173 struct kvm_mmu *mmu = vcpu->arch.mmu; 1174 struct tdp_iter iter; 1175 struct kvm_mmu_page *sp; 1176 int ret; 1177 1178 kvm_mmu_hugepage_adjust(vcpu, fault); 1179 1180 trace_kvm_mmu_spte_requested(fault); 1181 1182 rcu_read_lock(); 1183 1184 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 1185 if (fault->nx_huge_page_workaround_enabled) 1186 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1187 1188 if (iter.level == fault->goal_level) 1189 break; 1190 1191 /* 1192 * If there is an SPTE mapping a large page at a higher level 1193 * than the target, that SPTE must be cleared and replaced 1194 * with a non-leaf SPTE. 1195 */ 1196 if (is_shadow_present_pte(iter.old_spte) && 1197 is_large_pte(iter.old_spte)) { 1198 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) 1199 break; 1200 1201 /* 1202 * The iter must explicitly re-read the spte here 1203 * because the new value informs the !present 1204 * path below. 1205 */ 1206 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); 1207 } 1208 1209 if (!is_shadow_present_pte(iter.old_spte)) { 1210 bool account_nx = fault->huge_page_disallowed && 1211 fault->req_level >= iter.level; 1212 1213 /* 1214 * If SPTE has been frozen by another thread, just 1215 * give up and retry, avoiding unnecessary page table 1216 * allocation and free. 1217 */ 1218 if (is_removed_spte(iter.old_spte)) 1219 break; 1220 1221 sp = tdp_mmu_alloc_sp(vcpu); 1222 tdp_mmu_init_child_sp(sp, &iter); 1223 1224 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { 1225 tdp_mmu_free_sp(sp); 1226 break; 1227 } 1228 } 1229 } 1230 1231 /* 1232 * Force the guest to retry the access if the upper level SPTEs aren't 1233 * in place, or if the target leaf SPTE is frozen by another CPU. 1234 */ 1235 if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) { 1236 rcu_read_unlock(); 1237 return RET_PF_RETRY; 1238 } 1239 1240 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1241 rcu_read_unlock(); 1242 1243 return ret; 1244} 1245 1246bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1247 bool flush) 1248{ 1249 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, 1250 range->end, range->may_block, flush); 1251} 1252 1253typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1254 struct kvm_gfn_range *range); 1255 1256static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1257 struct kvm_gfn_range *range, 1258 tdp_handler_t handler) 1259{ 1260 struct kvm_mmu_page *root; 1261 struct tdp_iter iter; 1262 bool ret = false; 1263 1264 /* 1265 * Don't support rescheduling, none of the MMU notifiers that funnel 1266 * into this helper allow blocking; it'd be dead, wasteful code. 1267 */ 1268 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1269 rcu_read_lock(); 1270 1271 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1272 ret |= handler(kvm, &iter, range); 1273 1274 rcu_read_unlock(); 1275 } 1276 1277 return ret; 1278} 1279 1280/* 1281 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1282 * if any of the GFNs in the range have been accessed. 1283 */ 1284static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1285 struct kvm_gfn_range *range) 1286{ 1287 u64 new_spte = 0; 1288 1289 /* If we have a non-accessed entry we don't need to change the pte. */ 1290 if (!is_accessed_spte(iter->old_spte)) 1291 return false; 1292 1293 new_spte = iter->old_spte; 1294 1295 if (spte_ad_enabled(new_spte)) { 1296 new_spte &= ~shadow_accessed_mask; 1297 } else { 1298 /* 1299 * Capture the dirty status of the page, so that it doesn't get 1300 * lost when the SPTE is marked for access tracking. 1301 */ 1302 if (is_writable_pte(new_spte)) 1303 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1304 1305 new_spte = mark_spte_for_access_track(new_spte); 1306 } 1307 1308 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1309 1310 return true; 1311} 1312 1313bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1314{ 1315 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1316} 1317 1318static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1319 struct kvm_gfn_range *range) 1320{ 1321 return is_accessed_spte(iter->old_spte); 1322} 1323 1324bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1325{ 1326 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1327} 1328 1329static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1330 struct kvm_gfn_range *range) 1331{ 1332 u64 new_spte; 1333 1334 /* Huge pages aren't expected to be modified without first being zapped. */ 1335 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1336 1337 if (iter->level != PG_LEVEL_4K || 1338 !is_shadow_present_pte(iter->old_spte)) 1339 return false; 1340 1341 /* 1342 * Note, when changing a read-only SPTE, it's not strictly necessary to 1343 * zero the SPTE before setting the new PFN, but doing so preserves the 1344 * invariant that the PFN of a present * leaf SPTE can never change. 1345 * See __handle_changed_spte(). 1346 */ 1347 tdp_mmu_set_spte(kvm, iter, 0); 1348 1349 if (!pte_write(range->pte)) { 1350 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1351 pte_pfn(range->pte)); 1352 1353 tdp_mmu_set_spte(kvm, iter, new_spte); 1354 } 1355 1356 return true; 1357} 1358 1359/* 1360 * Handle the changed_pte MMU notifier for the TDP MMU. 1361 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1362 * notifier. 1363 * Returns non-zero if a flush is needed before releasing the MMU lock. 1364 */ 1365bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1366{ 1367 /* 1368 * No need to handle the remote TLB flush under RCU protection, the 1369 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 1370 * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). 1371 */ 1372 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1373} 1374 1375/* 1376 * Remove write access from all SPTEs at or above min_level that map GFNs 1377 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1378 * be flushed. 1379 */ 1380static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1381 gfn_t start, gfn_t end, int min_level) 1382{ 1383 struct tdp_iter iter; 1384 u64 new_spte; 1385 bool spte_set = false; 1386 1387 rcu_read_lock(); 1388 1389 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1390 1391 for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 1392retry: 1393 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1394 continue; 1395 1396 if (!is_shadow_present_pte(iter.old_spte) || 1397 !is_last_spte(iter.old_spte, iter.level) || 1398 !(iter.old_spte & PT_WRITABLE_MASK)) 1399 continue; 1400 1401 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1402 1403 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1404 goto retry; 1405 1406 spte_set = true; 1407 } 1408 1409 rcu_read_unlock(); 1410 return spte_set; 1411} 1412 1413/* 1414 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1415 * only affect leaf SPTEs down to min_level. 1416 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1417 */ 1418bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1419 const struct kvm_memory_slot *slot, int min_level) 1420{ 1421 struct kvm_mmu_page *root; 1422 bool spte_set = false; 1423 1424 lockdep_assert_held_read(&kvm->mmu_lock); 1425 1426 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1427 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1428 slot->base_gfn + slot->npages, min_level); 1429 1430 return spte_set; 1431} 1432 1433static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1434{ 1435 struct kvm_mmu_page *sp; 1436 1437 gfp |= __GFP_ZERO; 1438 1439 sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1440 if (!sp) 1441 return NULL; 1442 1443 sp->spt = (void *)__get_free_page(gfp); 1444 if (!sp->spt) { 1445 kmem_cache_free(mmu_page_header_cache, sp); 1446 return NULL; 1447 } 1448 1449 return sp; 1450} 1451 1452static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1453 struct tdp_iter *iter, 1454 bool shared) 1455{ 1456 struct kvm_mmu_page *sp; 1457 1458 /* 1459 * Since we are allocating while under the MMU lock we have to be 1460 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1461 * reclaim and to avoid making any filesystem callbacks (which can end 1462 * up invoking KVM MMU notifiers, resulting in a deadlock). 1463 * 1464 * If this allocation fails we drop the lock and retry with reclaim 1465 * allowed. 1466 */ 1467 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1468 if (sp) 1469 return sp; 1470 1471 rcu_read_unlock(); 1472 1473 if (shared) 1474 read_unlock(&kvm->mmu_lock); 1475 else 1476 write_unlock(&kvm->mmu_lock); 1477 1478 iter->yielded = true; 1479 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1480 1481 if (shared) 1482 read_lock(&kvm->mmu_lock); 1483 else 1484 write_lock(&kvm->mmu_lock); 1485 1486 rcu_read_lock(); 1487 1488 return sp; 1489} 1490 1491static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1492 struct kvm_mmu_page *sp, bool shared) 1493{ 1494 const u64 huge_spte = iter->old_spte; 1495 const int level = iter->level; 1496 int ret, i; 1497 1498 tdp_mmu_init_child_sp(sp, iter); 1499 1500 /* 1501 * No need for atomics when writing to sp->spt since the page table has 1502 * not been linked in yet and thus is not reachable from any other CPU. 1503 */ 1504 for (i = 0; i < PT64_ENT_PER_PAGE; i++) 1505 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); 1506 1507 /* 1508 * Replace the huge spte with a pointer to the populated lower level 1509 * page table. Since we are making this change without a TLB flush vCPUs 1510 * will see a mix of the split mappings and the original huge mapping, 1511 * depending on what's currently in their TLB. This is fine from a 1512 * correctness standpoint since the translation will be the same either 1513 * way. 1514 */ 1515 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); 1516 if (ret) 1517 goto out; 1518 1519 /* 1520 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1521 * are overwriting from the page stats. But we have to manually update 1522 * the page stats with the new present child pages. 1523 */ 1524 kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); 1525 1526out: 1527 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1528 return ret; 1529} 1530 1531static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1532 struct kvm_mmu_page *root, 1533 gfn_t start, gfn_t end, 1534 int target_level, bool shared) 1535{ 1536 struct kvm_mmu_page *sp = NULL; 1537 struct tdp_iter iter; 1538 int ret = 0; 1539 1540 rcu_read_lock(); 1541 1542 /* 1543 * Traverse the page table splitting all huge pages above the target 1544 * level into one lower level. For example, if we encounter a 1GB page 1545 * we split it into 512 2MB pages. 1546 * 1547 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1548 * to visit an SPTE before ever visiting its children, which means we 1549 * will correctly recursively split huge pages that are more than one 1550 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1551 * and then splitting each of those to 512 4KB pages). 1552 */ 1553 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1554retry: 1555 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1556 continue; 1557 1558 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1559 continue; 1560 1561 if (!sp) { 1562 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1563 if (!sp) { 1564 ret = -ENOMEM; 1565 trace_kvm_mmu_split_huge_page(iter.gfn, 1566 iter.old_spte, 1567 iter.level, ret); 1568 break; 1569 } 1570 1571 if (iter.yielded) 1572 continue; 1573 } 1574 1575 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1576 goto retry; 1577 1578 sp = NULL; 1579 } 1580 1581 rcu_read_unlock(); 1582 1583 /* 1584 * It's possible to exit the loop having never used the last sp if, for 1585 * example, a vCPU doing HugePage NX splitting wins the race and 1586 * installs its own sp in place of the last sp we tried to split. 1587 */ 1588 if (sp) 1589 tdp_mmu_free_sp(sp); 1590 1591 return ret; 1592} 1593 1594 1595/* 1596 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1597 */ 1598void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1599 const struct kvm_memory_slot *slot, 1600 gfn_t start, gfn_t end, 1601 int target_level, bool shared) 1602{ 1603 struct kvm_mmu_page *root; 1604 int r = 0; 1605 1606 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1607 1608 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1609 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1610 if (r) { 1611 kvm_tdp_mmu_put_root(kvm, root, shared); 1612 break; 1613 } 1614 } 1615} 1616 1617/* 1618 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1619 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1620 * If AD bits are not enabled, this will require clearing the writable bit on 1621 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1622 * be flushed. 1623 */ 1624static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1625 gfn_t start, gfn_t end) 1626{ 1627 struct tdp_iter iter; 1628 u64 new_spte; 1629 bool spte_set = false; 1630 1631 rcu_read_lock(); 1632 1633 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1634retry: 1635 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1636 continue; 1637 1638 if (!is_shadow_present_pte(iter.old_spte)) 1639 continue; 1640 1641 if (spte_ad_need_write_protect(iter.old_spte)) { 1642 if (is_writable_pte(iter.old_spte)) 1643 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1644 else 1645 continue; 1646 } else { 1647 if (iter.old_spte & shadow_dirty_mask) 1648 new_spte = iter.old_spte & ~shadow_dirty_mask; 1649 else 1650 continue; 1651 } 1652 1653 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1654 goto retry; 1655 1656 spte_set = true; 1657 } 1658 1659 rcu_read_unlock(); 1660 return spte_set; 1661} 1662 1663/* 1664 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1665 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1666 * If AD bits are not enabled, this will require clearing the writable bit on 1667 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1668 * be flushed. 1669 */ 1670bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1671 const struct kvm_memory_slot *slot) 1672{ 1673 struct kvm_mmu_page *root; 1674 bool spte_set = false; 1675 1676 lockdep_assert_held_read(&kvm->mmu_lock); 1677 1678 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1679 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1680 slot->base_gfn + slot->npages); 1681 1682 return spte_set; 1683} 1684 1685/* 1686 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1687 * set in mask, starting at gfn. The given memslot is expected to contain all 1688 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1689 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1690 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1691 */ 1692static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1693 gfn_t gfn, unsigned long mask, bool wrprot) 1694{ 1695 struct tdp_iter iter; 1696 u64 new_spte; 1697 1698 rcu_read_lock(); 1699 1700 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1701 gfn + BITS_PER_LONG) { 1702 if (!mask) 1703 break; 1704 1705 if (iter.level > PG_LEVEL_4K || 1706 !(mask & (1UL << (iter.gfn - gfn)))) 1707 continue; 1708 1709 mask &= ~(1UL << (iter.gfn - gfn)); 1710 1711 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1712 if (is_writable_pte(iter.old_spte)) 1713 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1714 else 1715 continue; 1716 } else { 1717 if (iter.old_spte & shadow_dirty_mask) 1718 new_spte = iter.old_spte & ~shadow_dirty_mask; 1719 else 1720 continue; 1721 } 1722 1723 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1724 } 1725 1726 rcu_read_unlock(); 1727} 1728 1729/* 1730 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1731 * set in mask, starting at gfn. The given memslot is expected to contain all 1732 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1733 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1734 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1735 */ 1736void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1737 struct kvm_memory_slot *slot, 1738 gfn_t gfn, unsigned long mask, 1739 bool wrprot) 1740{ 1741 struct kvm_mmu_page *root; 1742 1743 lockdep_assert_held_write(&kvm->mmu_lock); 1744 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1745 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1746} 1747 1748/* 1749 * Clear leaf entries which could be replaced by large mappings, for 1750 * GFNs within the slot. 1751 */ 1752static void zap_collapsible_spte_range(struct kvm *kvm, 1753 struct kvm_mmu_page *root, 1754 const struct kvm_memory_slot *slot) 1755{ 1756 gfn_t start = slot->base_gfn; 1757 gfn_t end = start + slot->npages; 1758 struct tdp_iter iter; 1759 int max_mapping_level; 1760 kvm_pfn_t pfn; 1761 1762 rcu_read_lock(); 1763 1764 tdp_root_for_each_pte(iter, root, start, end) { 1765 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1766 continue; 1767 1768 if (!is_shadow_present_pte(iter.old_spte) || 1769 !is_last_spte(iter.old_spte, iter.level)) 1770 continue; 1771 1772 /* 1773 * This is a leaf SPTE. Check if the PFN it maps can 1774 * be mapped at a higher level. 1775 */ 1776 pfn = spte_to_pfn(iter.old_spte); 1777 1778 if (kvm_is_reserved_pfn(pfn)) 1779 continue; 1780 1781 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, 1782 iter.gfn, pfn, PG_LEVEL_NUM); 1783 1784 WARN_ON(max_mapping_level < iter.level); 1785 1786 /* 1787 * If this page is already mapped at the highest 1788 * viable level, there's nothing more to do. 1789 */ 1790 if (max_mapping_level == iter.level) 1791 continue; 1792 1793 /* 1794 * The page can be remapped at a higher level, so step 1795 * up to zap the parent SPTE. 1796 */ 1797 while (max_mapping_level > iter.level) 1798 tdp_iter_step_up(&iter); 1799 1800 /* Note, a successful atomic zap also does a remote TLB flush. */ 1801 tdp_mmu_zap_spte_atomic(kvm, &iter); 1802 1803 /* 1804 * If the atomic zap fails, the iter will recurse back into 1805 * the same subtree to retry. 1806 */ 1807 } 1808 1809 rcu_read_unlock(); 1810} 1811 1812/* 1813 * Clear non-leaf entries (and free associated page tables) which could 1814 * be replaced by large mappings, for GFNs within the slot. 1815 */ 1816void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1817 const struct kvm_memory_slot *slot) 1818{ 1819 struct kvm_mmu_page *root; 1820 1821 lockdep_assert_held_read(&kvm->mmu_lock); 1822 1823 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1824 zap_collapsible_spte_range(kvm, root, slot); 1825} 1826 1827static bool cpc_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1828 gfn_t gfn, int min_level, int mode) 1829{ 1830 struct tdp_iter iter; 1831 u64 new_spte; 1832 bool spte_set = false; 1833 1834 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1835 1836 rcu_read_lock(); 1837 1838 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 1839 if (!is_shadow_present_pte(iter.old_spte) || 1840 !is_last_spte(iter.old_spte, iter.level)) 1841 continue; 1842 1843 new_spte = iter.old_spte & ~shadow_mmu_writable_mask; 1844 new_spte = cpc_protect_pte(new_spte, mode); 1845 1846 if (new_spte == iter.old_spte) 1847 break; 1848 1849 tdp_mmu_set_spte(kvm, &iter, new_spte); 1850 spte_set = true; 1851 } 1852 1853 rcu_read_unlock(); 1854 1855 return spte_set; 1856} 1857 1858bool cpc_tdp_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, 1859 gfn_t gfn, int min_level, enum kvm_page_track_mode mode) 1860{ 1861 struct kvm_mmu_page *root; 1862 bool spte_set = false; 1863 1864 lockdep_assert_held_write(&kvm->mmu_lock); 1865 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1866 spte_set |= cpc_protect_gfn(kvm, root, gfn, min_level, mode); 1867 1868 return spte_set; 1869} 1870 1871/* 1872 * Removes write access on the last level SPTE mapping this GFN and unsets the 1873 * MMU-writable bit to ensure future writes continue to be intercepted. 1874 * Returns true if an SPTE was set and a TLB flush is needed. 1875 */ 1876bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1877 struct kvm_memory_slot *slot, gfn_t gfn, 1878 int min_level) 1879{ 1880 return cpc_tdp_protect_gfn(kvm, slot, gfn, min_level, 1881 KVM_PAGE_TRACK_WRITE); 1882} 1883 1884/* 1885 * Return the level of the lowest level SPTE added to sptes. 1886 * That SPTE may be non-present. 1887 * 1888 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1889 */ 1890int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1891 int *root_level) 1892{ 1893 struct tdp_iter iter; 1894 struct kvm_mmu *mmu = vcpu->arch.mmu; 1895 gfn_t gfn = addr >> PAGE_SHIFT; 1896 int leaf = -1; 1897 1898 *root_level = vcpu->arch.mmu->root_role.level; 1899 1900 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1901 leaf = iter.level; 1902 sptes[leaf] = iter.old_spte; 1903 } 1904 1905 return leaf; 1906} 1907 1908/* 1909 * Returns the last level spte pointer of the shadow page walk for the given 1910 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1911 * walk could be performed, returns NULL and *spte does not contain valid data. 1912 * 1913 * Contract: 1914 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1915 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1916 * 1917 * WARNING: This function is only intended to be called during fast_page_fault. 1918 */ 1919u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 1920 u64 *spte) 1921{ 1922 struct tdp_iter iter; 1923 struct kvm_mmu *mmu = vcpu->arch.mmu; 1924 gfn_t gfn = addr >> PAGE_SHIFT; 1925 tdp_ptep_t sptep = NULL; 1926 1927 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1928 *spte = iter.old_spte; 1929 sptep = iter.sptep; 1930 } 1931 1932 /* 1933 * Perform the rcu_dereference to get the raw spte pointer value since 1934 * we are passing it up to fast_page_fault, which is shared with the 1935 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1936 * annotation. 1937 * 1938 * This is safe since fast_page_fault obeys the contracts of this 1939 * function as well as all TDP MMU contracts around modifying SPTEs 1940 * outside of mmu_lock. 1941 */ 1942 return rcu_dereference(sptep); 1943}