tdp_mmu.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
tdp_mmu.c (59287B)
      1// SPDX-License-Identifier: GPL-2.0
      2
      3#include "mmu.h"
      4#include "mmu_internal.h"
      5#include "mmutrace.h"
      6#include "tdp_iter.h"
      7#include "tdp_mmu.h"
      8#include "spte.h"
      9
     10#include <asm/cmpxchg.h>
     11#include <trace/events/kvm.h>
     12
     13static bool __read_mostly tdp_mmu_enabled = true;
     14module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
     15
     16/* Initializes the TDP MMU for the VM, if enabled. */
     17int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
     18{
     19	struct workqueue_struct *wq;
     20
     21	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
     22		return 0;
     23
     24	wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
     25	if (!wq)
     26		return -ENOMEM;
     27
     28	/* This should not be changed for the lifetime of the VM. */
     29	kvm->arch.tdp_mmu_enabled = true;
     30	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
     31	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
     32	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
     33	kvm->arch.tdp_mmu_zap_wq = wq;
     34	return 1;
     35}
     36
     37/* Arbitrarily returns true so that this may be used in if statements. */
     38static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
     39							     bool shared)
     40{
     41	if (shared)
     42		lockdep_assert_held_read(&kvm->mmu_lock);
     43	else
     44		lockdep_assert_held_write(&kvm->mmu_lock);
     45
     46	return true;
     47}
     48
     49void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
     50{
     51	if (!kvm->arch.tdp_mmu_enabled)
     52		return;
     53
     54	/* Also waits for any queued work items.  */
     55	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
     56
     57	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
     58	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
     59
     60	/*
     61	 * Ensure that all the outstanding RCU callbacks to free shadow pages
     62	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
     63	 * can call kvm_tdp_mmu_put_root and create new callbacks.
     64	 */
     65	rcu_barrier();
     66}
     67
     68static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
     69{
     70	free_page((unsigned long)sp->spt);
     71	kmem_cache_free(mmu_page_header_cache, sp);
     72}
     73
     74/*
     75 * This is called through call_rcu in order to free TDP page table memory
     76 * safely with respect to other kernel threads that may be operating on
     77 * the memory.
     78 * By only accessing TDP MMU page table memory in an RCU read critical
     79 * section, and freeing it after a grace period, lockless access to that
     80 * memory won't use it after it is freed.
     81 */
     82static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
     83{
     84	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
     85					       rcu_head);
     86
     87	tdp_mmu_free_sp(sp);
     88}
     89
     90static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
     91			     bool shared);
     92
     93static void tdp_mmu_zap_root_work(struct work_struct *work)
     94{
     95	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
     96						 tdp_mmu_async_work);
     97	struct kvm *kvm = root->tdp_mmu_async_data;
     98
     99	read_lock(&kvm->mmu_lock);
    100
    101	/*
    102	 * A TLB flush is not necessary as KVM performs a local TLB flush when
    103	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
    104	 * to a different pCPU.  Note, the local TLB flush on reuse also
    105	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
    106	 * intermediate paging structures, that may be zapped, as such entries
    107	 * are associated with the ASID on both VMX and SVM.
    108	 */
    109	tdp_mmu_zap_root(kvm, root, true);
    110
    111	/*
    112	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
    113	 * avoiding an infinite loop.  By design, the root is reachable while
    114	 * it's being asynchronously zapped, thus a different task can put its
    115	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
    116	 * asynchronously zapped root is unavoidable.
    117	 */
    118	kvm_tdp_mmu_put_root(kvm, root, true);
    119
    120	read_unlock(&kvm->mmu_lock);
    121}
    122
    123static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
    124{
    125	root->tdp_mmu_async_data = kvm;
    126	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
    127	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
    128}
    129
    130static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
    131{
    132	union kvm_mmu_page_role role = page->role;
    133	role.invalid = true;
    134
    135	/* No need to use cmpxchg, only the invalid bit can change.  */
    136	role.word = xchg(&page->role.word, role.word);
    137	return role.invalid;
    138}
    139
    140void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
    141			  bool shared)
    142{
    143	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
    144
    145	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
    146		return;
    147
    148	WARN_ON(!root->tdp_mmu_page);
    149
    150	/*
    151	 * The root now has refcount=0.  It is valid, but readers already
    152	 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
    153	 * rejects it.  This remains true for the rest of the execution
    154	 * of this function, because readers visit valid roots only
    155	 * (except for tdp_mmu_zap_root_work(), which however
    156	 * does not acquire any reference itself).
    157	 *
    158	 * Even though there are flows that need to visit all roots for
    159	 * correctness, they all take mmu_lock for write, so they cannot yet
    160	 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
    161	 * since the root still has refcount=0.
    162	 *
    163	 * However, tdp_mmu_zap_root can yield, and writers do not expect to
    164	 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
    165	 * So the root temporarily gets an extra reference, going to refcount=1
    166	 * while staying invalid.  Readers still cannot acquire any reference;
    167	 * but writers are now allowed to run if tdp_mmu_zap_root yields and
    168	 * they might take an extra reference if they themselves yield.
    169	 * Therefore, when the reference is given back by the worker,
    170	 * there is no guarantee that the refcount is still 1.  If not, whoever
    171	 * puts the last reference will free the page, but they will not have to
    172	 * zap the root because a root cannot go from invalid to valid.
    173	 */
    174	if (!kvm_tdp_root_mark_invalid(root)) {
    175		refcount_set(&root->tdp_mmu_root_count, 1);
    176
    177		/*
    178		 * Zapping the root in a worker is not just "nice to have";
    179		 * it is required because kvm_tdp_mmu_invalidate_all_roots()
    180		 * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
    181		 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
    182		 * might return with some roots not zapped yet.
    183		 */
    184		tdp_mmu_schedule_zap_root(kvm, root);
    185		return;
    186	}
    187
    188	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
    189	list_del_rcu(&root->link);
    190	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
    191	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
    192}
    193
    194/*
    195 * Returns the next root after @prev_root (or the first root if @prev_root is
    196 * NULL).  A reference to the returned root is acquired, and the reference to
    197 * @prev_root is released (the caller obviously must hold a reference to
    198 * @prev_root if it's non-NULL).
    199 *
    200 * If @only_valid is true, invalid roots are skipped.
    201 *
    202 * Returns NULL if the end of tdp_mmu_roots was reached.
    203 */
    204static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
    205					      struct kvm_mmu_page *prev_root,
    206					      bool shared, bool only_valid)
    207{
    208	struct kvm_mmu_page *next_root;
    209
    210	rcu_read_lock();
    211
    212	if (prev_root)
    213		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
    214						  &prev_root->link,
    215						  typeof(*prev_root), link);
    216	else
    217		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
    218						   typeof(*next_root), link);
    219
    220	while (next_root) {
    221		if ((!only_valid || !next_root->role.invalid) &&
    222		    kvm_tdp_mmu_get_root(next_root))
    223			break;
    224
    225		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
    226				&next_root->link, typeof(*next_root), link);
    227	}
    228
    229	rcu_read_unlock();
    230
    231	if (prev_root)
    232		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
    233
    234	return next_root;
    235}
    236
    237/*
    238 * Note: this iterator gets and puts references to the roots it iterates over.
    239 * This makes it safe to release the MMU lock and yield within the loop, but
    240 * if exiting the loop early, the caller must drop the reference to the most
    241 * recent root. (Unless keeping a live reference is desirable.)
    242 *
    243 * If shared is set, this function is operating under the MMU lock in read
    244 * mode. In the unlikely event that this thread must free a root, the lock
    245 * will be temporarily dropped and reacquired in write mode.
    246 */
    247#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
    248	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
    249	     _root;								\
    250	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
    251		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
    252		    kvm_mmu_page_as_id(_root) != _as_id) {			\
    253		} else
    254
    255#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
    256	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
    257
    258#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
    259	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
    260
    261/*
    262 * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
    263 * the implication being that any flow that holds mmu_lock for read is
    264 * inherently yield-friendly and should use the yield-safe variant above.
    265 * Holding mmu_lock for write obviates the need for RCU protection as the list
    266 * is guaranteed to be stable.
    267 */
    268#define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
    269	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
    270		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
    271		    kvm_mmu_page_as_id(_root) != _as_id) {		\
    272		} else
    273
    274static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
    275{
    276	struct kvm_mmu_page *sp;
    277
    278	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
    279	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
    280
    281	return sp;
    282}
    283
    284static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
    285			    gfn_t gfn, union kvm_mmu_page_role role)
    286{
    287	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
    288
    289	sp->role = role;
    290	sp->gfn = gfn;
    291	sp->ptep = sptep;
    292	sp->tdp_mmu_page = true;
    293
    294	trace_kvm_mmu_get_page(sp, true);
    295}
    296
    297static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
    298				  struct tdp_iter *iter)
    299{
    300	struct kvm_mmu_page *parent_sp;
    301	union kvm_mmu_page_role role;
    302
    303	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
    304
    305	role = parent_sp->role;
    306	role.level--;
    307
    308	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
    309}
    310
    311hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
    312{
    313	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
    314	struct kvm *kvm = vcpu->kvm;
    315	struct kvm_mmu_page *root;
    316
    317	lockdep_assert_held_write(&kvm->mmu_lock);
    318
    319	/*
    320	 * Check for an existing root before allocating a new one.  Note, the
    321	 * role check prevents consuming an invalid root.
    322	 */
    323	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
    324		if (root->role.word == role.word &&
    325		    kvm_tdp_mmu_get_root(root))
    326			goto out;
    327	}
    328
    329	root = tdp_mmu_alloc_sp(vcpu);
    330	tdp_mmu_init_sp(root, NULL, 0, role);
    331
    332	refcount_set(&root->tdp_mmu_root_count, 1);
    333
    334	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
    335	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
    336	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
    337
    338out:
    339	return __pa(root->spt);
    340}
    341
    342static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
    343				u64 old_spte, u64 new_spte, int level,
    344				bool shared);
    345
    346static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
    347{
    348	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
    349		return;
    350
    351	if (is_accessed_spte(old_spte) &&
    352	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
    353	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
    354		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
    355}
    356
    357static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
    358					  u64 old_spte, u64 new_spte, int level)
    359{
    360	bool pfn_changed;
    361	struct kvm_memory_slot *slot;
    362
    363	if (level > PG_LEVEL_4K)
    364		return;
    365
    366	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
    367
    368	if ((!is_writable_pte(old_spte) || pfn_changed) &&
    369	    is_writable_pte(new_spte)) {
    370		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
    371		mark_page_dirty_in_slot(kvm, slot, gfn);
    372	}
    373}
    374
    375/**
    376 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
    377 *
    378 * @kvm: kvm instance
    379 * @sp: the page to be removed
    380 * @shared: This operation may not be running under the exclusive use of
    381 *	    the MMU lock and the operation must synchronize with other
    382 *	    threads that might be adding or removing pages.
    383 */
    384static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
    385			      bool shared)
    386{
    387	if (shared)
    388		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
    389	else
    390		lockdep_assert_held_write(&kvm->mmu_lock);
    391
    392	list_del(&sp->link);
    393	if (sp->lpage_disallowed)
    394		unaccount_huge_nx_page(kvm, sp);
    395
    396	if (shared)
    397		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
    398}
    399
    400/**
    401 * handle_removed_pt() - handle a page table removed from the TDP structure
    402 *
    403 * @kvm: kvm instance
    404 * @pt: the page removed from the paging structure
    405 * @shared: This operation may not be running under the exclusive use
    406 *	    of the MMU lock and the operation must synchronize with other
    407 *	    threads that might be modifying SPTEs.
    408 *
    409 * Given a page table that has been removed from the TDP paging structure,
    410 * iterates through the page table to clear SPTEs and free child page tables.
    411 *
    412 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
    413 * protection. Since this thread removed it from the paging structure,
    414 * this thread will be responsible for ensuring the page is freed. Hence the
    415 * early rcu_dereferences in the function.
    416 */
    417static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
    418{
    419	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
    420	int level = sp->role.level;
    421	gfn_t base_gfn = sp->gfn;
    422	int i;
    423
    424	trace_kvm_mmu_prepare_zap_page(sp);
    425
    426	tdp_mmu_unlink_sp(kvm, sp, shared);
    427
    428	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
    429		tdp_ptep_t sptep = pt + i;
    430		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
    431		u64 old_spte;
    432
    433		if (shared) {
    434			/*
    435			 * Set the SPTE to a nonpresent value that other
    436			 * threads will not overwrite. If the SPTE was
    437			 * already marked as removed then another thread
    438			 * handling a page fault could overwrite it, so
    439			 * set the SPTE until it is set from some other
    440			 * value to the removed SPTE value.
    441			 */
    442			for (;;) {
    443				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
    444				if (!is_removed_spte(old_spte))
    445					break;
    446				cpu_relax();
    447			}
    448		} else {
    449			/*
    450			 * If the SPTE is not MMU-present, there is no backing
    451			 * page associated with the SPTE and so no side effects
    452			 * that need to be recorded, and exclusive ownership of
    453			 * mmu_lock ensures the SPTE can't be made present.
    454			 * Note, zapping MMIO SPTEs is also unnecessary as they
    455			 * are guarded by the memslots generation, not by being
    456			 * unreachable.
    457			 */
    458			old_spte = kvm_tdp_mmu_read_spte(sptep);
    459			if (!is_shadow_present_pte(old_spte))
    460				continue;
    461
    462			/*
    463			 * Use the common helper instead of a raw WRITE_ONCE as
    464			 * the SPTE needs to be updated atomically if it can be
    465			 * modified by a different vCPU outside of mmu_lock.
    466			 * Even though the parent SPTE is !PRESENT, the TLB
    467			 * hasn't yet been flushed, and both Intel and AMD
    468			 * document that A/D assists can use upper-level PxE
    469			 * entries that are cached in the TLB, i.e. the CPU can
    470			 * still access the page and mark it dirty.
    471			 *
    472			 * No retry is needed in the atomic update path as the
    473			 * sole concern is dropping a Dirty bit, i.e. no other
    474			 * task can zap/remove the SPTE as mmu_lock is held for
    475			 * write.  Marking the SPTE as a removed SPTE is not
    476			 * strictly necessary for the same reason, but using
    477			 * the remove SPTE value keeps the shared/exclusive
    478			 * paths consistent and allows the handle_changed_spte()
    479			 * call below to hardcode the new value to REMOVED_SPTE.
    480			 *
    481			 * Note, even though dropping a Dirty bit is the only
    482			 * scenario where a non-atomic update could result in a
    483			 * functional bug, simply checking the Dirty bit isn't
    484			 * sufficient as a fast page fault could read the upper
    485			 * level SPTE before it is zapped, and then make this
    486			 * target SPTE writable, resume the guest, and set the
    487			 * Dirty bit between reading the SPTE above and writing
    488			 * it here.
    489			 */
    490			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
    491							  REMOVED_SPTE, level);
    492		}
    493		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
    494				    old_spte, REMOVED_SPTE, level, shared);
    495	}
    496
    497	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
    498}
    499
    500/**
    501 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
    502 * @kvm: kvm instance
    503 * @as_id: the address space of the paging structure the SPTE was a part of
    504 * @gfn: the base GFN that was mapped by the SPTE
    505 * @old_spte: The value of the SPTE before the change
    506 * @new_spte: The value of the SPTE after the change
    507 * @level: the level of the PT the SPTE is part of in the paging structure
    508 * @shared: This operation may not be running under the exclusive use of
    509 *	    the MMU lock and the operation must synchronize with other
    510 *	    threads that might be modifying SPTEs.
    511 *
    512 * Handle bookkeeping that might result from the modification of a SPTE.
    513 * This function must be called for all TDP SPTE modifications.
    514 */
    515static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
    516				  u64 old_spte, u64 new_spte, int level,
    517				  bool shared)
    518{
    519	bool was_present = is_shadow_present_pte(old_spte);
    520	bool is_present = is_shadow_present_pte(new_spte);
    521	bool was_leaf = was_present && is_last_spte(old_spte, level);
    522	bool is_leaf = is_present && is_last_spte(new_spte, level);
    523	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
    524
    525	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
    526	WARN_ON(level < PG_LEVEL_4K);
    527	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
    528
    529	/*
    530	 * If this warning were to trigger it would indicate that there was a
    531	 * missing MMU notifier or a race with some notifier handler.
    532	 * A present, leaf SPTE should never be directly replaced with another
    533	 * present leaf SPTE pointing to a different PFN. A notifier handler
    534	 * should be zapping the SPTE before the main MM's page table is
    535	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
    536	 * thread before replacement.
    537	 */
    538	if (was_leaf && is_leaf && pfn_changed) {
    539		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
    540		       "SPTE with another present leaf SPTE mapping a\n"
    541		       "different PFN!\n"
    542		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
    543		       as_id, gfn, old_spte, new_spte, level);
    544
    545		/*
    546		 * Crash the host to prevent error propagation and guest data
    547		 * corruption.
    548		 */
    549		BUG();
    550	}
    551
    552	if (old_spte == new_spte)
    553		return;
    554
    555	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
    556
    557	if (is_leaf)
    558		check_spte_writable_invariants(new_spte);
    559
    560	/*
    561	 * The only times a SPTE should be changed from a non-present to
    562	 * non-present state is when an MMIO entry is installed/modified/
    563	 * removed. In that case, there is nothing to do here.
    564	 */
    565	if (!was_present && !is_present) {
    566		/*
    567		 * If this change does not involve a MMIO SPTE or removed SPTE,
    568		 * it is unexpected. Log the change, though it should not
    569		 * impact the guest since both the former and current SPTEs
    570		 * are nonpresent.
    571		 */
    572		if (WARN_ON(!is_mmio_spte(old_spte) &&
    573			    !is_mmio_spte(new_spte) &&
    574			    !is_removed_spte(new_spte)))
    575			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
    576			       "should not be replaced with another,\n"
    577			       "different nonpresent SPTE, unless one or both\n"
    578			       "are MMIO SPTEs, or the new SPTE is\n"
    579			       "a temporary removed SPTE.\n"
    580			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
    581			       as_id, gfn, old_spte, new_spte, level);
    582		return;
    583	}
    584
    585	if (is_leaf != was_leaf)
    586		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
    587
    588	if (was_leaf && is_dirty_spte(old_spte) &&
    589	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
    590		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
    591
    592	/*
    593	 * Recursively handle child PTs if the change removed a subtree from
    594	 * the paging structure.  Note the WARN on the PFN changing without the
    595	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
    596	 * pages are kernel allocations and should never be migrated.
    597	 */
    598	if (was_present && !was_leaf &&
    599	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
    600		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
    601}
    602
    603static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
    604				u64 old_spte, u64 new_spte, int level,
    605				bool shared)
    606{
    607	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
    608			      shared);
    609	handle_changed_spte_acc_track(old_spte, new_spte, level);
    610	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
    611				      new_spte, level);
    612}
    613
    614/*
    615 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
    616 * and handle the associated bookkeeping.  Do not mark the page dirty
    617 * in KVM's dirty bitmaps.
    618 *
    619 * If setting the SPTE fails because it has changed, iter->old_spte will be
    620 * refreshed to the current value of the spte.
    621 *
    622 * @kvm: kvm instance
    623 * @iter: a tdp_iter instance currently on the SPTE that should be set
    624 * @new_spte: The value the SPTE should be set to
    625 * Return:
    626 * * 0      - If the SPTE was set.
    627 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
    628 *            no side-effects other than setting iter->old_spte to the last
    629 *            known value of the spte.
    630 */
    631static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
    632					  struct tdp_iter *iter,
    633					  u64 new_spte)
    634{
    635	u64 *sptep = rcu_dereference(iter->sptep);
    636	u64 old_spte;
    637
    638	/*
    639	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
    640	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
    641	 * and pre-checking before inserting a new SPTE is advantageous as it
    642	 * avoids unnecessary work.
    643	 */
    644	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
    645
    646	lockdep_assert_held_read(&kvm->mmu_lock);
    647
    648	/*
    649	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
    650	 * does not hold the mmu_lock.
    651	 */
    652	old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
    653	if (old_spte != iter->old_spte) {
    654		/*
    655		 * The page table entry was modified by a different logical
    656		 * CPU. Refresh iter->old_spte with the current value so the
    657		 * caller operates on fresh data, e.g. if it retries
    658		 * tdp_mmu_set_spte_atomic().
    659		 */
    660		iter->old_spte = old_spte;
    661		return -EBUSY;
    662	}
    663
    664	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
    665			      new_spte, iter->level, true);
    666	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
    667
    668	return 0;
    669}
    670
    671static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
    672					  struct tdp_iter *iter)
    673{
    674	int ret;
    675
    676	/*
    677	 * Freeze the SPTE by setting it to a special,
    678	 * non-present value. This will stop other threads from
    679	 * immediately installing a present entry in its place
    680	 * before the TLBs are flushed.
    681	 */
    682	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
    683	if (ret)
    684		return ret;
    685
    686	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
    687					   KVM_PAGES_PER_HPAGE(iter->level));
    688
    689	/*
    690	 * No other thread can overwrite the removed SPTE as they must either
    691	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
    692	 * overwrite the special removed SPTE value. No bookkeeping is needed
    693	 * here since the SPTE is going from non-present to non-present.  Use
    694	 * the raw write helper to avoid an unnecessary check on volatile bits.
    695	 */
    696	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
    697
    698	return 0;
    699}
    700
    701
    702/*
    703 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
    704 * @kvm:	      KVM instance
    705 * @as_id:	      Address space ID, i.e. regular vs. SMM
    706 * @sptep:	      Pointer to the SPTE
    707 * @old_spte:	      The current value of the SPTE
    708 * @new_spte:	      The new value that will be set for the SPTE
    709 * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
    710 * @level:	      The level _containing_ the SPTE (its parent PT's level)
    711 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
    712 *		      of the page. Should be set unless handling an MMU
    713 *		      notifier for access tracking. Leaving record_acc_track
    714 *		      unset in that case prevents page accesses from being
    715 *		      double counted.
    716 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
    717 *		      appropriate for the change being made. Should be set
    718 *		      unless performing certain dirty logging operations.
    719 *		      Leaving record_dirty_log unset in that case prevents page
    720 *		      writes from being double counted.
    721 *
    722 * Returns the old SPTE value, which _may_ be different than @old_spte if the
    723 * SPTE had voldatile bits.
    724 */
    725static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
    726			      u64 old_spte, u64 new_spte, gfn_t gfn, int level,
    727			      bool record_acc_track, bool record_dirty_log)
    728{
    729	lockdep_assert_held_write(&kvm->mmu_lock);
    730
    731	/*
    732	 * No thread should be using this function to set SPTEs to or from the
    733	 * temporary removed SPTE value.
    734	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
    735	 * should be used. If operating under the MMU lock in write mode, the
    736	 * use of the removed SPTE should not be necessary.
    737	 */
    738	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
    739
    740	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
    741
    742	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
    743
    744	if (record_acc_track)
    745		handle_changed_spte_acc_track(old_spte, new_spte, level);
    746	if (record_dirty_log)
    747		handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
    748					      new_spte, level);
    749	return old_spte;
    750}
    751
    752static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
    753				     u64 new_spte, bool record_acc_track,
    754				     bool record_dirty_log)
    755{
    756	WARN_ON_ONCE(iter->yielded);
    757
    758	iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
    759					    iter->old_spte, new_spte,
    760					    iter->gfn, iter->level,
    761					    record_acc_track, record_dirty_log);
    762}
    763
    764static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
    765				    u64 new_spte)
    766{
    767	_tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
    768}
    769
    770static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
    771						 struct tdp_iter *iter,
    772						 u64 new_spte)
    773{
    774	_tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
    775}
    776
    777static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
    778						 struct tdp_iter *iter,
    779						 u64 new_spte)
    780{
    781	_tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
    782}
    783
    784#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
    785	for_each_tdp_pte(_iter, _root, _start, _end)
    786
    787#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
    788	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
    789		if (!is_shadow_present_pte(_iter.old_spte) ||		\
    790		    !is_last_spte(_iter.old_spte, _iter.level))		\
    791			continue;					\
    792		else
    793
    794#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
    795	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
    796
    797/*
    798 * Yield if the MMU lock is contended or this thread needs to return control
    799 * to the scheduler.
    800 *
    801 * If this function should yield and flush is set, it will perform a remote
    802 * TLB flush before yielding.
    803 *
    804 * If this function yields, iter->yielded is set and the caller must skip to
    805 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
    806 * over the paging structures to allow the iterator to continue its traversal
    807 * from the paging structure root.
    808 *
    809 * Returns true if this function yielded.
    810 */
    811static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
    812							  struct tdp_iter *iter,
    813							  bool flush, bool shared)
    814{
    815	WARN_ON(iter->yielded);
    816
    817	/* Ensure forward progress has been made before yielding. */
    818	if (iter->next_last_level_gfn == iter->yielded_gfn)
    819		return false;
    820
    821	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
    822		if (flush)
    823			kvm_flush_remote_tlbs(kvm);
    824
    825		rcu_read_unlock();
    826
    827		if (shared)
    828			cond_resched_rwlock_read(&kvm->mmu_lock);
    829		else
    830			cond_resched_rwlock_write(&kvm->mmu_lock);
    831
    832		rcu_read_lock();
    833
    834		WARN_ON(iter->gfn > iter->next_last_level_gfn);
    835
    836		iter->yielded = true;
    837	}
    838
    839	return iter->yielded;
    840}
    841
    842static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
    843{
    844	/*
    845	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
    846	 * a gpa range that would exceed the max gfn, and KVM does not create
    847	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
    848	 * the slow emulation path every time.
    849	 */
    850	return kvm_mmu_max_gfn() + 1;
    851}
    852
    853static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
    854			       bool shared, int zap_level)
    855{
    856	struct tdp_iter iter;
    857
    858	gfn_t end = tdp_mmu_max_gfn_exclusive();
    859	gfn_t start = 0;
    860
    861	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
    862retry:
    863		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
    864			continue;
    865
    866		if (!is_shadow_present_pte(iter.old_spte))
    867			continue;
    868
    869		if (iter.level > zap_level)
    870			continue;
    871
    872		if (!shared)
    873			tdp_mmu_set_spte(kvm, &iter, 0);
    874		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
    875			goto retry;
    876	}
    877}
    878
    879static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
    880			     bool shared)
    881{
    882
    883	/*
    884	 * The root must have an elevated refcount so that it's reachable via
    885	 * mmu_notifier callbacks, which allows this path to yield and drop
    886	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
    887	 * must drop all references to relevant pages prior to completing the
    888	 * callback.  Dropping mmu_lock with an unreachable root would result
    889	 * in zapping SPTEs after a relevant mmu_notifier callback completes
    890	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
    891	 * dirty accessed bits to the SPTE's associated struct page.
    892	 */
    893	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
    894
    895	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
    896
    897	rcu_read_lock();
    898
    899	/*
    900	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
    901	 * split the zap into two passes.  On the first pass, zap at the 1gb
    902	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
    903	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
    904	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
    905	 *
    906	 * Because zapping a SP recurses on its children, stepping down to
    907	 * PG_LEVEL_4K in the iterator itself is unnecessary.
    908	 */
    909	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
    910	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
    911
    912	rcu_read_unlock();
    913}
    914
    915bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
    916{
    917	u64 old_spte;
    918
    919	/*
    920	 * This helper intentionally doesn't allow zapping a root shadow page,
    921	 * which doesn't have a parent page table and thus no associated entry.
    922	 */
    923	if (WARN_ON_ONCE(!sp->ptep))
    924		return false;
    925
    926	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
    927	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
    928		return false;
    929
    930	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
    931			   sp->gfn, sp->role.level + 1, true, true);
    932
    933	return true;
    934}
    935
    936/*
    937 * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
    938 * have been cleared and a TLB flush is needed before releasing the MMU lock.
    939 *
    940 * If can_yield is true, will release the MMU lock and reschedule if the
    941 * scheduler needs the CPU or there is contention on the MMU lock. If this
    942 * function cannot yield, it will not release the MMU lock or reschedule and
    943 * the caller must ensure it does not supply too large a GFN range, or the
    944 * operation can cause a soft lockup.
    945 */
    946static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
    947			      gfn_t start, gfn_t end, bool can_yield, bool flush)
    948{
    949	struct tdp_iter iter;
    950
    951	end = min(end, tdp_mmu_max_gfn_exclusive());
    952
    953	lockdep_assert_held_write(&kvm->mmu_lock);
    954
    955	rcu_read_lock();
    956
    957	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
    958		if (can_yield &&
    959		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
    960			flush = false;
    961			continue;
    962		}
    963
    964		if (!is_shadow_present_pte(iter.old_spte) ||
    965		    !is_last_spte(iter.old_spte, iter.level))
    966			continue;
    967
    968		tdp_mmu_set_spte(kvm, &iter, 0);
    969		flush = true;
    970	}
    971
    972	rcu_read_unlock();
    973
    974	/*
    975	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
    976	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
    977	 */
    978	return flush;
    979}
    980
    981/*
    982 * Tears down the mappings for the range of gfns, [start, end), and frees the
    983 * non-root pages mapping GFNs strictly within that range. Returns true if
    984 * SPTEs have been cleared and a TLB flush is needed before releasing the
    985 * MMU lock.
    986 */
    987bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
    988			   bool can_yield, bool flush)
    989{
    990	struct kvm_mmu_page *root;
    991
    992	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
    993		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
    994
    995	return flush;
    996}
    997
    998void kvm_tdp_mmu_zap_all(struct kvm *kvm)
    999{
   1000	struct kvm_mmu_page *root;
   1001	int i;
   1002
   1003	/*
   1004	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
   1005	 * before returning to the caller.  Zap directly even if the root is
   1006	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
   1007	 * all that expensive and mmu_lock is already held, which means the
   1008	 * worker has yielded, i.e. flushing the work instead of zapping here
   1009	 * isn't guaranteed to be any faster.
   1010	 *
   1011	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
   1012	 * is being destroyed or the userspace VMM has exited.  In both cases,
   1013	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
   1014	 */
   1015	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
   1016		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
   1017			tdp_mmu_zap_root(kvm, root, false);
   1018	}
   1019}
   1020
   1021/*
   1022 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
   1023 * zap" completes.
   1024 */
   1025void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
   1026{
   1027	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
   1028}
   1029
   1030/*
   1031 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
   1032 * is about to be zapped, e.g. in response to a memslots update.  The actual
   1033 * zapping is performed asynchronously, so a reference is taken on all roots.
   1034 * Using a separate workqueue makes it easy to ensure that the destruction is
   1035 * performed before the "fast zap" completes, without keeping a separate list
   1036 * of invalidated roots; the list is effectively the list of work items in
   1037 * the workqueue.
   1038 *
   1039 * Get a reference even if the root is already invalid, the asynchronous worker
   1040 * assumes it was gifted a reference to the root it processes.  Because mmu_lock
   1041 * is held for write, it should be impossible to observe a root with zero refcount,
   1042 * i.e. the list of roots cannot be stale.
   1043 *
   1044 * This has essentially the same effect for the TDP MMU
   1045 * as updating mmu_valid_gen does for the shadow MMU.
   1046 */
   1047void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
   1048{
   1049	struct kvm_mmu_page *root;
   1050
   1051	lockdep_assert_held_write(&kvm->mmu_lock);
   1052	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
   1053		if (!root->role.invalid &&
   1054		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
   1055			root->role.invalid = true;
   1056			tdp_mmu_schedule_zap_root(kvm, root);
   1057		}
   1058	}
   1059}
   1060
   1061/*
   1062 * Installs a last-level SPTE to handle a TDP page fault.
   1063 * (NPT/EPT violation/misconfiguration)
   1064 */
   1065static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
   1066					  struct kvm_page_fault *fault,
   1067					  struct tdp_iter *iter)
   1068{
   1069	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
   1070	u64 new_spte;
   1071	int ret = RET_PF_FIXED;
   1072	bool wrprot = false;
   1073	int modes[] = {
   1074		KVM_PAGE_TRACK_EXEC,
   1075		KVM_PAGE_TRACK_ACCESS,
   1076	};
   1077	int i;
   1078
   1079	WARN_ON(sp->role.level != fault->goal_level);
   1080	if (unlikely(!fault->slot))
   1081		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
   1082	else
   1083		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
   1084					 fault->pfn, iter->old_spte, fault->prefetch, true,
   1085					 fault->map_writable, &new_spte);
   1086
   1087	/* reprotect the spte according to tracking */
   1088	for (i = 0; i < 2; i++) {
   1089		if (kvm_slot_page_track_is_active(vcpu->kvm,
   1090				fault->slot, fault->gfn, modes[i])) {
   1091			new_spte = cpc_protect_pte(new_spte, modes[i]);
   1092			break;
   1093		}
   1094	}
   1095
   1096	if (new_spte == iter->old_spte)
   1097		ret = RET_PF_SPURIOUS;
   1098	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
   1099		return RET_PF_RETRY;
   1100	else if (is_shadow_present_pte(iter->old_spte) &&
   1101		 !is_last_spte(iter->old_spte, iter->level))
   1102		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
   1103						   KVM_PAGES_PER_HPAGE(iter->level + 1));
   1104
   1105	/*
   1106	 * If the page fault was caused by a write but the page is write
   1107	 * protected, emulation is needed. If the emulation was skipped,
   1108	 * the vCPU would have the same fault again.
   1109	 */
   1110	if (wrprot) {
   1111		if (fault->write)
   1112			ret = RET_PF_EMULATE;
   1113	}
   1114
   1115	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
   1116	if (unlikely(is_mmio_spte(new_spte))) {
   1117		vcpu->stat.pf_mmio_spte_created++;
   1118		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
   1119				     new_spte);
   1120		ret = RET_PF_EMULATE;
   1121	} else {
   1122		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
   1123				       rcu_dereference(iter->sptep));
   1124	}
   1125
   1126	return ret;
   1127}
   1128
   1129/*
   1130 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
   1131 * provided page table.
   1132 *
   1133 * @kvm: kvm instance
   1134 * @iter: a tdp_iter instance currently on the SPTE that should be set
   1135 * @sp: The new TDP page table to install.
   1136 * @account_nx: True if this page table is being installed to split a
   1137 *              non-executable huge page.
   1138 * @shared: This operation is running under the MMU lock in read mode.
   1139 *
   1140 * Returns: 0 if the new page table was installed. Non-0 if the page table
   1141 *          could not be installed (e.g. the atomic compare-exchange failed).
   1142 */
   1143static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
   1144			   struct kvm_mmu_page *sp, bool account_nx,
   1145			   bool shared)
   1146{
   1147	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
   1148	int ret = 0;
   1149
   1150	if (shared) {
   1151		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
   1152		if (ret)
   1153			return ret;
   1154	} else {
   1155		tdp_mmu_set_spte(kvm, iter, spte);
   1156	}
   1157
   1158	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
   1159	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
   1160	if (account_nx)
   1161		account_huge_nx_page(kvm, sp);
   1162	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
   1163
   1164	return 0;
   1165}
   1166
   1167/*
   1168 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
   1169 * page tables and SPTEs to translate the faulting guest physical address.
   1170 */
   1171int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
   1172{
   1173	struct kvm_mmu *mmu = vcpu->arch.mmu;
   1174	struct tdp_iter iter;
   1175	struct kvm_mmu_page *sp;
   1176	int ret;
   1177
   1178	kvm_mmu_hugepage_adjust(vcpu, fault);
   1179
   1180	trace_kvm_mmu_spte_requested(fault);
   1181
   1182	rcu_read_lock();
   1183
   1184	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
   1185		if (fault->nx_huge_page_workaround_enabled)
   1186			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
   1187
   1188		if (iter.level == fault->goal_level)
   1189			break;
   1190
   1191		/*
   1192		 * If there is an SPTE mapping a large page at a higher level
   1193		 * than the target, that SPTE must be cleared and replaced
   1194		 * with a non-leaf SPTE.
   1195		 */
   1196		if (is_shadow_present_pte(iter.old_spte) &&
   1197		    is_large_pte(iter.old_spte)) {
   1198			if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
   1199				break;
   1200
   1201			/*
   1202			 * The iter must explicitly re-read the spte here
   1203			 * because the new value informs the !present
   1204			 * path below.
   1205			 */
   1206			iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
   1207		}
   1208
   1209		if (!is_shadow_present_pte(iter.old_spte)) {
   1210			bool account_nx = fault->huge_page_disallowed &&
   1211					  fault->req_level >= iter.level;
   1212
   1213			/*
   1214			 * If SPTE has been frozen by another thread, just
   1215			 * give up and retry, avoiding unnecessary page table
   1216			 * allocation and free.
   1217			 */
   1218			if (is_removed_spte(iter.old_spte))
   1219				break;
   1220
   1221			sp = tdp_mmu_alloc_sp(vcpu);
   1222			tdp_mmu_init_child_sp(sp, &iter);
   1223
   1224			if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
   1225				tdp_mmu_free_sp(sp);
   1226				break;
   1227			}
   1228		}
   1229	}
   1230
   1231	/*
   1232	 * Force the guest to retry the access if the upper level SPTEs aren't
   1233	 * in place, or if the target leaf SPTE is frozen by another CPU.
   1234	 */
   1235	if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
   1236		rcu_read_unlock();
   1237		return RET_PF_RETRY;
   1238	}
   1239
   1240	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
   1241	rcu_read_unlock();
   1242
   1243	return ret;
   1244}
   1245
   1246bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
   1247				 bool flush)
   1248{
   1249	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
   1250				     range->end, range->may_block, flush);
   1251}
   1252
   1253typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
   1254			      struct kvm_gfn_range *range);
   1255
   1256static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
   1257						   struct kvm_gfn_range *range,
   1258						   tdp_handler_t handler)
   1259{
   1260	struct kvm_mmu_page *root;
   1261	struct tdp_iter iter;
   1262	bool ret = false;
   1263
   1264	/*
   1265	 * Don't support rescheduling, none of the MMU notifiers that funnel
   1266	 * into this helper allow blocking; it'd be dead, wasteful code.
   1267	 */
   1268	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
   1269		rcu_read_lock();
   1270
   1271		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
   1272			ret |= handler(kvm, &iter, range);
   1273
   1274		rcu_read_unlock();
   1275	}
   1276
   1277	return ret;
   1278}
   1279
   1280/*
   1281 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
   1282 * if any of the GFNs in the range have been accessed.
   1283 */
   1284static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
   1285			  struct kvm_gfn_range *range)
   1286{
   1287	u64 new_spte = 0;
   1288
   1289	/* If we have a non-accessed entry we don't need to change the pte. */
   1290	if (!is_accessed_spte(iter->old_spte))
   1291		return false;
   1292
   1293	new_spte = iter->old_spte;
   1294
   1295	if (spte_ad_enabled(new_spte)) {
   1296		new_spte &= ~shadow_accessed_mask;
   1297	} else {
   1298		/*
   1299		 * Capture the dirty status of the page, so that it doesn't get
   1300		 * lost when the SPTE is marked for access tracking.
   1301		 */
   1302		if (is_writable_pte(new_spte))
   1303			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
   1304
   1305		new_spte = mark_spte_for_access_track(new_spte);
   1306	}
   1307
   1308	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
   1309
   1310	return true;
   1311}
   1312
   1313bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
   1314{
   1315	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
   1316}
   1317
   1318static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
   1319			 struct kvm_gfn_range *range)
   1320{
   1321	return is_accessed_spte(iter->old_spte);
   1322}
   1323
   1324bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
   1325{
   1326	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
   1327}
   1328
   1329static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
   1330			 struct kvm_gfn_range *range)
   1331{
   1332	u64 new_spte;
   1333
   1334	/* Huge pages aren't expected to be modified without first being zapped. */
   1335	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
   1336
   1337	if (iter->level != PG_LEVEL_4K ||
   1338	    !is_shadow_present_pte(iter->old_spte))
   1339		return false;
   1340
   1341	/*
   1342	 * Note, when changing a read-only SPTE, it's not strictly necessary to
   1343	 * zero the SPTE before setting the new PFN, but doing so preserves the
   1344	 * invariant that the PFN of a present * leaf SPTE can never change.
   1345	 * See __handle_changed_spte().
   1346	 */
   1347	tdp_mmu_set_spte(kvm, iter, 0);
   1348
   1349	if (!pte_write(range->pte)) {
   1350		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
   1351								  pte_pfn(range->pte));
   1352
   1353		tdp_mmu_set_spte(kvm, iter, new_spte);
   1354	}
   1355
   1356	return true;
   1357}
   1358
   1359/*
   1360 * Handle the changed_pte MMU notifier for the TDP MMU.
   1361 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
   1362 * notifier.
   1363 * Returns non-zero if a flush is needed before releasing the MMU lock.
   1364 */
   1365bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
   1366{
   1367	/*
   1368	 * No need to handle the remote TLB flush under RCU protection, the
   1369	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
   1370	 * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
   1371	 */
   1372	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
   1373}
   1374
   1375/*
   1376 * Remove write access from all SPTEs at or above min_level that map GFNs
   1377 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
   1378 * be flushed.
   1379 */
   1380static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
   1381			     gfn_t start, gfn_t end, int min_level)
   1382{
   1383	struct tdp_iter iter;
   1384	u64 new_spte;
   1385	bool spte_set = false;
   1386
   1387	rcu_read_lock();
   1388
   1389	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
   1390
   1391	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
   1392retry:
   1393		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
   1394			continue;
   1395
   1396		if (!is_shadow_present_pte(iter.old_spte) ||
   1397		    !is_last_spte(iter.old_spte, iter.level) ||
   1398		    !(iter.old_spte & PT_WRITABLE_MASK))
   1399			continue;
   1400
   1401		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
   1402
   1403		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
   1404			goto retry;
   1405
   1406		spte_set = true;
   1407	}
   1408
   1409	rcu_read_unlock();
   1410	return spte_set;
   1411}
   1412
   1413/*
   1414 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
   1415 * only affect leaf SPTEs down to min_level.
   1416 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
   1417 */
   1418bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
   1419			     const struct kvm_memory_slot *slot, int min_level)
   1420{
   1421	struct kvm_mmu_page *root;
   1422	bool spte_set = false;
   1423
   1424	lockdep_assert_held_read(&kvm->mmu_lock);
   1425
   1426	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
   1427		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
   1428			     slot->base_gfn + slot->npages, min_level);
   1429
   1430	return spte_set;
   1431}
   1432
   1433static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
   1434{
   1435	struct kvm_mmu_page *sp;
   1436
   1437	gfp |= __GFP_ZERO;
   1438
   1439	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
   1440	if (!sp)
   1441		return NULL;
   1442
   1443	sp->spt = (void *)__get_free_page(gfp);
   1444	if (!sp->spt) {
   1445		kmem_cache_free(mmu_page_header_cache, sp);
   1446		return NULL;
   1447	}
   1448
   1449	return sp;
   1450}
   1451
   1452static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
   1453						       struct tdp_iter *iter,
   1454						       bool shared)
   1455{
   1456	struct kvm_mmu_page *sp;
   1457
   1458	/*
   1459	 * Since we are allocating while under the MMU lock we have to be
   1460	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
   1461	 * reclaim and to avoid making any filesystem callbacks (which can end
   1462	 * up invoking KVM MMU notifiers, resulting in a deadlock).
   1463	 *
   1464	 * If this allocation fails we drop the lock and retry with reclaim
   1465	 * allowed.
   1466	 */
   1467	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
   1468	if (sp)
   1469		return sp;
   1470
   1471	rcu_read_unlock();
   1472
   1473	if (shared)
   1474		read_unlock(&kvm->mmu_lock);
   1475	else
   1476		write_unlock(&kvm->mmu_lock);
   1477
   1478	iter->yielded = true;
   1479	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
   1480
   1481	if (shared)
   1482		read_lock(&kvm->mmu_lock);
   1483	else
   1484		write_lock(&kvm->mmu_lock);
   1485
   1486	rcu_read_lock();
   1487
   1488	return sp;
   1489}
   1490
   1491static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
   1492				   struct kvm_mmu_page *sp, bool shared)
   1493{
   1494	const u64 huge_spte = iter->old_spte;
   1495	const int level = iter->level;
   1496	int ret, i;
   1497
   1498	tdp_mmu_init_child_sp(sp, iter);
   1499
   1500	/*
   1501	 * No need for atomics when writing to sp->spt since the page table has
   1502	 * not been linked in yet and thus is not reachable from any other CPU.
   1503	 */
   1504	for (i = 0; i < PT64_ENT_PER_PAGE; i++)
   1505		sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
   1506
   1507	/*
   1508	 * Replace the huge spte with a pointer to the populated lower level
   1509	 * page table. Since we are making this change without a TLB flush vCPUs
   1510	 * will see a mix of the split mappings and the original huge mapping,
   1511	 * depending on what's currently in their TLB. This is fine from a
   1512	 * correctness standpoint since the translation will be the same either
   1513	 * way.
   1514	 */
   1515	ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
   1516	if (ret)
   1517		goto out;
   1518
   1519	/*
   1520	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
   1521	 * are overwriting from the page stats. But we have to manually update
   1522	 * the page stats with the new present child pages.
   1523	 */
   1524	kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
   1525
   1526out:
   1527	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
   1528	return ret;
   1529}
   1530
   1531static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
   1532					 struct kvm_mmu_page *root,
   1533					 gfn_t start, gfn_t end,
   1534					 int target_level, bool shared)
   1535{
   1536	struct kvm_mmu_page *sp = NULL;
   1537	struct tdp_iter iter;
   1538	int ret = 0;
   1539
   1540	rcu_read_lock();
   1541
   1542	/*
   1543	 * Traverse the page table splitting all huge pages above the target
   1544	 * level into one lower level. For example, if we encounter a 1GB page
   1545	 * we split it into 512 2MB pages.
   1546	 *
   1547	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
   1548	 * to visit an SPTE before ever visiting its children, which means we
   1549	 * will correctly recursively split huge pages that are more than one
   1550	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
   1551	 * and then splitting each of those to 512 4KB pages).
   1552	 */
   1553	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
   1554retry:
   1555		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
   1556			continue;
   1557
   1558		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
   1559			continue;
   1560
   1561		if (!sp) {
   1562			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
   1563			if (!sp) {
   1564				ret = -ENOMEM;
   1565				trace_kvm_mmu_split_huge_page(iter.gfn,
   1566							      iter.old_spte,
   1567							      iter.level, ret);
   1568				break;
   1569			}
   1570
   1571			if (iter.yielded)
   1572				continue;
   1573		}
   1574
   1575		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
   1576			goto retry;
   1577
   1578		sp = NULL;
   1579	}
   1580
   1581	rcu_read_unlock();
   1582
   1583	/*
   1584	 * It's possible to exit the loop having never used the last sp if, for
   1585	 * example, a vCPU doing HugePage NX splitting wins the race and
   1586	 * installs its own sp in place of the last sp we tried to split.
   1587	 */
   1588	if (sp)
   1589		tdp_mmu_free_sp(sp);
   1590
   1591	return ret;
   1592}
   1593
   1594
   1595/*
   1596 * Try to split all huge pages mapped by the TDP MMU down to the target level.
   1597 */
   1598void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
   1599				      const struct kvm_memory_slot *slot,
   1600				      gfn_t start, gfn_t end,
   1601				      int target_level, bool shared)
   1602{
   1603	struct kvm_mmu_page *root;
   1604	int r = 0;
   1605
   1606	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
   1607
   1608	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
   1609		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
   1610		if (r) {
   1611			kvm_tdp_mmu_put_root(kvm, root, shared);
   1612			break;
   1613		}
   1614	}
   1615}
   1616
   1617/*
   1618 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
   1619 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
   1620 * If AD bits are not enabled, this will require clearing the writable bit on
   1621 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
   1622 * be flushed.
   1623 */
   1624static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
   1625			   gfn_t start, gfn_t end)
   1626{
   1627	struct tdp_iter iter;
   1628	u64 new_spte;
   1629	bool spte_set = false;
   1630
   1631	rcu_read_lock();
   1632
   1633	tdp_root_for_each_leaf_pte(iter, root, start, end) {
   1634retry:
   1635		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
   1636			continue;
   1637
   1638		if (!is_shadow_present_pte(iter.old_spte))
   1639			continue;
   1640
   1641		if (spte_ad_need_write_protect(iter.old_spte)) {
   1642			if (is_writable_pte(iter.old_spte))
   1643				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
   1644			else
   1645				continue;
   1646		} else {
   1647			if (iter.old_spte & shadow_dirty_mask)
   1648				new_spte = iter.old_spte & ~shadow_dirty_mask;
   1649			else
   1650				continue;
   1651		}
   1652
   1653		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
   1654			goto retry;
   1655
   1656		spte_set = true;
   1657	}
   1658
   1659	rcu_read_unlock();
   1660	return spte_set;
   1661}
   1662
   1663/*
   1664 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
   1665 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
   1666 * If AD bits are not enabled, this will require clearing the writable bit on
   1667 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
   1668 * be flushed.
   1669 */
   1670bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
   1671				  const struct kvm_memory_slot *slot)
   1672{
   1673	struct kvm_mmu_page *root;
   1674	bool spte_set = false;
   1675
   1676	lockdep_assert_held_read(&kvm->mmu_lock);
   1677
   1678	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
   1679		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
   1680				slot->base_gfn + slot->npages);
   1681
   1682	return spte_set;
   1683}
   1684
   1685/*
   1686 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
   1687 * set in mask, starting at gfn. The given memslot is expected to contain all
   1688 * the GFNs represented by set bits in the mask. If AD bits are enabled,
   1689 * clearing the dirty status will involve clearing the dirty bit on each SPTE
   1690 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
   1691 */
   1692static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
   1693				  gfn_t gfn, unsigned long mask, bool wrprot)
   1694{
   1695	struct tdp_iter iter;
   1696	u64 new_spte;
   1697
   1698	rcu_read_lock();
   1699
   1700	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
   1701				    gfn + BITS_PER_LONG) {
   1702		if (!mask)
   1703			break;
   1704
   1705		if (iter.level > PG_LEVEL_4K ||
   1706		    !(mask & (1UL << (iter.gfn - gfn))))
   1707			continue;
   1708
   1709		mask &= ~(1UL << (iter.gfn - gfn));
   1710
   1711		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
   1712			if (is_writable_pte(iter.old_spte))
   1713				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
   1714			else
   1715				continue;
   1716		} else {
   1717			if (iter.old_spte & shadow_dirty_mask)
   1718				new_spte = iter.old_spte & ~shadow_dirty_mask;
   1719			else
   1720				continue;
   1721		}
   1722
   1723		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
   1724	}
   1725
   1726	rcu_read_unlock();
   1727}
   1728
   1729/*
   1730 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
   1731 * set in mask, starting at gfn. The given memslot is expected to contain all
   1732 * the GFNs represented by set bits in the mask. If AD bits are enabled,
   1733 * clearing the dirty status will involve clearing the dirty bit on each SPTE
   1734 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
   1735 */
   1736void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
   1737				       struct kvm_memory_slot *slot,
   1738				       gfn_t gfn, unsigned long mask,
   1739				       bool wrprot)
   1740{
   1741	struct kvm_mmu_page *root;
   1742
   1743	lockdep_assert_held_write(&kvm->mmu_lock);
   1744	for_each_tdp_mmu_root(kvm, root, slot->as_id)
   1745		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
   1746}
   1747
   1748/*
   1749 * Clear leaf entries which could be replaced by large mappings, for
   1750 * GFNs within the slot.
   1751 */
   1752static void zap_collapsible_spte_range(struct kvm *kvm,
   1753				       struct kvm_mmu_page *root,
   1754				       const struct kvm_memory_slot *slot)
   1755{
   1756	gfn_t start = slot->base_gfn;
   1757	gfn_t end = start + slot->npages;
   1758	struct tdp_iter iter;
   1759	int max_mapping_level;
   1760	kvm_pfn_t pfn;
   1761
   1762	rcu_read_lock();
   1763
   1764	tdp_root_for_each_pte(iter, root, start, end) {
   1765		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
   1766			continue;
   1767
   1768		if (!is_shadow_present_pte(iter.old_spte) ||
   1769		    !is_last_spte(iter.old_spte, iter.level))
   1770			continue;
   1771
   1772		/*
   1773		 * This is a leaf SPTE. Check if the PFN it maps can
   1774		 * be mapped at a higher level.
   1775		 */
   1776		pfn = spte_to_pfn(iter.old_spte);
   1777
   1778		if (kvm_is_reserved_pfn(pfn))
   1779			continue;
   1780
   1781		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
   1782				iter.gfn, pfn, PG_LEVEL_NUM);
   1783
   1784		WARN_ON(max_mapping_level < iter.level);
   1785
   1786		/*
   1787		 * If this page is already mapped at the highest
   1788		 * viable level, there's nothing more to do.
   1789		 */
   1790		if (max_mapping_level == iter.level)
   1791			continue;
   1792
   1793		/*
   1794		 * The page can be remapped at a higher level, so step
   1795		 * up to zap the parent SPTE.
   1796		 */
   1797		while (max_mapping_level > iter.level)
   1798			tdp_iter_step_up(&iter);
   1799
   1800		/* Note, a successful atomic zap also does a remote TLB flush. */
   1801		tdp_mmu_zap_spte_atomic(kvm, &iter);
   1802
   1803		/*
   1804		 * If the atomic zap fails, the iter will recurse back into
   1805		 * the same subtree to retry.
   1806		 */
   1807	}
   1808
   1809	rcu_read_unlock();
   1810}
   1811
   1812/*
   1813 * Clear non-leaf entries (and free associated page tables) which could
   1814 * be replaced by large mappings, for GFNs within the slot.
   1815 */
   1816void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
   1817				       const struct kvm_memory_slot *slot)
   1818{
   1819	struct kvm_mmu_page *root;
   1820
   1821	lockdep_assert_held_read(&kvm->mmu_lock);
   1822
   1823	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
   1824		zap_collapsible_spte_range(kvm, root, slot);
   1825}
   1826
   1827static bool cpc_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
   1828	gfn_t gfn, int min_level, int mode)
   1829{
   1830	struct tdp_iter iter;
   1831	u64 new_spte;
   1832	bool spte_set = false;
   1833
   1834	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
   1835
   1836	rcu_read_lock();
   1837
   1838	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
   1839		if (!is_shadow_present_pte(iter.old_spte) ||
   1840		    !is_last_spte(iter.old_spte, iter.level))
   1841			continue;
   1842
   1843		new_spte = iter.old_spte & ~shadow_mmu_writable_mask;
   1844		new_spte = cpc_protect_pte(new_spte, mode);
   1845
   1846		if (new_spte == iter.old_spte)
   1847			break;
   1848
   1849		tdp_mmu_set_spte(kvm, &iter, new_spte);
   1850		spte_set = true;
   1851	}
   1852
   1853	rcu_read_unlock();
   1854
   1855	return spte_set;
   1856}
   1857
   1858bool cpc_tdp_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
   1859	gfn_t gfn, int min_level, enum kvm_page_track_mode mode)
   1860{
   1861	struct kvm_mmu_page *root;
   1862	bool spte_set = false;
   1863
   1864	lockdep_assert_held_write(&kvm->mmu_lock);
   1865	for_each_tdp_mmu_root(kvm, root, slot->as_id)
   1866		spte_set |= cpc_protect_gfn(kvm, root, gfn, min_level, mode);
   1867
   1868	return spte_set;
   1869}
   1870
   1871/*
   1872 * Removes write access on the last level SPTE mapping this GFN and unsets the
   1873 * MMU-writable bit to ensure future writes continue to be intercepted.
   1874 * Returns true if an SPTE was set and a TLB flush is needed.
   1875 */
   1876bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
   1877				   struct kvm_memory_slot *slot, gfn_t gfn,
   1878				   int min_level)
   1879{
   1880	return cpc_tdp_protect_gfn(kvm, slot, gfn, min_level,
   1881		KVM_PAGE_TRACK_WRITE);
   1882}
   1883
   1884/*
   1885 * Return the level of the lowest level SPTE added to sptes.
   1886 * That SPTE may be non-present.
   1887 *
   1888 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
   1889 */
   1890int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
   1891			 int *root_level)
   1892{
   1893	struct tdp_iter iter;
   1894	struct kvm_mmu *mmu = vcpu->arch.mmu;
   1895	gfn_t gfn = addr >> PAGE_SHIFT;
   1896	int leaf = -1;
   1897
   1898	*root_level = vcpu->arch.mmu->root_role.level;
   1899
   1900	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
   1901		leaf = iter.level;
   1902		sptes[leaf] = iter.old_spte;
   1903	}
   1904
   1905	return leaf;
   1906}
   1907
   1908/*
   1909 * Returns the last level spte pointer of the shadow page walk for the given
   1910 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
   1911 * walk could be performed, returns NULL and *spte does not contain valid data.
   1912 *
   1913 * Contract:
   1914 *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
   1915 *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
   1916 *
   1917 * WARNING: This function is only intended to be called during fast_page_fault.
   1918 */
   1919u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
   1920					u64 *spte)
   1921{
   1922	struct tdp_iter iter;
   1923	struct kvm_mmu *mmu = vcpu->arch.mmu;
   1924	gfn_t gfn = addr >> PAGE_SHIFT;
   1925	tdp_ptep_t sptep = NULL;
   1926
   1927	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
   1928		*spte = iter.old_spte;
   1929		sptep = iter.sptep;
   1930	}
   1931
   1932	/*
   1933	 * Perform the rcu_dereference to get the raw spte pointer value since
   1934	 * we are passing it up to fast_page_fault, which is shared with the
   1935	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
   1936	 * annotation.
   1937	 *
   1938	 * This is safe since fast_page_fault obeys the contracts of this
   1939	 * function as well as all TDP MMU contracts around modifying SPTEs
   1940	 * outside of mmu_lock.
   1941	 */
   1942	return rcu_dereference(sptep);
   1943}