cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mmu.c (188529B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Kernel-based Virtual Machine driver for Linux
      4 *
      5 * This module enables machines with Intel VT-x extensions to run virtual
      6 * machines without emulation or binary translation.
      7 *
      8 * MMU support
      9 *
     10 * Copyright (C) 2006 Qumranet, Inc.
     11 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
     12 *
     13 * Authors:
     14 *   Yaniv Kamay  <yaniv@qumranet.com>
     15 *   Avi Kivity   <avi@qumranet.com>
     16 */
     17
     18#include "irq.h"
     19#include "ioapic.h"
     20#include "mmu.h"
     21#include "mmu_internal.h"
     22#include "tdp_mmu.h"
     23#include "x86.h"
     24#include "kvm_cache_regs.h"
     25#include "kvm_emulate.h"
     26#include "cpuid.h"
     27#include "spte.h"
     28
     29#include <linux/kvm_host.h>
     30#include <linux/types.h>
     31#include <linux/string.h>
     32#include <linux/mm.h>
     33#include <linux/highmem.h>
     34#include <linux/moduleparam.h>
     35#include <linux/export.h>
     36#include <linux/swap.h>
     37#include <linux/hugetlb.h>
     38#include <linux/compiler.h>
     39#include <linux/srcu.h>
     40#include <linux/slab.h>
     41#include <linux/sched/signal.h>
     42#include <linux/uaccess.h>
     43#include <linux/hash.h>
     44#include <linux/kern_levels.h>
     45#include <linux/kthread.h>
     46#include <linux/sev.h>
     47
     48#include <asm/page.h>
     49#include <asm/memtype.h>
     50#include <asm/cmpxchg.h>
     51#include <asm/io.h>
     52#include <asm/set_memory.h>
     53#include <asm/vmx.h>
     54#include <asm/kvm_page_track.h>
     55#include "trace.h"
     56
     57#include "paging.h"
     58
     59#include "../cachepc/cachepc.h"
     60#include "../cachepc/track.h"
     61#include "../cachepc/event.h"
     62#include "svm/svm.h"
     63
     64extern bool itlb_multihit_kvm_mitigation;
     65
     66int __read_mostly nx_huge_pages = -1;
     67static uint __read_mostly nx_huge_pages_recovery_period_ms;
     68#ifdef CONFIG_PREEMPT_RT
     69/* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
     70static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
     71#else
     72static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
     73#endif
     74
     75static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
     76static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
     77
     78static const struct kernel_param_ops nx_huge_pages_ops = {
     79	.set = set_nx_huge_pages,
     80	.get = param_get_bool,
     81};
     82
     83static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
     84	.set = set_nx_huge_pages_recovery_param,
     85	.get = param_get_uint,
     86};
     87
     88module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
     89__MODULE_PARM_TYPE(nx_huge_pages, "bool");
     90module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
     91		&nx_huge_pages_recovery_ratio, 0644);
     92__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
     93module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
     94		&nx_huge_pages_recovery_period_ms, 0644);
     95__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
     96
     97static bool __read_mostly force_flush_and_sync_on_reuse;
     98module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
     99
    100/*
    101 * When setting this variable to true it enables Two-Dimensional-Paging
    102 * where the hardware walks 2 page tables:
    103 * 1. the guest-virtual to guest-physical
    104 * 2. while doing 1. it walks guest-physical to host-physical
    105 * If the hardware supports that we don't need to do shadow paging.
    106 */
    107bool tdp_enabled = false;
    108
    109static int max_huge_page_level __read_mostly;
    110static int tdp_root_level __read_mostly;
    111static int max_tdp_level __read_mostly;
    112
    113#ifdef MMU_DEBUG
    114bool dbg = 0;
    115module_param(dbg, bool, 0644);
    116#endif
    117
    118#define PTE_PREFETCH_NUM		8
    119
    120#define PT32_LEVEL_BITS 10
    121
    122#define PT32_LEVEL_SHIFT(level) \
    123		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
    124
    125#define PT32_LVL_OFFSET_MASK(level) \
    126	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
    127						* PT32_LEVEL_BITS))) - 1))
    128
    129#define PT32_INDEX(address, level)\
    130	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
    131
    132
    133#define PT32_BASE_ADDR_MASK PAGE_MASK
    134#define PT32_DIR_BASE_ADDR_MASK \
    135	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
    136#define PT32_LVL_ADDR_MASK(level) \
    137	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
    138					    * PT32_LEVEL_BITS))) - 1))
    139
    140#include <trace/events/kvm.h>
    141
    142/* make pte_list_desc fit well in cache lines */
    143#define PTE_LIST_EXT 14
    144
    145/*
    146 * Slight optimization of cacheline layout, by putting `more' and `spte_count'
    147 * at the start; then accessing it will only use one single cacheline for
    148 * either full (entries==PTE_LIST_EXT) case or entries<=6.
    149 */
    150struct pte_list_desc {
    151	struct pte_list_desc *more;
    152	/*
    153	 * Stores number of entries stored in the pte_list_desc.  No need to be
    154	 * u64 but just for easier alignment.  When PTE_LIST_EXT, means full.
    155	 */
    156	u64 spte_count;
    157	u64 *sptes[PTE_LIST_EXT];
    158};
    159
    160struct kvm_shadow_walk_iterator {
    161	u64 addr;
    162	hpa_t shadow_addr;
    163	u64 *sptep;
    164	int level;
    165	unsigned index;
    166};
    167
    168#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
    169	for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
    170					 (_root), (_addr));                \
    171	     shadow_walk_okay(&(_walker));			           \
    172	     shadow_walk_next(&(_walker)))
    173
    174#define for_each_shadow_entry(_vcpu, _addr, _walker)            \
    175	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
    176	     shadow_walk_okay(&(_walker));			\
    177	     shadow_walk_next(&(_walker)))
    178
    179#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)	\
    180	for (shadow_walk_init(&(_walker), _vcpu, _addr);		\
    181	     shadow_walk_okay(&(_walker)) &&				\
    182		({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });	\
    183	     __shadow_walk_next(&(_walker), spte))
    184
    185static struct kmem_cache *pte_list_desc_cache;
    186struct kmem_cache *mmu_page_header_cache;
    187static struct percpu_counter kvm_total_used_mmu_pages;
    188
    189static void mmu_spte_set(u64 *sptep, u64 spte);
    190
    191struct kvm_mmu_role_regs {
    192	const unsigned long cr0;
    193	const unsigned long cr4;
    194	const u64 efer;
    195};
    196
    197#define CREATE_TRACE_POINTS
    198#include "mmutrace.h"
    199
    200/*
    201 * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
    202 * reading from the role_regs.  Once the root_role is constructed, it becomes
    203 * the single source of truth for the MMU's state.
    204 */
    205#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)			\
    206static inline bool __maybe_unused					\
    207____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)		\
    208{									\
    209	return !!(regs->reg & flag);					\
    210}
    211BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
    212BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
    213BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
    214BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
    215BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
    216BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
    217BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
    218BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
    219BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
    220BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
    221
    222/*
    223 * The MMU itself (with a valid role) is the single source of truth for the
    224 * MMU.  Do not use the regs used to build the MMU/role, nor the vCPU.  The
    225 * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
    226 * and the vCPU may be incorrect/irrelevant.
    227 */
    228#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)		\
    229static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)	\
    230{								\
    231	return !!(mmu->cpu_role. base_or_ext . reg##_##name);	\
    232}
    233BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
    234BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
    235BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
    236BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
    237BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
    238BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
    239BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
    240BUILD_MMU_ROLE_ACCESSOR(ext,  efer, lma);
    241
    242static inline bool is_cr0_pg(struct kvm_mmu *mmu)
    243{
    244        return mmu->cpu_role.base.level > 0;
    245}
    246
    247static inline bool is_cr4_pae(struct kvm_mmu *mmu)
    248{
    249        return !mmu->cpu_role.base.has_4_byte_gpte;
    250}
    251
    252static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
    253{
    254	struct kvm_mmu_role_regs regs = {
    255		.cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
    256		.cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
    257		.efer = vcpu->arch.efer,
    258	};
    259
    260	return regs;
    261}
    262
    263static inline bool kvm_available_flush_tlb_with_range(void)
    264{
    265	return kvm_x86_ops.tlb_remote_flush_with_range;
    266}
    267
    268static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
    269		struct kvm_tlb_range *range)
    270{
    271	int ret = -ENOTSUPP;
    272
    273	if (range && kvm_x86_ops.tlb_remote_flush_with_range)
    274		ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
    275
    276	if (ret)
    277		kvm_flush_remote_tlbs(kvm);
    278}
    279
    280void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
    281		u64 start_gfn, u64 pages)
    282{
    283	struct kvm_tlb_range range;
    284
    285	range.start_gfn = start_gfn;
    286	range.pages = pages;
    287
    288	kvm_flush_remote_tlbs_with_range(kvm, &range);
    289}
    290
    291static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
    292			   unsigned int access)
    293{
    294	u64 spte = make_mmio_spte(vcpu, gfn, access);
    295
    296	trace_mark_mmio_spte(sptep, gfn, spte);
    297	mmu_spte_set(sptep, spte);
    298}
    299
    300static gfn_t get_mmio_spte_gfn(u64 spte)
    301{
    302	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
    303
    304	gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
    305	       & shadow_nonpresent_or_rsvd_mask;
    306
    307	return gpa >> PAGE_SHIFT;
    308}
    309
    310static unsigned get_mmio_spte_access(u64 spte)
    311{
    312	return spte & shadow_mmio_access_mask;
    313}
    314
    315static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
    316{
    317	u64 kvm_gen, spte_gen, gen;
    318
    319	gen = kvm_vcpu_memslots(vcpu)->generation;
    320	if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
    321		return false;
    322
    323	kvm_gen = gen & MMIO_SPTE_GEN_MASK;
    324	spte_gen = get_mmio_spte_generation(spte);
    325
    326	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
    327	return likely(kvm_gen == spte_gen);
    328}
    329
    330static int is_cpuid_PSE36(void)
    331{
    332	return 1;
    333}
    334
    335static gfn_t pse36_gfn_delta(u32 gpte)
    336{
    337	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
    338
    339	return (gpte & PT32_DIR_PSE36_MASK) << shift;
    340}
    341
    342#ifdef CONFIG_X86_64
    343static void __set_spte(u64 *sptep, u64 spte)
    344{
    345	WRITE_ONCE(*sptep, spte);
    346}
    347
    348static void __update_clear_spte_fast(u64 *sptep, u64 spte)
    349{
    350	WRITE_ONCE(*sptep, spte);
    351}
    352
    353static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
    354{
    355	return xchg(sptep, spte);
    356}
    357
    358static u64 __get_spte_lockless(u64 *sptep)
    359{
    360	return READ_ONCE(*sptep);
    361}
    362#else
    363union split_spte {
    364	struct {
    365		u32 spte_low;
    366		u32 spte_high;
    367	};
    368	u64 spte;
    369};
    370
    371static void count_spte_clear(u64 *sptep, u64 spte)
    372{
    373	struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
    374
    375	if (is_shadow_present_pte(spte))
    376		return;
    377
    378	/* Ensure the spte is completely set before we increase the count */
    379	smp_wmb();
    380	sp->clear_spte_count++;
    381}
    382
    383static void __set_spte(u64 *sptep, u64 spte)
    384{
    385	union split_spte *ssptep, sspte;
    386
    387	ssptep = (union split_spte *)sptep;
    388	sspte = (union split_spte)spte;
    389
    390	ssptep->spte_high = sspte.spte_high;
    391
    392	/*
    393	 * If we map the spte from nonpresent to present, We should store
    394	 * the high bits firstly, then set present bit, so cpu can not
    395	 * fetch this spte while we are setting the spte.
    396	 */
    397	smp_wmb();
    398
    399	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
    400}
    401
    402static void __update_clear_spte_fast(u64 *sptep, u64 spte)
    403{
    404	union split_spte *ssptep, sspte;
    405
    406	ssptep = (union split_spte *)sptep;
    407	sspte = (union split_spte)spte;
    408
    409	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
    410
    411	/*
    412	 * If we map the spte from present to nonpresent, we should clear
    413	 * present bit firstly to avoid vcpu fetch the old high bits.
    414	 */
    415	smp_wmb();
    416
    417	ssptep->spte_high = sspte.spte_high;
    418	count_spte_clear(sptep, spte);
    419}
    420
    421static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
    422{
    423	union split_spte *ssptep, sspte, orig;
    424
    425	ssptep = (union split_spte *)sptep;
    426	sspte = (union split_spte)spte;
    427
    428	/* xchg acts as a barrier before the setting of the high bits */
    429	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
    430	orig.spte_high = ssptep->spte_high;
    431	ssptep->spte_high = sspte.spte_high;
    432	count_spte_clear(sptep, spte);
    433
    434	return orig.spte;
    435}
    436
    437/*
    438 * The idea using the light way get the spte on x86_32 guest is from
    439 * gup_get_pte (mm/gup.c).
    440 *
    441 * An spte tlb flush may be pending, because kvm_set_pte_rmapp
    442 * coalesces them and we are running out of the MMU lock.  Therefore
    443 * we need to protect against in-progress updates of the spte.
    444 *
    445 * Reading the spte while an update is in progress may get the old value
    446 * for the high part of the spte.  The race is fine for a present->non-present
    447 * change (because the high part of the spte is ignored for non-present spte),
    448 * but for a present->present change we must reread the spte.
    449 *
    450 * All such changes are done in two steps (present->non-present and
    451 * non-present->present), hence it is enough to count the number of
    452 * present->non-present updates: if it changed while reading the spte,
    453 * we might have hit the race.  This is done using clear_spte_count.
    454 */
    455static u64 __get_spte_lockless(u64 *sptep)
    456{
    457	struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
    458	union split_spte spte, *orig = (union split_spte *)sptep;
    459	int count;
    460
    461retry:
    462	count = sp->clear_spte_count;
    463	smp_rmb();
    464
    465	spte.spte_low = orig->spte_low;
    466	smp_rmb();
    467
    468	spte.spte_high = orig->spte_high;
    469	smp_rmb();
    470
    471	if (unlikely(spte.spte_low != orig->spte_low ||
    472	      count != sp->clear_spte_count))
    473		goto retry;
    474
    475	return spte.spte;
    476}
    477#endif
    478
    479/* Rules for using mmu_spte_set:
    480 * Set the sptep from nonpresent to present.
    481 * Note: the sptep being assigned *must* be either not present
    482 * or in a state where the hardware will not attempt to update
    483 * the spte.
    484 */
    485static void mmu_spte_set(u64 *sptep, u64 new_spte)
    486{
    487	WARN_ON(is_shadow_present_pte(*sptep));
    488	__set_spte(sptep, new_spte);
    489}
    490
    491/*
    492 * Update the SPTE (excluding the PFN), but do not track changes in its
    493 * accessed/dirty status.
    494 */
    495static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
    496{
    497	u64 old_spte = *sptep;
    498
    499	WARN_ON(!is_shadow_present_pte(new_spte));
    500	check_spte_writable_invariants(new_spte);
    501
    502	if (!is_shadow_present_pte(old_spte)) {
    503		mmu_spte_set(sptep, new_spte);
    504		return old_spte;
    505	}
    506
    507	if (!spte_has_volatile_bits(old_spte))
    508		__update_clear_spte_fast(sptep, new_spte);
    509	else
    510		old_spte = __update_clear_spte_slow(sptep, new_spte);
    511
    512	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
    513
    514	return old_spte;
    515}
    516
    517/* Rules for using mmu_spte_update:
    518 * Update the state bits, it means the mapped pfn is not changed.
    519 *
    520 * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
    521 * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
    522 * spte, even though the writable spte might be cached on a CPU's TLB.
    523 *
    524 * Returns true if the TLB needs to be flushed
    525 */
    526static bool mmu_spte_update(u64 *sptep, u64 new_spte)
    527{
    528	bool flush = false;
    529	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
    530
    531	if (!is_shadow_present_pte(old_spte))
    532		return false;
    533
    534	/*
    535	 * For the spte updated out of mmu-lock is safe, since
    536	 * we always atomically update it, see the comments in
    537	 * spte_has_volatile_bits().
    538	 */
    539	if (is_mmu_writable_spte(old_spte) &&
    540	      !is_writable_pte(new_spte))
    541		flush = true;
    542
    543	/*
    544	 * Flush TLB when accessed/dirty states are changed in the page tables,
    545	 * to guarantee consistency between TLB and page tables.
    546	 */
    547
    548	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
    549		flush = true;
    550		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
    551	}
    552
    553	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
    554		flush = true;
    555		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
    556	}
    557
    558	return flush;
    559}
    560
    561/*
    562 * Rules for using mmu_spte_clear_track_bits:
    563 * It sets the sptep from present to nonpresent, and track the
    564 * state bits, it is used to clear the last level sptep.
    565 * Returns the old PTE.
    566 */
    567static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
    568{
    569	kvm_pfn_t pfn;
    570	u64 old_spte = *sptep;
    571	int level = sptep_to_sp(sptep)->role.level;
    572
    573	if (!is_shadow_present_pte(old_spte) ||
    574	    !spte_has_volatile_bits(old_spte))
    575		__update_clear_spte_fast(sptep, 0ull);
    576	else
    577		old_spte = __update_clear_spte_slow(sptep, 0ull);
    578
    579	if (!is_shadow_present_pte(old_spte))
    580		return old_spte;
    581
    582	kvm_update_page_stats(kvm, level, -1);
    583
    584	pfn = spte_to_pfn(old_spte);
    585
    586	/*
    587	 * KVM does not hold the refcount of the page used by
    588	 * kvm mmu, before reclaiming the page, we should
    589	 * unmap it from mmu first.
    590	 */
    591	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
    592
    593	if (is_accessed_spte(old_spte))
    594		kvm_set_pfn_accessed(pfn);
    595
    596	if (is_dirty_spte(old_spte))
    597		kvm_set_pfn_dirty(pfn);
    598
    599	return old_spte;
    600}
    601
    602/*
    603 * Rules for using mmu_spte_clear_no_track:
    604 * Directly clear spte without caring the state bits of sptep,
    605 * it is used to set the upper level spte.
    606 */
    607static void mmu_spte_clear_no_track(u64 *sptep)
    608{
    609	__update_clear_spte_fast(sptep, 0ull);
    610}
    611
    612static u64 mmu_spte_get_lockless(u64 *sptep)
    613{
    614	return __get_spte_lockless(sptep);
    615}
    616
    617/* Returns the Accessed status of the PTE and resets it at the same time. */
    618static bool mmu_spte_age(u64 *sptep)
    619{
    620	u64 spte = mmu_spte_get_lockless(sptep);
    621
    622	if (!is_accessed_spte(spte))
    623		return false;
    624
    625	if (spte_ad_enabled(spte)) {
    626		clear_bit((ffs(shadow_accessed_mask) - 1),
    627			  (unsigned long *)sptep);
    628	} else {
    629		/*
    630		 * Capture the dirty status of the page, so that it doesn't get
    631		 * lost when the SPTE is marked for access tracking.
    632		 */
    633		if (is_writable_pte(spte))
    634			kvm_set_pfn_dirty(spte_to_pfn(spte));
    635
    636		spte = mark_spte_for_access_track(spte);
    637		mmu_spte_update_no_track(sptep, spte);
    638	}
    639
    640	return true;
    641}
    642
    643static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
    644{
    645	if (is_tdp_mmu(vcpu->arch.mmu)) {
    646		kvm_tdp_mmu_walk_lockless_begin();
    647	} else {
    648		/*
    649		 * Prevent page table teardown by making any free-er wait during
    650		 * kvm_flush_remote_tlbs() IPI to all active vcpus.
    651		 */
    652		local_irq_disable();
    653
    654		/*
    655		 * Make sure a following spte read is not reordered ahead of the write
    656		 * to vcpu->mode.
    657		 */
    658		smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
    659	}
    660}
    661
    662static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
    663{
    664	if (is_tdp_mmu(vcpu->arch.mmu)) {
    665		kvm_tdp_mmu_walk_lockless_end();
    666	} else {
    667		/*
    668		 * Make sure the write to vcpu->mode is not reordered in front of
    669		 * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
    670		 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
    671		 */
    672		smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
    673		local_irq_enable();
    674	}
    675}
    676
    677static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
    678{
    679	int r;
    680
    681	/* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
    682	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
    683				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
    684	if (r)
    685		return r;
    686	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
    687				       PT64_ROOT_MAX_LEVEL);
    688	if (r)
    689		return r;
    690	if (maybe_indirect) {
    691		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
    692					       PT64_ROOT_MAX_LEVEL);
    693		if (r)
    694			return r;
    695	}
    696	return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
    697					  PT64_ROOT_MAX_LEVEL);
    698}
    699
    700static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
    701{
    702	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
    703	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
    704	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
    705	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
    706}
    707
    708static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
    709{
    710	return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
    711}
    712
    713static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
    714{
    715	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
    716}
    717
    718static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
    719{
    720	if (sp->role.passthrough)
    721		return sp->gfn;
    722
    723	if (!sp->role.direct)
    724		return sp->gfns[index];
    725
    726	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
    727}
    728
    729static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
    730{
    731	if (sp->role.passthrough) {
    732		WARN_ON_ONCE(gfn != sp->gfn);
    733		return;
    734	}
    735
    736	if (!sp->role.direct) {
    737		sp->gfns[index] = gfn;
    738		return;
    739	}
    740
    741	if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
    742		pr_err_ratelimited("gfn mismatch under direct page %llx "
    743				   "(expected %llx, got %llx)\n",
    744				   sp->gfn,
    745				   kvm_mmu_page_get_gfn(sp, index), gfn);
    746}
    747
    748/*
    749 * Return the pointer to the large page information for a given gfn,
    750 * handling slots that are not large page aligned.
    751 */
    752static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
    753		const struct kvm_memory_slot *slot, int level)
    754{
    755	unsigned long idx;
    756
    757	idx = gfn_to_index(gfn, slot->base_gfn, level);
    758	return &slot->arch.lpage_info[level - 2][idx];
    759}
    760
    761static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
    762					    gfn_t gfn, int count)
    763{
    764	struct kvm_lpage_info *linfo;
    765	int i;
    766
    767	for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
    768		linfo = lpage_info_slot(gfn, slot, i);
    769		linfo->disallow_lpage += count;
    770		WARN_ON(linfo->disallow_lpage < 0);
    771	}
    772}
    773
    774void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
    775{
    776	update_gfn_disallow_lpage_count(slot, gfn, 1);
    777}
    778
    779void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
    780{
    781	update_gfn_disallow_lpage_count(slot, gfn, -1);
    782}
    783
    784static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
    785{
    786	struct kvm_memslots *slots;
    787	struct kvm_memory_slot *slot;
    788	gfn_t gfn;
    789
    790	kvm->arch.indirect_shadow_pages++;
    791	gfn = sp->gfn;
    792	slots = kvm_memslots_for_spte_role(kvm, sp->role);
    793	slot = __gfn_to_memslot(slots, gfn);
    794
    795	/* the non-leaf shadow pages are keeping readonly. */
    796	if (sp->role.level > PG_LEVEL_4K)
    797		return kvm_slot_page_track_add_page(kvm, slot, gfn,
    798						    KVM_PAGE_TRACK_WRITE);
    799
    800	kvm_mmu_gfn_disallow_lpage(slot, gfn);
    801}
    802
    803void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
    804{
    805	if (sp->lpage_disallowed)
    806		return;
    807
    808	++kvm->stat.nx_lpage_splits;
    809	list_add_tail(&sp->lpage_disallowed_link,
    810		      &kvm->arch.lpage_disallowed_mmu_pages);
    811	sp->lpage_disallowed = true;
    812}
    813
    814static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
    815{
    816	struct kvm_memslots *slots;
    817	struct kvm_memory_slot *slot;
    818	gfn_t gfn;
    819
    820	kvm->arch.indirect_shadow_pages--;
    821	gfn = sp->gfn;
    822	slots = kvm_memslots_for_spte_role(kvm, sp->role);
    823	slot = __gfn_to_memslot(slots, gfn);
    824	if (sp->role.level > PG_LEVEL_4K)
    825		return kvm_slot_page_track_remove_page(kvm, slot, gfn,
    826						       KVM_PAGE_TRACK_WRITE);
    827
    828	kvm_mmu_gfn_allow_lpage(slot, gfn);
    829}
    830
    831void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
    832{
    833	--kvm->stat.nx_lpage_splits;
    834	sp->lpage_disallowed = false;
    835	list_del(&sp->lpage_disallowed_link);
    836}
    837
    838static struct kvm_memory_slot *
    839gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
    840			    bool no_dirty_log)
    841{
    842	struct kvm_memory_slot *slot;
    843
    844	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
    845	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
    846		return NULL;
    847	if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
    848		return NULL;
    849
    850	return slot;
    851}
    852
    853/*
    854 * About rmap_head encoding:
    855 *
    856 * If the bit zero of rmap_head->val is clear, then it points to the only spte
    857 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
    858 * pte_list_desc containing more mappings.
    859 */
    860
    861/*
    862 * Returns the number of pointers in the rmap chain, not counting the new one.
    863 */
    864static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
    865			struct kvm_rmap_head *rmap_head)
    866{
    867	struct pte_list_desc *desc;
    868	int count = 0;
    869
    870	if (!rmap_head->val) {
    871		rmap_printk("%p %llx 0->1\n", spte, *spte);
    872		rmap_head->val = (unsigned long)spte;
    873	} else if (!(rmap_head->val & 1)) {
    874		rmap_printk("%p %llx 1->many\n", spte, *spte);
    875		desc = mmu_alloc_pte_list_desc(vcpu);
    876		desc->sptes[0] = (u64 *)rmap_head->val;
    877		desc->sptes[1] = spte;
    878		desc->spte_count = 2;
    879		rmap_head->val = (unsigned long)desc | 1;
    880		++count;
    881	} else {
    882		rmap_printk("%p %llx many->many\n", spte, *spte);
    883		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
    884		while (desc->spte_count == PTE_LIST_EXT) {
    885			count += PTE_LIST_EXT;
    886			if (!desc->more) {
    887				desc->more = mmu_alloc_pte_list_desc(vcpu);
    888				desc = desc->more;
    889				desc->spte_count = 0;
    890				break;
    891			}
    892			desc = desc->more;
    893		}
    894		count += desc->spte_count;
    895		desc->sptes[desc->spte_count++] = spte;
    896	}
    897	return count;
    898}
    899
    900static void
    901pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
    902			   struct pte_list_desc *desc, int i,
    903			   struct pte_list_desc *prev_desc)
    904{
    905	int j = desc->spte_count - 1;
    906
    907	desc->sptes[i] = desc->sptes[j];
    908	desc->sptes[j] = NULL;
    909	desc->spte_count--;
    910	if (desc->spte_count)
    911		return;
    912	if (!prev_desc && !desc->more)
    913		rmap_head->val = 0;
    914	else
    915		if (prev_desc)
    916			prev_desc->more = desc->more;
    917		else
    918			rmap_head->val = (unsigned long)desc->more | 1;
    919	mmu_free_pte_list_desc(desc);
    920}
    921
    922static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
    923{
    924	struct pte_list_desc *desc;
    925	struct pte_list_desc *prev_desc;
    926	int i;
    927
    928	if (!rmap_head->val) {
    929		pr_err("%s: %p 0->BUG\n", __func__, spte);
    930		BUG();
    931	} else if (!(rmap_head->val & 1)) {
    932		rmap_printk("%p 1->0\n", spte);
    933		if ((u64 *)rmap_head->val != spte) {
    934			pr_err("%s:  %p 1->BUG\n", __func__, spte);
    935			BUG();
    936		}
    937		rmap_head->val = 0;
    938	} else {
    939		rmap_printk("%p many->many\n", spte);
    940		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
    941		prev_desc = NULL;
    942		while (desc) {
    943			for (i = 0; i < desc->spte_count; ++i) {
    944				if (desc->sptes[i] == spte) {
    945					pte_list_desc_remove_entry(rmap_head,
    946							desc, i, prev_desc);
    947					return;
    948				}
    949			}
    950			prev_desc = desc;
    951			desc = desc->more;
    952		}
    953		pr_err("%s: %p many->many\n", __func__, spte);
    954		BUG();
    955	}
    956}
    957
    958static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
    959			    u64 *sptep)
    960{
    961	mmu_spte_clear_track_bits(kvm, sptep);
    962	__pte_list_remove(sptep, rmap_head);
    963}
    964
    965/* Return true if rmap existed, false otherwise */
    966static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
    967{
    968	struct pte_list_desc *desc, *next;
    969	int i;
    970
    971	if (!rmap_head->val)
    972		return false;
    973
    974	if (!(rmap_head->val & 1)) {
    975		mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
    976		goto out;
    977	}
    978
    979	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
    980
    981	for (; desc; desc = next) {
    982		for (i = 0; i < desc->spte_count; i++)
    983			mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
    984		next = desc->more;
    985		mmu_free_pte_list_desc(desc);
    986	}
    987out:
    988	/* rmap_head is meaningless now, remember to reset it */
    989	rmap_head->val = 0;
    990	return true;
    991}
    992
    993unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
    994{
    995	struct pte_list_desc *desc;
    996	unsigned int count = 0;
    997
    998	if (!rmap_head->val)
    999		return 0;
   1000	else if (!(rmap_head->val & 1))
   1001		return 1;
   1002
   1003	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
   1004
   1005	while (desc) {
   1006		count += desc->spte_count;
   1007		desc = desc->more;
   1008	}
   1009
   1010	return count;
   1011}
   1012
   1013static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
   1014					 const struct kvm_memory_slot *slot)
   1015{
   1016	unsigned long idx;
   1017
   1018	idx = gfn_to_index(gfn, slot->base_gfn, level);
   1019	return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
   1020}
   1021
   1022static bool rmap_can_add(struct kvm_vcpu *vcpu)
   1023{
   1024	struct kvm_mmu_memory_cache *mc;
   1025
   1026	mc = &vcpu->arch.mmu_pte_list_desc_cache;
   1027	return kvm_mmu_memory_cache_nr_free_objects(mc);
   1028}
   1029
   1030static void rmap_remove(struct kvm *kvm, u64 *spte)
   1031{
   1032	struct kvm_memslots *slots;
   1033	struct kvm_memory_slot *slot;
   1034	struct kvm_mmu_page *sp;
   1035	gfn_t gfn;
   1036	struct kvm_rmap_head *rmap_head;
   1037
   1038	sp = sptep_to_sp(spte);
   1039	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
   1040
   1041	/*
   1042	 * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
   1043	 * so we have to determine which memslots to use based on context
   1044	 * information in sp->role.
   1045	 */
   1046	slots = kvm_memslots_for_spte_role(kvm, sp->role);
   1047
   1048	slot = __gfn_to_memslot(slots, gfn);
   1049	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
   1050
   1051	__pte_list_remove(spte, rmap_head);
   1052}
   1053
   1054/*
   1055 * Used by the following functions to iterate through the sptes linked by a
   1056 * rmap.  All fields are private and not assumed to be used outside.
   1057 */
   1058struct rmap_iterator {
   1059	/* private fields */
   1060	struct pte_list_desc *desc;	/* holds the sptep if not NULL */
   1061	int pos;			/* index of the sptep */
   1062};
   1063
   1064/*
   1065 * Iteration must be started by this function.  This should also be used after
   1066 * removing/dropping sptes from the rmap link because in such cases the
   1067 * information in the iterator may not be valid.
   1068 *
   1069 * Returns sptep if found, NULL otherwise.
   1070 */
   1071static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
   1072			   struct rmap_iterator *iter)
   1073{
   1074	u64 *sptep;
   1075
   1076	if (!rmap_head->val)
   1077		return NULL;
   1078
   1079	if (!(rmap_head->val & 1)) {
   1080		iter->desc = NULL;
   1081		sptep = (u64 *)rmap_head->val;
   1082		goto out;
   1083	}
   1084
   1085	iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
   1086	iter->pos = 0;
   1087	sptep = iter->desc->sptes[iter->pos];
   1088out:
   1089	BUG_ON(!is_shadow_present_pte(*sptep));
   1090	return sptep;
   1091}
   1092
   1093/*
   1094 * Must be used with a valid iterator: e.g. after rmap_get_first().
   1095 *
   1096 * Returns sptep if found, NULL otherwise.
   1097 */
   1098static u64 *rmap_get_next(struct rmap_iterator *iter)
   1099{
   1100	u64 *sptep;
   1101
   1102	if (iter->desc) {
   1103		if (iter->pos < PTE_LIST_EXT - 1) {
   1104			++iter->pos;
   1105			sptep = iter->desc->sptes[iter->pos];
   1106			if (sptep)
   1107				goto out;
   1108		}
   1109
   1110		iter->desc = iter->desc->more;
   1111
   1112		if (iter->desc) {
   1113			iter->pos = 0;
   1114			/* desc->sptes[0] cannot be NULL */
   1115			sptep = iter->desc->sptes[iter->pos];
   1116			goto out;
   1117		}
   1118	}
   1119
   1120	return NULL;
   1121out:
   1122	BUG_ON(!is_shadow_present_pte(*sptep));
   1123	return sptep;
   1124}
   1125
   1126#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)			\
   1127	for (_spte_ = rmap_get_first(_rmap_head_, _iter_);		\
   1128	     _spte_; _spte_ = rmap_get_next(_iter_))
   1129
   1130static void drop_spte(struct kvm *kvm, u64 *sptep)
   1131{
   1132	u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
   1133
   1134	if (is_shadow_present_pte(old_spte))
   1135		rmap_remove(kvm, sptep);
   1136}
   1137
   1138
   1139static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
   1140{
   1141	if (is_large_pte(*sptep)) {
   1142		WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
   1143		drop_spte(kvm, sptep);
   1144		return true;
   1145	}
   1146
   1147	return false;
   1148}
   1149
   1150static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
   1151{
   1152	if (__drop_large_spte(vcpu->kvm, sptep)) {
   1153		struct kvm_mmu_page *sp = sptep_to_sp(sptep);
   1154
   1155		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
   1156			KVM_PAGES_PER_HPAGE(sp->role.level));
   1157	}
   1158}
   1159
   1160bool
   1161cpc_spte_protect(u64 *sptep, bool pt_protect, enum kvm_page_track_mode mode)
   1162{
   1163	u64 spte;
   1164
   1165	spte = *sptep;
   1166	if (!is_writable_pte(spte) && !(pt_protect && is_mmu_writable_spte(spte)))
   1167		return false;
   1168
   1169	if (pt_protect)
   1170		spte &= ~shadow_mmu_writable_mask;
   1171
   1172	spte = cpc_protect_pte(spte, mode);
   1173
   1174	mmu_spte_update(sptep, spte);
   1175
   1176	return true;
   1177}
   1178
   1179bool cpc_rmap_protect(struct kvm_rmap_head *rmap_head,
   1180	bool pt_protect, enum kvm_page_track_mode mode)
   1181{
   1182	struct rmap_iterator iter;
   1183	bool flush;
   1184	u64 *sptep;
   1185
   1186	flush = false;
   1187	for_each_rmap_spte(rmap_head, &iter, sptep) {
   1188		flush |= cpc_spte_protect(sptep, pt_protect, mode);
   1189	}
   1190
   1191	return flush;
   1192}
   1193/*
   1194 * Write-protect on the specified @sptep, @pt_protect indicates whether
   1195 * spte write-protection is caused by protecting shadow page table.
   1196 *
   1197 * Note: write protection is difference between dirty logging and spte
   1198 * protection:
   1199 * - for dirty logging, the spte can be set to writable at anytime if
   1200 *   its dirty bitmap is properly set.
   1201 * - for spte protection, the spte can be writable only after unsync-ing
   1202 *   shadow page.
   1203 *
   1204 * Return true if tlb need be flushed.
   1205 */
   1206 static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
   1207			       bool pt_protect)
   1208{
   1209	return cpc_rmap_protect(rmap_head, pt_protect, KVM_PAGE_TRACK_WRITE);
   1210}
   1211
   1212static bool spte_clear_dirty(u64 *sptep)
   1213{
   1214	u64 spte = *sptep;
   1215
   1216	rmap_printk("spte %p %llx\n", sptep, *sptep);
   1217
   1218	MMU_WARN_ON(!spte_ad_enabled(spte));
   1219	spte &= ~shadow_dirty_mask;
   1220	return mmu_spte_update(sptep, spte);
   1221}
   1222
   1223static bool spte_wrprot_for_clear_dirty(u64 *sptep)
   1224{
   1225	bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
   1226					       (unsigned long *)sptep);
   1227	if (was_writable && !spte_ad_enabled(*sptep))
   1228		kvm_set_pfn_dirty(spte_to_pfn(*sptep));
   1229
   1230	return was_writable;
   1231}
   1232
   1233/*
   1234 * Gets the GFN ready for another round of dirty logging by clearing the
   1235 *	- D bit on ad-enabled SPTEs, and
   1236 *	- W bit on ad-disabled SPTEs.
   1237 * Returns true iff any D or W bits were cleared.
   1238 */
   1239static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
   1240			       const struct kvm_memory_slot *slot)
   1241{
   1242	u64 *sptep;
   1243	struct rmap_iterator iter;
   1244	bool flush = false;
   1245
   1246	for_each_rmap_spte(rmap_head, &iter, sptep)
   1247		if (spte_ad_need_write_protect(*sptep))
   1248			flush |= spte_wrprot_for_clear_dirty(sptep);
   1249		else
   1250			flush |= spte_clear_dirty(sptep);
   1251
   1252	return flush;
   1253}
   1254
   1255/**
   1256 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
   1257 * @kvm: kvm instance
   1258 * @slot: slot to protect
   1259 * @gfn_offset: start of the BITS_PER_LONG pages we care about
   1260 * @mask: indicates which pages we should protect
   1261 *
   1262 * Used when we do not need to care about huge page mappings.
   1263 */
   1264static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
   1265				     struct kvm_memory_slot *slot,
   1266				     gfn_t gfn_offset, unsigned long mask)
   1267{
   1268	struct kvm_rmap_head *rmap_head;
   1269
   1270	if (is_tdp_mmu_enabled(kvm))
   1271		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
   1272				slot->base_gfn + gfn_offset, mask, true);
   1273
   1274	if (!kvm_memslots_have_rmaps(kvm))
   1275		return;
   1276
   1277	while (mask) {
   1278		rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
   1279					PG_LEVEL_4K, slot);
   1280		rmap_write_protect(rmap_head, false);
   1281
   1282		/* clear the first set bit */
   1283		mask &= mask - 1;
   1284	}
   1285}
   1286
   1287/**
   1288 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
   1289 * protect the page if the D-bit isn't supported.
   1290 * @kvm: kvm instance
   1291 * @slot: slot to clear D-bit
   1292 * @gfn_offset: start of the BITS_PER_LONG pages we care about
   1293 * @mask: indicates which pages we should clear D-bit
   1294 *
   1295 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
   1296 */
   1297static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
   1298					 struct kvm_memory_slot *slot,
   1299					 gfn_t gfn_offset, unsigned long mask)
   1300{
   1301	struct kvm_rmap_head *rmap_head;
   1302
   1303	if (is_tdp_mmu_enabled(kvm))
   1304		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
   1305				slot->base_gfn + gfn_offset, mask, false);
   1306
   1307	if (!kvm_memslots_have_rmaps(kvm))
   1308		return;
   1309
   1310	while (mask) {
   1311		rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
   1312					PG_LEVEL_4K, slot);
   1313		__rmap_clear_dirty(kvm, rmap_head, slot);
   1314
   1315		/* clear the first set bit */
   1316		mask &= mask - 1;
   1317	}
   1318}
   1319
   1320/**
   1321 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
   1322 * PT level pages.
   1323 *
   1324 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
   1325 * enable dirty logging for them.
   1326 *
   1327 * We need to care about huge page mappings: e.g. during dirty logging we may
   1328 * have such mappings.
   1329 */
   1330void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
   1331				struct kvm_memory_slot *slot,
   1332				gfn_t gfn_offset, unsigned long mask)
   1333{
   1334	/*
   1335	 * Huge pages are NOT write protected when we start dirty logging in
   1336	 * initially-all-set mode; must write protect them here so that they
   1337	 * are split to 4K on the first write.
   1338	 *
   1339	 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
   1340	 * of memslot has no such restriction, so the range can cross two large
   1341	 * pages.
   1342	 */
   1343	if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
   1344		gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
   1345		gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
   1346
   1347		if (READ_ONCE(eager_page_split))
   1348			kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
   1349
   1350		kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
   1351
   1352		/* Cross two large pages? */
   1353		if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
   1354		    ALIGN(end << PAGE_SHIFT, PMD_SIZE))
   1355			kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
   1356						       PG_LEVEL_2M);
   1357	}
   1358
   1359	/* Now handle 4K PTEs.  */
   1360	if (kvm_x86_ops.cpu_dirty_log_size)
   1361		kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
   1362	else
   1363		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
   1364}
   1365
   1366int kvm_cpu_dirty_log_size(void)
   1367{
   1368	return kvm_x86_ops.cpu_dirty_log_size;
   1369}
   1370
   1371bool
   1372cpc_kvm_mmu_slot_gfn_protect(struct kvm *kvm, struct kvm_memory_slot *slot,
   1373	uint64_t gfn, int min_level, enum kvm_page_track_mode mode)
   1374{
   1375	struct kvm_rmap_head *rmap_head;
   1376	bool flush;
   1377	int i;
   1378
   1379	flush = false;
   1380
   1381	if (kvm_memslots_have_rmaps(kvm)) {
   1382		for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
   1383			rmap_head = gfn_to_rmap(gfn, i, slot);
   1384			flush |= cpc_rmap_protect(rmap_head, true, mode);
   1385		}
   1386	} else if (is_tdp_mmu_enabled(kvm)) {
   1387		flush |= cpc_tdp_protect_gfn(kvm, slot, gfn, min_level, mode);
   1388	} else {
   1389		CPC_ERR("Tracking unsupported!\n");
   1390	}
   1391
   1392	return flush;
   1393}
   1394
   1395bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
   1396				    struct kvm_memory_slot *slot, u64 gfn,
   1397				    int min_level)
   1398{
   1399	return cpc_kvm_mmu_slot_gfn_protect(kvm, slot,
   1400		gfn, min_level, KVM_PAGE_TRACK_WRITE);
   1401}
   1402
   1403static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
   1404{
   1405	struct kvm_memory_slot *slot;
   1406
   1407	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
   1408	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
   1409}
   1410
   1411static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
   1412			  const struct kvm_memory_slot *slot)
   1413{
   1414	return pte_list_destroy(kvm, rmap_head);
   1415}
   1416
   1417static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
   1418			    struct kvm_memory_slot *slot, gfn_t gfn, int level,
   1419			    pte_t unused)
   1420{
   1421	return kvm_zap_rmapp(kvm, rmap_head, slot);
   1422}
   1423
   1424static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
   1425			      struct kvm_memory_slot *slot, gfn_t gfn, int level,
   1426			      pte_t pte)
   1427{
   1428	u64 *sptep;
   1429	struct rmap_iterator iter;
   1430	bool need_flush = false;
   1431	u64 new_spte;
   1432	kvm_pfn_t new_pfn;
   1433
   1434	WARN_ON(pte_huge(pte));
   1435	new_pfn = pte_pfn(pte);
   1436
   1437restart:
   1438	for_each_rmap_spte(rmap_head, &iter, sptep) {
   1439		rmap_printk("spte %p %llx gfn %llx (%d)\n",
   1440			    sptep, *sptep, gfn, level);
   1441
   1442		need_flush = true;
   1443
   1444		if (pte_write(pte)) {
   1445			pte_list_remove(kvm, rmap_head, sptep);
   1446			goto restart;
   1447		} else {
   1448			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
   1449					*sptep, new_pfn);
   1450
   1451			mmu_spte_clear_track_bits(kvm, sptep);
   1452			mmu_spte_set(sptep, new_spte);
   1453		}
   1454	}
   1455
   1456	if (need_flush && kvm_available_flush_tlb_with_range()) {
   1457		kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
   1458		return false;
   1459	}
   1460
   1461	return need_flush;
   1462}
   1463
   1464struct slot_rmap_walk_iterator {
   1465	/* input fields. */
   1466	const struct kvm_memory_slot *slot;
   1467	gfn_t start_gfn;
   1468	gfn_t end_gfn;
   1469	int start_level;
   1470	int end_level;
   1471
   1472	/* output fields. */
   1473	gfn_t gfn;
   1474	struct kvm_rmap_head *rmap;
   1475	int level;
   1476
   1477	/* private field. */
   1478	struct kvm_rmap_head *end_rmap;
   1479};
   1480
   1481static void
   1482rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
   1483{
   1484	iterator->level = level;
   1485	iterator->gfn = iterator->start_gfn;
   1486	iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
   1487	iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
   1488}
   1489
   1490static void
   1491slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
   1492		    const struct kvm_memory_slot *slot, int start_level,
   1493		    int end_level, gfn_t start_gfn, gfn_t end_gfn)
   1494{
   1495	iterator->slot = slot;
   1496	iterator->start_level = start_level;
   1497	iterator->end_level = end_level;
   1498	iterator->start_gfn = start_gfn;
   1499	iterator->end_gfn = end_gfn;
   1500
   1501	rmap_walk_init_level(iterator, iterator->start_level);
   1502}
   1503
   1504static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
   1505{
   1506	return !!iterator->rmap;
   1507}
   1508
   1509static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
   1510{
   1511	while (++iterator->rmap <= iterator->end_rmap) {
   1512		iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
   1513
   1514		if (iterator->rmap->val)
   1515			return;
   1516	}
   1517
   1518	if (++iterator->level > iterator->end_level) {
   1519		iterator->rmap = NULL;
   1520		return;
   1521	}
   1522
   1523	rmap_walk_init_level(iterator, iterator->level);
   1524}
   1525
   1526#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,	\
   1527	   _start_gfn, _end_gfn, _iter_)				\
   1528	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,		\
   1529				 _end_level_, _start_gfn, _end_gfn);	\
   1530	     slot_rmap_walk_okay(_iter_);				\
   1531	     slot_rmap_walk_next(_iter_))
   1532
   1533typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
   1534			       struct kvm_memory_slot *slot, gfn_t gfn,
   1535			       int level, pte_t pte);
   1536
   1537static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
   1538						 struct kvm_gfn_range *range,
   1539						 rmap_handler_t handler)
   1540{
   1541	struct slot_rmap_walk_iterator iterator;
   1542	bool ret = false;
   1543
   1544	for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
   1545				 range->start, range->end - 1, &iterator)
   1546		ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
   1547			       iterator.level, range->pte);
   1548
   1549	return ret;
   1550}
   1551
   1552bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
   1553{
   1554	bool flush = false;
   1555
   1556	if (kvm_memslots_have_rmaps(kvm))
   1557		flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
   1558
   1559	if (is_tdp_mmu_enabled(kvm))
   1560		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
   1561
   1562	return flush;
   1563}
   1564
   1565bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
   1566{
   1567	bool flush = false;
   1568
   1569	if (kvm_memslots_have_rmaps(kvm))
   1570		flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
   1571
   1572	if (is_tdp_mmu_enabled(kvm))
   1573		flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
   1574
   1575	return flush;
   1576}
   1577
   1578static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
   1579			  struct kvm_memory_slot *slot, gfn_t gfn, int level,
   1580			  pte_t unused)
   1581{
   1582	u64 *sptep;
   1583	struct rmap_iterator iter;
   1584	int young = 0;
   1585
   1586	for_each_rmap_spte(rmap_head, &iter, sptep)
   1587		young |= mmu_spte_age(sptep);
   1588
   1589	return young;
   1590}
   1591
   1592static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
   1593			       struct kvm_memory_slot *slot, gfn_t gfn,
   1594			       int level, pte_t unused)
   1595{
   1596	u64 *sptep;
   1597	struct rmap_iterator iter;
   1598
   1599	for_each_rmap_spte(rmap_head, &iter, sptep)
   1600		if (is_accessed_spte(*sptep))
   1601			return true;
   1602	return false;
   1603}
   1604
   1605#define RMAP_RECYCLE_THRESHOLD 1000
   1606
   1607static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
   1608		     u64 *spte, gfn_t gfn)
   1609{
   1610	struct kvm_mmu_page *sp;
   1611	struct kvm_rmap_head *rmap_head;
   1612	int rmap_count;
   1613
   1614	sp = sptep_to_sp(spte);
   1615	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
   1616	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
   1617	rmap_count = pte_list_add(vcpu, spte, rmap_head);
   1618
   1619	if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
   1620		kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
   1621		kvm_flush_remote_tlbs_with_address(
   1622				vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
   1623	}
   1624}
   1625
   1626bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
   1627{
   1628	bool young = false;
   1629
   1630	if (kvm_memslots_have_rmaps(kvm))
   1631		young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
   1632
   1633	if (is_tdp_mmu_enabled(kvm))
   1634		young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
   1635
   1636	return young;
   1637}
   1638
   1639bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
   1640{
   1641	bool young = false;
   1642
   1643	if (kvm_memslots_have_rmaps(kvm))
   1644		young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
   1645
   1646	if (is_tdp_mmu_enabled(kvm))
   1647		young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
   1648
   1649	return young;
   1650}
   1651
   1652#ifdef MMU_DEBUG
   1653static int is_empty_shadow_page(u64 *spt)
   1654{
   1655	u64 *pos;
   1656	u64 *end;
   1657
   1658	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
   1659		if (is_shadow_present_pte(*pos)) {
   1660			printk(KERN_ERR "%s: %p %llx\n", __func__,
   1661			       pos, *pos);
   1662			return 0;
   1663		}
   1664	return 1;
   1665}
   1666#endif
   1667
   1668/*
   1669 * This value is the sum of all of the kvm instances's
   1670 * kvm->arch.n_used_mmu_pages values.  We need a global,
   1671 * aggregate version in order to make the slab shrinker
   1672 * faster
   1673 */
   1674static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
   1675{
   1676	kvm->arch.n_used_mmu_pages += nr;
   1677	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
   1678}
   1679
   1680static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
   1681{
   1682	MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
   1683	hlist_del(&sp->hash_link);
   1684	list_del(&sp->link);
   1685	free_page((unsigned long)sp->spt);
   1686	if (!sp->role.direct)
   1687		free_page((unsigned long)sp->gfns);
   1688	kmem_cache_free(mmu_page_header_cache, sp);
   1689}
   1690
   1691static unsigned kvm_page_table_hashfn(gfn_t gfn)
   1692{
   1693	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
   1694}
   1695
   1696static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
   1697				    struct kvm_mmu_page *sp, u64 *parent_pte)
   1698{
   1699	if (!parent_pte)
   1700		return;
   1701
   1702	pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
   1703}
   1704
   1705static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
   1706				       u64 *parent_pte)
   1707{
   1708	__pte_list_remove(parent_pte, &sp->parent_ptes);
   1709}
   1710
   1711static void drop_parent_pte(struct kvm_mmu_page *sp,
   1712			    u64 *parent_pte)
   1713{
   1714	mmu_page_remove_parent_pte(sp, parent_pte);
   1715	mmu_spte_clear_no_track(parent_pte);
   1716}
   1717
   1718static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
   1719{
   1720	struct kvm_mmu_page *sp;
   1721
   1722	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
   1723	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
   1724	if (!direct)
   1725		sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
   1726	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
   1727
   1728	/*
   1729	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
   1730	 * depends on valid pages being added to the head of the list.  See
   1731	 * comments in kvm_zap_obsolete_pages().
   1732	 */
   1733	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
   1734	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
   1735	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
   1736	return sp;
   1737}
   1738
   1739static void mark_unsync(u64 *spte);
   1740static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
   1741{
   1742	u64 *sptep;
   1743	struct rmap_iterator iter;
   1744
   1745	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
   1746		mark_unsync(sptep);
   1747	}
   1748}
   1749
   1750static void mark_unsync(u64 *spte)
   1751{
   1752	struct kvm_mmu_page *sp;
   1753	unsigned int index;
   1754
   1755	sp = sptep_to_sp(spte);
   1756	index = spte - sp->spt;
   1757	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
   1758		return;
   1759	if (sp->unsync_children++)
   1760		return;
   1761	kvm_mmu_mark_parents_unsync(sp);
   1762}
   1763
   1764static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
   1765			       struct kvm_mmu_page *sp)
   1766{
   1767	return -1;
   1768}
   1769
   1770#define KVM_PAGE_ARRAY_NR 16
   1771
   1772struct kvm_mmu_pages {
   1773	struct mmu_page_and_offset {
   1774		struct kvm_mmu_page *sp;
   1775		unsigned int idx;
   1776	} page[KVM_PAGE_ARRAY_NR];
   1777	unsigned int nr;
   1778};
   1779
   1780static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
   1781			 int idx)
   1782{
   1783	int i;
   1784
   1785	if (sp->unsync)
   1786		for (i=0; i < pvec->nr; i++)
   1787			if (pvec->page[i].sp == sp)
   1788				return 0;
   1789
   1790	pvec->page[pvec->nr].sp = sp;
   1791	pvec->page[pvec->nr].idx = idx;
   1792	pvec->nr++;
   1793	return (pvec->nr == KVM_PAGE_ARRAY_NR);
   1794}
   1795
   1796static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
   1797{
   1798	--sp->unsync_children;
   1799	WARN_ON((int)sp->unsync_children < 0);
   1800	__clear_bit(idx, sp->unsync_child_bitmap);
   1801}
   1802
   1803static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
   1804			   struct kvm_mmu_pages *pvec)
   1805{
   1806	int i, ret, nr_unsync_leaf = 0;
   1807
   1808	for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
   1809		struct kvm_mmu_page *child;
   1810		u64 ent = sp->spt[i];
   1811
   1812		if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
   1813			clear_unsync_child_bit(sp, i);
   1814			continue;
   1815		}
   1816
   1817		child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
   1818
   1819		if (child->unsync_children) {
   1820			if (mmu_pages_add(pvec, child, i))
   1821				return -ENOSPC;
   1822
   1823			ret = __mmu_unsync_walk(child, pvec);
   1824			if (!ret) {
   1825				clear_unsync_child_bit(sp, i);
   1826				continue;
   1827			} else if (ret > 0) {
   1828				nr_unsync_leaf += ret;
   1829			} else
   1830				return ret;
   1831		} else if (child->unsync) {
   1832			nr_unsync_leaf++;
   1833			if (mmu_pages_add(pvec, child, i))
   1834				return -ENOSPC;
   1835		} else
   1836			clear_unsync_child_bit(sp, i);
   1837	}
   1838
   1839	return nr_unsync_leaf;
   1840}
   1841
   1842#define INVALID_INDEX (-1)
   1843
   1844static int mmu_unsync_walk(struct kvm_mmu_page *sp,
   1845			   struct kvm_mmu_pages *pvec)
   1846{
   1847	pvec->nr = 0;
   1848	if (!sp->unsync_children)
   1849		return 0;
   1850
   1851	mmu_pages_add(pvec, sp, INVALID_INDEX);
   1852	return __mmu_unsync_walk(sp, pvec);
   1853}
   1854
   1855static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
   1856{
   1857	WARN_ON(!sp->unsync);
   1858	trace_kvm_mmu_sync_page(sp);
   1859	sp->unsync = 0;
   1860	--kvm->stat.mmu_unsync;
   1861}
   1862
   1863static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
   1864				     struct list_head *invalid_list);
   1865static void kvm_mmu_commit_zap_page(struct kvm *kvm,
   1866				    struct list_head *invalid_list);
   1867
   1868static bool sp_has_gptes(struct kvm_mmu_page *sp)
   1869{
   1870	if (sp->role.direct)
   1871		return false;
   1872
   1873	if (sp->role.passthrough)
   1874		return false;
   1875
   1876	return true;
   1877}
   1878
   1879#define for_each_valid_sp(_kvm, _sp, _list)				\
   1880	hlist_for_each_entry(_sp, _list, hash_link)			\
   1881		if (is_obsolete_sp((_kvm), (_sp))) {			\
   1882		} else
   1883
   1884#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn)		\
   1885	for_each_valid_sp(_kvm, _sp,					\
   1886	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])	\
   1887		if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
   1888
   1889static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
   1890			 struct list_head *invalid_list)
   1891{
   1892	int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
   1893
   1894	if (ret < 0)
   1895		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
   1896	return ret;
   1897}
   1898
   1899static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
   1900					struct list_head *invalid_list,
   1901					bool remote_flush)
   1902{
   1903	if (!remote_flush && list_empty(invalid_list))
   1904		return false;
   1905
   1906	if (!list_empty(invalid_list))
   1907		kvm_mmu_commit_zap_page(kvm, invalid_list);
   1908	else
   1909		kvm_flush_remote_tlbs(kvm);
   1910	return true;
   1911}
   1912
   1913static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
   1914{
   1915	if (sp->role.invalid)
   1916		return true;
   1917
   1918	/* TDP MMU pages due not use the MMU generation. */
   1919	return !sp->tdp_mmu_page &&
   1920	       unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
   1921}
   1922
   1923struct mmu_page_path {
   1924	struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
   1925	unsigned int idx[PT64_ROOT_MAX_LEVEL];
   1926};
   1927
   1928#define for_each_sp(pvec, sp, parents, i)			\
   1929		for (i = mmu_pages_first(&pvec, &parents);	\
   1930			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
   1931			i = mmu_pages_next(&pvec, &parents, i))
   1932
   1933static int mmu_pages_next(struct kvm_mmu_pages *pvec,
   1934			  struct mmu_page_path *parents,
   1935			  int i)
   1936{
   1937	int n;
   1938
   1939	for (n = i+1; n < pvec->nr; n++) {
   1940		struct kvm_mmu_page *sp = pvec->page[n].sp;
   1941		unsigned idx = pvec->page[n].idx;
   1942		int level = sp->role.level;
   1943
   1944		parents->idx[level-1] = idx;
   1945		if (level == PG_LEVEL_4K)
   1946			break;
   1947
   1948		parents->parent[level-2] = sp;
   1949	}
   1950
   1951	return n;
   1952}
   1953
   1954static int mmu_pages_first(struct kvm_mmu_pages *pvec,
   1955			   struct mmu_page_path *parents)
   1956{
   1957	struct kvm_mmu_page *sp;
   1958	int level;
   1959
   1960	if (pvec->nr == 0)
   1961		return 0;
   1962
   1963	WARN_ON(pvec->page[0].idx != INVALID_INDEX);
   1964
   1965	sp = pvec->page[0].sp;
   1966	level = sp->role.level;
   1967	WARN_ON(level == PG_LEVEL_4K);
   1968
   1969	parents->parent[level-2] = sp;
   1970
   1971	/* Also set up a sentinel.  Further entries in pvec are all
   1972	 * children of sp, so this element is never overwritten.
   1973	 */
   1974	parents->parent[level-1] = NULL;
   1975	return mmu_pages_next(pvec, parents, 0);
   1976}
   1977
   1978static void mmu_pages_clear_parents(struct mmu_page_path *parents)
   1979{
   1980	struct kvm_mmu_page *sp;
   1981	unsigned int level = 0;
   1982
   1983	do {
   1984		unsigned int idx = parents->idx[level];
   1985		sp = parents->parent[level];
   1986		if (!sp)
   1987			return;
   1988
   1989		WARN_ON(idx == INVALID_INDEX);
   1990		clear_unsync_child_bit(sp, idx);
   1991		level++;
   1992	} while (!sp->unsync_children);
   1993}
   1994
   1995static int mmu_sync_children(struct kvm_vcpu *vcpu,
   1996			     struct kvm_mmu_page *parent, bool can_yield)
   1997{
   1998	int i;
   1999	struct kvm_mmu_page *sp;
   2000	struct mmu_page_path parents;
   2001	struct kvm_mmu_pages pages;
   2002	LIST_HEAD(invalid_list);
   2003	bool flush = false;
   2004
   2005	while (mmu_unsync_walk(parent, &pages)) {
   2006		bool protected = false;
   2007
   2008		for_each_sp(pages, sp, parents, i)
   2009			protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
   2010
   2011		if (protected) {
   2012			kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
   2013			flush = false;
   2014		}
   2015
   2016		for_each_sp(pages, sp, parents, i) {
   2017			kvm_unlink_unsync_page(vcpu->kvm, sp);
   2018			flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
   2019			mmu_pages_clear_parents(&parents);
   2020		}
   2021		if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
   2022			kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
   2023			if (!can_yield) {
   2024				kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
   2025				return -EINTR;
   2026			}
   2027
   2028			cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
   2029			flush = false;
   2030		}
   2031	}
   2032
   2033	kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
   2034	return 0;
   2035}
   2036
   2037static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
   2038{
   2039	atomic_set(&sp->write_flooding_count,  0);
   2040}
   2041
   2042static void clear_sp_write_flooding_count(u64 *spte)
   2043{
   2044	__clear_sp_write_flooding_count(sptep_to_sp(spte));
   2045}
   2046
   2047static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
   2048					     gfn_t gfn,
   2049					     gva_t gaddr,
   2050					     unsigned level,
   2051					     int direct,
   2052					     unsigned int access)
   2053{
   2054	bool direct_mmu = vcpu->arch.mmu->root_role.direct;
   2055	union kvm_mmu_page_role role;
   2056	struct hlist_head *sp_list;
   2057	unsigned quadrant;
   2058	struct kvm_mmu_page *sp;
   2059	int ret;
   2060	int collisions = 0;
   2061	LIST_HEAD(invalid_list);
   2062
   2063	role = vcpu->arch.mmu->root_role;
   2064	role.level = level;
   2065	role.direct = direct;
   2066	role.access = access;
   2067	if (role.has_4_byte_gpte) {
   2068		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
   2069		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
   2070		role.quadrant = quadrant;
   2071	}
   2072	if (level <= vcpu->arch.mmu->cpu_role.base.level)
   2073		role.passthrough = 0;
   2074
   2075	sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
   2076	for_each_valid_sp(vcpu->kvm, sp, sp_list) {
   2077		if (sp->gfn != gfn) {
   2078			collisions++;
   2079			continue;
   2080		}
   2081
   2082		if (sp->role.word != role.word) {
   2083			/*
   2084			 * If the guest is creating an upper-level page, zap
   2085			 * unsync pages for the same gfn.  While it's possible
   2086			 * the guest is using recursive page tables, in all
   2087			 * likelihood the guest has stopped using the unsync
   2088			 * page and is installing a completely unrelated page.
   2089			 * Unsync pages must not be left as is, because the new
   2090			 * upper-level page will be write-protected.
   2091			 */
   2092			if (level > PG_LEVEL_4K && sp->unsync)
   2093				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
   2094							 &invalid_list);
   2095			continue;
   2096		}
   2097
   2098		if (direct_mmu)
   2099			goto trace_get_page;
   2100
   2101		if (sp->unsync) {
   2102			/*
   2103			 * The page is good, but is stale.  kvm_sync_page does
   2104			 * get the latest guest state, but (unlike mmu_unsync_children)
   2105			 * it doesn't write-protect the page or mark it synchronized!
   2106			 * This way the validity of the mapping is ensured, but the
   2107			 * overhead of write protection is not incurred until the
   2108			 * guest invalidates the TLB mapping.  This allows multiple
   2109			 * SPs for a single gfn to be unsync.
   2110			 *
   2111			 * If the sync fails, the page is zapped.  If so, break
   2112			 * in order to rebuild it.
   2113			 */
   2114			ret = kvm_sync_page(vcpu, sp, &invalid_list);
   2115			if (ret < 0)
   2116				break;
   2117
   2118			WARN_ON(!list_empty(&invalid_list));
   2119			if (ret > 0)
   2120				kvm_flush_remote_tlbs(vcpu->kvm);
   2121		}
   2122
   2123		__clear_sp_write_flooding_count(sp);
   2124
   2125trace_get_page:
   2126		trace_kvm_mmu_get_page(sp, false);
   2127		goto out;
   2128	}
   2129
   2130	++vcpu->kvm->stat.mmu_cache_miss;
   2131
   2132	sp = kvm_mmu_alloc_page(vcpu, direct);
   2133
   2134	sp->gfn = gfn;
   2135	sp->role = role;
   2136	hlist_add_head(&sp->hash_link, sp_list);
   2137	if (sp_has_gptes(sp)) {
   2138		account_shadowed(vcpu->kvm, sp);
   2139		if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
   2140			kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
   2141	}
   2142	trace_kvm_mmu_get_page(sp, true);
   2143out:
   2144	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
   2145
   2146	if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
   2147		vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
   2148	return sp;
   2149}
   2150
   2151static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
   2152					struct kvm_vcpu *vcpu, hpa_t root,
   2153					u64 addr)
   2154{
   2155	iterator->addr = addr;
   2156	iterator->shadow_addr = root;
   2157	iterator->level = vcpu->arch.mmu->root_role.level;
   2158
   2159	if (iterator->level >= PT64_ROOT_4LEVEL &&
   2160	    vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
   2161	    !vcpu->arch.mmu->root_role.direct)
   2162		iterator->level = PT32E_ROOT_LEVEL;
   2163
   2164	if (iterator->level == PT32E_ROOT_LEVEL) {
   2165		/*
   2166		 * prev_root is currently only used for 64-bit hosts. So only
   2167		 * the active root_hpa is valid here.
   2168		 */
   2169		BUG_ON(root != vcpu->arch.mmu->root.hpa);
   2170
   2171		iterator->shadow_addr
   2172			= vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
   2173		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
   2174		--iterator->level;
   2175		if (!iterator->shadow_addr)
   2176			iterator->level = 0;
   2177	}
   2178}
   2179
   2180static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
   2181			     struct kvm_vcpu *vcpu, u64 addr)
   2182{
   2183	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
   2184				    addr);
   2185}
   2186
   2187static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
   2188{
   2189	if (iterator->level < PG_LEVEL_4K)
   2190		return false;
   2191
   2192	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
   2193	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
   2194	return true;
   2195}
   2196
   2197static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
   2198			       u64 spte)
   2199{
   2200	if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
   2201		iterator->level = 0;
   2202		return;
   2203	}
   2204
   2205	iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
   2206	--iterator->level;
   2207}
   2208
   2209static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
   2210{
   2211	__shadow_walk_next(iterator, *iterator->sptep);
   2212}
   2213
   2214static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
   2215			     struct kvm_mmu_page *sp)
   2216{
   2217	u64 spte;
   2218
   2219	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
   2220
   2221	spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
   2222
   2223	mmu_spte_set(sptep, spte);
   2224
   2225	mmu_page_add_parent_pte(vcpu, sp, sptep);
   2226
   2227	if (sp->unsync_children || sp->unsync)
   2228		mark_unsync(sptep);
   2229}
   2230
   2231static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
   2232				   unsigned direct_access)
   2233{
   2234	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
   2235		struct kvm_mmu_page *child;
   2236
   2237		/*
   2238		 * For the direct sp, if the guest pte's dirty bit
   2239		 * changed form clean to dirty, it will corrupt the
   2240		 * sp's access: allow writable in the read-only sp,
   2241		 * so we should update the spte at this point to get
   2242		 * a new sp with the correct access.
   2243		 */
   2244		child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
   2245		if (child->role.access == direct_access)
   2246			return;
   2247
   2248		drop_parent_pte(child, sptep);
   2249		kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
   2250	}
   2251}
   2252
   2253/* Returns the number of zapped non-leaf child shadow pages. */
   2254static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
   2255			    u64 *spte, struct list_head *invalid_list)
   2256{
   2257	u64 pte;
   2258	struct kvm_mmu_page *child;
   2259
   2260	pte = *spte;
   2261	if (is_shadow_present_pte(pte)) {
   2262		if (is_last_spte(pte, sp->role.level)) {
   2263			drop_spte(kvm, spte);
   2264		} else {
   2265			child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
   2266			drop_parent_pte(child, spte);
   2267
   2268			/*
   2269			 * Recursively zap nested TDP SPs, parentless SPs are
   2270			 * unlikely to be used again in the near future.  This
   2271			 * avoids retaining a large number of stale nested SPs.
   2272			 */
   2273			if (tdp_enabled && invalid_list &&
   2274			    child->role.guest_mode && !child->parent_ptes.val)
   2275				return kvm_mmu_prepare_zap_page(kvm, child,
   2276								invalid_list);
   2277		}
   2278	} else if (is_mmio_spte(pte)) {
   2279		mmu_spte_clear_no_track(spte);
   2280	}
   2281	return 0;
   2282}
   2283
   2284static int kvm_mmu_page_unlink_children(struct kvm *kvm,
   2285					struct kvm_mmu_page *sp,
   2286					struct list_head *invalid_list)
   2287{
   2288	int zapped = 0;
   2289	unsigned i;
   2290
   2291	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
   2292		zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
   2293
   2294	return zapped;
   2295}
   2296
   2297static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
   2298{
   2299	u64 *sptep;
   2300	struct rmap_iterator iter;
   2301
   2302	while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
   2303		drop_parent_pte(sp, sptep);
   2304}
   2305
   2306static int mmu_zap_unsync_children(struct kvm *kvm,
   2307				   struct kvm_mmu_page *parent,
   2308				   struct list_head *invalid_list)
   2309{
   2310	int i, zapped = 0;
   2311	struct mmu_page_path parents;
   2312	struct kvm_mmu_pages pages;
   2313
   2314	if (parent->role.level == PG_LEVEL_4K)
   2315		return 0;
   2316
   2317	while (mmu_unsync_walk(parent, &pages)) {
   2318		struct kvm_mmu_page *sp;
   2319
   2320		for_each_sp(pages, sp, parents, i) {
   2321			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
   2322			mmu_pages_clear_parents(&parents);
   2323			zapped++;
   2324		}
   2325	}
   2326
   2327	return zapped;
   2328}
   2329
   2330static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
   2331				       struct kvm_mmu_page *sp,
   2332				       struct list_head *invalid_list,
   2333				       int *nr_zapped)
   2334{
   2335	bool list_unstable, zapped_root = false;
   2336
   2337	trace_kvm_mmu_prepare_zap_page(sp);
   2338	++kvm->stat.mmu_shadow_zapped;
   2339	*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
   2340	*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
   2341	kvm_mmu_unlink_parents(sp);
   2342
   2343	/* Zapping children means active_mmu_pages has become unstable. */
   2344	list_unstable = *nr_zapped;
   2345
   2346	if (!sp->role.invalid && sp_has_gptes(sp))
   2347		unaccount_shadowed(kvm, sp);
   2348
   2349	if (sp->unsync)
   2350		kvm_unlink_unsync_page(kvm, sp);
   2351	if (!sp->root_count) {
   2352		/* Count self */
   2353		(*nr_zapped)++;
   2354
   2355		/*
   2356		 * Already invalid pages (previously active roots) are not on
   2357		 * the active page list.  See list_del() in the "else" case of
   2358		 * !sp->root_count.
   2359		 */
   2360		if (sp->role.invalid)
   2361			list_add(&sp->link, invalid_list);
   2362		else
   2363			list_move(&sp->link, invalid_list);
   2364		kvm_mod_used_mmu_pages(kvm, -1);
   2365	} else {
   2366		/*
   2367		 * Remove the active root from the active page list, the root
   2368		 * will be explicitly freed when the root_count hits zero.
   2369		 */
   2370		list_del(&sp->link);
   2371
   2372		/*
   2373		 * Obsolete pages cannot be used on any vCPUs, see the comment
   2374		 * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
   2375		 * treats invalid shadow pages as being obsolete.
   2376		 */
   2377		zapped_root = !is_obsolete_sp(kvm, sp);
   2378	}
   2379
   2380	if (sp->lpage_disallowed)
   2381		unaccount_huge_nx_page(kvm, sp);
   2382
   2383	sp->role.invalid = 1;
   2384
   2385	/*
   2386	 * Make the request to free obsolete roots after marking the root
   2387	 * invalid, otherwise other vCPUs may not see it as invalid.
   2388	 */
   2389	if (zapped_root)
   2390		kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
   2391	return list_unstable;
   2392}
   2393
   2394static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
   2395				     struct list_head *invalid_list)
   2396{
   2397	int nr_zapped;
   2398
   2399	__kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
   2400	return nr_zapped;
   2401}
   2402
   2403static void kvm_mmu_commit_zap_page(struct kvm *kvm,
   2404				    struct list_head *invalid_list)
   2405{
   2406	struct kvm_mmu_page *sp, *nsp;
   2407
   2408	if (list_empty(invalid_list))
   2409		return;
   2410
   2411	/*
   2412	 * We need to make sure everyone sees our modifications to
   2413	 * the page tables and see changes to vcpu->mode here. The barrier
   2414	 * in the kvm_flush_remote_tlbs() achieves this. This pairs
   2415	 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
   2416	 *
   2417	 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
   2418	 * guest mode and/or lockless shadow page table walks.
   2419	 */
   2420	kvm_flush_remote_tlbs(kvm);
   2421
   2422	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
   2423		WARN_ON(!sp->role.invalid || sp->root_count);
   2424		kvm_mmu_free_page(sp);
   2425	}
   2426}
   2427
   2428static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
   2429						  unsigned long nr_to_zap)
   2430{
   2431	unsigned long total_zapped = 0;
   2432	struct kvm_mmu_page *sp, *tmp;
   2433	LIST_HEAD(invalid_list);
   2434	bool unstable;
   2435	int nr_zapped;
   2436
   2437	if (list_empty(&kvm->arch.active_mmu_pages))
   2438		return 0;
   2439
   2440restart:
   2441	list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
   2442		/*
   2443		 * Don't zap active root pages, the page itself can't be freed
   2444		 * and zapping it will just force vCPUs to realloc and reload.
   2445		 */
   2446		if (sp->root_count)
   2447			continue;
   2448
   2449		unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
   2450						      &nr_zapped);
   2451		total_zapped += nr_zapped;
   2452		if (total_zapped >= nr_to_zap)
   2453			break;
   2454
   2455		if (unstable)
   2456			goto restart;
   2457	}
   2458
   2459	kvm_mmu_commit_zap_page(kvm, &invalid_list);
   2460
   2461	kvm->stat.mmu_recycled += total_zapped;
   2462	return total_zapped;
   2463}
   2464
   2465static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
   2466{
   2467	if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
   2468		return kvm->arch.n_max_mmu_pages -
   2469			kvm->arch.n_used_mmu_pages;
   2470
   2471	return 0;
   2472}
   2473
   2474static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
   2475{
   2476	unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
   2477
   2478	if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
   2479		return 0;
   2480
   2481	kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
   2482
   2483	/*
   2484	 * Note, this check is intentionally soft, it only guarantees that one
   2485	 * page is available, while the caller may end up allocating as many as
   2486	 * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
   2487	 * exceeding the (arbitrary by default) limit will not harm the host,
   2488	 * being too aggressive may unnecessarily kill the guest, and getting an
   2489	 * exact count is far more trouble than it's worth, especially in the
   2490	 * page fault paths.
   2491	 */
   2492	if (!kvm_mmu_available_pages(vcpu->kvm))
   2493		return -ENOSPC;
   2494	return 0;
   2495}
   2496
   2497/*
   2498 * Changing the number of mmu pages allocated to the vm
   2499 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
   2500 */
   2501void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
   2502{
   2503	write_lock(&kvm->mmu_lock);
   2504
   2505	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
   2506		kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
   2507						  goal_nr_mmu_pages);
   2508
   2509		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
   2510	}
   2511
   2512	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
   2513
   2514	write_unlock(&kvm->mmu_lock);
   2515}
   2516
   2517int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
   2518{
   2519	struct kvm_mmu_page *sp;
   2520	LIST_HEAD(invalid_list);
   2521	int r;
   2522
   2523	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
   2524	r = 0;
   2525	write_lock(&kvm->mmu_lock);
   2526	for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
   2527		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
   2528			 sp->role.word);
   2529		r = 1;
   2530		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
   2531	}
   2532	kvm_mmu_commit_zap_page(kvm, &invalid_list);
   2533	write_unlock(&kvm->mmu_lock);
   2534
   2535	return r;
   2536}
   2537
   2538static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
   2539{
   2540	gpa_t gpa;
   2541	int r;
   2542
   2543	if (vcpu->arch.mmu->root_role.direct)
   2544		return 0;
   2545
   2546	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
   2547
   2548	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
   2549
   2550	return r;
   2551}
   2552
   2553static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
   2554{
   2555	trace_kvm_mmu_unsync_page(sp);
   2556	++kvm->stat.mmu_unsync;
   2557	sp->unsync = 1;
   2558
   2559	kvm_mmu_mark_parents_unsync(sp);
   2560}
   2561
   2562/*
   2563 * Attempt to unsync any shadow pages that can be reached by the specified gfn,
   2564 * KVM is creating a writable mapping for said gfn.  Returns 0 if all pages
   2565 * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
   2566 * be write-protected.
   2567 */
   2568int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
   2569			    gfn_t gfn, bool can_unsync, bool prefetch)
   2570{
   2571	struct kvm_mmu_page *sp;
   2572	bool locked = false;
   2573
   2574	/*
   2575	 * Force write-protection if the page is being tracked.  Note, the page
   2576	 * track machinery is used to write-protect upper-level shadow pages,
   2577	 * i.e. this guards the role.level == 4K assertion below!
   2578	 */
   2579	if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
   2580		return -EPERM;
   2581
   2582	/*
   2583	 * The page is not write-tracked, mark existing shadow pages unsync
   2584	 * unless KVM is synchronizing an unsync SP (can_unsync = false).  In
   2585	 * that case, KVM must complete emulation of the guest TLB flush before
   2586	 * allowing shadow pages to become unsync (writable by the guest).
   2587	 */
   2588	for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
   2589		if (!can_unsync)
   2590			return -EPERM;
   2591
   2592		if (sp->unsync)
   2593			continue;
   2594
   2595		if (prefetch)
   2596			return -EEXIST;
   2597
   2598		/*
   2599		 * TDP MMU page faults require an additional spinlock as they
   2600		 * run with mmu_lock held for read, not write, and the unsync
   2601		 * logic is not thread safe.  Take the spinklock regardless of
   2602		 * the MMU type to avoid extra conditionals/parameters, there's
   2603		 * no meaningful penalty if mmu_lock is held for write.
   2604		 */
   2605		if (!locked) {
   2606			locked = true;
   2607			spin_lock(&kvm->arch.mmu_unsync_pages_lock);
   2608
   2609			/*
   2610			 * Recheck after taking the spinlock, a different vCPU
   2611			 * may have since marked the page unsync.  A false
   2612			 * positive on the unprotected check above is not
   2613			 * possible as clearing sp->unsync _must_ hold mmu_lock
   2614			 * for write, i.e. unsync cannot transition from 0->1
   2615			 * while this CPU holds mmu_lock for read (or write).
   2616			 */
   2617			if (READ_ONCE(sp->unsync))
   2618				continue;
   2619		}
   2620
   2621		WARN_ON(sp->role.level != PG_LEVEL_4K);
   2622		kvm_unsync_page(kvm, sp);
   2623	}
   2624	if (locked)
   2625		spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
   2626
   2627	/*
   2628	 * We need to ensure that the marking of unsync pages is visible
   2629	 * before the SPTE is updated to allow writes because
   2630	 * kvm_mmu_sync_roots() checks the unsync flags without holding
   2631	 * the MMU lock and so can race with this. If the SPTE was updated
   2632	 * before the page had been marked as unsync-ed, something like the
   2633	 * following could happen:
   2634	 *
   2635	 * CPU 1                    CPU 2
   2636	 * ---------------------------------------------------------------------
   2637	 * 1.2 Host updates SPTE
   2638	 *     to be writable
   2639	 *                      2.1 Guest writes a GPTE for GVA X.
   2640	 *                          (GPTE being in the guest page table shadowed
   2641	 *                           by the SP from CPU 1.)
   2642	 *                          This reads SPTE during the page table walk.
   2643	 *                          Since SPTE.W is read as 1, there is no
   2644	 *                          fault.
   2645	 *
   2646	 *                      2.2 Guest issues TLB flush.
   2647	 *                          That causes a VM Exit.
   2648	 *
   2649	 *                      2.3 Walking of unsync pages sees sp->unsync is
   2650	 *                          false and skips the page.
   2651	 *
   2652	 *                      2.4 Guest accesses GVA X.
   2653	 *                          Since the mapping in the SP was not updated,
   2654	 *                          so the old mapping for GVA X incorrectly
   2655	 *                          gets used.
   2656	 * 1.1 Host marks SP
   2657	 *     as unsync
   2658	 *     (sp->unsync = true)
   2659	 *
   2660	 * The write barrier below ensures that 1.1 happens before 1.2 and thus
   2661	 * the situation in 2.4 does not arise.  It pairs with the read barrier
   2662	 * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
   2663	 */
   2664	smp_wmb();
   2665
   2666	return 0;
   2667}
   2668
   2669static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
   2670			u64 *sptep, unsigned int pte_access, gfn_t gfn,
   2671			kvm_pfn_t pfn, struct kvm_page_fault *fault)
   2672{
   2673	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
   2674	int level = sp->role.level;
   2675	int was_rmapped = 0;
   2676	int ret = RET_PF_FIXED;
   2677	bool flush = false;
   2678	bool wrprot;
   2679	u64 spte;
   2680
   2681	/* Prefetching always gets a writable pfn.  */
   2682	bool host_writable = !fault || fault->map_writable;
   2683	bool prefetch = !fault || fault->prefetch;
   2684	bool write_fault = fault && fault->write;
   2685
   2686	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
   2687		 *sptep, write_fault, gfn);
   2688
   2689	if (unlikely(is_noslot_pfn(pfn))) {
   2690		vcpu->stat.pf_mmio_spte_created++;
   2691		mark_mmio_spte(vcpu, sptep, gfn, pte_access);
   2692		return RET_PF_EMULATE;
   2693	}
   2694
   2695	if (is_shadow_present_pte(*sptep)) {
   2696		/*
   2697		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
   2698		 * the parent of the now unreachable PTE.
   2699		 */
   2700		if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
   2701			struct kvm_mmu_page *child;
   2702			u64 pte = *sptep;
   2703
   2704			child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
   2705			drop_parent_pte(child, sptep);
   2706			flush = true;
   2707		} else if (pfn != spte_to_pfn(*sptep)) {
   2708			pgprintk("hfn old %llx new %llx\n",
   2709				 spte_to_pfn(*sptep), pfn);
   2710			drop_spte(vcpu->kvm, sptep);
   2711			flush = true;
   2712		} else
   2713			was_rmapped = 1;
   2714	}
   2715
   2716	wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
   2717			   true, host_writable, &spte);
   2718
   2719	if (*sptep == spte) {
   2720		ret = RET_PF_SPURIOUS;
   2721	} else {
   2722		flush |= mmu_spte_update(sptep, spte);
   2723		trace_kvm_mmu_set_spte(level, gfn, sptep);
   2724	}
   2725
   2726	if (wrprot) {
   2727		if (write_fault)
   2728			ret = RET_PF_EMULATE;
   2729	}
   2730
   2731	if (flush)
   2732		kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
   2733				KVM_PAGES_PER_HPAGE(level));
   2734
   2735	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
   2736
   2737	if (!was_rmapped) {
   2738		WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
   2739		kvm_update_page_stats(vcpu->kvm, level, 1);
   2740		rmap_add(vcpu, slot, sptep, gfn);
   2741	}
   2742
   2743	return ret;
   2744}
   2745
   2746static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
   2747				    struct kvm_mmu_page *sp,
   2748				    u64 *start, u64 *end)
   2749{
   2750	struct page *pages[PTE_PREFETCH_NUM];
   2751	struct kvm_memory_slot *slot;
   2752	unsigned int access = sp->role.access;
   2753	int i, ret;
   2754	gfn_t gfn;
   2755
   2756	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
   2757	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
   2758	if (!slot)
   2759		return -1;
   2760
   2761	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
   2762	if (ret <= 0)
   2763		return -1;
   2764
   2765	for (i = 0; i < ret; i++, gfn++, start++) {
   2766		mmu_set_spte(vcpu, slot, start, access, gfn,
   2767			     page_to_pfn(pages[i]), NULL);
   2768		put_page(pages[i]);
   2769	}
   2770
   2771	return 0;
   2772}
   2773
   2774static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
   2775				  struct kvm_mmu_page *sp, u64 *sptep)
   2776{
   2777	u64 *spte, *start = NULL;
   2778	int i;
   2779
   2780	WARN_ON(!sp->role.direct);
   2781
   2782	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
   2783	spte = sp->spt + i;
   2784
   2785	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
   2786		if (is_shadow_present_pte(*spte) || spte == sptep) {
   2787			if (!start)
   2788				continue;
   2789			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
   2790				return;
   2791			start = NULL;
   2792		} else if (!start)
   2793			start = spte;
   2794	}
   2795	if (start)
   2796		direct_pte_prefetch_many(vcpu, sp, start, spte);
   2797}
   2798
   2799static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
   2800{
   2801	struct kvm_mmu_page *sp;
   2802
   2803	sp = sptep_to_sp(sptep);
   2804
   2805	/*
   2806	 * Without accessed bits, there's no way to distinguish between
   2807	 * actually accessed translations and prefetched, so disable pte
   2808	 * prefetch if accessed bits aren't available.
   2809	 */
   2810	if (sp_ad_disabled(sp))
   2811		return;
   2812
   2813	if (sp->role.level > PG_LEVEL_4K)
   2814		return;
   2815
   2816	/*
   2817	 * If addresses are being invalidated, skip prefetching to avoid
   2818	 * accidentally prefetching those addresses.
   2819	 */
   2820	if (unlikely(vcpu->kvm->mmu_notifier_count))
   2821		return;
   2822
   2823	__direct_pte_prefetch(vcpu, sp, sptep);
   2824}
   2825
   2826static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
   2827				  const struct kvm_memory_slot *slot)
   2828{
   2829	unsigned long hva;
   2830	unsigned long flags;
   2831	int level = PG_LEVEL_4K;
   2832	pgd_t pgd;
   2833	p4d_t p4d;
   2834	pud_t pud;
   2835	pmd_t pmd;
   2836
   2837	if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
   2838		return PG_LEVEL_4K;
   2839
   2840	/*
   2841	 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
   2842	 * is not solely for performance, it's also necessary to avoid the
   2843	 * "writable" check in __gfn_to_hva_many(), which will always fail on
   2844	 * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
   2845	 * page fault steps have already verified the guest isn't writing a
   2846	 * read-only memslot.
   2847	 */
   2848	hva = __gfn_to_hva_memslot(slot, gfn);
   2849
   2850	/*
   2851	 * Lookup the mapping level in the current mm.  The information
   2852	 * may become stale soon, but it is safe to use as long as
   2853	 * 1) mmu_notifier_retry was checked after taking mmu_lock, and
   2854	 * 2) mmu_lock is taken now.
   2855	 *
   2856	 * We still need to disable IRQs to prevent concurrent tear down
   2857	 * of page tables.
   2858	 */
   2859	local_irq_save(flags);
   2860
   2861	pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
   2862	if (pgd_none(pgd))
   2863		goto out;
   2864
   2865	p4d = READ_ONCE(*p4d_offset(&pgd, hva));
   2866	if (p4d_none(p4d) || !p4d_present(p4d))
   2867		goto out;
   2868
   2869	pud = READ_ONCE(*pud_offset(&p4d, hva));
   2870	if (pud_none(pud) || !pud_present(pud))
   2871		goto out;
   2872
   2873	if (pud_large(pud)) {
   2874		level = PG_LEVEL_1G;
   2875		goto out;
   2876	}
   2877
   2878	pmd = READ_ONCE(*pmd_offset(&pud, hva));
   2879	if (pmd_none(pmd) || !pmd_present(pmd))
   2880		goto out;
   2881
   2882	if (pmd_large(pmd))
   2883		level = PG_LEVEL_2M;
   2884
   2885out:
   2886	local_irq_restore(flags);
   2887
   2888	/* Adjust the page level based on the SEV-SNP RMP page level. */
   2889	if (kvm_x86_ops.rmp_page_level_adjust)
   2890		static_call(kvm_x86_rmp_page_level_adjust)(kvm, pfn, &level);
   2891
   2892	return level;
   2893}
   2894
   2895int kvm_mmu_max_mapping_level(struct kvm *kvm,
   2896			      const struct kvm_memory_slot *slot, gfn_t gfn,
   2897			      kvm_pfn_t pfn, int max_level)
   2898{
   2899	struct kvm_lpage_info *linfo;
   2900	int host_level;
   2901
   2902	max_level = min(max_level, max_huge_page_level);
   2903	for ( ; max_level > PG_LEVEL_4K; max_level--) {
   2904		linfo = lpage_info_slot(gfn, slot, max_level);
   2905		if (!linfo->disallow_lpage)
   2906			break;
   2907	}
   2908
   2909	if (max_level == PG_LEVEL_4K)
   2910		return PG_LEVEL_4K;
   2911
   2912	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
   2913	return min(host_level, max_level);
   2914}
   2915
   2916void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
   2917{
   2918	struct kvm_memory_slot *slot = fault->slot;
   2919	kvm_pfn_t mask;
   2920
   2921	fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
   2922
   2923	if (unlikely(fault->max_level == PG_LEVEL_4K))
   2924		return;
   2925
   2926	if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
   2927		return;
   2928
   2929	if (kvm_slot_dirty_track_enabled(slot))
   2930		return;
   2931
   2932	/*
   2933	 * Enforce the iTLB multihit workaround after capturing the requested
   2934	 * level, which will be used to do precise, accurate accounting.
   2935	 */
   2936	fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
   2937						     fault->gfn, fault->pfn,
   2938						     fault->max_level);
   2939	if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
   2940		return;
   2941
   2942	/*
   2943	 * mmu_notifier_retry() was successful and mmu_lock is held, so
   2944	 * the pmd can't be split from under us.
   2945	 */
   2946	fault->goal_level = fault->req_level;
   2947	mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
   2948	VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
   2949	fault->pfn &= ~mask;
   2950}
   2951
   2952void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
   2953{
   2954	if (cur_level > PG_LEVEL_4K &&
   2955	    cur_level == fault->goal_level &&
   2956	    is_shadow_present_pte(spte) &&
   2957	    !is_large_pte(spte)) {
   2958		/*
   2959		 * A small SPTE exists for this pfn, but FNAME(fetch)
   2960		 * and __direct_map would like to create a large PTE
   2961		 * instead: just force them to go down another level,
   2962		 * patching back for them into pfn the next 9 bits of
   2963		 * the address.
   2964		 */
   2965		u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
   2966				KVM_PAGES_PER_HPAGE(cur_level - 1);
   2967		fault->pfn |= fault->gfn & page_mask;
   2968		fault->goal_level--;
   2969	}
   2970}
   2971
   2972static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
   2973{
   2974	struct kvm_shadow_walk_iterator it;
   2975	struct kvm_mmu_page *sp;
   2976	int ret;
   2977	gfn_t base_gfn = fault->gfn;
   2978
   2979	kvm_mmu_hugepage_adjust(vcpu, fault);
   2980
   2981	trace_kvm_mmu_spte_requested(fault);
   2982	for_each_shadow_entry(vcpu, fault->addr, it) {
   2983		/*
   2984		 * We cannot overwrite existing page tables with an NX
   2985		 * large page, as the leaf could be executable.
   2986		 */
   2987		if (fault->nx_huge_page_workaround_enabled)
   2988			disallowed_hugepage_adjust(fault, *it.sptep, it.level);
   2989
   2990		base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
   2991		if (it.level == fault->goal_level)
   2992			break;
   2993
   2994		drop_large_spte(vcpu, it.sptep);
   2995		if (is_shadow_present_pte(*it.sptep))
   2996			continue;
   2997
   2998		sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
   2999				      it.level - 1, true, ACC_ALL);
   3000
   3001		link_shadow_page(vcpu, it.sptep, sp);
   3002		if (fault->is_tdp && fault->huge_page_disallowed &&
   3003		    fault->req_level >= it.level)
   3004			account_huge_nx_page(vcpu->kvm, sp);
   3005	}
   3006
   3007	if (WARN_ON_ONCE(it.level != fault->goal_level))
   3008		return -EFAULT;
   3009
   3010	ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
   3011			   base_gfn, fault->pfn, fault);
   3012	if (ret == RET_PF_SPURIOUS)
   3013		return ret;
   3014
   3015	direct_pte_prefetch(vcpu, it.sptep);
   3016	return ret;
   3017}
   3018
   3019static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
   3020{
   3021	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
   3022}
   3023
   3024static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
   3025{
   3026	/*
   3027	 * Do not cache the mmio info caused by writing the readonly gfn
   3028	 * into the spte otherwise read access on readonly gfn also can
   3029	 * caused mmio page fault and treat it as mmio access.
   3030	 */
   3031	if (pfn == KVM_PFN_ERR_RO_FAULT)
   3032		return RET_PF_EMULATE;
   3033
   3034	if (pfn == KVM_PFN_ERR_HWPOISON) {
   3035		kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
   3036		return RET_PF_RETRY;
   3037	}
   3038
   3039	return -EFAULT;
   3040}
   3041
   3042static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
   3043			       unsigned int access)
   3044{
   3045	/* The pfn is invalid, report the error! */
   3046	if (unlikely(is_error_pfn(fault->pfn)))
   3047		return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
   3048
   3049	if (unlikely(!fault->slot)) {
   3050		gva_t gva = fault->is_tdp ? 0 : fault->addr;
   3051
   3052		vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
   3053				     access & shadow_mmio_access_mask);
   3054		/*
   3055		 * If MMIO caching is disabled, emulate immediately without
   3056		 * touching the shadow page tables as attempting to install an
   3057		 * MMIO SPTE will just be an expensive nop.  Do not cache MMIO
   3058		 * whose gfn is greater than host.MAXPHYADDR, any guest that
   3059		 * generates such gfns is running nested and is being tricked
   3060		 * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
   3061		 * and only if L1's MAXPHYADDR is inaccurate with respect to
   3062		 * the hardware's).
   3063		 */
   3064		if (unlikely(!enable_mmio_caching) ||
   3065		    unlikely(fault->gfn > kvm_mmu_max_gfn()))
   3066			return RET_PF_EMULATE;
   3067	}
   3068
   3069	return RET_PF_CONTINUE;
   3070}
   3071
   3072static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
   3073{
   3074	/*
   3075	 * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
   3076	 * reach the common page fault handler if the SPTE has an invalid MMIO
   3077	 * generation number.  Refreshing the MMIO generation needs to go down
   3078	 * the slow path.  Note, EPT Misconfigs do NOT set the PRESENT flag!
   3079	 */
   3080	if (fault->rsvd)
   3081		return false;
   3082
   3083	/*
   3084	 * #PF can be fast if:
   3085	 *
   3086	 * 1. The shadow page table entry is not present and A/D bits are
   3087	 *    disabled _by KVM_, which could mean that the fault is potentially
   3088	 *    caused by access tracking (if enabled).  If A/D bits are enabled
   3089	 *    by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
   3090	 *    bits for L2 and employ access tracking, but the fast page fault
   3091	 *    mechanism only supports direct MMUs.
   3092	 * 2. The shadow page table entry is present, the access is a write,
   3093	 *    and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
   3094	 *    the fault was caused by a write-protection violation.  If the
   3095	 *    SPTE is MMU-writable (determined later), the fault can be fixed
   3096	 *    by setting the Writable bit, which can be done out of mmu_lock.
   3097	 */
   3098	if (!fault->present)
   3099		return !kvm_ad_enabled();
   3100
   3101	/*
   3102	 * Note, instruction fetches and writes are mutually exclusive, ignore
   3103	 * the "exec" flag.
   3104	 */
   3105	return fault->write;
   3106}
   3107
   3108/*
   3109 * Returns true if the SPTE was fixed successfully. Otherwise,
   3110 * someone else modified the SPTE from its original value.
   3111 */
   3112static bool
   3113fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
   3114			u64 *sptep, u64 old_spte, u64 new_spte)
   3115{
   3116	/*
   3117	 * Theoretically we could also set dirty bit (and flush TLB) here in
   3118	 * order to eliminate unnecessary PML logging. See comments in
   3119	 * set_spte. But fast_page_fault is very unlikely to happen with PML
   3120	 * enabled, so we do not do this. This might result in the same GPA
   3121	 * to be logged in PML buffer again when the write really happens, and
   3122	 * eventually to be called by mark_page_dirty twice. But it's also no
   3123	 * harm. This also avoids the TLB flush needed after setting dirty bit
   3124	 * so non-PML cases won't be impacted.
   3125	 *
   3126	 * Compare with set_spte where instead shadow_dirty_mask is set.
   3127	 */
   3128	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
   3129		return false;
   3130
   3131	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
   3132		mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
   3133
   3134	return true;
   3135}
   3136
   3137static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
   3138{
   3139	if (fault->exec)
   3140		return is_executable_pte(spte);
   3141
   3142	if (fault->write)
   3143		return is_writable_pte(spte);
   3144
   3145	/* Fault was on Read access */
   3146	return spte & PT_PRESENT_MASK;
   3147}
   3148
   3149/*
   3150 * Returns the last level spte pointer of the shadow page walk for the given
   3151 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
   3152 * walk could be performed, returns NULL and *spte does not contain valid data.
   3153 *
   3154 * Contract:
   3155 *  - Must be called between walk_shadow_page_lockless_{begin,end}.
   3156 *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
   3157 */
   3158static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
   3159{
   3160	struct kvm_shadow_walk_iterator iterator;
   3161	u64 old_spte;
   3162	u64 *sptep = NULL;
   3163
   3164	for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
   3165		sptep = iterator.sptep;
   3166		*spte = old_spte;
   3167	}
   3168
   3169	return sptep;
   3170}
   3171
   3172/*
   3173 * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
   3174 */
   3175static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
   3176{
   3177	struct kvm_mmu_page *sp;
   3178	int ret = RET_PF_INVALID;
   3179	u64 spte = 0ull;
   3180	u64 *sptep = NULL;
   3181	uint retry_count = 0;
   3182
   3183	if (!page_fault_can_be_fast(fault))
   3184		return ret;
   3185
   3186	walk_shadow_page_lockless_begin(vcpu);
   3187
   3188	do {
   3189		u64 new_spte;
   3190
   3191		if (is_tdp_mmu(vcpu->arch.mmu))
   3192			sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
   3193		else
   3194			sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
   3195
   3196		if (!is_shadow_present_pte(spte))
   3197			break;
   3198
   3199		sp = sptep_to_sp(sptep);
   3200		if (!is_last_spte(spte, sp->role.level))
   3201			break;
   3202
   3203		/*
   3204		 * Check whether the memory access that caused the fault would
   3205		 * still cause it if it were to be performed right now. If not,
   3206		 * then this is a spurious fault caused by TLB lazily flushed,
   3207		 * or some other CPU has already fixed the PTE after the
   3208		 * current CPU took the fault.
   3209		 *
   3210		 * Need not check the access of upper level table entries since
   3211		 * they are always ACC_ALL.
   3212		 */
   3213		if (is_access_allowed(fault, spte)) {
   3214			ret = RET_PF_SPURIOUS;
   3215			break;
   3216		}
   3217
   3218		new_spte = spte;
   3219
   3220		/*
   3221		 * KVM only supports fixing page faults outside of MMU lock for
   3222		 * direct MMUs, nested MMUs are always indirect, and KVM always
   3223		 * uses A/D bits for non-nested MMUs.  Thus, if A/D bits are
   3224		 * enabled, the SPTE can't be an access-tracked SPTE.
   3225		 */
   3226		if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
   3227			new_spte = restore_acc_track_spte(new_spte);
   3228
   3229		/*
   3230		 * To keep things simple, only SPTEs that are MMU-writable can
   3231		 * be made fully writable outside of mmu_lock, e.g. only SPTEs
   3232		 * that were write-protected for dirty-logging or access
   3233		 * tracking are handled here.  Don't bother checking if the
   3234		 * SPTE is writable to prioritize running with A/D bits enabled.
   3235		 * The is_access_allowed() check above handles the common case
   3236		 * of the fault being spurious, and the SPTE is known to be
   3237		 * shadow-present, i.e. except for access tracking restoration
   3238		 * making the new SPTE writable, the check is wasteful.
   3239		 */
   3240		if (fault->write && is_mmu_writable_spte(spte)) {
   3241			new_spte |= PT_WRITABLE_MASK;
   3242
   3243			/*
   3244			 * Do not fix write-permission on the large spte when
   3245			 * dirty logging is enabled. Since we only dirty the
   3246			 * first page into the dirty-bitmap in
   3247			 * fast_pf_fix_direct_spte(), other pages are missed
   3248			 * if its slot has dirty logging enabled.
   3249			 *
   3250			 * Instead, we let the slow page fault path create a
   3251			 * normal spte to fix the access.
   3252			 */
   3253			if (sp->role.level > PG_LEVEL_4K &&
   3254			    kvm_slot_dirty_track_enabled(fault->slot))
   3255				break;
   3256		}
   3257
   3258		/* Verify that the fault can be handled in the fast path */
   3259		if (new_spte == spte ||
   3260		    !is_access_allowed(fault, new_spte))
   3261			break;
   3262
   3263		/*
   3264		 * Currently, fast page fault only works for direct mapping
   3265		 * since the gfn is not stable for indirect shadow page. See
   3266		 * Documentation/virt/kvm/locking.rst to get more detail.
   3267		 */
   3268		if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
   3269			ret = RET_PF_FIXED;
   3270			break;
   3271		}
   3272
   3273		if (++retry_count > 4) {
   3274			printk_once(KERN_WARNING
   3275				"kvm: Fast #PF retrying more than 4 times.\n");
   3276			break;
   3277		}
   3278
   3279	} while (true);
   3280
   3281	trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
   3282	walk_shadow_page_lockless_end(vcpu);
   3283
   3284	if (ret != RET_PF_INVALID)
   3285		vcpu->stat.pf_fast++;
   3286
   3287	return ret;
   3288}
   3289
   3290static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
   3291			       struct list_head *invalid_list)
   3292{
   3293	struct kvm_mmu_page *sp;
   3294
   3295	if (!VALID_PAGE(*root_hpa))
   3296		return;
   3297
   3298	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
   3299	if (WARN_ON(!sp))
   3300		return;
   3301
   3302	if (is_tdp_mmu_page(sp))
   3303		kvm_tdp_mmu_put_root(kvm, sp, false);
   3304	else if (!--sp->root_count && sp->role.invalid)
   3305		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
   3306
   3307	*root_hpa = INVALID_PAGE;
   3308}
   3309
   3310/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
   3311void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
   3312			ulong roots_to_free)
   3313{
   3314	int i;
   3315	LIST_HEAD(invalid_list);
   3316	bool free_active_root;
   3317
   3318	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
   3319
   3320	/* Before acquiring the MMU lock, see if we need to do any real work. */
   3321	free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
   3322		&& VALID_PAGE(mmu->root.hpa);
   3323
   3324	if (!free_active_root) {
   3325		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
   3326			if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
   3327			    VALID_PAGE(mmu->prev_roots[i].hpa))
   3328				break;
   3329
   3330		if (i == KVM_MMU_NUM_PREV_ROOTS)
   3331			return;
   3332	}
   3333
   3334	write_lock(&kvm->mmu_lock);
   3335
   3336	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
   3337		if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
   3338			mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
   3339					   &invalid_list);
   3340
   3341	if (free_active_root) {
   3342		if (to_shadow_page(mmu->root.hpa)) {
   3343			mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
   3344		} else if (mmu->pae_root) {
   3345			for (i = 0; i < 4; ++i) {
   3346				if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
   3347					continue;
   3348
   3349				mmu_free_root_page(kvm, &mmu->pae_root[i],
   3350						   &invalid_list);
   3351				mmu->pae_root[i] = INVALID_PAE_ROOT;
   3352			}
   3353		}
   3354		mmu->root.hpa = INVALID_PAGE;
   3355		mmu->root.pgd = 0;
   3356	}
   3357
   3358	kvm_mmu_commit_zap_page(kvm, &invalid_list);
   3359	write_unlock(&kvm->mmu_lock);
   3360}
   3361EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
   3362
   3363void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
   3364{
   3365	unsigned long roots_to_free = 0;
   3366	hpa_t root_hpa;
   3367	int i;
   3368
   3369	/*
   3370	 * This should not be called while L2 is active, L2 can't invalidate
   3371	 * _only_ its own roots, e.g. INVVPID unconditionally exits.
   3372	 */
   3373	WARN_ON_ONCE(mmu->root_role.guest_mode);
   3374
   3375	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
   3376		root_hpa = mmu->prev_roots[i].hpa;
   3377		if (!VALID_PAGE(root_hpa))
   3378			continue;
   3379
   3380		if (!to_shadow_page(root_hpa) ||
   3381			to_shadow_page(root_hpa)->role.guest_mode)
   3382			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
   3383	}
   3384
   3385	kvm_mmu_free_roots(kvm, mmu, roots_to_free);
   3386}
   3387EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
   3388
   3389
   3390static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
   3391{
   3392	int ret = 0;
   3393
   3394	if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
   3395		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
   3396		ret = 1;
   3397	}
   3398
   3399	return ret;
   3400}
   3401
   3402static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
   3403			    u8 level, bool direct)
   3404{
   3405	struct kvm_mmu_page *sp;
   3406
   3407	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
   3408	++sp->root_count;
   3409
   3410	return __pa(sp->spt);
   3411}
   3412
   3413static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
   3414{
   3415	struct kvm_mmu *mmu = vcpu->arch.mmu;
   3416	u8 shadow_root_level = mmu->root_role.level;
   3417	hpa_t root;
   3418	unsigned i;
   3419	int r;
   3420
   3421	write_lock(&vcpu->kvm->mmu_lock);
   3422	r = make_mmu_pages_available(vcpu);
   3423	if (r < 0)
   3424		goto out_unlock;
   3425
   3426	if (is_tdp_mmu_enabled(vcpu->kvm)) {
   3427		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
   3428		mmu->root.hpa = root;
   3429	} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
   3430		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
   3431		mmu->root.hpa = root;
   3432	} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
   3433		if (WARN_ON_ONCE(!mmu->pae_root)) {
   3434			r = -EIO;
   3435			goto out_unlock;
   3436		}
   3437
   3438		for (i = 0; i < 4; ++i) {
   3439			WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
   3440
   3441			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
   3442					      i << 30, PT32_ROOT_LEVEL, true);
   3443			mmu->pae_root[i] = root | PT_PRESENT_MASK |
   3444					   shadow_me_value;
   3445		}
   3446		mmu->root.hpa = __pa(mmu->pae_root);
   3447	} else {
   3448		WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
   3449		r = -EIO;
   3450		goto out_unlock;
   3451	}
   3452
   3453	/* root.pgd is ignored for direct MMUs. */
   3454	mmu->root.pgd = 0;
   3455out_unlock:
   3456	write_unlock(&vcpu->kvm->mmu_lock);
   3457	return r;
   3458}
   3459
   3460static int mmu_first_shadow_root_alloc(struct kvm *kvm)
   3461{
   3462	struct kvm_memslots *slots;
   3463	struct kvm_memory_slot *slot;
   3464	int r = 0, i, bkt;
   3465
   3466	/*
   3467	 * Check if this is the first shadow root being allocated before
   3468	 * taking the lock.
   3469	 */
   3470	if (kvm_shadow_root_allocated(kvm))
   3471		return 0;
   3472
   3473	mutex_lock(&kvm->slots_arch_lock);
   3474
   3475	/* Recheck, under the lock, whether this is the first shadow root. */
   3476	if (kvm_shadow_root_allocated(kvm))
   3477		goto out_unlock;
   3478
   3479	/*
   3480	 * Check if anything actually needs to be allocated, e.g. all metadata
   3481	 * will be allocated upfront if TDP is disabled.
   3482	 */
   3483	if (kvm_memslots_have_rmaps(kvm) &&
   3484	    kvm_page_track_write_tracking_enabled(kvm))
   3485		goto out_success;
   3486
   3487	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
   3488		slots = __kvm_memslots(kvm, i);
   3489		kvm_for_each_memslot(slot, bkt, slots) {
   3490			/*
   3491			 * Both of these functions are no-ops if the target is
   3492			 * already allocated, so unconditionally calling both
   3493			 * is safe.  Intentionally do NOT free allocations on
   3494			 * failure to avoid having to track which allocations
   3495			 * were made now versus when the memslot was created.
   3496			 * The metadata is guaranteed to be freed when the slot
   3497			 * is freed, and will be kept/used if userspace retries
   3498			 * KVM_RUN instead of killing the VM.
   3499			 */
   3500			r = memslot_rmap_alloc(slot, slot->npages);
   3501			if (r)
   3502				goto out_unlock;
   3503			r = kvm_page_track_write_tracking_alloc(slot);
   3504			if (r)
   3505				goto out_unlock;
   3506		}
   3507	}
   3508
   3509	/*
   3510	 * Ensure that shadow_root_allocated becomes true strictly after
   3511	 * all the related pointers are set.
   3512	 */
   3513out_success:
   3514	smp_store_release(&kvm->arch.shadow_root_allocated, true);
   3515
   3516out_unlock:
   3517	mutex_unlock(&kvm->slots_arch_lock);
   3518	return r;
   3519}
   3520
   3521static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
   3522{
   3523	struct kvm_mmu *mmu = vcpu->arch.mmu;
   3524	u64 pdptrs[4], pm_mask;
   3525	gfn_t root_gfn, root_pgd;
   3526	hpa_t root;
   3527	unsigned i;
   3528	int r;
   3529
   3530	root_pgd = mmu->get_guest_pgd(vcpu);
   3531	root_gfn = root_pgd >> PAGE_SHIFT;
   3532
   3533	if (mmu_check_root(vcpu, root_gfn))
   3534		return 1;
   3535
   3536	/*
   3537	 * On SVM, reading PDPTRs might access guest memory, which might fault
   3538	 * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
   3539	 */
   3540	if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
   3541		for (i = 0; i < 4; ++i) {
   3542			pdptrs[i] = mmu->get_pdptr(vcpu, i);
   3543			if (!(pdptrs[i] & PT_PRESENT_MASK))
   3544				continue;
   3545
   3546			if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
   3547				return 1;
   3548		}
   3549	}
   3550
   3551	r = mmu_first_shadow_root_alloc(vcpu->kvm);
   3552	if (r)
   3553		return r;
   3554
   3555	write_lock(&vcpu->kvm->mmu_lock);
   3556	r = make_mmu_pages_available(vcpu);
   3557	if (r < 0)
   3558		goto out_unlock;
   3559
   3560	/*
   3561	 * Do we shadow a long mode page table? If so we need to
   3562	 * write-protect the guests page table root.
   3563	 */
   3564	if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
   3565		root = mmu_alloc_root(vcpu, root_gfn, 0,
   3566				      mmu->root_role.level, false);
   3567		mmu->root.hpa = root;
   3568		goto set_root_pgd;
   3569	}
   3570
   3571	if (WARN_ON_ONCE(!mmu->pae_root)) {
   3572		r = -EIO;
   3573		goto out_unlock;
   3574	}
   3575
   3576	/*
   3577	 * We shadow a 32 bit page table. This may be a legacy 2-level
   3578	 * or a PAE 3-level page table. In either case we need to be aware that
   3579	 * the shadow page table may be a PAE or a long mode page table.
   3580	 */
   3581	pm_mask = PT_PRESENT_MASK | shadow_me_value;
   3582	if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
   3583		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
   3584
   3585		if (WARN_ON_ONCE(!mmu->pml4_root)) {
   3586			r = -EIO;
   3587			goto out_unlock;
   3588		}
   3589		mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
   3590
   3591		if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
   3592			if (WARN_ON_ONCE(!mmu->pml5_root)) {
   3593				r = -EIO;
   3594				goto out_unlock;
   3595			}
   3596			mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
   3597		}
   3598	}
   3599
   3600	for (i = 0; i < 4; ++i) {
   3601		WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
   3602
   3603		if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
   3604			if (!(pdptrs[i] & PT_PRESENT_MASK)) {
   3605				mmu->pae_root[i] = INVALID_PAE_ROOT;
   3606				continue;
   3607			}
   3608			root_gfn = pdptrs[i] >> PAGE_SHIFT;
   3609		}
   3610
   3611		root = mmu_alloc_root(vcpu, root_gfn, i << 30,
   3612				      PT32_ROOT_LEVEL, false);
   3613		mmu->pae_root[i] = root | pm_mask;
   3614	}
   3615
   3616	if (mmu->root_role.level == PT64_ROOT_5LEVEL)
   3617		mmu->root.hpa = __pa(mmu->pml5_root);
   3618	else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
   3619		mmu->root.hpa = __pa(mmu->pml4_root);
   3620	else
   3621		mmu->root.hpa = __pa(mmu->pae_root);
   3622
   3623set_root_pgd:
   3624	mmu->root.pgd = root_pgd;
   3625out_unlock:
   3626	write_unlock(&vcpu->kvm->mmu_lock);
   3627
   3628	return r;
   3629}
   3630
   3631static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
   3632{
   3633	struct kvm_mmu *mmu = vcpu->arch.mmu;
   3634	bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
   3635	u64 *pml5_root = NULL;
   3636	u64 *pml4_root = NULL;
   3637	u64 *pae_root;
   3638
   3639	/*
   3640	 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
   3641	 * tables are allocated and initialized at root creation as there is no
   3642	 * equivalent level in the guest's NPT to shadow.  Allocate the tables
   3643	 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
   3644	 */
   3645	if (mmu->root_role.direct ||
   3646	    mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
   3647	    mmu->root_role.level < PT64_ROOT_4LEVEL)
   3648		return 0;
   3649
   3650	/*
   3651	 * NPT, the only paging mode that uses this horror, uses a fixed number
   3652	 * of levels for the shadow page tables, e.g. all MMUs are 4-level or
   3653	 * all MMus are 5-level.  Thus, this can safely require that pml5_root
   3654	 * is allocated if the other roots are valid and pml5 is needed, as any
   3655	 * prior MMU would also have required pml5.
   3656	 */
   3657	if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
   3658		return 0;
   3659
   3660	/*
   3661	 * The special roots should always be allocated in concert.  Yell and
   3662	 * bail if KVM ends up in a state where only one of the roots is valid.
   3663	 */
   3664	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
   3665			 (need_pml5 && mmu->pml5_root)))
   3666		return -EIO;
   3667
   3668	/*
   3669	 * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
   3670	 * doesn't need to be decrypted.
   3671	 */
   3672	pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
   3673	if (!pae_root)
   3674		return -ENOMEM;
   3675
   3676#ifdef CONFIG_X86_64
   3677	pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
   3678	if (!pml4_root)
   3679		goto err_pml4;
   3680
   3681	if (need_pml5) {
   3682		pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
   3683		if (!pml5_root)
   3684			goto err_pml5;
   3685	}
   3686#endif
   3687
   3688	mmu->pae_root = pae_root;
   3689	mmu->pml4_root = pml4_root;
   3690	mmu->pml5_root = pml5_root;
   3691
   3692	return 0;
   3693
   3694#ifdef CONFIG_X86_64
   3695err_pml5:
   3696	free_page((unsigned long)pml4_root);
   3697err_pml4:
   3698	free_page((unsigned long)pae_root);
   3699	return -ENOMEM;
   3700#endif
   3701}
   3702
   3703static bool is_unsync_root(hpa_t root)
   3704{
   3705	struct kvm_mmu_page *sp;
   3706
   3707	if (!VALID_PAGE(root))
   3708		return false;
   3709
   3710	/*
   3711	 * The read barrier orders the CPU's read of SPTE.W during the page table
   3712	 * walk before the reads of sp->unsync/sp->unsync_children here.
   3713	 *
   3714	 * Even if another CPU was marking the SP as unsync-ed simultaneously,
   3715	 * any guest page table changes are not guaranteed to be visible anyway
   3716	 * until this VCPU issues a TLB flush strictly after those changes are
   3717	 * made.  We only need to ensure that the other CPU sets these flags
   3718	 * before any actual changes to the page tables are made.  The comments
   3719	 * in mmu_try_to_unsync_pages() describe what could go wrong if this
   3720	 * requirement isn't satisfied.
   3721	 */
   3722	smp_rmb();
   3723	sp = to_shadow_page(root);
   3724
   3725	/*
   3726	 * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
   3727	 * PDPTEs for a given PAE root need to be synchronized individually.
   3728	 */
   3729	if (WARN_ON_ONCE(!sp))
   3730		return false;
   3731
   3732	if (sp->unsync || sp->unsync_children)
   3733		return true;
   3734
   3735	return false;
   3736}
   3737
   3738void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
   3739{
   3740	int i;
   3741	struct kvm_mmu_page *sp;
   3742
   3743	if (vcpu->arch.mmu->root_role.direct)
   3744		return;
   3745
   3746	if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
   3747		return;
   3748
   3749	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
   3750
   3751	if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
   3752		hpa_t root = vcpu->arch.mmu->root.hpa;
   3753		sp = to_shadow_page(root);
   3754
   3755		if (!is_unsync_root(root))
   3756			return;
   3757
   3758		write_lock(&vcpu->kvm->mmu_lock);
   3759		mmu_sync_children(vcpu, sp, true);
   3760		write_unlock(&vcpu->kvm->mmu_lock);
   3761		return;
   3762	}
   3763
   3764	write_lock(&vcpu->kvm->mmu_lock);
   3765
   3766	for (i = 0; i < 4; ++i) {
   3767		hpa_t root = vcpu->arch.mmu->pae_root[i];
   3768
   3769		if (IS_VALID_PAE_ROOT(root)) {
   3770			root &= PT64_BASE_ADDR_MASK;
   3771			sp = to_shadow_page(root);
   3772			mmu_sync_children(vcpu, sp, true);
   3773		}
   3774	}
   3775
   3776	write_unlock(&vcpu->kvm->mmu_lock);
   3777}
   3778
   3779void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
   3780{
   3781	unsigned long roots_to_free = 0;
   3782	int i;
   3783
   3784	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
   3785		if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
   3786			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
   3787
   3788	/* sync prev_roots by simply freeing them */
   3789	kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
   3790}
   3791
   3792static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
   3793				  gpa_t vaddr, u64 access,
   3794				  struct x86_exception *exception)
   3795{
   3796	if (exception)
   3797		exception->error_code = 0;
   3798	return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
   3799}
   3800
   3801static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
   3802{
   3803	/*
   3804	 * A nested guest cannot use the MMIO cache if it is using nested
   3805	 * page tables, because cr2 is a nGPA while the cache stores GPAs.
   3806	 */
   3807	if (mmu_is_nested(vcpu))
   3808		return false;
   3809
   3810	if (direct)
   3811		return vcpu_match_mmio_gpa(vcpu, addr);
   3812
   3813	return vcpu_match_mmio_gva(vcpu, addr);
   3814}
   3815
   3816/*
   3817 * Return the level of the lowest level SPTE added to sptes.
   3818 * That SPTE may be non-present.
   3819 *
   3820 * Must be called between walk_shadow_page_lockless_{begin,end}.
   3821 */
   3822static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
   3823{
   3824	struct kvm_shadow_walk_iterator iterator;
   3825	int leaf = -1;
   3826	u64 spte;
   3827
   3828	for (shadow_walk_init(&iterator, vcpu, addr),
   3829	     *root_level = iterator.level;
   3830	     shadow_walk_okay(&iterator);
   3831	     __shadow_walk_next(&iterator, spte)) {
   3832		leaf = iterator.level;
   3833		spte = mmu_spte_get_lockless(iterator.sptep);
   3834
   3835		sptes[leaf] = spte;
   3836	}
   3837
   3838	return leaf;
   3839}
   3840
   3841/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
   3842static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
   3843{
   3844	u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
   3845	struct rsvd_bits_validate *rsvd_check;
   3846	int root, leaf, level;
   3847	bool reserved = false;
   3848
   3849	walk_shadow_page_lockless_begin(vcpu);
   3850
   3851	if (is_tdp_mmu(vcpu->arch.mmu))
   3852		leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
   3853	else
   3854		leaf = get_walk(vcpu, addr, sptes, &root);
   3855
   3856	walk_shadow_page_lockless_end(vcpu);
   3857
   3858	if (unlikely(leaf < 0)) {
   3859		*sptep = 0ull;
   3860		return reserved;
   3861	}
   3862
   3863	*sptep = sptes[leaf];
   3864
   3865	/*
   3866	 * Skip reserved bits checks on the terminal leaf if it's not a valid
   3867	 * SPTE.  Note, this also (intentionally) skips MMIO SPTEs, which, by
   3868	 * design, always have reserved bits set.  The purpose of the checks is
   3869	 * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
   3870	 */
   3871	if (!is_shadow_present_pte(sptes[leaf]))
   3872		leaf++;
   3873
   3874	rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
   3875
   3876	for (level = root; level >= leaf; level--)
   3877		reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
   3878
   3879	if (reserved) {
   3880		pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
   3881		       __func__, addr);
   3882		for (level = root; level >= leaf; level--)
   3883			pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
   3884			       sptes[level], level,
   3885			       get_rsvd_bits(rsvd_check, sptes[level], level));
   3886	}
   3887
   3888	return reserved;
   3889}
   3890
   3891static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
   3892{
   3893	u64 spte;
   3894	bool reserved;
   3895
   3896	if (mmio_info_in_cache(vcpu, addr, direct))
   3897		return RET_PF_EMULATE;
   3898
   3899	reserved = get_mmio_spte(vcpu, addr, &spte);
   3900	if (WARN_ON(reserved))
   3901		return -EINVAL;
   3902
   3903	if (is_mmio_spte(spte)) {
   3904		gfn_t gfn = get_mmio_spte_gfn(spte);
   3905		unsigned int access = get_mmio_spte_access(spte);
   3906
   3907		if (!check_mmio_spte(vcpu, spte))
   3908			return RET_PF_INVALID;
   3909
   3910		if (direct)
   3911			addr = 0;
   3912
   3913		trace_handle_mmio_page_fault(addr, gfn, access);
   3914		vcpu_cache_mmio_info(vcpu, addr, gfn, access);
   3915		return RET_PF_EMULATE;
   3916	}
   3917
   3918	/*
   3919	 * If the page table is zapped by other cpus, let CPU fault again on
   3920	 * the address.
   3921	 */
   3922	return RET_PF_RETRY;
   3923}
   3924
   3925static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
   3926					 struct kvm_page_fault *fault)
   3927{
   3928	int modes[] = {
   3929		KVM_PAGE_TRACK_EXEC,
   3930		KVM_PAGE_TRACK_ACCESS,
   3931		KVM_PAGE_TRACK_WRITE
   3932	};
   3933	struct cpc_fault *tmp, *alloc;
   3934	size_t count, i;
   3935	bool inst_fetch;
   3936	bool is_prev_gfn;
   3937
   3938	CPC_DBG("Page fault (gfn:%08llx err:%u)\n",
   3939		fault->gfn, fault->error_code);
   3940
   3941	for (i = 0; i < 3; i++) {
   3942		if (kvm_slot_page_track_is_active(vcpu->kvm,
   3943				fault->slot, fault->gfn, modes[i]))
   3944			break;
   3945	}
   3946	if (i == 3) {
   3947		return false;
   3948	}
   3949
   3950	inst_fetch = fault->error_code & PFERR_FETCH_MASK;
   3951
   3952	count = 0;
   3953	list_for_each_entry(tmp, &cpc_faults, list)
   3954		count += 1;
   3955
   3956	switch (cpc_track_mode) {
   3957	case CPC_TRACK_FAULT_NO_RUN:
   3958		BUG_ON(modes[i] != KVM_PAGE_TRACK_ACCESS);
   3959
   3960		cpc_send_track_step_event_single(fault->gfn, fault->error_code,
   3961			cpc_retinst, cpc_guest_misses);
   3962
   3963		return true;
   3964	case CPC_TRACK_STEPS:
   3965		if (cpc_track_steps.with_data && cpc_track_steps.stepping)
   3966			BUG_ON(modes[i] != KVM_PAGE_TRACK_ACCESS);
   3967		else
   3968			BUG_ON(modes[i] != KVM_PAGE_TRACK_EXEC);
   3969
   3970		//CPC_WARN("here\n");
   3971		//if (!fault->present) return false;
   3972
   3973		if (modes[i] == KVM_PAGE_TRACK_EXEC && !inst_fetch)
   3974			return false;
   3975
   3976		CPC_INFO("Got fault cnt:%lu gfn:%08llx err:%u\n", count,
   3977			fault->gfn, fault->error_code);
   3978
   3979		cpc_untrack_single(vcpu, fault->gfn, modes[i]);
   3980
   3981		if (cpc_track_steps.use_target && !cpc_track_steps.in_target
   3982				&& inst_fetch && fault->gfn == cpc_track_steps.target_gfn) {
   3983			CPC_INFO("Entering target gfn for stepping\n");
   3984			cpc_track_steps.in_target = true;
   3985			cpc_track_steps.stepping = true;
   3986			cpc_untrack_single(vcpu, fault->gfn,
   3987				KVM_PAGE_TRACK_EXEC);
   3988			if (cpc_track_steps.with_data) {
   3989				cpc_track_all(vcpu, KVM_PAGE_TRACK_ACCESS);
   3990				cpc_untrack_single(vcpu, fault->gfn,
   3991					KVM_PAGE_TRACK_ACCESS);
   3992			} else {
   3993				cpc_track_all(vcpu, KVM_PAGE_TRACK_EXEC);
   3994				cpc_untrack_single(vcpu, fault->gfn,
   3995					KVM_PAGE_TRACK_EXEC);
   3996			}
   3997		} else if (cpc_track_steps.use_target && cpc_track_steps.in_target
   3998				&& inst_fetch && fault->gfn != cpc_track_steps.target_gfn) {
   3999			CPC_INFO("Leaving target gfn for stepping\n");
   4000			cpc_track_steps.in_target = false;
   4001			cpc_track_steps.stepping = false;
   4002			if (cpc_track_steps.with_data) {
   4003				cpc_untrack_all(vcpu, KVM_PAGE_TRACK_ACCESS);
   4004			} else {
   4005				cpc_untrack_all(vcpu, KVM_PAGE_TRACK_EXEC);
   4006			}
   4007			cpc_track_single(vcpu, cpc_track_steps.target_gfn,
   4008				KVM_PAGE_TRACK_EXEC);
   4009			cpc_singlestep = false;
   4010			cpc_prime_probe = false;
   4011		}
   4012
   4013		if (cpc_track_steps.stepping) {
   4014			alloc = kmalloc(sizeof(struct cpc_fault), GFP_KERNEL);
   4015			BUG_ON(!alloc);
   4016			alloc->gfn = fault->gfn;
   4017			alloc->err = fault->error_code;
   4018			list_add_tail(&alloc->list, &cpc_faults);
   4019
   4020			cpc_singlestep_reset = true;
   4021			if (cpc_track_steps.with_data)
   4022				cpc_prime_probe = true;
   4023		}
   4024
   4025		break;
   4026	case CPC_TRACK_PAGES:
   4027		BUG_ON(modes[i] != KVM_PAGE_TRACK_EXEC);
   4028
   4029		if (!inst_fetch || !fault->present)
   4030			return false;
   4031
   4032		CPC_INFO("Got fault cnt:%lu gfn:%08llx err:%u ret:%llu\n", count,
   4033			fault->gfn, fault->error_code, cpc_track_pages.retinst);
   4034
   4035		is_prev_gfn = (cpc_track_pages.prev_avail &&
   4036			fault->gfn == cpc_track_pages.prev_gfn);
   4037
   4038		/* no conflict if next pagefault happens on a different inst */
   4039		if (cpc_track_pages.in_step && cpc_track_pages.retinst > 2 && !is_prev_gfn) {
   4040			cpc_track_pages.in_step = false;
   4041			cpc_singlestep = false;
   4042		}
   4043
   4044		if (cpc_track_pages.cur_avail && cpc_track_pages.next_avail) {
   4045			WARN_ON(cpc_track_pages.singlestep_resolve);
   4046			CPC_INFO("Boundary %08llx -> %08llx resolved through fault\n",
   4047				cpc_track_pages.cur_gfn, cpc_track_pages.next_gfn);
   4048			cpc_track_single(vcpu, cpc_track_pages.cur_gfn,
   4049				KVM_PAGE_TRACK_EXEC);
   4050			cpc_track_pages.prev_gfn = cpc_track_pages.cur_gfn;
   4051			cpc_track_pages.prev_err = cpc_track_pages.cur_err;
   4052			cpc_track_pages.prev_avail = true;
   4053			cpc_track_pages.cur_gfn = cpc_track_pages.next_gfn;
   4054			cpc_track_pages.cur_err = cpc_track_pages.next_err;
   4055			cpc_track_pages.cur_avail = true;
   4056			cpc_track_pages.next_avail = false;
   4057			cpc_track_pages.in_step = false;
   4058		}
   4059
   4060		cpc_untrack_single(vcpu, fault->gfn, modes[i]);
   4061
   4062		if (!cpc_track_pages.in_step) {
   4063			/* assume instruction is not on page boundary,
   4064			 * retrack previous, keep current untracked.. */
   4065			if (cpc_track_pages.cur_avail) {
   4066				cpc_track_single(vcpu,
   4067					cpc_track_pages.cur_gfn, modes[i]);
   4068				cpc_send_track_page_event(cpc_track_pages.cur_gfn,
   4069					fault->gfn, fault->error_code,
   4070					cpc_track_pages.retinst,
   4071					cpc_track_pages.retinst_user);
   4072				cpc_track_pages.prev_gfn = cpc_track_pages.cur_gfn;
   4073				cpc_track_pages.prev_err = cpc_track_pages.cur_err;
   4074				cpc_track_pages.prev_avail = true;
   4075			}
   4076
   4077			cpc_track_pages.cur_gfn = fault->gfn;
   4078			cpc_track_pages.cur_err = fault->error_code;
   4079			cpc_track_pages.cur_avail = true;
   4080			cpc_track_pages.next_avail = false;
   4081			cpc_track_pages.retinst = 0;
   4082			cpc_track_pages.retinst_user = 0;
   4083			cpc_track_pages.in_step = true;
   4084		} else {
   4085			WARN_ON(cpc_track_pages.next_avail);
   4086			if (is_prev_gfn) {
   4087				/* instruction on boundary A -> B, but we
   4088				 * untracked A previously so now its being
   4089				 * retracked to load the instruction.
   4090				 * reorder gfns chronologically */
   4091				cpc_track_pages.next_gfn = cpc_track_pages.cur_gfn;
   4092				cpc_track_pages.next_err = cpc_track_pages.cur_err;
   4093				cpc_track_pages.next_avail = true;
   4094				cpc_track_pages.cur_gfn = cpc_track_pages.prev_gfn;
   4095				cpc_track_pages.cur_err = cpc_track_pages.prev_err;
   4096				cpc_track_pages.cur_avail = true;
   4097				cpc_track_pages.prev_avail = false;
   4098			} else {
   4099				/* instruction on boundary B -> C in order */
   4100				cpc_track_pages.next_gfn = fault->gfn;
   4101				cpc_track_pages.next_err = fault->error_code;
   4102				cpc_track_pages.next_avail = true;
   4103				cpc_send_track_page_event(cpc_track_pages.cur_gfn,
   4104					cpc_track_pages.next_gfn,
   4105					cpc_track_pages.cur_err,
   4106					cpc_track_pages.retinst,
   4107					cpc_track_pages.retinst_user);
   4108				cpc_track_pages.retinst = 0;
   4109				cpc_track_pages.retinst_user = 0;
   4110			}
   4111
   4112			CPC_INFO("Instruction on boundary %08llx -> %08llx\n",
   4113				cpc_track_pages.cur_gfn, cpc_track_pages.next_gfn);
   4114
   4115			if (cpc_track_pages.singlestep_resolve)
   4116				cpc_singlestep_reset = true;
   4117		}
   4118
   4119		break;
   4120	}
   4121
   4122	return false;
   4123}
   4124
   4125static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
   4126{
   4127	struct kvm_shadow_walk_iterator iterator;
   4128	u64 spte;
   4129
   4130	walk_shadow_page_lockless_begin(vcpu);
   4131	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
   4132		clear_sp_write_flooding_count(iterator.sptep);
   4133	walk_shadow_page_lockless_end(vcpu);
   4134}
   4135
   4136static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
   4137{
   4138	/* make sure the token value is not 0 */
   4139	u32 id = vcpu->arch.apf.id;
   4140
   4141	if (id << 12 == 0)
   4142		vcpu->arch.apf.id = 1;
   4143
   4144	return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
   4145}
   4146
   4147static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
   4148				    gfn_t gfn)
   4149{
   4150	struct kvm_arch_async_pf arch;
   4151
   4152	arch.token = alloc_apf_token(vcpu);
   4153	arch.gfn = gfn;
   4154	arch.direct_map = vcpu->arch.mmu->root_role.direct;
   4155	arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
   4156
   4157	return kvm_setup_async_pf(vcpu, cr2_or_gpa,
   4158				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
   4159}
   4160
   4161void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
   4162{
   4163	int r;
   4164
   4165	if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
   4166	      work->wakeup_all)
   4167		return;
   4168
   4169	r = kvm_mmu_reload(vcpu);
   4170	if (unlikely(r))
   4171		return;
   4172
   4173	if (!vcpu->arch.mmu->root_role.direct &&
   4174	      work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
   4175		return;
   4176
   4177	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
   4178}
   4179
   4180static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
   4181{
   4182	struct kvm_memory_slot *slot = fault->slot;
   4183	bool async;
   4184
   4185	/*
   4186	 * Retry the page fault if the gfn hit a memslot that is being deleted
   4187	 * or moved.  This ensures any existing SPTEs for the old memslot will
   4188	 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
   4189	 */
   4190	if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
   4191		return RET_PF_RETRY;
   4192
   4193	if (!kvm_is_visible_memslot(slot)) {
   4194		/* Don't expose private memslots to L2. */
   4195		if (is_guest_mode(vcpu)) {
   4196			fault->slot = NULL;
   4197			fault->pfn = KVM_PFN_NOSLOT;
   4198			fault->map_writable = false;
   4199			return RET_PF_CONTINUE;
   4200		}
   4201		/*
   4202		 * If the APIC access page exists but is disabled, go directly
   4203		 * to emulation without caching the MMIO access or creating a
   4204		 * MMIO SPTE.  That way the cache doesn't need to be purged
   4205		 * when the AVIC is re-enabled.
   4206		 */
   4207		if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
   4208		    !kvm_apicv_activated(vcpu->kvm))
   4209			return RET_PF_EMULATE;
   4210	}
   4211
   4212	async = false;
   4213	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
   4214					  fault->write, &fault->map_writable,
   4215					  &fault->hva);
   4216	if (!async)
   4217		return RET_PF_CONTINUE; /* *pfn has correct page already */
   4218
   4219	if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
   4220		trace_kvm_try_async_get_page(fault->addr, fault->gfn);
   4221		if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
   4222			trace_kvm_async_pf_doublefault(fault->addr, fault->gfn);
   4223			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
   4224			return RET_PF_RETRY;
   4225		} else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
   4226			return RET_PF_RETRY;
   4227		}
   4228	}
   4229
   4230	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
   4231					  fault->write, &fault->map_writable,
   4232					  &fault->hva);
   4233	return RET_PF_CONTINUE;
   4234}
   4235
   4236/*
   4237 * Returns true if the page fault is stale and needs to be retried, i.e. if the
   4238 * root was invalidated by a memslot update or a relevant mmu_notifier fired.
   4239 */
   4240static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
   4241				struct kvm_page_fault *fault, int mmu_seq)
   4242{
   4243	struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
   4244
   4245	/* Special roots, e.g. pae_root, are not backed by shadow pages. */
   4246	if (sp && is_obsolete_sp(vcpu->kvm, sp))
   4247		return true;
   4248
   4249	/*
   4250	 * Roots without an associated shadow page are considered invalid if
   4251	 * there is a pending request to free obsolete roots.  The request is
   4252	 * only a hint that the current root _may_ be obsolete and needs to be
   4253	 * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
   4254	 * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
   4255	 * to reload even if no vCPU is actively using the root.
   4256	 */
   4257	if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
   4258		return true;
   4259
   4260	return fault->slot &&
   4261	       mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
   4262}
   4263
   4264static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
   4265{
   4266	bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
   4267
   4268	unsigned long mmu_seq;
   4269	int r;
   4270
   4271	fault->gfn = fault->addr >> PAGE_SHIFT;
   4272	fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
   4273
   4274	if (page_fault_handle_page_track(vcpu, fault))
   4275		return RET_PF_EMULATE;
   4276
   4277	r = fast_page_fault(vcpu, fault);
   4278	if (r != RET_PF_INVALID)
   4279		return r;
   4280
   4281	r = mmu_topup_memory_caches(vcpu, false);
   4282	if (r)
   4283		return r;
   4284
   4285	mmu_seq = vcpu->kvm->mmu_notifier_seq;
   4286	smp_rmb();
   4287
   4288	r = kvm_faultin_pfn(vcpu, fault);
   4289	if (r != RET_PF_CONTINUE)
   4290		return r;
   4291
   4292	r = handle_abnormal_pfn(vcpu, fault, ACC_ALL);
   4293	if (r != RET_PF_CONTINUE)
   4294		return r;
   4295
   4296	r = RET_PF_RETRY;
   4297
   4298	if (is_tdp_mmu_fault)
   4299		read_lock(&vcpu->kvm->mmu_lock);
   4300	else
   4301		write_lock(&vcpu->kvm->mmu_lock);
   4302
   4303	if (is_page_fault_stale(vcpu, fault, mmu_seq))
   4304		goto out_unlock;
   4305
   4306	r = make_mmu_pages_available(vcpu);
   4307	if (r)
   4308		goto out_unlock;
   4309
   4310	if (is_tdp_mmu_fault)
   4311		r = kvm_tdp_mmu_map(vcpu, fault);
   4312	else
   4313		r = __direct_map(vcpu, fault);
   4314
   4315out_unlock:
   4316	if (is_tdp_mmu_fault)
   4317		read_unlock(&vcpu->kvm->mmu_lock);
   4318	else
   4319		write_unlock(&vcpu->kvm->mmu_lock);
   4320	kvm_release_pfn_clean(fault->pfn);
   4321	return r;
   4322}
   4323
   4324static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
   4325				struct kvm_page_fault *fault)
   4326{
   4327	pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
   4328
   4329	/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
   4330	fault->max_level = PG_LEVEL_2M;
   4331	return direct_page_fault(vcpu, fault);
   4332}
   4333
   4334int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
   4335				u64 fault_address, char *insn, int insn_len)
   4336{
   4337	int r = 1;
   4338	u32 flags = vcpu->arch.apf.host_apf_flags;
   4339
   4340#ifndef CONFIG_X86_64
   4341	/* A 64-bit CR2 should be impossible on 32-bit KVM. */
   4342	if (WARN_ON_ONCE(fault_address >> 32))
   4343		return -EFAULT;
   4344#endif
   4345
   4346	vcpu->arch.l1tf_flush_l1d = true;
   4347	if (!flags) {
   4348		trace_kvm_page_fault(fault_address, error_code);
   4349
   4350		if (kvm_event_needs_reinjection(vcpu))
   4351			kvm_mmu_unprotect_page_virt(vcpu, fault_address);
   4352		r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
   4353				insn_len);
   4354	} else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
   4355		vcpu->arch.apf.host_apf_flags = 0;
   4356		local_irq_disable();
   4357		kvm_async_pf_task_wait_schedule(fault_address);
   4358		local_irq_enable();
   4359	} else {
   4360		WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
   4361	}
   4362
   4363	return r;
   4364}
   4365EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
   4366
   4367int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
   4368{
   4369	while (fault->max_level > PG_LEVEL_4K) {
   4370		int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
   4371		gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
   4372
   4373		if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
   4374			break;
   4375
   4376		--fault->max_level;
   4377	}
   4378
   4379	return direct_page_fault(vcpu, fault);
   4380}
   4381
   4382kvm_pfn_t kvm_mmu_map_tdp_page(struct kvm_vcpu *vcpu, gpa_t gpa,
   4383			       u32 err, int max_level)
   4384{
   4385	struct kvm_page_fault fault = {
   4386		.addr = gpa,
   4387		.error_code = err,
   4388		.exec = err & PFERR_FETCH_MASK,
   4389		.write = err & PFERR_WRITE_MASK,
   4390		.present = err & PFERR_PRESENT_MASK,
   4391		.rsvd = err & PFERR_RSVD_MASK,
   4392		.user = err & PFERR_USER_MASK,
   4393		.prefetch = false,
   4394		.is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
   4395		.nx_huge_page_workaround_enabled =
   4396			is_nx_huge_page_enabled(),
   4397
   4398		.max_level = max_level,
   4399		.req_level = PG_LEVEL_4K,
   4400		.goal_level = PG_LEVEL_4K,
   4401	};
   4402	int r;
   4403
   4404	if (mmu_topup_memory_caches(vcpu, false))
   4405		return KVM_PFN_ERR_FAULT;
   4406
   4407	/*
   4408	 * Loop on the page fault path to handle the case where an mmu_notifier
   4409	 * invalidation triggers RET_PF_RETRY.  In the normal page fault path,
   4410	 * KVM needs to resume the guest in case the invalidation changed any
   4411	 * of the page fault properties, i.e. the gpa or error code.  For this
   4412	 * path, the gpa and error code are fixed by the caller, and the caller
   4413	 * expects failure if and only if the page fault can't be fixed.
   4414	 */
   4415	do {
   4416		/*
   4417		 * TODO: this should probably go through kvm_mmu_do_page_fault(),
   4418		 * but we need a way to control the max_level, so maybe a direct
   4419		 * call to kvm_tdp_page_fault, which will call into
   4420		 * direct_page_fault() when appropriate.
   4421		 */
   4422		//r = direct_page_fault(vcpu, &fault);
   4423#if CONFIG_RETPOLINE
   4424		if (fault.is_tdp)
   4425			r = kvm_tdp_page_fault(vcpu, &fault);
   4426#else
   4427		r = vcpu->arch.mmu->page_fault(vcpu, &fault);
   4428#endif
   4429	} while (r == RET_PF_RETRY && !is_error_noslot_pfn(fault.pfn));
   4430	return fault.pfn;
   4431}
   4432EXPORT_SYMBOL_GPL(kvm_mmu_map_tdp_page);
   4433
   4434bool kvm_mmu_get_tdp_walk(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t *pfn, int *level)
   4435{
   4436	u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
   4437	int leaf, root;
   4438
   4439	walk_shadow_page_lockless_begin(vcpu);
   4440
   4441	if (is_tdp_mmu(vcpu->arch.mmu))
   4442		leaf = kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, &root);
   4443	else
   4444		leaf = get_walk(vcpu, gpa, sptes, &root);
   4445
   4446	walk_shadow_page_lockless_end(vcpu);
   4447
   4448	if (unlikely(leaf < 0))
   4449		return false;
   4450
   4451	/* Check if the leaf SPTE is present */
   4452	if (!is_shadow_present_pte(sptes[leaf]))
   4453		return false;
   4454
   4455	*pfn = spte_to_pfn(sptes[leaf]);
   4456	if (leaf > PG_LEVEL_4K) {
   4457		u64 page_mask = KVM_PAGES_PER_HPAGE(leaf) - KVM_PAGES_PER_HPAGE(leaf - 1);
   4458		*pfn |= (gpa_to_gfn(gpa) & page_mask);
   4459	}
   4460
   4461	*level = leaf;
   4462
   4463	return true;
   4464}
   4465EXPORT_SYMBOL_GPL(kvm_mmu_get_tdp_walk);
   4466
   4467static void nonpaging_init_context(struct kvm_mmu *context)
   4468{
   4469	context->page_fault = nonpaging_page_fault;
   4470	context->gva_to_gpa = nonpaging_gva_to_gpa;
   4471	context->sync_page = nonpaging_sync_page;
   4472	context->invlpg = NULL;
   4473}
   4474
   4475static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
   4476				  union kvm_mmu_page_role role)
   4477{
   4478	return (role.direct || pgd == root->pgd) &&
   4479	       VALID_PAGE(root->hpa) &&
   4480	       role.word == to_shadow_page(root->hpa)->role.word;
   4481}
   4482
   4483/*
   4484 * Find out if a previously cached root matching the new pgd/role is available,
   4485 * and insert the current root as the MRU in the cache.
   4486 * If a matching root is found, it is assigned to kvm_mmu->root and
   4487 * true is returned.
   4488 * If no match is found, kvm_mmu->root is left invalid, the LRU root is
   4489 * evicted to make room for the current root, and false is returned.
   4490 */
   4491static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
   4492					      gpa_t new_pgd,
   4493					      union kvm_mmu_page_role new_role)
   4494{
   4495	uint i;
   4496
   4497	if (is_root_usable(&mmu->root, new_pgd, new_role))
   4498		return true;
   4499
   4500	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
   4501		/*
   4502		 * The swaps end up rotating the cache like this:
   4503		 *   C   0 1 2 3   (on entry to the function)
   4504		 *   0   C 1 2 3
   4505		 *   1   C 0 2 3
   4506		 *   2   C 0 1 3
   4507		 *   3   C 0 1 2   (on exit from the loop)
   4508		 */
   4509		swap(mmu->root, mmu->prev_roots[i]);
   4510		if (is_root_usable(&mmu->root, new_pgd, new_role))
   4511			return true;
   4512	}
   4513
   4514	kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
   4515	return false;
   4516}
   4517
   4518/*
   4519 * Find out if a previously cached root matching the new pgd/role is available.
   4520 * On entry, mmu->root is invalid.
   4521 * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
   4522 * of the cache becomes invalid, and true is returned.
   4523 * If no match is found, kvm_mmu->root is left invalid and false is returned.
   4524 */
   4525static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
   4526					     gpa_t new_pgd,
   4527					     union kvm_mmu_page_role new_role)
   4528{
   4529	uint i;
   4530
   4531	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
   4532		if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
   4533			goto hit;
   4534
   4535	return false;
   4536
   4537hit:
   4538	swap(mmu->root, mmu->prev_roots[i]);
   4539	/* Bubble up the remaining roots.  */
   4540	for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
   4541		mmu->prev_roots[i] = mmu->prev_roots[i + 1];
   4542	mmu->prev_roots[i].hpa = INVALID_PAGE;
   4543	return true;
   4544}
   4545
   4546static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
   4547			    gpa_t new_pgd, union kvm_mmu_page_role new_role)
   4548{
   4549	/*
   4550	 * For now, limit the caching to 64-bit hosts+VMs in order to avoid
   4551	 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
   4552	 * later if necessary.
   4553	 */
   4554	if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
   4555		kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
   4556
   4557	if (VALID_PAGE(mmu->root.hpa))
   4558		return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
   4559	else
   4560		return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
   4561}
   4562
   4563void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
   4564{
   4565	struct kvm_mmu *mmu = vcpu->arch.mmu;
   4566	union kvm_mmu_page_role new_role = mmu->root_role;
   4567
   4568	if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
   4569		/* kvm_mmu_ensure_valid_pgd will set up a new root.  */
   4570		return;
   4571	}
   4572
   4573	/*
   4574	 * It's possible that the cached previous root page is obsolete because
   4575	 * of a change in the MMU generation number. However, changing the
   4576	 * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
   4577	 * which will free the root set here and allocate a new one.
   4578	 */
   4579	kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
   4580
   4581	if (force_flush_and_sync_on_reuse) {
   4582		kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
   4583		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
   4584	}
   4585
   4586	/*
   4587	 * The last MMIO access's GVA and GPA are cached in the VCPU. When
   4588	 * switching to a new CR3, that GVA->GPA mapping may no longer be
   4589	 * valid. So clear any cached MMIO info even when we don't need to sync
   4590	 * the shadow page tables.
   4591	 */
   4592	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
   4593
   4594	/*
   4595	 * If this is a direct root page, it doesn't have a write flooding
   4596	 * count. Otherwise, clear the write flooding count.
   4597	 */
   4598	if (!new_role.direct)
   4599		__clear_sp_write_flooding_count(
   4600				to_shadow_page(vcpu->arch.mmu->root.hpa));
   4601}
   4602EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
   4603
   4604static unsigned long get_cr3(struct kvm_vcpu *vcpu)
   4605{
   4606	return kvm_read_cr3(vcpu);
   4607}
   4608
   4609static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
   4610			   unsigned int access)
   4611{
   4612	if (unlikely(is_mmio_spte(*sptep))) {
   4613		if (gfn != get_mmio_spte_gfn(*sptep)) {
   4614			mmu_spte_clear_no_track(sptep);
   4615			return true;
   4616		}
   4617
   4618		mark_mmio_spte(vcpu, sptep, gfn, access);
   4619		return true;
   4620	}
   4621
   4622	return false;
   4623}
   4624
   4625#define PTTYPE_EPT 18 /* arbitrary */
   4626#define PTTYPE PTTYPE_EPT
   4627#include "paging_tmpl.h"
   4628#undef PTTYPE
   4629
   4630#define PTTYPE 64
   4631#include "paging_tmpl.h"
   4632#undef PTTYPE
   4633
   4634#define PTTYPE 32
   4635#include "paging_tmpl.h"
   4636#undef PTTYPE
   4637
   4638static void
   4639__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
   4640			u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
   4641			bool pse, bool amd)
   4642{
   4643	u64 gbpages_bit_rsvd = 0;
   4644	u64 nonleaf_bit8_rsvd = 0;
   4645	u64 high_bits_rsvd;
   4646
   4647	rsvd_check->bad_mt_xwr = 0;
   4648
   4649	if (!gbpages)
   4650		gbpages_bit_rsvd = rsvd_bits(7, 7);
   4651
   4652	if (level == PT32E_ROOT_LEVEL)
   4653		high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
   4654	else
   4655		high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
   4656
   4657	/* Note, NX doesn't exist in PDPTEs, this is handled below. */
   4658	if (!nx)
   4659		high_bits_rsvd |= rsvd_bits(63, 63);
   4660
   4661	/*
   4662	 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
   4663	 * leaf entries) on AMD CPUs only.
   4664	 */
   4665	if (amd)
   4666		nonleaf_bit8_rsvd = rsvd_bits(8, 8);
   4667
   4668	switch (level) {
   4669	case PT32_ROOT_LEVEL:
   4670		/* no rsvd bits for 2 level 4K page table entries */
   4671		rsvd_check->rsvd_bits_mask[0][1] = 0;
   4672		rsvd_check->rsvd_bits_mask[0][0] = 0;
   4673		rsvd_check->rsvd_bits_mask[1][0] =
   4674			rsvd_check->rsvd_bits_mask[0][0];
   4675
   4676		if (!pse) {
   4677			rsvd_check->rsvd_bits_mask[1][1] = 0;
   4678			break;
   4679		}
   4680
   4681		if (is_cpuid_PSE36())
   4682			/* 36bits PSE 4MB page */
   4683			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
   4684		else
   4685			/* 32 bits PSE 4MB page */
   4686			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
   4687		break;
   4688	case PT32E_ROOT_LEVEL:
   4689		rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
   4690						   high_bits_rsvd |
   4691						   rsvd_bits(5, 8) |
   4692						   rsvd_bits(1, 2);	/* PDPTE */
   4693		rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;	/* PDE */
   4694		rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;	/* PTE */
   4695		rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
   4696						   rsvd_bits(13, 20);	/* large page */
   4697		rsvd_check->rsvd_bits_mask[1][0] =
   4698			rsvd_check->rsvd_bits_mask[0][0];
   4699		break;
   4700	case PT64_ROOT_5LEVEL:
   4701		rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
   4702						   nonleaf_bit8_rsvd |
   4703						   rsvd_bits(7, 7);
   4704		rsvd_check->rsvd_bits_mask[1][4] =
   4705			rsvd_check->rsvd_bits_mask[0][4];
   4706		fallthrough;
   4707	case PT64_ROOT_4LEVEL:
   4708		rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
   4709						   nonleaf_bit8_rsvd |
   4710						   rsvd_bits(7, 7);
   4711		rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
   4712						   gbpages_bit_rsvd;
   4713		rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
   4714		rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
   4715		rsvd_check->rsvd_bits_mask[1][3] =
   4716			rsvd_check->rsvd_bits_mask[0][3];
   4717		rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
   4718						   gbpages_bit_rsvd |
   4719						   rsvd_bits(13, 29);
   4720		rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
   4721						   rsvd_bits(13, 20); /* large page */
   4722		rsvd_check->rsvd_bits_mask[1][0] =
   4723			rsvd_check->rsvd_bits_mask[0][0];
   4724		break;
   4725	}
   4726}
   4727
   4728static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
   4729{
   4730	/*
   4731	 * If TDP is enabled, let the guest use GBPAGES if they're supported in
   4732	 * hardware.  The hardware page walker doesn't let KVM disable GBPAGES,
   4733	 * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
   4734	 * walk for performance and complexity reasons.  Not to mention KVM
   4735	 * _can't_ solve the problem because GVA->GPA walks aren't visible to
   4736	 * KVM once a TDP translation is installed.  Mimic hardware behavior so
   4737	 * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
   4738	 */
   4739	return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
   4740			     guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
   4741}
   4742
   4743static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
   4744					struct kvm_mmu *context)
   4745{
   4746	__reset_rsvds_bits_mask(&context->guest_rsvd_check,
   4747				vcpu->arch.reserved_gpa_bits,
   4748				context->cpu_role.base.level, is_efer_nx(context),
   4749				guest_can_use_gbpages(vcpu),
   4750				is_cr4_pse(context),
   4751				guest_cpuid_is_amd_or_hygon(vcpu));
   4752}
   4753
   4754static void
   4755__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
   4756			    u64 pa_bits_rsvd, bool execonly, int huge_page_level)
   4757{
   4758	u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
   4759	u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
   4760	u64 bad_mt_xwr;
   4761
   4762	if (huge_page_level < PG_LEVEL_1G)
   4763		large_1g_rsvd = rsvd_bits(7, 7);
   4764	if (huge_page_level < PG_LEVEL_2M)
   4765		large_2m_rsvd = rsvd_bits(7, 7);
   4766
   4767	rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
   4768	rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
   4769	rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
   4770	rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
   4771	rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
   4772
   4773	/* large page */
   4774	rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
   4775	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
   4776	rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
   4777	rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
   4778	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
   4779
   4780	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
   4781	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
   4782	bad_mt_xwr |= 0xFFull << (7 * 8);	/* bits 3..5 must not be 7 */
   4783	bad_mt_xwr |= REPEAT_BYTE(1ull << 2);	/* bits 0..2 must not be 010 */
   4784	bad_mt_xwr |= REPEAT_BYTE(1ull << 6);	/* bits 0..2 must not be 110 */
   4785	if (!execonly) {
   4786		/* bits 0..2 must not be 100 unless VMX capabilities allow it */
   4787		bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
   4788	}
   4789	rsvd_check->bad_mt_xwr = bad_mt_xwr;
   4790}
   4791
   4792static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
   4793		struct kvm_mmu *context, bool execonly, int huge_page_level)
   4794{
   4795	__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
   4796				    vcpu->arch.reserved_gpa_bits, execonly,
   4797				    huge_page_level);
   4798}
   4799
   4800static inline u64 reserved_hpa_bits(void)
   4801{
   4802	return rsvd_bits(shadow_phys_bits, 63);
   4803}
   4804
   4805/*
   4806 * the page table on host is the shadow page table for the page
   4807 * table in guest or amd nested guest, its mmu features completely
   4808 * follow the features in guest.
   4809 */
   4810static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
   4811					struct kvm_mmu *context)
   4812{
   4813	/* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
   4814	bool is_amd = true;
   4815	/* KVM doesn't use 2-level page tables for the shadow MMU. */
   4816	bool is_pse = false;
   4817	struct rsvd_bits_validate *shadow_zero_check;
   4818	int i;
   4819
   4820	WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
   4821
   4822	shadow_zero_check = &context->shadow_zero_check;
   4823	__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
   4824				context->root_role.level,
   4825				context->root_role.efer_nx,
   4826				guest_can_use_gbpages(vcpu), is_pse, is_amd);
   4827
   4828	if (!shadow_me_mask)
   4829		return;
   4830
   4831	for (i = context->root_role.level; --i >= 0;) {
   4832		/*
   4833		 * So far shadow_me_value is a constant during KVM's life
   4834		 * time.  Bits in shadow_me_value are allowed to be set.
   4835		 * Bits in shadow_me_mask but not in shadow_me_value are
   4836		 * not allowed to be set.
   4837		 */
   4838		shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
   4839		shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
   4840		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
   4841		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
   4842	}
   4843
   4844}
   4845
   4846static inline bool boot_cpu_is_amd(void)
   4847{
   4848	WARN_ON_ONCE(!tdp_enabled);
   4849	return shadow_x_mask == 0;
   4850}
   4851
   4852/*
   4853 * the direct page table on host, use as much mmu features as
   4854 * possible, however, kvm currently does not do execution-protection.
   4855 */
   4856static void
   4857reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
   4858{
   4859	struct rsvd_bits_validate *shadow_zero_check;
   4860	int i;
   4861
   4862	shadow_zero_check = &context->shadow_zero_check;
   4863
   4864	if (boot_cpu_is_amd())
   4865		__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
   4866					context->root_role.level, false,
   4867					boot_cpu_has(X86_FEATURE_GBPAGES),
   4868					false, true);
   4869	else
   4870		__reset_rsvds_bits_mask_ept(shadow_zero_check,
   4871					    reserved_hpa_bits(), false,
   4872					    max_huge_page_level);
   4873
   4874	if (!shadow_me_mask)
   4875		return;
   4876
   4877	for (i = context->root_role.level; --i >= 0;) {
   4878		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
   4879		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
   4880	}
   4881}
   4882
   4883/*
   4884 * as the comments in reset_shadow_zero_bits_mask() except it
   4885 * is the shadow page table for intel nested guest.
   4886 */
   4887static void
   4888reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
   4889{
   4890	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
   4891				    reserved_hpa_bits(), execonly,
   4892				    max_huge_page_level);
   4893}
   4894
   4895#define BYTE_MASK(access) \
   4896	((1 & (access) ? 2 : 0) | \
   4897	 (2 & (access) ? 4 : 0) | \
   4898	 (3 & (access) ? 8 : 0) | \
   4899	 (4 & (access) ? 16 : 0) | \
   4900	 (5 & (access) ? 32 : 0) | \
   4901	 (6 & (access) ? 64 : 0) | \
   4902	 (7 & (access) ? 128 : 0))
   4903
   4904
   4905static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
   4906{
   4907	unsigned byte;
   4908
   4909	const u8 x = BYTE_MASK(ACC_EXEC_MASK);
   4910	const u8 w = BYTE_MASK(ACC_WRITE_MASK);
   4911	const u8 u = BYTE_MASK(ACC_USER_MASK);
   4912
   4913	bool cr4_smep = is_cr4_smep(mmu);
   4914	bool cr4_smap = is_cr4_smap(mmu);
   4915	bool cr0_wp = is_cr0_wp(mmu);
   4916	bool efer_nx = is_efer_nx(mmu);
   4917
   4918	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
   4919		unsigned pfec = byte << 1;
   4920
   4921		/*
   4922		 * Each "*f" variable has a 1 bit for each UWX value
   4923		 * that causes a fault with the given PFEC.
   4924		 */
   4925
   4926		/* Faults from writes to non-writable pages */
   4927		u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
   4928		/* Faults from user mode accesses to supervisor pages */
   4929		u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
   4930		/* Faults from fetches of non-executable pages*/
   4931		u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
   4932		/* Faults from kernel mode fetches of user pages */
   4933		u8 smepf = 0;
   4934		/* Faults from kernel mode accesses of user pages */
   4935		u8 smapf = 0;
   4936
   4937		if (!ept) {
   4938			/* Faults from kernel mode accesses to user pages */
   4939			u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
   4940
   4941			/* Not really needed: !nx will cause pte.nx to fault */
   4942			if (!efer_nx)
   4943				ff = 0;
   4944
   4945			/* Allow supervisor writes if !cr0.wp */
   4946			if (!cr0_wp)
   4947				wf = (pfec & PFERR_USER_MASK) ? wf : 0;
   4948
   4949			/* Disallow supervisor fetches of user code if cr4.smep */
   4950			if (cr4_smep)
   4951				smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
   4952
   4953			/*
   4954			 * SMAP:kernel-mode data accesses from user-mode
   4955			 * mappings should fault. A fault is considered
   4956			 * as a SMAP violation if all of the following
   4957			 * conditions are true:
   4958			 *   - X86_CR4_SMAP is set in CR4
   4959			 *   - A user page is accessed
   4960			 *   - The access is not a fetch
   4961			 *   - The access is supervisor mode
   4962			 *   - If implicit supervisor access or X86_EFLAGS_AC is clear
   4963			 *
   4964			 * Here, we cover the first four conditions.
   4965			 * The fifth is computed dynamically in permission_fault();
   4966			 * PFERR_RSVD_MASK bit will be set in PFEC if the access is
   4967			 * *not* subject to SMAP restrictions.
   4968			 */
   4969			if (cr4_smap)
   4970				smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
   4971		}
   4972
   4973		mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
   4974	}
   4975}
   4976
   4977/*
   4978* PKU is an additional mechanism by which the paging controls access to
   4979* user-mode addresses based on the value in the PKRU register.  Protection
   4980* key violations are reported through a bit in the page fault error code.
   4981* Unlike other bits of the error code, the PK bit is not known at the
   4982* call site of e.g. gva_to_gpa; it must be computed directly in
   4983* permission_fault based on two bits of PKRU, on some machine state (CR4,
   4984* CR0, EFER, CPL), and on other bits of the error code and the page tables.
   4985*
   4986* In particular the following conditions come from the error code, the
   4987* page tables and the machine state:
   4988* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
   4989* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
   4990* - PK is always zero if U=0 in the page tables
   4991* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
   4992*
   4993* The PKRU bitmask caches the result of these four conditions.  The error
   4994* code (minus the P bit) and the page table's U bit form an index into the
   4995* PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
   4996* with the two bits of the PKRU register corresponding to the protection key.
   4997* For the first three conditions above the bits will be 00, thus masking
   4998* away both AD and WD.  For all reads or if the last condition holds, WD
   4999* only will be masked away.
   5000*/
   5001static void update_pkru_bitmask(struct kvm_mmu *mmu)
   5002{
   5003	unsigned bit;
   5004	bool wp;
   5005
   5006	mmu->pkru_mask = 0;
   5007
   5008	if (!is_cr4_pke(mmu))
   5009		return;
   5010
   5011	wp = is_cr0_wp(mmu);
   5012
   5013	for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
   5014		unsigned pfec, pkey_bits;
   5015		bool check_pkey, check_write, ff, uf, wf, pte_user;
   5016
   5017		pfec = bit << 1;
   5018		ff = pfec & PFERR_FETCH_MASK;
   5019		uf = pfec & PFERR_USER_MASK;
   5020		wf = pfec & PFERR_WRITE_MASK;
   5021
   5022		/* PFEC.RSVD is replaced by ACC_USER_MASK. */
   5023		pte_user = pfec & PFERR_RSVD_MASK;
   5024
   5025		/*
   5026		 * Only need to check the access which is not an
   5027		 * instruction fetch and is to a user page.
   5028		 */
   5029		check_pkey = (!ff && pte_user);
   5030		/*
   5031		 * write access is controlled by PKRU if it is a
   5032		 * user access or CR0.WP = 1.
   5033		 */
   5034		check_write = check_pkey && wf && (uf || wp);
   5035
   5036		/* PKRU.AD stops both read and write access. */
   5037		pkey_bits = !!check_pkey;
   5038		/* PKRU.WD stops write access. */
   5039		pkey_bits |= (!!check_write) << 1;
   5040
   5041		mmu->pkru_mask |= (pkey_bits & 3) << pfec;
   5042	}
   5043}
   5044
   5045static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
   5046					struct kvm_mmu *mmu)
   5047{
   5048	if (!is_cr0_pg(mmu))
   5049		return;
   5050
   5051	reset_guest_rsvds_bits_mask(vcpu, mmu);
   5052	update_permission_bitmask(mmu, false);
   5053	update_pkru_bitmask(mmu);
   5054}
   5055
   5056static void paging64_init_context(struct kvm_mmu *context)
   5057{
   5058	context->page_fault = paging64_page_fault;
   5059	context->gva_to_gpa = paging64_gva_to_gpa;
   5060	context->sync_page = paging64_sync_page;
   5061	context->invlpg = paging64_invlpg;
   5062}
   5063
   5064static void paging32_init_context(struct kvm_mmu *context)
   5065{
   5066	context->page_fault = paging32_page_fault;
   5067	context->gva_to_gpa = paging32_gva_to_gpa;
   5068	context->sync_page = paging32_sync_page;
   5069	context->invlpg = paging32_invlpg;
   5070}
   5071
   5072static union kvm_cpu_role
   5073kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
   5074{
   5075	union kvm_cpu_role role = {0};
   5076
   5077	role.base.access = ACC_ALL;
   5078	role.base.smm = is_smm(vcpu);
   5079	role.base.guest_mode = is_guest_mode(vcpu);
   5080	role.ext.valid = 1;
   5081
   5082	if (!____is_cr0_pg(regs)) {
   5083		role.base.direct = 1;
   5084		return role;
   5085	}
   5086
   5087	role.base.efer_nx = ____is_efer_nx(regs);
   5088	role.base.cr0_wp = ____is_cr0_wp(regs);
   5089	role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
   5090	role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
   5091	role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
   5092
   5093	if (____is_efer_lma(regs))
   5094		role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
   5095							: PT64_ROOT_4LEVEL;
   5096	else if (____is_cr4_pae(regs))
   5097		role.base.level = PT32E_ROOT_LEVEL;
   5098	else
   5099		role.base.level = PT32_ROOT_LEVEL;
   5100
   5101	role.ext.cr4_smep = ____is_cr4_smep(regs);
   5102	role.ext.cr4_smap = ____is_cr4_smap(regs);
   5103	role.ext.cr4_pse = ____is_cr4_pse(regs);
   5104
   5105	/* PKEY and LA57 are active iff long mode is active. */
   5106	role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
   5107	role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
   5108	role.ext.efer_lma = ____is_efer_lma(regs);
   5109	return role;
   5110}
   5111
   5112static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
   5113{
   5114	/* tdp_root_level is architecture forced level, use it if nonzero */
   5115	if (tdp_root_level)
   5116		return tdp_root_level;
   5117
   5118	/* Use 5-level TDP if and only if it's useful/necessary. */
   5119	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
   5120		return 4;
   5121
   5122	return max_tdp_level;
   5123}
   5124
   5125static union kvm_mmu_page_role
   5126kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
   5127				union kvm_cpu_role cpu_role)
   5128{
   5129	union kvm_mmu_page_role role = {0};
   5130
   5131	role.access = ACC_ALL;
   5132	role.cr0_wp = true;
   5133	role.efer_nx = true;
   5134	role.smm = cpu_role.base.smm;
   5135	role.guest_mode = cpu_role.base.guest_mode;
   5136	role.ad_disabled = !kvm_ad_enabled();
   5137	role.level = kvm_mmu_get_tdp_level(vcpu);
   5138	role.direct = true;
   5139	role.has_4_byte_gpte = false;
   5140
   5141	return role;
   5142}
   5143
   5144static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
   5145			     union kvm_cpu_role cpu_role)
   5146{
   5147	struct kvm_mmu *context = &vcpu->arch.root_mmu;
   5148	union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
   5149
   5150	if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
   5151	    root_role.word == context->root_role.word)
   5152		return;
   5153
   5154	context->cpu_role.as_u64 = cpu_role.as_u64;
   5155	context->root_role.word = root_role.word;
   5156	context->page_fault = kvm_tdp_page_fault;
   5157	context->sync_page = nonpaging_sync_page;
   5158	context->invlpg = NULL;
   5159	context->get_guest_pgd = get_cr3;
   5160	context->get_pdptr = kvm_pdptr_read;
   5161	context->inject_page_fault = kvm_inject_page_fault;
   5162
   5163	if (!is_cr0_pg(context))
   5164		context->gva_to_gpa = nonpaging_gva_to_gpa;
   5165	else if (is_cr4_pae(context))
   5166		context->gva_to_gpa = paging64_gva_to_gpa;
   5167	else
   5168		context->gva_to_gpa = paging32_gva_to_gpa;
   5169
   5170	reset_guest_paging_metadata(vcpu, context);
   5171	reset_tdp_shadow_zero_bits_mask(context);
   5172}
   5173
   5174static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
   5175				    union kvm_cpu_role cpu_role,
   5176				    union kvm_mmu_page_role root_role)
   5177{
   5178	if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
   5179	    root_role.word == context->root_role.word)
   5180		return;
   5181
   5182	context->cpu_role.as_u64 = cpu_role.as_u64;
   5183	context->root_role.word = root_role.word;
   5184
   5185	if (!is_cr0_pg(context))
   5186		nonpaging_init_context(context);
   5187	else if (is_cr4_pae(context))
   5188		paging64_init_context(context);
   5189	else
   5190		paging32_init_context(context);
   5191
   5192	reset_guest_paging_metadata(vcpu, context);
   5193	reset_shadow_zero_bits_mask(vcpu, context);
   5194}
   5195
   5196static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
   5197				union kvm_cpu_role cpu_role)
   5198{
   5199	struct kvm_mmu *context = &vcpu->arch.root_mmu;
   5200	union kvm_mmu_page_role root_role;
   5201
   5202	root_role = cpu_role.base;
   5203
   5204	/* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
   5205	root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
   5206
   5207	/*
   5208	 * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
   5209	 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
   5210	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
   5211	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
   5212	 * The iTLB multi-hit workaround can be toggled at any time, so assume
   5213	 * NX can be used by any non-nested shadow MMU to avoid having to reset
   5214	 * MMU contexts.
   5215	 */
   5216	root_role.efer_nx = true;
   5217
   5218	shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
   5219}
   5220
   5221void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
   5222			     unsigned long cr4, u64 efer, gpa_t nested_cr3)
   5223{
   5224	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
   5225	struct kvm_mmu_role_regs regs = {
   5226		.cr0 = cr0,
   5227		.cr4 = cr4 & ~X86_CR4_PKE,
   5228		.efer = efer,
   5229	};
   5230	union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
   5231	union kvm_mmu_page_role root_role;
   5232
   5233	/* NPT requires CR0.PG=1. */
   5234	WARN_ON_ONCE(cpu_role.base.direct);
   5235
   5236	root_role = cpu_role.base;
   5237	root_role.level = kvm_mmu_get_tdp_level(vcpu);
   5238	if (root_role.level == PT64_ROOT_5LEVEL &&
   5239	    cpu_role.base.level == PT64_ROOT_4LEVEL)
   5240		root_role.passthrough = 1;
   5241
   5242	shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
   5243	kvm_mmu_new_pgd(vcpu, nested_cr3);
   5244}
   5245EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
   5246
   5247static union kvm_cpu_role
   5248kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
   5249				   bool execonly, u8 level)
   5250{
   5251	union kvm_cpu_role role = {0};
   5252
   5253	/*
   5254	 * KVM does not support SMM transfer monitors, and consequently does not
   5255	 * support the "entry to SMM" control either.  role.base.smm is always 0.
   5256	 */
   5257	WARN_ON_ONCE(is_smm(vcpu));
   5258	role.base.level = level;
   5259	role.base.has_4_byte_gpte = false;
   5260	role.base.direct = false;
   5261	role.base.ad_disabled = !accessed_dirty;
   5262	role.base.guest_mode = true;
   5263	role.base.access = ACC_ALL;
   5264
   5265	role.ext.word = 0;
   5266	role.ext.execonly = execonly;
   5267	role.ext.valid = 1;
   5268
   5269	return role;
   5270}
   5271
   5272void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
   5273			     int huge_page_level, bool accessed_dirty,
   5274			     gpa_t new_eptp)
   5275{
   5276	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
   5277	u8 level = vmx_eptp_page_walk_level(new_eptp);
   5278	union kvm_cpu_role new_mode =
   5279		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
   5280						   execonly, level);
   5281
   5282	if (new_mode.as_u64 != context->cpu_role.as_u64) {
   5283		/* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
   5284		context->cpu_role.as_u64 = new_mode.as_u64;
   5285		context->root_role.word = new_mode.base.word;
   5286
   5287		context->page_fault = ept_page_fault;
   5288		context->gva_to_gpa = ept_gva_to_gpa;
   5289		context->sync_page = ept_sync_page;
   5290		context->invlpg = ept_invlpg;
   5291
   5292		update_permission_bitmask(context, true);
   5293		context->pkru_mask = 0;
   5294		reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
   5295		reset_ept_shadow_zero_bits_mask(context, execonly);
   5296	}
   5297
   5298	kvm_mmu_new_pgd(vcpu, new_eptp);
   5299}
   5300EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
   5301
   5302static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
   5303			     union kvm_cpu_role cpu_role)
   5304{
   5305	struct kvm_mmu *context = &vcpu->arch.root_mmu;
   5306
   5307	kvm_init_shadow_mmu(vcpu, cpu_role);
   5308
   5309	context->get_guest_pgd     = get_cr3;
   5310	context->get_pdptr         = kvm_pdptr_read;
   5311	context->inject_page_fault = kvm_inject_page_fault;
   5312}
   5313
   5314static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
   5315				union kvm_cpu_role new_mode)
   5316{
   5317	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
   5318
   5319	if (new_mode.as_u64 == g_context->cpu_role.as_u64)
   5320		return;
   5321
   5322	g_context->cpu_role.as_u64   = new_mode.as_u64;
   5323	g_context->get_guest_pgd     = get_cr3;
   5324	g_context->get_pdptr         = kvm_pdptr_read;
   5325	g_context->inject_page_fault = kvm_inject_page_fault;
   5326
   5327	/*
   5328	 * L2 page tables are never shadowed, so there is no need to sync
   5329	 * SPTEs.
   5330	 */
   5331	g_context->invlpg            = NULL;
   5332
   5333	/*
   5334	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
   5335	 * L1's nested page tables (e.g. EPT12). The nested translation
   5336	 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
   5337	 * L2's page tables as the first level of translation and L1's
   5338	 * nested page tables as the second level of translation. Basically
   5339	 * the gva_to_gpa functions between mmu and nested_mmu are swapped.
   5340	 */
   5341	if (!is_paging(vcpu))
   5342		g_context->gva_to_gpa = nonpaging_gva_to_gpa;
   5343	else if (is_long_mode(vcpu))
   5344		g_context->gva_to_gpa = paging64_gva_to_gpa;
   5345	else if (is_pae(vcpu))
   5346		g_context->gva_to_gpa = paging64_gva_to_gpa;
   5347	else
   5348		g_context->gva_to_gpa = paging32_gva_to_gpa;
   5349
   5350	reset_guest_paging_metadata(vcpu, g_context);
   5351}
   5352
   5353void kvm_init_mmu(struct kvm_vcpu *vcpu)
   5354{
   5355	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
   5356	union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
   5357
   5358	if (mmu_is_nested(vcpu))
   5359		init_kvm_nested_mmu(vcpu, cpu_role);
   5360	else if (tdp_enabled)
   5361		init_kvm_tdp_mmu(vcpu, cpu_role);
   5362	else
   5363		init_kvm_softmmu(vcpu, cpu_role);
   5364}
   5365EXPORT_SYMBOL_GPL(kvm_init_mmu);
   5366
   5367void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
   5368{
   5369	/*
   5370	 * Invalidate all MMU roles to force them to reinitialize as CPUID
   5371	 * information is factored into reserved bit calculations.
   5372	 *
   5373	 * Correctly handling multiple vCPU models with respect to paging and
   5374	 * physical address properties) in a single VM would require tracking
   5375	 * all relevant CPUID information in kvm_mmu_page_role. That is very
   5376	 * undesirable as it would increase the memory requirements for
   5377	 * gfn_track (see struct kvm_mmu_page_role comments).  For now that
   5378	 * problem is swept under the rug; KVM's CPUID API is horrific and
   5379	 * it's all but impossible to solve it without introducing a new API.
   5380	 */
   5381	vcpu->arch.root_mmu.root_role.word = 0;
   5382	vcpu->arch.guest_mmu.root_role.word = 0;
   5383	vcpu->arch.nested_mmu.root_role.word = 0;
   5384	vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
   5385	vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
   5386	vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
   5387	kvm_mmu_reset_context(vcpu);
   5388
   5389	/*
   5390	 * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
   5391	 * kvm_arch_vcpu_ioctl().
   5392	 */
   5393	KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
   5394}
   5395
   5396void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
   5397{
   5398	kvm_mmu_unload(vcpu);
   5399	kvm_init_mmu(vcpu);
   5400}
   5401EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
   5402
   5403int kvm_mmu_load(struct kvm_vcpu *vcpu)
   5404{
   5405	int r;
   5406
   5407	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
   5408	if (r)
   5409		goto out;
   5410	r = mmu_alloc_special_roots(vcpu);
   5411	if (r)
   5412		goto out;
   5413	if (vcpu->arch.mmu->root_role.direct)
   5414		r = mmu_alloc_direct_roots(vcpu);
   5415	else
   5416		r = mmu_alloc_shadow_roots(vcpu);
   5417	if (r)
   5418		goto out;
   5419
   5420	kvm_mmu_sync_roots(vcpu);
   5421
   5422	kvm_mmu_load_pgd(vcpu);
   5423
   5424	/*
   5425	 * Flush any TLB entries for the new root, the provenance of the root
   5426	 * is unknown.  Even if KVM ensures there are no stale TLB entries
   5427	 * for a freed root, in theory another hypervisor could have left
   5428	 * stale entries.  Flushing on alloc also allows KVM to skip the TLB
   5429	 * flush when freeing a root (see kvm_tdp_mmu_put_root()).
   5430	 */
   5431	static_call(kvm_x86_flush_tlb_current)(vcpu);
   5432out:
   5433	return r;
   5434}
   5435
   5436void kvm_mmu_unload(struct kvm_vcpu *vcpu)
   5437{
   5438	struct kvm *kvm = vcpu->kvm;
   5439
   5440	kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
   5441	WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
   5442	kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
   5443	WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
   5444	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
   5445}
   5446
   5447static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
   5448{
   5449	struct kvm_mmu_page *sp;
   5450
   5451	if (!VALID_PAGE(root_hpa))
   5452		return false;
   5453
   5454	/*
   5455	 * When freeing obsolete roots, treat roots as obsolete if they don't
   5456	 * have an associated shadow page.  This does mean KVM will get false
   5457	 * positives and free roots that don't strictly need to be freed, but
   5458	 * such false positives are relatively rare:
   5459	 *
   5460	 *  (a) only PAE paging and nested NPT has roots without shadow pages
   5461	 *  (b) remote reloads due to a memslot update obsoletes _all_ roots
   5462	 *  (c) KVM doesn't track previous roots for PAE paging, and the guest
   5463	 *      is unlikely to zap an in-use PGD.
   5464	 */
   5465	sp = to_shadow_page(root_hpa);
   5466	return !sp || is_obsolete_sp(kvm, sp);
   5467}
   5468
   5469static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
   5470{
   5471	unsigned long roots_to_free = 0;
   5472	int i;
   5473
   5474	if (is_obsolete_root(kvm, mmu->root.hpa))
   5475		roots_to_free |= KVM_MMU_ROOT_CURRENT;
   5476
   5477	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
   5478		if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
   5479			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
   5480	}
   5481
   5482	if (roots_to_free)
   5483		kvm_mmu_free_roots(kvm, mmu, roots_to_free);
   5484}
   5485
   5486void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
   5487{
   5488	__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
   5489	__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
   5490}
   5491
   5492static bool need_remote_flush(u64 old, u64 new)
   5493{
   5494	if (!is_shadow_present_pte(old))
   5495		return false;
   5496	if (!is_shadow_present_pte(new))
   5497		return true;
   5498	if ((old ^ new) & PT64_BASE_ADDR_MASK)
   5499		return true;
   5500	old ^= shadow_nx_mask;
   5501	new ^= shadow_nx_mask;
   5502	return (old & ~new & PT64_PERM_MASK) != 0;
   5503}
   5504
   5505static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
   5506				    int *bytes)
   5507{
   5508	u64 gentry = 0;
   5509	int r;
   5510
   5511	/*
   5512	 * Assume that the pte write on a page table of the same type
   5513	 * as the current vcpu paging mode since we update the sptes only
   5514	 * when they have the same mode.
   5515	 */
   5516	if (is_pae(vcpu) && *bytes == 4) {
   5517		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
   5518		*gpa &= ~(gpa_t)7;
   5519		*bytes = 8;
   5520	}
   5521
   5522	if (*bytes == 4 || *bytes == 8) {
   5523		r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
   5524		if (r)
   5525			gentry = 0;
   5526	}
   5527
   5528	return gentry;
   5529}
   5530
   5531/*
   5532 * If we're seeing too many writes to a page, it may no longer be a page table,
   5533 * or we may be forking, in which case it is better to unmap the page.
   5534 */
   5535static bool detect_write_flooding(struct kvm_mmu_page *sp)
   5536{
   5537	/*
   5538	 * Skip write-flooding detected for the sp whose level is 1, because
   5539	 * it can become unsync, then the guest page is not write-protected.
   5540	 */
   5541	if (sp->role.level == PG_LEVEL_4K)
   5542		return false;
   5543
   5544	atomic_inc(&sp->write_flooding_count);
   5545	return atomic_read(&sp->write_flooding_count) >= 3;
   5546}
   5547
   5548/*
   5549 * Misaligned accesses are too much trouble to fix up; also, they usually
   5550 * indicate a page is not used as a page table.
   5551 */
   5552static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
   5553				    int bytes)
   5554{
   5555	unsigned offset, pte_size, misaligned;
   5556
   5557	pgprintk("misaligned: gpa %llx bytes %d role %x\n",
   5558		 gpa, bytes, sp->role.word);
   5559
   5560	offset = offset_in_page(gpa);
   5561	pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
   5562
   5563	/*
   5564	 * Sometimes, the OS only writes the last one bytes to update status
   5565	 * bits, for example, in linux, andb instruction is used in clear_bit().
   5566	 */
   5567	if (!(offset & (pte_size - 1)) && bytes == 1)
   5568		return false;
   5569
   5570	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
   5571	misaligned |= bytes < 4;
   5572
   5573	return misaligned;
   5574}
   5575
   5576static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
   5577{
   5578	unsigned page_offset, quadrant;
   5579	u64 *spte;
   5580	int level;
   5581
   5582	page_offset = offset_in_page(gpa);
   5583	level = sp->role.level;
   5584	*nspte = 1;
   5585	if (sp->role.has_4_byte_gpte) {
   5586		page_offset <<= 1;	/* 32->64 */
   5587		/*
   5588		 * A 32-bit pde maps 4MB while the shadow pdes map
   5589		 * only 2MB.  So we need to double the offset again
   5590		 * and zap two pdes instead of one.
   5591		 */
   5592		if (level == PT32_ROOT_LEVEL) {
   5593			page_offset &= ~7; /* kill rounding error */
   5594			page_offset <<= 1;
   5595			*nspte = 2;
   5596		}
   5597		quadrant = page_offset >> PAGE_SHIFT;
   5598		page_offset &= ~PAGE_MASK;
   5599		if (quadrant != sp->role.quadrant)
   5600			return NULL;
   5601	}
   5602
   5603	spte = &sp->spt[page_offset / sizeof(*spte)];
   5604	return spte;
   5605}
   5606
   5607static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
   5608			      const u8 *new, int bytes,
   5609			      struct kvm_page_track_notifier_node *node)
   5610{
   5611	gfn_t gfn = gpa >> PAGE_SHIFT;
   5612	struct kvm_mmu_page *sp;
   5613	LIST_HEAD(invalid_list);
   5614	u64 entry, gentry, *spte;
   5615	int npte;
   5616	bool flush = false;
   5617
   5618	/*
   5619	 * If we don't have indirect shadow pages, it means no page is
   5620	 * write-protected, so we can exit simply.
   5621	 */
   5622	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
   5623		return;
   5624
   5625	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
   5626
   5627	/*
   5628	 * No need to care whether allocation memory is successful
   5629	 * or not since pte prefetch is skipped if it does not have
   5630	 * enough objects in the cache.
   5631	 */
   5632	mmu_topup_memory_caches(vcpu, true);
   5633
   5634	write_lock(&vcpu->kvm->mmu_lock);
   5635
   5636	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
   5637
   5638	++vcpu->kvm->stat.mmu_pte_write;
   5639
   5640	for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
   5641		if (detect_write_misaligned(sp, gpa, bytes) ||
   5642		      detect_write_flooding(sp)) {
   5643			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
   5644			++vcpu->kvm->stat.mmu_flooded;
   5645			continue;
   5646		}
   5647
   5648		spte = get_written_sptes(sp, gpa, &npte);
   5649		if (!spte)
   5650			continue;
   5651
   5652		while (npte--) {
   5653			entry = *spte;
   5654			mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
   5655			if (gentry && sp->role.level != PG_LEVEL_4K)
   5656				++vcpu->kvm->stat.mmu_pde_zapped;
   5657			if (need_remote_flush(entry, *spte))
   5658				flush = true;
   5659			++spte;
   5660		}
   5661	}
   5662	kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
   5663	write_unlock(&vcpu->kvm->mmu_lock);
   5664}
   5665
   5666int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
   5667		       void *insn, int insn_len)
   5668{
   5669	int r, emulation_type = EMULTYPE_PF;
   5670	bool direct = vcpu->arch.mmu->root_role.direct;
   5671
   5672	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
   5673		return RET_PF_RETRY;
   5674
   5675	r = RET_PF_INVALID;
   5676	if (unlikely(error_code & PFERR_RSVD_MASK)) {
   5677		r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
   5678		if (r == RET_PF_EMULATE)
   5679			goto emulate;
   5680	}
   5681
   5682	if (r == RET_PF_INVALID) {
   5683		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
   5684					  lower_32_bits(error_code), false);
   5685		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
   5686			return -EIO;
   5687	}
   5688
   5689	if (r < 0)
   5690		return r;
   5691	if (r != RET_PF_EMULATE)
   5692		return 1;
   5693
   5694	/*
   5695	 * Before emulating the instruction, check if the error code
   5696	 * was due to a RO violation while translating the guest page.
   5697	 * This can occur when using nested virtualization with nested
   5698	 * paging in both guests. If true, we simply unprotect the page
   5699	 * and resume the guest.
   5700	 */
   5701	if (vcpu->arch.mmu->root_role.direct &&
   5702	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
   5703		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
   5704		return 1;
   5705	}
   5706
   5707	/*
   5708	 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
   5709	 * optimistically try to just unprotect the page and let the processor
   5710	 * re-execute the instruction that caused the page fault.  Do not allow
   5711	 * retrying MMIO emulation, as it's not only pointless but could also
   5712	 * cause us to enter an infinite loop because the processor will keep
   5713	 * faulting on the non-existent MMIO address.  Retrying an instruction
   5714	 * from a nested guest is also pointless and dangerous as we are only
   5715	 * explicitly shadowing L1's page tables, i.e. unprotecting something
   5716	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
   5717	 */
   5718	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
   5719		emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
   5720emulate:
   5721	return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
   5722				       insn_len);
   5723}
   5724EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
   5725
   5726void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
   5727			    gva_t gva, hpa_t root_hpa)
   5728{
   5729	int i;
   5730
   5731	/* It's actually a GPA for vcpu->arch.guest_mmu.  */
   5732	if (mmu != &vcpu->arch.guest_mmu) {
   5733		/* INVLPG on a non-canonical address is a NOP according to the SDM.  */
   5734		if (is_noncanonical_address(gva, vcpu))
   5735			return;
   5736
   5737		static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
   5738	}
   5739
   5740	if (!mmu->invlpg)
   5741		return;
   5742
   5743	if (root_hpa == INVALID_PAGE) {
   5744		mmu->invlpg(vcpu, gva, mmu->root.hpa);
   5745
   5746		/*
   5747		 * INVLPG is required to invalidate any global mappings for the VA,
   5748		 * irrespective of PCID. Since it would take us roughly similar amount
   5749		 * of work to determine whether any of the prev_root mappings of the VA
   5750		 * is marked global, or to just sync it blindly, so we might as well
   5751		 * just always sync it.
   5752		 *
   5753		 * Mappings not reachable via the current cr3 or the prev_roots will be
   5754		 * synced when switching to that cr3, so nothing needs to be done here
   5755		 * for them.
   5756		 */
   5757		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
   5758			if (VALID_PAGE(mmu->prev_roots[i].hpa))
   5759				mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
   5760	} else {
   5761		mmu->invlpg(vcpu, gva, root_hpa);
   5762	}
   5763}
   5764
   5765void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
   5766{
   5767	kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
   5768	++vcpu->stat.invlpg;
   5769}
   5770EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
   5771
   5772
   5773void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
   5774{
   5775	struct kvm_mmu *mmu = vcpu->arch.mmu;
   5776	bool tlb_flush = false;
   5777	uint i;
   5778
   5779	if (pcid == kvm_get_active_pcid(vcpu)) {
   5780		if (mmu->invlpg)
   5781			mmu->invlpg(vcpu, gva, mmu->root.hpa);
   5782		tlb_flush = true;
   5783	}
   5784
   5785	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
   5786		if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
   5787		    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
   5788			if (mmu->invlpg)
   5789				mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
   5790			tlb_flush = true;
   5791		}
   5792	}
   5793
   5794	if (tlb_flush)
   5795		static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
   5796
   5797	++vcpu->stat.invlpg;
   5798
   5799	/*
   5800	 * Mappings not reachable via the current cr3 or the prev_roots will be
   5801	 * synced when switching to that cr3, so nothing needs to be done here
   5802	 * for them.
   5803	 */
   5804}
   5805
   5806void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
   5807		       int tdp_max_root_level, int tdp_huge_page_level)
   5808{
   5809	tdp_enabled = enable_tdp;
   5810	tdp_root_level = tdp_forced_root_level;
   5811	max_tdp_level = tdp_max_root_level;
   5812
   5813	/*
   5814	 * max_huge_page_level reflects KVM's MMU capabilities irrespective
   5815	 * of kernel support, e.g. KVM may be capable of using 1GB pages when
   5816	 * the kernel is not.  But, KVM never creates a page size greater than
   5817	 * what is used by the kernel for any given HVA, i.e. the kernel's
   5818	 * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
   5819	 */
   5820	if (tdp_enabled)
   5821		max_huge_page_level = tdp_huge_page_level;
   5822	else if (boot_cpu_has(X86_FEATURE_GBPAGES))
   5823		max_huge_page_level = PG_LEVEL_1G;
   5824	else
   5825		max_huge_page_level = PG_LEVEL_2M;
   5826}
   5827EXPORT_SYMBOL_GPL(kvm_configure_mmu);
   5828
   5829/* The return value indicates if tlb flush on all vcpus is needed. */
   5830typedef bool (*slot_level_handler) (struct kvm *kvm,
   5831				    struct kvm_rmap_head *rmap_head,
   5832				    const struct kvm_memory_slot *slot);
   5833
   5834/* The caller should hold mmu-lock before calling this function. */
   5835static __always_inline bool
   5836slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
   5837			slot_level_handler fn, int start_level, int end_level,
   5838			gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
   5839			bool flush)
   5840{
   5841	struct slot_rmap_walk_iterator iterator;
   5842
   5843	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
   5844			end_gfn, &iterator) {
   5845		if (iterator.rmap)
   5846			flush |= fn(kvm, iterator.rmap, memslot);
   5847
   5848		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
   5849			if (flush && flush_on_yield) {
   5850				kvm_flush_remote_tlbs_with_address(kvm,
   5851						start_gfn,
   5852						iterator.gfn - start_gfn + 1);
   5853				flush = false;
   5854			}
   5855			cond_resched_rwlock_write(&kvm->mmu_lock);
   5856		}
   5857	}
   5858
   5859	return flush;
   5860}
   5861
   5862static __always_inline bool
   5863slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
   5864		  slot_level_handler fn, int start_level, int end_level,
   5865		  bool flush_on_yield)
   5866{
   5867	return slot_handle_level_range(kvm, memslot, fn, start_level,
   5868			end_level, memslot->base_gfn,
   5869			memslot->base_gfn + memslot->npages - 1,
   5870			flush_on_yield, false);
   5871}
   5872
   5873static __always_inline bool
   5874slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
   5875		     slot_level_handler fn, bool flush_on_yield)
   5876{
   5877	return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
   5878				 PG_LEVEL_4K, flush_on_yield);
   5879}
   5880
   5881static void free_mmu_pages(struct kvm_mmu *mmu)
   5882{
   5883	if (!tdp_enabled && mmu->pae_root)
   5884		set_memory_encrypted((unsigned long)mmu->pae_root, 1);
   5885	free_page((unsigned long)mmu->pae_root);
   5886	free_page((unsigned long)mmu->pml4_root);
   5887	free_page((unsigned long)mmu->pml5_root);
   5888}
   5889
   5890static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
   5891{
   5892	struct page *page;
   5893	int i;
   5894
   5895	mmu->root.hpa = INVALID_PAGE;
   5896	mmu->root.pgd = 0;
   5897	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
   5898		mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
   5899
   5900	/* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
   5901	if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
   5902		return 0;
   5903
   5904	/*
   5905	 * When using PAE paging, the four PDPTEs are treated as 'root' pages,
   5906	 * while the PDP table is a per-vCPU construct that's allocated at MMU
   5907	 * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
   5908	 * x86_64.  Therefore we need to allocate the PDP table in the first
   5909	 * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
   5910	 * generally doesn't use PAE paging and can skip allocating the PDP
   5911	 * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
   5912	 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
   5913	 * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
   5914	 */
   5915	if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
   5916		return 0;
   5917
   5918	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
   5919	if (!page)
   5920		return -ENOMEM;
   5921
   5922	mmu->pae_root = page_address(page);
   5923
   5924	/*
   5925	 * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
   5926	 * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
   5927	 * that KVM's writes and the CPU's reads get along.  Note, this is
   5928	 * only necessary when using shadow paging, as 64-bit NPT can get at
   5929	 * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
   5930	 * by 32-bit kernels (when KVM itself uses 32-bit NPT).
   5931	 */
   5932	if (!tdp_enabled)
   5933		set_memory_decrypted((unsigned long)mmu->pae_root, 1);
   5934	else
   5935		WARN_ON_ONCE(shadow_me_value);
   5936
   5937	for (i = 0; i < 4; ++i)
   5938		mmu->pae_root[i] = INVALID_PAE_ROOT;
   5939
   5940	return 0;
   5941}
   5942
   5943int kvm_mmu_create(struct kvm_vcpu *vcpu)
   5944{
   5945	int ret;
   5946
   5947	vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
   5948	vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
   5949
   5950	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
   5951	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
   5952
   5953	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
   5954
   5955	vcpu->arch.mmu = &vcpu->arch.root_mmu;
   5956	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
   5957
   5958	ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
   5959	if (ret)
   5960		return ret;
   5961
   5962	ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
   5963	if (ret)
   5964		goto fail_allocate_root;
   5965
   5966	return ret;
   5967 fail_allocate_root:
   5968	free_mmu_pages(&vcpu->arch.guest_mmu);
   5969	return ret;
   5970}
   5971
   5972#define BATCH_ZAP_PAGES	10
   5973static void kvm_zap_obsolete_pages(struct kvm *kvm)
   5974{
   5975	struct kvm_mmu_page *sp, *node;
   5976	int nr_zapped, batch = 0;
   5977	bool unstable;
   5978
   5979restart:
   5980	list_for_each_entry_safe_reverse(sp, node,
   5981	      &kvm->arch.active_mmu_pages, link) {
   5982		/*
   5983		 * No obsolete valid page exists before a newly created page
   5984		 * since active_mmu_pages is a FIFO list.
   5985		 */
   5986		if (!is_obsolete_sp(kvm, sp))
   5987			break;
   5988
   5989		/*
   5990		 * Invalid pages should never land back on the list of active
   5991		 * pages.  Skip the bogus page, otherwise we'll get stuck in an
   5992		 * infinite loop if the page gets put back on the list (again).
   5993		 */
   5994		if (WARN_ON(sp->role.invalid))
   5995			continue;
   5996
   5997		/*
   5998		 * No need to flush the TLB since we're only zapping shadow
   5999		 * pages with an obsolete generation number and all vCPUS have
   6000		 * loaded a new root, i.e. the shadow pages being zapped cannot
   6001		 * be in active use by the guest.
   6002		 */
   6003		if (batch >= BATCH_ZAP_PAGES &&
   6004		    cond_resched_rwlock_write(&kvm->mmu_lock)) {
   6005			batch = 0;
   6006			goto restart;
   6007		}
   6008
   6009		unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
   6010				&kvm->arch.zapped_obsolete_pages, &nr_zapped);
   6011		batch += nr_zapped;
   6012
   6013		if (unstable)
   6014			goto restart;
   6015	}
   6016
   6017	/*
   6018	 * Kick all vCPUs (via remote TLB flush) before freeing the page tables
   6019	 * to ensure KVM is not in the middle of a lockless shadow page table
   6020	 * walk, which may reference the pages.  The remote TLB flush itself is
   6021	 * not required and is simply a convenient way to kick vCPUs as needed.
   6022	 * KVM performs a local TLB flush when allocating a new root (see
   6023	 * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
   6024	 * running with an obsolete MMU.
   6025	 */
   6026	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
   6027}
   6028
   6029/*
   6030 * Fast invalidate all shadow pages and use lock-break technique
   6031 * to zap obsolete pages.
   6032 *
   6033 * It's required when memslot is being deleted or VM is being
   6034 * destroyed, in these cases, we should ensure that KVM MMU does
   6035 * not use any resource of the being-deleted slot or all slots
   6036 * after calling the function.
   6037 */
   6038static void kvm_mmu_zap_all_fast(struct kvm *kvm)
   6039{
   6040	lockdep_assert_held(&kvm->slots_lock);
   6041
   6042	write_lock(&kvm->mmu_lock);
   6043	trace_kvm_mmu_zap_all_fast(kvm);
   6044
   6045	/*
   6046	 * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
   6047	 * held for the entire duration of zapping obsolete pages, it's
   6048	 * impossible for there to be multiple invalid generations associated
   6049	 * with *valid* shadow pages at any given time, i.e. there is exactly
   6050	 * one valid generation and (at most) one invalid generation.
   6051	 */
   6052	kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
   6053
   6054	/*
   6055	 * In order to ensure all vCPUs drop their soon-to-be invalid roots,
   6056	 * invalidating TDP MMU roots must be done while holding mmu_lock for
   6057	 * write and in the same critical section as making the reload request,
   6058	 * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
   6059	 */
   6060	if (is_tdp_mmu_enabled(kvm))
   6061		kvm_tdp_mmu_invalidate_all_roots(kvm);
   6062
   6063	/*
   6064	 * Notify all vcpus to reload its shadow page table and flush TLB.
   6065	 * Then all vcpus will switch to new shadow page table with the new
   6066	 * mmu_valid_gen.
   6067	 *
   6068	 * Note: we need to do this under the protection of mmu_lock,
   6069	 * otherwise, vcpu would purge shadow page but miss tlb flush.
   6070	 */
   6071	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
   6072
   6073	kvm_zap_obsolete_pages(kvm);
   6074
   6075	write_unlock(&kvm->mmu_lock);
   6076
   6077	/*
   6078	 * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
   6079	 * returning to the caller, e.g. if the zap is in response to a memslot
   6080	 * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
   6081	 * associated with the deleted memslot once the update completes, and
   6082	 * Deferring the zap until the final reference to the root is put would
   6083	 * lead to use-after-free.
   6084	 */
   6085	if (is_tdp_mmu_enabled(kvm))
   6086		kvm_tdp_mmu_zap_invalidated_roots(kvm);
   6087}
   6088
   6089static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
   6090{
   6091	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
   6092}
   6093
   6094static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
   6095			struct kvm_memory_slot *slot,
   6096			struct kvm_page_track_notifier_node *node)
   6097{
   6098	kvm_mmu_zap_all_fast(kvm);
   6099}
   6100
   6101int kvm_mmu_init_vm(struct kvm *kvm)
   6102{
   6103	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
   6104	int r;
   6105
   6106	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
   6107	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
   6108	INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
   6109	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
   6110
   6111	r = kvm_mmu_init_tdp_mmu(kvm);
   6112	if (r < 0)
   6113		return r;
   6114
   6115	node->track_write = kvm_mmu_pte_write;
   6116	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
   6117	kvm_page_track_register_notifier(kvm, node);
   6118	return 0;
   6119}
   6120
   6121void kvm_mmu_uninit_vm(struct kvm *kvm)
   6122{
   6123	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
   6124
   6125	kvm_page_track_unregister_notifier(kvm, node);
   6126
   6127	kvm_mmu_uninit_tdp_mmu(kvm);
   6128}
   6129
   6130static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
   6131{
   6132	const struct kvm_memory_slot *memslot;
   6133	struct kvm_memslots *slots;
   6134	struct kvm_memslot_iter iter;
   6135	bool flush = false;
   6136	gfn_t start, end;
   6137	int i;
   6138
   6139	if (!kvm_memslots_have_rmaps(kvm))
   6140		return flush;
   6141
   6142	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
   6143		slots = __kvm_memslots(kvm, i);
   6144
   6145		kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
   6146			memslot = iter.slot;
   6147			start = max(gfn_start, memslot->base_gfn);
   6148			end = min(gfn_end, memslot->base_gfn + memslot->npages);
   6149			if (WARN_ON_ONCE(start >= end))
   6150				continue;
   6151
   6152			flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
   6153
   6154							PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
   6155							start, end - 1, true, flush);
   6156		}
   6157	}
   6158
   6159	return flush;
   6160}
   6161
   6162/*
   6163 * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
   6164 * (not including it)
   6165 */
   6166void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
   6167{
   6168	bool flush;
   6169	int i;
   6170
   6171	if (WARN_ON_ONCE(gfn_end <= gfn_start))
   6172		return;
   6173
   6174	write_lock(&kvm->mmu_lock);
   6175
   6176	kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
   6177
   6178	flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end);
   6179
   6180	if (is_tdp_mmu_enabled(kvm)) {
   6181		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
   6182			flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
   6183						      gfn_end, true, flush);
   6184	}
   6185
   6186	if (flush)
   6187		kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
   6188						   gfn_end - gfn_start);
   6189
   6190	kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
   6191
   6192	write_unlock(&kvm->mmu_lock);
   6193}
   6194
   6195static bool slot_rmap_write_protect(struct kvm *kvm,
   6196				    struct kvm_rmap_head *rmap_head,
   6197				    const struct kvm_memory_slot *slot)
   6198{
   6199	return rmap_write_protect(rmap_head, false);
   6200}
   6201
   6202void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
   6203				      const struct kvm_memory_slot *memslot,
   6204				      int start_level)
   6205{
   6206	bool flush = false;
   6207
   6208	if (kvm_memslots_have_rmaps(kvm)) {
   6209		write_lock(&kvm->mmu_lock);
   6210		flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
   6211					  start_level, KVM_MAX_HUGEPAGE_LEVEL,
   6212					  false);
   6213		write_unlock(&kvm->mmu_lock);
   6214	}
   6215
   6216	if (is_tdp_mmu_enabled(kvm)) {
   6217		read_lock(&kvm->mmu_lock);
   6218		flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
   6219		read_unlock(&kvm->mmu_lock);
   6220	}
   6221
   6222	/*
   6223	 * Flush TLBs if any SPTEs had to be write-protected to ensure that
   6224	 * guest writes are reflected in the dirty bitmap before the memslot
   6225	 * update completes, i.e. before enabling dirty logging is visible to
   6226	 * userspace.
   6227	 *
   6228	 * Perform the TLB flush outside the mmu_lock to reduce the amount of
   6229	 * time the lock is held. However, this does mean that another CPU can
   6230	 * now grab mmu_lock and encounter a write-protected SPTE while CPUs
   6231	 * still have a writable mapping for the associated GFN in their TLB.
   6232	 *
   6233	 * This is safe but requires KVM to be careful when making decisions
   6234	 * based on the write-protection status of an SPTE. Specifically, KVM
   6235	 * also write-protects SPTEs to monitor changes to guest page tables
   6236	 * during shadow paging, and must guarantee no CPUs can write to those
   6237	 * page before the lock is dropped. As mentioned in the previous
   6238	 * paragraph, a write-protected SPTE is no guarantee that CPU cannot
   6239	 * perform writes. So to determine if a TLB flush is truly required, KVM
   6240	 * will clear a separate software-only bit (MMU-writable) and skip the
   6241	 * flush if-and-only-if this bit was already clear.
   6242	 *
   6243	 * See is_writable_pte() for more details.
   6244	 */
   6245	if (flush)
   6246		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
   6247}
   6248
   6249/* Must be called with the mmu_lock held in write-mode. */
   6250void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
   6251				   const struct kvm_memory_slot *memslot,
   6252				   u64 start, u64 end,
   6253				   int target_level)
   6254{
   6255	if (is_tdp_mmu_enabled(kvm))
   6256		kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
   6257						 target_level, false);
   6258
   6259	/*
   6260	 * A TLB flush is unnecessary at this point for the same resons as in
   6261	 * kvm_mmu_slot_try_split_huge_pages().
   6262	 */
   6263}
   6264
   6265void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
   6266					const struct kvm_memory_slot *memslot,
   6267					int target_level)
   6268{
   6269	u64 start = memslot->base_gfn;
   6270	u64 end = start + memslot->npages;
   6271
   6272	if (is_tdp_mmu_enabled(kvm)) {
   6273		read_lock(&kvm->mmu_lock);
   6274		kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
   6275		read_unlock(&kvm->mmu_lock);
   6276	}
   6277
   6278	/*
   6279	 * No TLB flush is necessary here. KVM will flush TLBs after
   6280	 * write-protecting and/or clearing dirty on the newly split SPTEs to
   6281	 * ensure that guest writes are reflected in the dirty log before the
   6282	 * ioctl to enable dirty logging on this memslot completes. Since the
   6283	 * split SPTEs retain the write and dirty bits of the huge SPTE, it is
   6284	 * safe for KVM to decide if a TLB flush is necessary based on the split
   6285	 * SPTEs.
   6286	 */
   6287}
   6288
   6289static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
   6290					 struct kvm_rmap_head *rmap_head,
   6291					 const struct kvm_memory_slot *slot)
   6292{
   6293	u64 *sptep;
   6294	struct rmap_iterator iter;
   6295	int need_tlb_flush = 0;
   6296	kvm_pfn_t pfn;
   6297	struct kvm_mmu_page *sp;
   6298
   6299restart:
   6300	for_each_rmap_spte(rmap_head, &iter, sptep) {
   6301		sp = sptep_to_sp(sptep);
   6302		pfn = spte_to_pfn(*sptep);
   6303
   6304		/*
   6305		 * We cannot do huge page mapping for indirect shadow pages,
   6306		 * which are found on the last rmap (level = 1) when not using
   6307		 * tdp; such shadow pages are synced with the page table in
   6308		 * the guest, and the guest page table is using 4K page size
   6309		 * mapping if the indirect sp has level = 1.
   6310		 */
   6311		if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
   6312		    sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
   6313							       pfn, PG_LEVEL_NUM)) {
   6314			pte_list_remove(kvm, rmap_head, sptep);
   6315
   6316			if (kvm_available_flush_tlb_with_range())
   6317				kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
   6318					KVM_PAGES_PER_HPAGE(sp->role.level));
   6319			else
   6320				need_tlb_flush = 1;
   6321
   6322			goto restart;
   6323		}
   6324	}
   6325
   6326	return need_tlb_flush;
   6327}
   6328EXPORT_SYMBOL_GPL(kvm_zap_gfn_range);
   6329
   6330void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
   6331				   const struct kvm_memory_slot *slot)
   6332{
   6333	if (kvm_memslots_have_rmaps(kvm)) {
   6334		write_lock(&kvm->mmu_lock);
   6335		/*
   6336		 * Zap only 4k SPTEs since the legacy MMU only supports dirty
   6337		 * logging at a 4k granularity and never creates collapsible
   6338		 * 2m SPTEs during dirty logging.
   6339		 */
   6340		if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
   6341			kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
   6342		write_unlock(&kvm->mmu_lock);
   6343	}
   6344
   6345	if (is_tdp_mmu_enabled(kvm)) {
   6346		read_lock(&kvm->mmu_lock);
   6347		kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
   6348		read_unlock(&kvm->mmu_lock);
   6349	}
   6350}
   6351
   6352void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
   6353					const struct kvm_memory_slot *memslot)
   6354{
   6355	/*
   6356	 * All current use cases for flushing the TLBs for a specific memslot
   6357	 * related to dirty logging, and many do the TLB flush out of mmu_lock.
   6358	 * The interaction between the various operations on memslot must be
   6359	 * serialized by slots_locks to ensure the TLB flush from one operation
   6360	 * is observed by any other operation on the same memslot.
   6361	 */
   6362	lockdep_assert_held(&kvm->slots_lock);
   6363	kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
   6364					   memslot->npages);
   6365}
   6366
   6367void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
   6368				   const struct kvm_memory_slot *memslot)
   6369{
   6370	bool flush = false;
   6371
   6372	if (kvm_memslots_have_rmaps(kvm)) {
   6373		write_lock(&kvm->mmu_lock);
   6374		/*
   6375		 * Clear dirty bits only on 4k SPTEs since the legacy MMU only
   6376		 * support dirty logging at a 4k granularity.
   6377		 */
   6378		flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
   6379		write_unlock(&kvm->mmu_lock);
   6380	}
   6381
   6382	if (is_tdp_mmu_enabled(kvm)) {
   6383		read_lock(&kvm->mmu_lock);
   6384		flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
   6385		read_unlock(&kvm->mmu_lock);
   6386	}
   6387
   6388	/*
   6389	 * It's also safe to flush TLBs out of mmu lock here as currently this
   6390	 * function is only used for dirty logging, in which case flushing TLB
   6391	 * out of mmu lock also guarantees no dirty pages will be lost in
   6392	 * dirty_bitmap.
   6393	 */
   6394	if (flush)
   6395		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
   6396}
   6397
   6398void kvm_mmu_zap_all(struct kvm *kvm)
   6399{
   6400	struct kvm_mmu_page *sp, *node;
   6401	LIST_HEAD(invalid_list);
   6402	int ign;
   6403
   6404	write_lock(&kvm->mmu_lock);
   6405restart:
   6406	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
   6407		if (WARN_ON(sp->role.invalid))
   6408			continue;
   6409		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
   6410			goto restart;
   6411		if (cond_resched_rwlock_write(&kvm->mmu_lock))
   6412			goto restart;
   6413	}
   6414
   6415	kvm_mmu_commit_zap_page(kvm, &invalid_list);
   6416
   6417	if (is_tdp_mmu_enabled(kvm))
   6418		kvm_tdp_mmu_zap_all(kvm);
   6419
   6420	write_unlock(&kvm->mmu_lock);
   6421}
   6422
   6423void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
   6424{
   6425	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
   6426
   6427	gen &= MMIO_SPTE_GEN_MASK;
   6428
   6429	/*
   6430	 * Generation numbers are incremented in multiples of the number of
   6431	 * address spaces in order to provide unique generations across all
   6432	 * address spaces.  Strip what is effectively the address space
   6433	 * modifier prior to checking for a wrap of the MMIO generation so
   6434	 * that a wrap in any address space is detected.
   6435	 */
   6436	gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
   6437
   6438	/*
   6439	 * The very rare case: if the MMIO generation number has wrapped,
   6440	 * zap all shadow pages.
   6441	 */
   6442	if (unlikely(gen == 0)) {
   6443		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
   6444		kvm_mmu_zap_all_fast(kvm);
   6445	}
   6446}
   6447
   6448static unsigned long
   6449mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
   6450{
   6451	struct kvm *kvm;
   6452	int nr_to_scan = sc->nr_to_scan;
   6453	unsigned long freed = 0;
   6454
   6455	mutex_lock(&kvm_lock);
   6456
   6457	list_for_each_entry(kvm, &vm_list, vm_list) {
   6458		int idx;
   6459		LIST_HEAD(invalid_list);
   6460
   6461		/*
   6462		 * Never scan more than sc->nr_to_scan VM instances.
   6463		 * Will not hit this condition practically since we do not try
   6464		 * to shrink more than one VM and it is very unlikely to see
   6465		 * !n_used_mmu_pages so many times.
   6466		 */
   6467		if (!nr_to_scan--)
   6468			break;
   6469		/*
   6470		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
   6471		 * here. We may skip a VM instance errorneosly, but we do not
   6472		 * want to shrink a VM that only started to populate its MMU
   6473		 * anyway.
   6474		 */
   6475		if (!kvm->arch.n_used_mmu_pages &&
   6476		    !kvm_has_zapped_obsolete_pages(kvm))
   6477			continue;
   6478
   6479		idx = srcu_read_lock(&kvm->srcu);
   6480		write_lock(&kvm->mmu_lock);
   6481
   6482		if (kvm_has_zapped_obsolete_pages(kvm)) {
   6483			kvm_mmu_commit_zap_page(kvm,
   6484			      &kvm->arch.zapped_obsolete_pages);
   6485			goto unlock;
   6486		}
   6487
   6488		freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
   6489
   6490unlock:
   6491		write_unlock(&kvm->mmu_lock);
   6492		srcu_read_unlock(&kvm->srcu, idx);
   6493
   6494		/*
   6495		 * unfair on small ones
   6496		 * per-vm shrinkers cry out
   6497		 * sadness comes quickly
   6498		 */
   6499		list_move_tail(&kvm->vm_list, &vm_list);
   6500		break;
   6501	}
   6502
   6503	mutex_unlock(&kvm_lock);
   6504	return freed;
   6505}
   6506
   6507static unsigned long
   6508mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
   6509{
   6510	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
   6511}
   6512
   6513static struct shrinker mmu_shrinker = {
   6514	.count_objects = mmu_shrink_count,
   6515	.scan_objects = mmu_shrink_scan,
   6516	.seeks = DEFAULT_SEEKS * 10,
   6517};
   6518
   6519static void mmu_destroy_caches(void)
   6520{
   6521	kmem_cache_destroy(pte_list_desc_cache);
   6522	kmem_cache_destroy(mmu_page_header_cache);
   6523}
   6524
   6525static bool get_nx_auto_mode(void)
   6526{
   6527	/* Return true when CPU has the bug, and mitigations are ON */
   6528	return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
   6529}
   6530
   6531static void __set_nx_huge_pages(bool val)
   6532{
   6533	nx_huge_pages = itlb_multihit_kvm_mitigation = val;
   6534}
   6535
   6536static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
   6537{
   6538	bool old_val = nx_huge_pages;
   6539	bool new_val;
   6540
   6541	/* In "auto" mode deploy workaround only if CPU has the bug. */
   6542	if (sysfs_streq(val, "off"))
   6543		new_val = 0;
   6544	else if (sysfs_streq(val, "force"))
   6545		new_val = 1;
   6546	else if (sysfs_streq(val, "auto"))
   6547		new_val = get_nx_auto_mode();
   6548	else if (strtobool(val, &new_val) < 0)
   6549		return -EINVAL;
   6550
   6551	__set_nx_huge_pages(new_val);
   6552
   6553	if (new_val != old_val) {
   6554		struct kvm *kvm;
   6555
   6556		mutex_lock(&kvm_lock);
   6557
   6558		list_for_each_entry(kvm, &vm_list, vm_list) {
   6559			mutex_lock(&kvm->slots_lock);
   6560			kvm_mmu_zap_all_fast(kvm);
   6561			mutex_unlock(&kvm->slots_lock);
   6562
   6563			wake_up_process(kvm->arch.nx_lpage_recovery_thread);
   6564		}
   6565		mutex_unlock(&kvm_lock);
   6566	}
   6567
   6568	return 0;
   6569}
   6570
   6571/*
   6572 * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
   6573 * its default value of -1 is technically undefined behavior for a boolean.
   6574 * Forward the module init call to SPTE code so that it too can handle module
   6575 * params that need to be resolved/snapshot.
   6576 */
   6577void __init kvm_mmu_x86_module_init(void)
   6578{
   6579	if (nx_huge_pages == -1)
   6580		__set_nx_huge_pages(get_nx_auto_mode());
   6581
   6582	kvm_mmu_spte_module_init();
   6583}
   6584
   6585/*
   6586 * The bulk of the MMU initialization is deferred until the vendor module is
   6587 * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
   6588 * to be reset when a potentially different vendor module is loaded.
   6589 */
   6590int kvm_mmu_vendor_module_init(void)
   6591{
   6592	int ret = -ENOMEM;
   6593
   6594	/*
   6595	 * MMU roles use union aliasing which is, generally speaking, an
   6596	 * undefined behavior. However, we supposedly know how compilers behave
   6597	 * and the current status quo is unlikely to change. Guardians below are
   6598	 * supposed to let us know if the assumption becomes false.
   6599	 */
   6600	BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
   6601	BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
   6602	BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
   6603
   6604	kvm_mmu_reset_all_pte_masks();
   6605
   6606	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
   6607					    sizeof(struct pte_list_desc),
   6608					    0, SLAB_ACCOUNT, NULL);
   6609	if (!pte_list_desc_cache)
   6610		goto out;
   6611
   6612	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
   6613						  sizeof(struct kvm_mmu_page),
   6614						  0, SLAB_ACCOUNT, NULL);
   6615	if (!mmu_page_header_cache)
   6616		goto out;
   6617
   6618	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
   6619		goto out;
   6620
   6621	ret = register_shrinker(&mmu_shrinker);
   6622	if (ret)
   6623		goto out;
   6624
   6625	return 0;
   6626
   6627out:
   6628	mmu_destroy_caches();
   6629	return ret;
   6630}
   6631
   6632void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
   6633{
   6634	kvm_mmu_unload(vcpu);
   6635	free_mmu_pages(&vcpu->arch.root_mmu);
   6636	free_mmu_pages(&vcpu->arch.guest_mmu);
   6637	mmu_free_memory_caches(vcpu);
   6638}
   6639
   6640void kvm_mmu_vendor_module_exit(void)
   6641{
   6642	mmu_destroy_caches();
   6643	percpu_counter_destroy(&kvm_total_used_mmu_pages);
   6644	unregister_shrinker(&mmu_shrinker);
   6645}
   6646
   6647/*
   6648 * Calculate the effective recovery period, accounting for '0' meaning "let KVM
   6649 * select a halving time of 1 hour".  Returns true if recovery is enabled.
   6650 */
   6651static bool calc_nx_huge_pages_recovery_period(uint *period)
   6652{
   6653	/*
   6654	 * Use READ_ONCE to get the params, this may be called outside of the
   6655	 * param setters, e.g. by the kthread to compute its next timeout.
   6656	 */
   6657	bool enabled = READ_ONCE(nx_huge_pages);
   6658	uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
   6659
   6660	if (!enabled || !ratio)
   6661		return false;
   6662
   6663	*period = READ_ONCE(nx_huge_pages_recovery_period_ms);
   6664	if (!*period) {
   6665		/* Make sure the period is not less than one second.  */
   6666		ratio = min(ratio, 3600u);
   6667		*period = 60 * 60 * 1000 / ratio;
   6668	}
   6669	return true;
   6670}
   6671
   6672static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
   6673{
   6674	bool was_recovery_enabled, is_recovery_enabled;
   6675	uint old_period, new_period;
   6676	int err;
   6677
   6678	was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
   6679
   6680	err = param_set_uint(val, kp);
   6681	if (err)
   6682		return err;
   6683
   6684	is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
   6685
   6686	if (is_recovery_enabled &&
   6687	    (!was_recovery_enabled || old_period > new_period)) {
   6688		struct kvm *kvm;
   6689
   6690		mutex_lock(&kvm_lock);
   6691
   6692		list_for_each_entry(kvm, &vm_list, vm_list)
   6693			wake_up_process(kvm->arch.nx_lpage_recovery_thread);
   6694
   6695		mutex_unlock(&kvm_lock);
   6696	}
   6697
   6698	return err;
   6699}
   6700
   6701static void kvm_recover_nx_lpages(struct kvm *kvm)
   6702{
   6703	unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
   6704	int rcu_idx;
   6705	struct kvm_mmu_page *sp;
   6706	unsigned int ratio;
   6707	LIST_HEAD(invalid_list);
   6708	bool flush = false;
   6709	ulong to_zap;
   6710
   6711	rcu_idx = srcu_read_lock(&kvm->srcu);
   6712	write_lock(&kvm->mmu_lock);
   6713
   6714	/*
   6715	 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
   6716	 * be done under RCU protection, because the pages are freed via RCU
   6717	 * callback.
   6718	 */
   6719	rcu_read_lock();
   6720
   6721	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
   6722	to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
   6723	for ( ; to_zap; --to_zap) {
   6724		if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
   6725			break;
   6726
   6727		/*
   6728		 * We use a separate list instead of just using active_mmu_pages
   6729		 * because the number of lpage_disallowed pages is expected to
   6730		 * be relatively small compared to the total.
   6731		 */
   6732		sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
   6733				      struct kvm_mmu_page,
   6734				      lpage_disallowed_link);
   6735		WARN_ON_ONCE(!sp->lpage_disallowed);
   6736		if (is_tdp_mmu_page(sp)) {
   6737			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
   6738		} else {
   6739			kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
   6740			WARN_ON_ONCE(sp->lpage_disallowed);
   6741		}
   6742
   6743		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
   6744			kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
   6745			rcu_read_unlock();
   6746
   6747			cond_resched_rwlock_write(&kvm->mmu_lock);
   6748			flush = false;
   6749
   6750			rcu_read_lock();
   6751		}
   6752	}
   6753	kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
   6754
   6755	rcu_read_unlock();
   6756
   6757	write_unlock(&kvm->mmu_lock);
   6758	srcu_read_unlock(&kvm->srcu, rcu_idx);
   6759}
   6760
   6761static long get_nx_lpage_recovery_timeout(u64 start_time)
   6762{
   6763	bool enabled;
   6764	uint period;
   6765
   6766	enabled = calc_nx_huge_pages_recovery_period(&period);
   6767
   6768	return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
   6769		       : MAX_SCHEDULE_TIMEOUT;
   6770}
   6771
   6772static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
   6773{
   6774	u64 start_time;
   6775	long remaining_time;
   6776
   6777	while (true) {
   6778		start_time = get_jiffies_64();
   6779		remaining_time = get_nx_lpage_recovery_timeout(start_time);
   6780
   6781		set_current_state(TASK_INTERRUPTIBLE);
   6782		while (!kthread_should_stop() && remaining_time > 0) {
   6783			schedule_timeout(remaining_time);
   6784			remaining_time = get_nx_lpage_recovery_timeout(start_time);
   6785			set_current_state(TASK_INTERRUPTIBLE);
   6786		}
   6787
   6788		set_current_state(TASK_RUNNING);
   6789
   6790		if (kthread_should_stop())
   6791			return 0;
   6792
   6793		kvm_recover_nx_lpages(kvm);
   6794	}
   6795}
   6796
   6797int kvm_mmu_post_init_vm(struct kvm *kvm)
   6798{
   6799	int err;
   6800
   6801	err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
   6802					  "kvm-nx-lpage-recovery",
   6803					  &kvm->arch.nx_lpage_recovery_thread);
   6804	if (!err)
   6805		kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
   6806
   6807	return err;
   6808}
   6809
   6810void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
   6811{
   6812	if (kvm->arch.nx_lpage_recovery_thread)
   6813		kthread_stop(kvm->arch.nx_lpage_recovery_thread);
   6814}