paging_tmpl.h - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
paging_tmpl.h (31634B)
      1/* SPDX-License-Identifier: GPL-2.0-only */
      2/*
      3 * Kernel-based Virtual Machine driver for Linux
      4 *
      5 * This module enables machines with Intel VT-x extensions to run virtual
      6 * machines without emulation or binary translation.
      7 *
      8 * MMU support
      9 *
     10 * Copyright (C) 2006 Qumranet, Inc.
     11 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
     12 *
     13 * Authors:
     14 *   Yaniv Kamay  <yaniv@qumranet.com>
     15 *   Avi Kivity   <avi@qumranet.com>
     16 */
     17
     18/*
     19 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
     20 * so the code in this file is compiled twice, once per pte size.
     21 */
     22
     23#if PTTYPE == 64
     24	#define pt_element_t u64
     25	#define guest_walker guest_walker64
     26	#define FNAME(name) paging##64_##name
     27	#define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
     28	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
     29	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
     30	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
     31	#define PT_LEVEL_BITS PT64_LEVEL_BITS
     32	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
     33	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
     34	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
     35	#ifdef CONFIG_X86_64
     36	#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
     37	#define CMPXCHG "cmpxchgq"
     38	#else
     39	#define PT_MAX_FULL_LEVELS 2
     40	#endif
     41#elif PTTYPE == 32
     42	#define pt_element_t u32
     43	#define guest_walker guest_walker32
     44	#define FNAME(name) paging##32_##name
     45	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
     46	#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
     47	#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
     48	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
     49	#define PT_LEVEL_BITS PT32_LEVEL_BITS
     50	#define PT_MAX_FULL_LEVELS 2
     51	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
     52	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
     53	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
     54	#define CMPXCHG "cmpxchgl"
     55#elif PTTYPE == PTTYPE_EPT
     56	#define pt_element_t u64
     57	#define guest_walker guest_walkerEPT
     58	#define FNAME(name) ept_##name
     59	#define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
     60	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
     61	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
     62	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
     63	#define PT_LEVEL_BITS PT64_LEVEL_BITS
     64	#define PT_GUEST_DIRTY_SHIFT 9
     65	#define PT_GUEST_ACCESSED_SHIFT 8
     66	#define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
     67	#ifdef CONFIG_X86_64
     68	#define CMPXCHG "cmpxchgq"
     69	#endif
     70	#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
     71#else
     72	#error Invalid PTTYPE value
     73#endif
     74
     75#define PT_GUEST_DIRTY_MASK    (1 << PT_GUEST_DIRTY_SHIFT)
     76#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
     77
     78#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
     79#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K)
     80
     81/*
     82 * The guest_walker structure emulates the behavior of the hardware page
     83 * table walker.
     84 */
     85struct guest_walker {
     86	int level;
     87	unsigned max_level;
     88	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
     89	pt_element_t ptes[PT_MAX_FULL_LEVELS];
     90	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
     91	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
     92	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
     93	bool pte_writable[PT_MAX_FULL_LEVELS];
     94	unsigned int pt_access[PT_MAX_FULL_LEVELS];
     95	unsigned int pte_access;
     96	gfn_t gfn;
     97	struct x86_exception fault;
     98};
     99
    100static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
    101{
    102	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
    103}
    104
    105static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
    106					     unsigned gpte)
    107{
    108	unsigned mask;
    109
    110	/* dirty bit is not supported, so no need to track it */
    111	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
    112		return;
    113
    114	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
    115
    116	mask = (unsigned)~ACC_WRITE_MASK;
    117	/* Allow write access to dirty gptes */
    118	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
    119		PT_WRITABLE_MASK;
    120	*access &= mask;
    121}
    122
    123static inline int FNAME(is_present_gpte)(unsigned long pte)
    124{
    125#if PTTYPE != PTTYPE_EPT
    126	return pte & PT_PRESENT_MASK;
    127#else
    128	return pte & 7;
    129#endif
    130}
    131
    132static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte)
    133{
    134#if PTTYPE != PTTYPE_EPT
    135	return false;
    136#else
    137	return __is_bad_mt_xwr(rsvd_check, gpte);
    138#endif
    139}
    140
    141static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
    142{
    143	return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) ||
    144	       FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
    145}
    146
    147static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
    148				  struct kvm_mmu_page *sp, u64 *spte,
    149				  u64 gpte)
    150{
    151	if (!FNAME(is_present_gpte)(gpte))
    152		goto no_present;
    153
    154	/* Prefetch only accessed entries (unless A/D bits are disabled). */
    155	if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
    156	    !(gpte & PT_GUEST_ACCESSED_MASK))
    157		goto no_present;
    158
    159	if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K))
    160		goto no_present;
    161
    162	return false;
    163
    164no_present:
    165	drop_spte(vcpu->kvm, spte);
    166	return true;
    167}
    168
    169/*
    170 * For PTTYPE_EPT, a page table can be executable but not readable
    171 * on supported processors. Therefore, set_spte does not automatically
    172 * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
    173 * to signify readability since it isn't used in the EPT case
    174 */
    175static inline unsigned FNAME(gpte_access)(u64 gpte)
    176{
    177	unsigned access;
    178#if PTTYPE == PTTYPE_EPT
    179	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
    180		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
    181		((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
    182#else
    183	BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
    184	BUILD_BUG_ON(ACC_EXEC_MASK != 1);
    185	access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
    186	/* Combine NX with P (which is set here) to get ACC_EXEC_MASK.  */
    187	access ^= (gpte >> PT64_NX_SHIFT);
    188#endif
    189
    190	return access;
    191}
    192
    193static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
    194					     struct kvm_mmu *mmu,
    195					     struct guest_walker *walker,
    196					     gpa_t addr, int write_fault)
    197{
    198	unsigned level, index;
    199	pt_element_t pte, orig_pte;
    200	pt_element_t __user *ptep_user;
    201	gfn_t table_gfn;
    202	int ret;
    203
    204	/* dirty/accessed bits are not supported, so no need to update them */
    205	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
    206		return 0;
    207
    208	for (level = walker->max_level; level >= walker->level; --level) {
    209		pte = orig_pte = walker->ptes[level - 1];
    210		table_gfn = walker->table_gfn[level - 1];
    211		ptep_user = walker->ptep_user[level - 1];
    212		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
    213		if (!(pte & PT_GUEST_ACCESSED_MASK)) {
    214			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
    215			pte |= PT_GUEST_ACCESSED_MASK;
    216		}
    217		if (level == walker->level && write_fault &&
    218				!(pte & PT_GUEST_DIRTY_MASK)) {
    219			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
    220#if PTTYPE == PTTYPE_EPT
    221			if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr))
    222				return -EINVAL;
    223#endif
    224			pte |= PT_GUEST_DIRTY_MASK;
    225		}
    226		if (pte == orig_pte)
    227			continue;
    228
    229		/*
    230		 * If the slot is read-only, simply do not process the accessed
    231		 * and dirty bits.  This is the correct thing to do if the slot
    232		 * is ROM, and page tables in read-as-ROM/write-as-MMIO slots
    233		 * are only supported if the accessed and dirty bits are already
    234		 * set in the ROM (so that MMIO writes are never needed).
    235		 *
    236		 * Note that NPT does not allow this at all and faults, since
    237		 * it always wants nested page table entries for the guest
    238		 * page tables to be writable.  And EPT works but will simply
    239		 * overwrite the read-only memory to set the accessed and dirty
    240		 * bits.
    241		 */
    242		if (unlikely(!walker->pte_writable[level - 1]))
    243			continue;
    244
    245		ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault);
    246		if (ret)
    247			return ret;
    248
    249		kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
    250		walker->ptes[level - 1] = pte;
    251	}
    252	return 0;
    253}
    254
    255static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
    256{
    257	unsigned pkeys = 0;
    258#if PTTYPE == 64
    259	pte_t pte = {.pte = gpte};
    260
    261	pkeys = pte_flags_pkey(pte_flags(pte));
    262#endif
    263	return pkeys;
    264}
    265
    266static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
    267				       unsigned int level, unsigned int gpte)
    268{
    269	/*
    270	 * For EPT and PAE paging (both variants), bit 7 is either reserved at
    271	 * all level or indicates a huge page (ignoring CR3/EPTP).  In either
    272	 * case, bit 7 being set terminates the walk.
    273	 */
    274#if PTTYPE == 32
    275	/*
    276	 * 32-bit paging requires special handling because bit 7 is ignored if
    277	 * CR4.PSE=0, not reserved.  Clear bit 7 in the gpte if the level is
    278	 * greater than the last level for which bit 7 is the PAGE_SIZE bit.
    279	 *
    280	 * The RHS has bit 7 set iff level < (2 + PSE).  If it is clear, bit 7
    281	 * is not reserved and does not indicate a large page at this level,
    282	 * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
    283	 */
    284	gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse);
    285#endif
    286	/*
    287	 * PG_LEVEL_4K always terminates.  The RHS has bit 7 set
    288	 * iff level <= PG_LEVEL_4K, which for our purpose means
    289	 * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
    290	 */
    291	gpte |= level - PG_LEVEL_4K - 1;
    292
    293	return gpte & PT_PAGE_SIZE_MASK;
    294}
    295/*
    296 * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
    297 */
    298static int FNAME(walk_addr_generic)(struct guest_walker *walker,
    299				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
    300				    gpa_t addr, u64 access)
    301{
    302	int ret;
    303	pt_element_t pte;
    304	pt_element_t __user *ptep_user;
    305	gfn_t table_gfn;
    306	u64 pt_access, pte_access;
    307	unsigned index, accessed_dirty, pte_pkey;
    308	u64 nested_access;
    309	gpa_t pte_gpa;
    310	bool have_ad;
    311	int offset;
    312	u64 walk_nx_mask = 0;
    313	const int write_fault = access & PFERR_WRITE_MASK;
    314	const int user_fault  = access & PFERR_USER_MASK;
    315	const int fetch_fault = access & PFERR_FETCH_MASK;
    316	u16 errcode = 0;
    317	gpa_t real_gpa;
    318	gfn_t gfn;
    319
    320	trace_kvm_mmu_pagetable_walk(addr, access);
    321retry_walk:
    322	walker->level = mmu->cpu_role.base.level;
    323	pte           = mmu->get_guest_pgd(vcpu);
    324	have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
    325
    326#if PTTYPE == 64
    327	walk_nx_mask = 1ULL << PT64_NX_SHIFT;
    328	if (walker->level == PT32E_ROOT_LEVEL) {
    329		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
    330		trace_kvm_mmu_paging_element(pte, walker->level);
    331		if (!FNAME(is_present_gpte)(pte))
    332			goto error;
    333		--walker->level;
    334	}
    335#endif
    336	walker->max_level = walker->level;
    337	ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
    338
    339	/*
    340	 * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
    341	 * by the MOV to CR instruction are treated as reads and do not cause the
    342	 * processor to set the dirty flag in any EPT paging-structure entry.
    343	 */
    344	nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
    345
    346	pte_access = ~0;
    347	++walker->level;
    348
    349	do {
    350		unsigned long host_addr;
    351
    352		pt_access = pte_access;
    353		--walker->level;
    354
    355		index = PT_INDEX(addr, walker->level);
    356		table_gfn = gpte_to_gfn(pte);
    357		offset    = index * sizeof(pt_element_t);
    358		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
    359
    360		BUG_ON(walker->level < 1);
    361		walker->table_gfn[walker->level - 1] = table_gfn;
    362		walker->pte_gpa[walker->level - 1] = pte_gpa;
    363
    364		real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn),
    365					     nested_access, &walker->fault);
    366
    367		/*
    368		 * FIXME: This can happen if emulation (for of an INS/OUTS
    369		 * instruction) triggers a nested page fault.  The exit
    370		 * qualification / exit info field will incorrectly have
    371		 * "guest page access" as the nested page fault's cause,
    372		 * instead of "guest page structure access".  To fix this,
    373		 * the x86_exception struct should be augmented with enough
    374		 * information to fix the exit_qualification or exit_info_1
    375		 * fields.
    376		 */
    377		if (unlikely(real_gpa == UNMAPPED_GVA))
    378			return 0;
    379
    380		host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa),
    381					    &walker->pte_writable[walker->level - 1]);
    382		if (unlikely(kvm_is_error_hva(host_addr)))
    383			goto error;
    384
    385		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
    386		if (unlikely(__get_user(pte, ptep_user)))
    387			goto error;
    388		walker->ptep_user[walker->level - 1] = ptep_user;
    389
    390		trace_kvm_mmu_paging_element(pte, walker->level);
    391
    392		/*
    393		 * Inverting the NX it lets us AND it like other
    394		 * permission bits.
    395		 */
    396		pte_access = pt_access & (pte ^ walk_nx_mask);
    397
    398		if (unlikely(!FNAME(is_present_gpte)(pte)))
    399			goto error;
    400
    401		if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
    402			errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
    403			goto error;
    404		}
    405
    406		walker->ptes[walker->level - 1] = pte;
    407
    408		/* Convert to ACC_*_MASK flags for struct guest_walker.  */
    409		walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
    410	} while (!FNAME(is_last_gpte)(mmu, walker->level, pte));
    411
    412	pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
    413	accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
    414
    415	/* Convert to ACC_*_MASK flags for struct guest_walker.  */
    416	walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
    417	errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
    418	if (unlikely(errcode))
    419		goto error;
    420
    421	gfn = gpte_to_gfn_lvl(pte, walker->level);
    422	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
    423
    424	if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
    425		gfn += pse36_gfn_delta(pte);
    426
    427	real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
    428	if (real_gpa == UNMAPPED_GVA)
    429		return 0;
    430
    431	walker->gfn = real_gpa >> PAGE_SHIFT;
    432
    433	if (!write_fault)
    434		FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
    435	else
    436		/*
    437		 * On a write fault, fold the dirty bit into accessed_dirty.
    438		 * For modes without A/D bits support accessed_dirty will be
    439		 * always clear.
    440		 */
    441		accessed_dirty &= pte >>
    442			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
    443
    444	if (unlikely(!accessed_dirty)) {
    445		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
    446							addr, write_fault);
    447		if (unlikely(ret < 0))
    448			goto error;
    449		else if (ret)
    450			goto retry_walk;
    451	}
    452
    453	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
    454		 __func__, (u64)pte, walker->pte_access,
    455		 walker->pt_access[walker->level - 1]);
    456	return 1;
    457
    458error:
    459	errcode |= write_fault | user_fault;
    460	if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
    461		errcode |= PFERR_FETCH_MASK;
    462
    463	walker->fault.vector = PF_VECTOR;
    464	walker->fault.error_code_valid = true;
    465	walker->fault.error_code = errcode;
    466
    467#if PTTYPE == PTTYPE_EPT
    468	/*
    469	 * Use PFERR_RSVD_MASK in error_code to to tell if EPT
    470	 * misconfiguration requires to be injected. The detection is
    471	 * done by is_rsvd_bits_set() above.
    472	 *
    473	 * We set up the value of exit_qualification to inject:
    474	 * [2:0] - Derive from the access bits. The exit_qualification might be
    475	 *         out of date if it is serving an EPT misconfiguration.
    476	 * [5:3] - Calculated by the page walk of the guest EPT page tables
    477	 * [7:8] - Derived from [7:8] of real exit_qualification
    478	 *
    479	 * The other bits are set to 0.
    480	 */
    481	if (!(errcode & PFERR_RSVD_MASK)) {
    482		vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID |
    483						  EPT_VIOLATION_GVA_TRANSLATED);
    484		if (write_fault)
    485			vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
    486		if (user_fault)
    487			vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
    488		if (fetch_fault)
    489			vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
    490
    491		/*
    492		 * Note, pte_access holds the raw RWX bits from the EPTE, not
    493		 * ACC_*_MASK flags!
    494		 */
    495		vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
    496						 EPT_VIOLATION_RWX_SHIFT;
    497	}
    498#endif
    499	walker->fault.address = addr;
    500	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
    501	walker->fault.async_page_fault = false;
    502
    503	trace_kvm_mmu_walker_error(walker->fault.error_code);
    504	return 0;
    505}
    506
    507static int FNAME(walk_addr)(struct guest_walker *walker,
    508			    struct kvm_vcpu *vcpu, gpa_t addr, u64 access)
    509{
    510	return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
    511					access);
    512}
    513
    514static bool
    515FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
    516		     u64 *spte, pt_element_t gpte, bool no_dirty_log)
    517{
    518	struct kvm_memory_slot *slot;
    519	unsigned pte_access;
    520	gfn_t gfn;
    521	kvm_pfn_t pfn;
    522
    523	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
    524		return false;
    525
    526	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
    527
    528	gfn = gpte_to_gfn(gpte);
    529	pte_access = sp->role.access & FNAME(gpte_access)(gpte);
    530	FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
    531
    532	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn,
    533			no_dirty_log && (pte_access & ACC_WRITE_MASK));
    534	if (!slot)
    535		return false;
    536
    537	pfn = gfn_to_pfn_memslot_atomic(slot, gfn);
    538	if (is_error_pfn(pfn))
    539		return false;
    540
    541	mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL);
    542	kvm_release_pfn_clean(pfn);
    543	return true;
    544}
    545
    546static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
    547				struct guest_walker *gw, int level)
    548{
    549	pt_element_t curr_pte;
    550	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
    551	u64 mask;
    552	int r, index;
    553
    554	if (level == PG_LEVEL_4K) {
    555		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
    556		base_gpa = pte_gpa & ~mask;
    557		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
    558
    559		r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
    560				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
    561		curr_pte = gw->prefetch_ptes[index];
    562	} else
    563		r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
    564				  &curr_pte, sizeof(curr_pte));
    565
    566	return r || curr_pte != gw->ptes[level - 1];
    567}
    568
    569static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
    570				u64 *sptep)
    571{
    572	struct kvm_mmu_page *sp;
    573	pt_element_t *gptep = gw->prefetch_ptes;
    574	u64 *spte;
    575	int i;
    576
    577	sp = sptep_to_sp(sptep);
    578
    579	if (sp->role.level > PG_LEVEL_4K)
    580		return;
    581
    582	/*
    583	 * If addresses are being invalidated, skip prefetching to avoid
    584	 * accidentally prefetching those addresses.
    585	 */
    586	if (unlikely(vcpu->kvm->mmu_notifier_count))
    587		return;
    588
    589	if (sp->role.direct)
    590		return __direct_pte_prefetch(vcpu, sp, sptep);
    591
    592	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
    593	spte = sp->spt + i;
    594
    595	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
    596		if (spte == sptep)
    597			continue;
    598
    599		if (is_shadow_present_pte(*spte))
    600			continue;
    601
    602		if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
    603			break;
    604	}
    605}
    606
    607/*
    608 * Fetch a shadow pte for a specific level in the paging hierarchy.
    609 * If the guest tries to write a write-protected page, we need to
    610 * emulate this operation, return 1 to indicate this case.
    611 */
    612static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
    613			 struct guest_walker *gw)
    614{
    615	struct kvm_mmu_page *sp = NULL;
    616	struct kvm_shadow_walk_iterator it;
    617	unsigned int direct_access, access;
    618	int top_level, ret;
    619	gfn_t base_gfn = fault->gfn;
    620
    621	WARN_ON_ONCE(gw->gfn != base_gfn);
    622	direct_access = gw->pte_access;
    623
    624	top_level = vcpu->arch.mmu->cpu_role.base.level;
    625	if (top_level == PT32E_ROOT_LEVEL)
    626		top_level = PT32_ROOT_LEVEL;
    627	/*
    628	 * Verify that the top-level gpte is still there.  Since the page
    629	 * is a root page, it is either write protected (and cannot be
    630	 * changed from now on) or it is invalid (in which case, we don't
    631	 * really care if it changes underneath us after this point).
    632	 */
    633	if (FNAME(gpte_changed)(vcpu, gw, top_level))
    634		goto out_gpte_changed;
    635
    636	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
    637		goto out_gpte_changed;
    638
    639	for (shadow_walk_init(&it, vcpu, fault->addr);
    640	     shadow_walk_okay(&it) && it.level > gw->level;
    641	     shadow_walk_next(&it)) {
    642		gfn_t table_gfn;
    643
    644		clear_sp_write_flooding_count(it.sptep);
    645		drop_large_spte(vcpu, it.sptep);
    646
    647		sp = NULL;
    648		if (!is_shadow_present_pte(*it.sptep)) {
    649			table_gfn = gw->table_gfn[it.level - 2];
    650			access = gw->pt_access[it.level - 2];
    651			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
    652					      it.level-1, false, access);
    653			/*
    654			 * We must synchronize the pagetable before linking it
    655			 * because the guest doesn't need to flush tlb when
    656			 * the gpte is changed from non-present to present.
    657			 * Otherwise, the guest may use the wrong mapping.
    658			 *
    659			 * For PG_LEVEL_4K, kvm_mmu_get_page() has already
    660			 * synchronized it transiently via kvm_sync_page().
    661			 *
    662			 * For higher level pagetable, we synchronize it via
    663			 * the slower mmu_sync_children().  If it needs to
    664			 * break, some progress has been made; return
    665			 * RET_PF_RETRY and retry on the next #PF.
    666			 * KVM_REQ_MMU_SYNC is not necessary but it
    667			 * expedites the process.
    668			 */
    669			if (sp->unsync_children &&
    670			    mmu_sync_children(vcpu, sp, false))
    671				return RET_PF_RETRY;
    672		}
    673
    674		/*
    675		 * Verify that the gpte in the page we've just write
    676		 * protected is still there.
    677		 */
    678		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
    679			goto out_gpte_changed;
    680
    681		if (sp)
    682			link_shadow_page(vcpu, it.sptep, sp);
    683	}
    684
    685	kvm_mmu_hugepage_adjust(vcpu, fault);
    686
    687	trace_kvm_mmu_spte_requested(fault);
    688
    689	for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
    690		clear_sp_write_flooding_count(it.sptep);
    691
    692		/*
    693		 * We cannot overwrite existing page tables with an NX
    694		 * large page, as the leaf could be executable.
    695		 */
    696		if (fault->nx_huge_page_workaround_enabled)
    697			disallowed_hugepage_adjust(fault, *it.sptep, it.level);
    698
    699		base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
    700		if (it.level == fault->goal_level)
    701			break;
    702
    703		validate_direct_spte(vcpu, it.sptep, direct_access);
    704
    705		drop_large_spte(vcpu, it.sptep);
    706
    707		if (!is_shadow_present_pte(*it.sptep)) {
    708			sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
    709					      it.level - 1, true, direct_access);
    710			link_shadow_page(vcpu, it.sptep, sp);
    711			if (fault->huge_page_disallowed &&
    712			    fault->req_level >= it.level)
    713				account_huge_nx_page(vcpu->kvm, sp);
    714		}
    715	}
    716
    717	if (WARN_ON_ONCE(it.level != fault->goal_level))
    718		return -EFAULT;
    719
    720	ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access,
    721			   base_gfn, fault->pfn, fault);
    722	if (ret == RET_PF_SPURIOUS)
    723		return ret;
    724
    725	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
    726	return ret;
    727
    728out_gpte_changed:
    729	return RET_PF_RETRY;
    730}
    731
    732 /*
    733 * To see whether the mapped gfn can write its page table in the current
    734 * mapping.
    735 *
    736 * It is the helper function of FNAME(page_fault). When guest uses large page
    737 * size to map the writable gfn which is used as current page table, we should
    738 * force kvm to use small page size to map it because new shadow page will be
    739 * created when kvm establishes shadow page table that stop kvm using large
    740 * page size. Do it early can avoid unnecessary #PF and emulation.
    741 *
    742 * @write_fault_to_shadow_pgtable will return true if the fault gfn is
    743 * currently used as its page table.
    744 *
    745 * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
    746 * since the PDPT is always shadowed, that means, we can not use large page
    747 * size to map the gfn which is used as PDPT.
    748 */
    749static bool
    750FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
    751			      struct guest_walker *walker, bool user_fault,
    752			      bool *write_fault_to_shadow_pgtable)
    753{
    754	int level;
    755	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
    756	bool self_changed = false;
    757
    758	if (!(walker->pte_access & ACC_WRITE_MASK ||
    759	    (!is_cr0_wp(vcpu->arch.mmu) && !user_fault)))
    760		return false;
    761
    762	for (level = walker->level; level <= walker->max_level; level++) {
    763		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
    764
    765		self_changed |= !(gfn & mask);
    766		*write_fault_to_shadow_pgtable |= !gfn;
    767	}
    768
    769	return self_changed;
    770}
    771
    772/*
    773 * Page fault handler.  There are several causes for a page fault:
    774 *   - there is no shadow pte for the guest pte
    775 *   - write access through a shadow pte marked read only so that we can set
    776 *     the dirty bit
    777 *   - write access to a shadow pte marked read only so we can update the page
    778 *     dirty bitmap, when userspace requests it
    779 *   - mmio access; in this case we will never install a present shadow pte
    780 *   - normal guest page fault due to the guest pte marked not present, not
    781 *     writable, or not executable
    782 *
    783 *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
    784 *           a negative value on error.
    785 */
    786static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
    787{
    788	struct guest_walker walker;
    789	int r;
    790	unsigned long mmu_seq;
    791	bool is_self_change_mapping;
    792
    793	pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
    794	WARN_ON_ONCE(fault->is_tdp);
    795
    796	/*
    797	 * Look up the guest pte for the faulting address.
    798	 * If PFEC.RSVD is set, this is a shadow page fault.
    799	 * The bit needs to be cleared before walking guest page tables.
    800	 */
    801	r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
    802			     fault->error_code & ~PFERR_RSVD_MASK);
    803
    804	/*
    805	 * The page is not mapped by the guest.  Let the guest handle it.
    806	 */
    807	if (!r) {
    808		pgprintk("%s: guest page fault\n", __func__);
    809		if (!fault->prefetch)
    810			kvm_inject_emulated_page_fault(vcpu, &walker.fault);
    811
    812		return RET_PF_RETRY;
    813	}
    814
    815	fault->gfn = walker.gfn;
    816	fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
    817
    818	if (page_fault_handle_page_track(vcpu, fault)) {
    819		shadow_page_table_clear_flood(vcpu, fault->addr);
    820		return RET_PF_EMULATE;
    821	}
    822
    823	r = mmu_topup_memory_caches(vcpu, true);
    824	if (r)
    825		return r;
    826
    827	vcpu->arch.write_fault_to_shadow_pgtable = false;
    828
    829	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
    830	      &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable);
    831
    832	if (is_self_change_mapping)
    833		fault->max_level = PG_LEVEL_4K;
    834	else
    835		fault->max_level = walker.level;
    836
    837	mmu_seq = vcpu->kvm->mmu_notifier_seq;
    838	smp_rmb();
    839
    840	r = kvm_faultin_pfn(vcpu, fault);
    841	if (r != RET_PF_CONTINUE)
    842		return r;
    843
    844	r = handle_abnormal_pfn(vcpu, fault, walker.pte_access);
    845	if (r != RET_PF_CONTINUE)
    846		return r;
    847
    848	/*
    849	 * Do not change pte_access if the pfn is a mmio page, otherwise
    850	 * we will cache the incorrect access into mmio spte.
    851	 */
    852	if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
    853	    !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
    854		walker.pte_access |= ACC_WRITE_MASK;
    855		walker.pte_access &= ~ACC_USER_MASK;
    856
    857		/*
    858		 * If we converted a user page to a kernel page,
    859		 * so that the kernel can write to it when cr0.wp=0,
    860		 * then we should prevent the kernel from executing it
    861		 * if SMEP is enabled.
    862		 */
    863		if (is_cr4_smep(vcpu->arch.mmu))
    864			walker.pte_access &= ~ACC_EXEC_MASK;
    865	}
    866
    867	r = RET_PF_RETRY;
    868	write_lock(&vcpu->kvm->mmu_lock);
    869
    870	if (is_page_fault_stale(vcpu, fault, mmu_seq))
    871		goto out_unlock;
    872
    873	r = make_mmu_pages_available(vcpu);
    874	if (r)
    875		goto out_unlock;
    876	r = FNAME(fetch)(vcpu, fault, &walker);
    877
    878out_unlock:
    879	write_unlock(&vcpu->kvm->mmu_lock);
    880	kvm_release_pfn_clean(fault->pfn);
    881	return r;
    882}
    883
    884static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
    885{
    886	int offset = 0;
    887
    888	WARN_ON(sp->role.level != PG_LEVEL_4K);
    889
    890	if (PTTYPE == 32)
    891		offset = sp->role.quadrant << PT64_LEVEL_BITS;
    892
    893	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
    894}
    895
    896static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
    897{
    898	struct kvm_shadow_walk_iterator iterator;
    899	struct kvm_mmu_page *sp;
    900	u64 old_spte;
    901	int level;
    902	u64 *sptep;
    903
    904	vcpu_clear_mmio_info(vcpu, gva);
    905
    906	/*
    907	 * No need to check return value here, rmap_can_add() can
    908	 * help us to skip pte prefetch later.
    909	 */
    910	mmu_topup_memory_caches(vcpu, true);
    911
    912	if (!VALID_PAGE(root_hpa)) {
    913		WARN_ON(1);
    914		return;
    915	}
    916
    917	write_lock(&vcpu->kvm->mmu_lock);
    918	for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
    919		level = iterator.level;
    920		sptep = iterator.sptep;
    921
    922		sp = sptep_to_sp(sptep);
    923		old_spte = *sptep;
    924		if (is_last_spte(old_spte, level)) {
    925			pt_element_t gpte;
    926			gpa_t pte_gpa;
    927
    928			if (!sp->unsync)
    929				break;
    930
    931			pte_gpa = FNAME(get_level1_sp_gpa)(sp);
    932			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
    933
    934			mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
    935			if (is_shadow_present_pte(old_spte))
    936				kvm_flush_remote_tlbs_with_address(vcpu->kvm,
    937					sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
    938
    939			if (!rmap_can_add(vcpu))
    940				break;
    941
    942			if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
    943						       sizeof(pt_element_t)))
    944				break;
    945
    946			FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false);
    947		}
    948
    949		if (!sp->unsync_children)
    950			break;
    951	}
    952	write_unlock(&vcpu->kvm->mmu_lock);
    953}
    954
    955/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
    956static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
    957			       gpa_t addr, u64 access,
    958			       struct x86_exception *exception)
    959{
    960	struct guest_walker walker;
    961	gpa_t gpa = UNMAPPED_GVA;
    962	int r;
    963
    964#ifndef CONFIG_X86_64
    965	/* A 64-bit GVA should be impossible on 32-bit KVM. */
    966	WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu);
    967#endif
    968
    969	r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access);
    970
    971	if (r) {
    972		gpa = gfn_to_gpa(walker.gfn);
    973		gpa |= addr & ~PAGE_MASK;
    974	} else if (exception)
    975		*exception = walker.fault;
    976
    977	return gpa;
    978}
    979
    980/*
    981 * Using the cached information from sp->gfns is safe because:
    982 * - The spte has a reference to the struct page, so the pfn for a given gfn
    983 *   can't change unless all sptes pointing to it are nuked first.
    984 *
    985 * Returns
    986 * < 0: the sp should be zapped
    987 *   0: the sp is synced and no tlb flushing is required
    988 * > 0: the sp is synced and tlb flushing is required
    989 */
    990static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
    991{
    992	union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
    993	int i;
    994	bool host_writable;
    995	gpa_t first_pte_gpa;
    996	bool flush = false;
    997
    998	/*
    999	 * Ignore various flags when verifying that it's safe to sync a shadow
   1000	 * page using the current MMU context.
   1001	 *
   1002	 *  - level: not part of the overall MMU role and will never match as the MMU's
   1003	 *           level tracks the root level
   1004	 *  - access: updated based on the new guest PTE
   1005	 *  - quadrant: not part of the overall MMU role (similar to level)
   1006	 */
   1007	const union kvm_mmu_page_role sync_role_ign = {
   1008		.level = 0xf,
   1009		.access = 0x7,
   1010		.quadrant = 0x3,
   1011		.passthrough = 0x1,
   1012	};
   1013
   1014	/*
   1015	 * Direct pages can never be unsync, and KVM should never attempt to
   1016	 * sync a shadow page for a different MMU context, e.g. if the role
   1017	 * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
   1018	 * reserved bits checks will be wrong, etc...
   1019	 */
   1020	if (WARN_ON_ONCE(sp->role.direct ||
   1021			 (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
   1022		return -1;
   1023
   1024	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
   1025
   1026	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
   1027		u64 *sptep, spte;
   1028		struct kvm_memory_slot *slot;
   1029		unsigned pte_access;
   1030		pt_element_t gpte;
   1031		gpa_t pte_gpa;
   1032		gfn_t gfn;
   1033
   1034		if (!sp->spt[i])
   1035			continue;
   1036
   1037		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
   1038
   1039		if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
   1040					       sizeof(pt_element_t)))
   1041			return -1;
   1042
   1043		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
   1044			flush = true;
   1045			continue;
   1046		}
   1047
   1048		gfn = gpte_to_gfn(gpte);
   1049		pte_access = sp->role.access;
   1050		pte_access &= FNAME(gpte_access)(gpte);
   1051		FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
   1052
   1053		if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
   1054			continue;
   1055
   1056		if (gfn != sp->gfns[i]) {
   1057			drop_spte(vcpu->kvm, &sp->spt[i]);
   1058			flush = true;
   1059			continue;
   1060		}
   1061
   1062		sptep = &sp->spt[i];
   1063		spte = *sptep;
   1064		host_writable = spte & shadow_host_writable_mask;
   1065		slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
   1066		make_spte(vcpu, sp, slot, pte_access, gfn,
   1067			  spte_to_pfn(spte), spte, true, false,
   1068			  host_writable, &spte);
   1069
   1070		flush |= mmu_spte_update(sptep, spte);
   1071	}
   1072
   1073	return flush;
   1074}
   1075
   1076#undef pt_element_t
   1077#undef guest_walker
   1078#undef FNAME
   1079#undef PT_BASE_ADDR_MASK
   1080#undef PT_INDEX
   1081#undef PT_LVL_ADDR_MASK
   1082#undef PT_LVL_OFFSET_MASK
   1083#undef PT_LEVEL_BITS
   1084#undef PT_MAX_FULL_LEVELS
   1085#undef gpte_to_gfn
   1086#undef gpte_to_gfn_lvl
   1087#undef CMPXCHG
   1088#undef PT_GUEST_ACCESSED_MASK
   1089#undef PT_GUEST_DIRTY_MASK
   1090#undef PT_GUEST_DIRTY_SHIFT
   1091#undef PT_GUEST_ACCESSED_SHIFT
   1092#undef PT_HAVE_ACCESSED_DIRTY