spte.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
spte.c (14294B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Kernel-based Virtual Machine driver for Linux
      4 *
      5 * Macros and functions to access KVM PTEs (also known as SPTEs)
      6 *
      7 * Copyright (C) 2006 Qumranet, Inc.
      8 * Copyright 2020 Red Hat, Inc. and/or its affiliates.
      9 */
     10
     11
     12#include <linux/kvm_host.h>
     13#include "mmu.h"
     14#include "mmu_internal.h"
     15#include "x86.h"
     16#include "spte.h"
     17
     18#include <asm/e820/api.h>
     19#include <asm/memtype.h>
     20#include <asm/vmx.h>
     21
     22bool __read_mostly enable_mmio_caching = true;
     23static bool __ro_after_init allow_mmio_caching;
     24module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
     25EXPORT_SYMBOL_GPL(enable_mmio_caching);
     26
     27u64 __read_mostly shadow_host_writable_mask;
     28u64 __read_mostly shadow_mmu_writable_mask;
     29u64 __read_mostly shadow_nx_mask;
     30u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
     31u64 __read_mostly shadow_user_mask;
     32u64 __read_mostly shadow_accessed_mask;
     33u64 __read_mostly shadow_dirty_mask;
     34u64 __read_mostly shadow_mmio_value;
     35u64 __read_mostly shadow_mmio_mask;
     36u64 __read_mostly shadow_mmio_access_mask;
     37u64 __read_mostly shadow_present_mask;
     38u64 __read_mostly shadow_me_value;
     39u64 __read_mostly shadow_me_mask;
     40u64 __read_mostly shadow_acc_track_mask;
     41
     42u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
     43u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
     44
     45u8 __read_mostly shadow_phys_bits;
     46
     47void __init kvm_mmu_spte_module_init(void)
     48{
     49	/*
     50	 * Snapshot userspace's desire to allow MMIO caching.  Whether or not
     51	 * KVM can actually enable MMIO caching depends on vendor-specific
     52	 * hardware capabilities and other module params that can't be resolved
     53	 * until the vendor module is loaded, i.e. enable_mmio_caching can and
     54	 * will change when the vendor module is (re)loaded.
     55	 */
     56	allow_mmio_caching = enable_mmio_caching;
     57}
     58
     59static u64 generation_mmio_spte_mask(u64 gen)
     60{
     61	u64 mask;
     62
     63	WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
     64
     65	mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
     66	mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
     67	return mask;
     68}
     69
     70u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
     71{
     72	u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
     73	u64 spte = generation_mmio_spte_mask(gen);
     74	u64 gpa = gfn << PAGE_SHIFT;
     75
     76	WARN_ON_ONCE(!shadow_mmio_value);
     77
     78	access &= shadow_mmio_access_mask;
     79	spte |= shadow_mmio_value | access;
     80	spte |= gpa | shadow_nonpresent_or_rsvd_mask;
     81	spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
     82		<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
     83
     84	return spte;
     85}
     86
     87static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
     88{
     89	if (pfn_valid(pfn))
     90		return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
     91			/*
     92			 * Some reserved pages, such as those from NVDIMM
     93			 * DAX devices, are not for MMIO, and can be mapped
     94			 * with cached memory type for better performance.
     95			 * However, the above check misconceives those pages
     96			 * as MMIO, and results in KVM mapping them with UC
     97			 * memory type, which would hurt the performance.
     98			 * Therefore, we check the host memory type in addition
     99			 * and only treat UC/UC-/WC pages as MMIO.
    100			 */
    101			(!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
    102
    103	return !e820__mapped_raw_any(pfn_to_hpa(pfn),
    104				     pfn_to_hpa(pfn + 1) - 1,
    105				     E820_TYPE_RAM);
    106}
    107
    108/*
    109 * Returns true if the SPTE has bits that may be set without holding mmu_lock.
    110 * The caller is responsible for checking if the SPTE is shadow-present, and
    111 * for determining whether or not the caller cares about non-leaf SPTEs.
    112 */
    113bool spte_has_volatile_bits(u64 spte)
    114{
    115	/*
    116	 * Always atomically update spte if it can be updated
    117	 * out of mmu-lock, it can ensure dirty bit is not lost,
    118	 * also, it can help us to get a stable is_writable_pte()
    119	 * to ensure tlb flush is not missed.
    120	 */
    121	if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
    122		return true;
    123
    124	if (is_access_track_spte(spte))
    125		return true;
    126
    127	if (spte_ad_enabled(spte)) {
    128		if (!(spte & shadow_accessed_mask) ||
    129		    (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
    130			return true;
    131	}
    132
    133	return false;
    134}
    135
    136bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
    137	       const struct kvm_memory_slot *slot,
    138	       unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
    139	       u64 old_spte, bool prefetch, bool can_unsync,
    140	       bool host_writable, u64 *new_spte)
    141{
    142	int level = sp->role.level;
    143	u64 spte = SPTE_MMU_PRESENT_MASK;
    144	bool wrprot = false;
    145
    146	if (sp->role.ad_disabled)
    147		spte |= SPTE_TDP_AD_DISABLED_MASK;
    148	else if (kvm_mmu_page_ad_need_write_protect(sp))
    149		spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
    150
    151	/*
    152	 * For the EPT case, shadow_present_mask is 0 if hardware
    153	 * supports exec-only page table entries.  In that case,
    154	 * ACC_USER_MASK and shadow_user_mask are used to represent
    155	 * read access.  See FNAME(gpte_access) in paging_tmpl.h.
    156	 */
    157	spte |= shadow_present_mask;
    158	if (!prefetch)
    159		spte |= spte_shadow_accessed_mask(spte);
    160
    161	if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
    162	    is_nx_huge_page_enabled()) {
    163		pte_access &= ~ACC_EXEC_MASK;
    164	}
    165
    166	if (pte_access & ACC_EXEC_MASK)
    167		spte |= shadow_x_mask;
    168	else
    169		spte |= shadow_nx_mask;
    170
    171	if (pte_access & ACC_USER_MASK)
    172		spte |= shadow_user_mask;
    173
    174	if (level > PG_LEVEL_4K)
    175		spte |= PT_PAGE_SIZE_MASK;
    176	if (tdp_enabled)
    177		spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
    178			kvm_is_mmio_pfn(pfn));
    179
    180	if (host_writable)
    181		spte |= shadow_host_writable_mask;
    182	else
    183		pte_access &= ~ACC_WRITE_MASK;
    184
    185	if (shadow_me_value && !kvm_is_mmio_pfn(pfn))
    186		spte |= shadow_me_value;
    187
    188	spte |= (u64)pfn << PAGE_SHIFT;
    189
    190	if (pte_access & ACC_WRITE_MASK) {
    191		spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
    192
    193		/*
    194		 * Optimization: for pte sync, if spte was writable the hash
    195		 * lookup is unnecessary (and expensive). Write protection
    196		 * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots.
    197		 * Same reasoning can be applied to dirty page accounting.
    198		 */
    199		if (is_writable_pte(old_spte))
    200			goto out;
    201
    202		/*
    203		 * Unsync shadow pages that are reachable by the new, writable
    204		 * SPTE.  Write-protect the SPTE if the page can't be unsync'd,
    205		 * e.g. it's write-tracked (upper-level SPs) or has one or more
    206		 * shadow pages and unsync'ing pages is not allowed.
    207		 */
    208		if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) {
    209			pgprintk("%s: found shadow page for %llx, marking ro\n",
    210				 __func__, gfn);
    211			wrprot = true;
    212			pte_access &= ~ACC_WRITE_MASK;
    213			spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
    214		}
    215	}
    216
    217	if (pte_access & ACC_WRITE_MASK)
    218		spte |= spte_shadow_dirty_mask(spte);
    219
    220out:
    221	if (prefetch)
    222		spte = mark_spte_for_access_track(spte);
    223
    224	WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
    225		  "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
    226		  get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
    227
    228	if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
    229		/* Enforced by kvm_mmu_hugepage_adjust. */
    230		WARN_ON(level > PG_LEVEL_4K);
    231		mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
    232	}
    233
    234	*new_spte = spte;
    235	return wrprot;
    236}
    237
    238static u64 make_spte_executable(u64 spte)
    239{
    240	bool is_access_track = is_access_track_spte(spte);
    241
    242	if (is_access_track)
    243		spte = restore_acc_track_spte(spte);
    244
    245	spte &= ~shadow_nx_mask;
    246	spte |= shadow_x_mask;
    247
    248	if (is_access_track)
    249		spte = mark_spte_for_access_track(spte);
    250
    251	return spte;
    252}
    253
    254/*
    255 * Construct an SPTE that maps a sub-page of the given huge page SPTE where
    256 * `index` identifies which sub-page.
    257 *
    258 * This is used during huge page splitting to build the SPTEs that make up the
    259 * new page table.
    260 */
    261u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
    262{
    263	u64 child_spte;
    264	int child_level;
    265
    266	if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
    267		return 0;
    268
    269	if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
    270		return 0;
    271
    272	child_spte = huge_spte;
    273	child_level = huge_level - 1;
    274
    275	/*
    276	 * The child_spte already has the base address of the huge page being
    277	 * split. So we just have to OR in the offset to the page at the next
    278	 * lower level for the given index.
    279	 */
    280	child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;
    281
    282	if (child_level == PG_LEVEL_4K) {
    283		child_spte &= ~PT_PAGE_SIZE_MASK;
    284
    285		/*
    286		 * When splitting to a 4K page, mark the page executable as the
    287		 * NX hugepage mitigation no longer applies.
    288		 */
    289		if (is_nx_huge_page_enabled())
    290			child_spte = make_spte_executable(child_spte);
    291	}
    292
    293	return child_spte;
    294}
    295
    296
    297u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
    298{
    299	u64 spte = SPTE_MMU_PRESENT_MASK;
    300
    301	spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
    302		shadow_user_mask | shadow_x_mask | shadow_me_value;
    303
    304	if (ad_disabled)
    305		spte |= SPTE_TDP_AD_DISABLED_MASK;
    306	else
    307		spte |= shadow_accessed_mask;
    308
    309	return spte;
    310}
    311
    312u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
    313{
    314	u64 new_spte;
    315
    316	new_spte = old_spte & ~PT64_BASE_ADDR_MASK;
    317	new_spte |= (u64)new_pfn << PAGE_SHIFT;
    318
    319	new_spte &= ~PT_WRITABLE_MASK;
    320	new_spte &= ~shadow_host_writable_mask;
    321	new_spte &= ~shadow_mmu_writable_mask;
    322
    323	new_spte = mark_spte_for_access_track(new_spte);
    324
    325	return new_spte;
    326}
    327
    328u64 mark_spte_for_access_track(u64 spte)
    329{
    330	if (spte_ad_enabled(spte))
    331		return spte & ~shadow_accessed_mask;
    332
    333	if (is_access_track_spte(spte))
    334		return spte;
    335
    336	check_spte_writable_invariants(spte);
    337
    338	WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
    339			  SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
    340		  "kvm: Access Tracking saved bit locations are not zero\n");
    341
    342	spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
    343		SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
    344	spte &= ~shadow_acc_track_mask;
    345
    346	return spte;
    347}
    348
    349void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
    350{
    351	BUG_ON((u64)(unsigned)access_mask != access_mask);
    352	WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
    353
    354	/*
    355	 * Reset to the original module param value to honor userspace's desire
    356	 * to (dis)allow MMIO caching.  Update the param itself so that
    357	 * userspace can see whether or not KVM is actually using MMIO caching.
    358	 */
    359	enable_mmio_caching = allow_mmio_caching;
    360	if (!enable_mmio_caching)
    361		mmio_value = 0;
    362
    363	/*
    364	 * Disable MMIO caching if the MMIO value collides with the bits that
    365	 * are used to hold the relocated GFN when the L1TF mitigation is
    366	 * enabled.  This should never fire as there is no known hardware that
    367	 * can trigger this condition, e.g. SME/SEV CPUs that require a custom
    368	 * MMIO value are not susceptible to L1TF.
    369	 */
    370	if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
    371				  SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
    372		mmio_value = 0;
    373
    374	/*
    375	 * The masked MMIO value must obviously match itself and a removed SPTE
    376	 * must not get a false positive.  Removed SPTEs and MMIO SPTEs should
    377	 * never collide as MMIO must set some RWX bits, and removed SPTEs must
    378	 * not set any RWX bits.
    379	 */
    380	if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
    381	    WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
    382		mmio_value = 0;
    383
    384	if (!mmio_value)
    385		enable_mmio_caching = false;
    386
    387	shadow_mmio_value = mmio_value;
    388	shadow_mmio_mask  = mmio_mask;
    389	shadow_mmio_access_mask = access_mask;
    390}
    391EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
    392
    393void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
    394{
    395	/* shadow_me_value must be a subset of shadow_me_mask */
    396	if (WARN_ON(me_value & ~me_mask))
    397		me_value = me_mask = 0;
    398
    399	shadow_me_value = me_value;
    400	shadow_me_mask = me_mask;
    401}
    402EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask);
    403
    404void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
    405{
    406	shadow_user_mask	= VMX_EPT_READABLE_MASK;
    407	shadow_accessed_mask	= has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
    408	shadow_dirty_mask	= has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
    409	shadow_nx_mask		= 0ull;
    410	shadow_x_mask		= VMX_EPT_EXECUTABLE_MASK;
    411	shadow_present_mask	= has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
    412	shadow_acc_track_mask	= VMX_EPT_RWX_MASK;
    413	shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
    414	shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE;
    415
    416	/*
    417	 * EPT Misconfigurations are generated if the value of bits 2:0
    418	 * of an EPT paging-structure entry is 110b (write/execute).
    419	 */
    420	kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
    421				   VMX_EPT_RWX_MASK, 0);
    422}
    423EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
    424
    425void kvm_mmu_reset_all_pte_masks(void)
    426{
    427	u8 low_phys_bits;
    428	u64 mask;
    429
    430	shadow_phys_bits = kvm_get_shadow_phys_bits();
    431
    432	/*
    433	 * If the CPU has 46 or less physical address bits, then set an
    434	 * appropriate mask to guard against L1TF attacks. Otherwise, it is
    435	 * assumed that the CPU is not vulnerable to L1TF.
    436	 *
    437	 * Some Intel CPUs address the L1 cache using more PA bits than are
    438	 * reported by CPUID. Use the PA width of the L1 cache when possible
    439	 * to achieve more effective mitigation, e.g. if system RAM overlaps
    440	 * the most significant bits of legal physical address space.
    441	 */
    442	shadow_nonpresent_or_rsvd_mask = 0;
    443	low_phys_bits = boot_cpu_data.x86_phys_bits;
    444	if (boot_cpu_has_bug(X86_BUG_L1TF) &&
    445	    !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
    446			  52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) {
    447		low_phys_bits = boot_cpu_data.x86_cache_bits
    448			- SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
    449		shadow_nonpresent_or_rsvd_mask =
    450			rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
    451	}
    452
    453	shadow_nonpresent_or_rsvd_lower_gfn_mask =
    454		GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
    455
    456	shadow_user_mask	= PT_USER_MASK;
    457	shadow_accessed_mask	= PT_ACCESSED_MASK;
    458	shadow_dirty_mask	= PT_DIRTY_MASK;
    459	shadow_nx_mask		= PT64_NX_MASK;
    460	shadow_x_mask		= 0;
    461	shadow_present_mask	= PT_PRESENT_MASK;
    462	shadow_acc_track_mask	= 0;
    463	shadow_me_mask		= 0;
    464	shadow_me_value		= 0;
    465
    466	shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE;
    467	shadow_mmu_writable_mask  = DEFAULT_SPTE_MMU_WRITABLE;
    468
    469	/*
    470	 * Set a reserved PA bit in MMIO SPTEs to generate page faults with
    471	 * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
    472	 * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
    473	 * 52-bit physical addresses then there are no reserved PA bits in the
    474	 * PTEs and so the reserved PA approach must be disabled.
    475	 */
    476	if (shadow_phys_bits < 52)
    477		mask = BIT_ULL(51) | PT_PRESENT_MASK;
    478	else
    479		mask = 0;
    480
    481	kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
    482}