commit d505f8bebab8214981a7b4ad63e2595fa497074c
parent 0e89d3b1b7c45ff9a3916b01ab56f177d4b64f8c
Author: Louis Burda <quent.burda@gmail.com>
Date: Thu, 6 Oct 2022 09:53:35 +0200
Merge sevstep with cachepc dir and merge cachepc headers
Diffstat:
18 files changed, 1130 insertions(+), 1151 deletions(-)
diff --git a/Makefile b/Makefile
@@ -6,13 +6,10 @@ all: build test/eviction test/access test/kvm test/sev test/sev-es test/sevstep
clean:
$(MAKE) -C $(LINUX) SUBDIRS=arch/x86/kvm clean
-$(LINUX)/arch/x86/kvm/svm/cachepc:
+$(LINUX)/arch/x86/kvm/cachepc:
ln -sf $(PWD)/cachepc $@
-$(LINUX)/arch/x86/kvm/sevstep:
- ln -sf $(PWD)/sevstep $@
-
-build: $(LINUX)/arch/x86/kvm/svm/cachepc $(LINUX)/arch/x86/kvm/sevstep
+build: $(LINUX)/arch/x86/kvm/cachepc
$(MAKE) -C $(LINUX) -j6 M=arch/x86/kvm
load:
@@ -21,7 +18,7 @@ load:
sudo insmod $(LINUX)/arch/x86/kvm/kvm.ko
sudo insmod $(LINUX)/arch/x86/kvm/kvm-amd.ko
-test/%: test/%.c cachepc/uapi.h sevstep/uapi.h
+test/%: test/%.c cachepc/uapi.h
clang -o $@ $< -fsanitize=address -I . -Wunused-variable
diff --git a/cachepc/cache_types.h b/cachepc/cache_types.h
@@ -1,66 +0,0 @@
-#pragma once
-
-#include "device_conf.h"
-
-#define SET_MASK(SETS) (((((uintptr_t) SETS) * CACHELINE_SIZE) - 1) ^ (CACHELINE_SIZE - 1))
-
-#define REMOVE_PAGE_OFFSET(ptr) ((void *) (((uintptr_t) ptr) & PAGE_MASK))
-
-#define GET_BIT(b, i) (((b) >> (i)) & 1)
-#define SET_BIT(b, i) ((b) | (1 << (i)))
-
-/* Operate cacheline flags
- * Used flags:
- * 32 2 1 0
- * | | ... | cache group initialized | last | first |
- */
-#define DEFAULT_FLAGS 0
-#define SET_FIRST(flags) SET_BIT(flags, 0)
-#define SET_LAST(flags) SET_BIT(flags, 1)
-#define SET_CACHE_GROUP_INIT(flags) SET_BIT(flags, 2)
-#define IS_FIRST(flags) GET_BIT(flags, 0)
-#define IS_LAST(flags) GET_BIT(flags, 1)
-#define IS_CACHE_GROUP_INIT(flags) GET_BIT(flags, 2)
-
-#define CL_NEXT_OFFSET offsetof(struct cacheline, next)
-#define CL_PREV_OFFSET offsetof(struct cacheline, prev)
-
-typedef enum cache_level cache_level;
-typedef enum addressing_type addressing_type;
-typedef struct cacheline cacheline;
-typedef struct cache_ctx cache_ctx;
-
-enum cache_level {L1, L2};
-enum addressing_type {VIRTUAL, PHYSICAL};
-
-struct cache_ctx {
- cache_level cache_level;
- addressing_type addressing;
-
- uint32_t sets;
- uint32_t associativity;
- uint32_t access_time;
- uint32_t nr_of_cachelines;
- uint32_t set_size;
- uint32_t cache_size;
-};
-
-struct cacheline {
- // Doubly linked list inside same set
- // Attention: CL_NEXT_OFFSET and CL_PREV_OFFSET
- // must be kept up to date
- cacheline *next;
- cacheline *prev;
-
- uint32_t cache_set;
- uint32_t cache_line;
- uint32_t flags;
-
- // Unused padding to fill cache line
- uint64_t count;
-
- char padding[24];
-};
-
-static_assert(sizeof(struct cacheline) == CACHELINE_SIZE, "Bad cache line struct size");
-static_assert(CL_NEXT_OFFSET == 0 && CL_PREV_OFFSET == 8);
diff --git a/cachepc/cachepc.c b/cachepc/cachepc.c
@@ -6,6 +6,10 @@
#include <linux/delay.h>
#include <linux/ioctl.h>
+#define SET_MASK(SETS) (((((uintptr_t) SETS) * CACHELINE_SIZE) - 1) ^ (CACHELINE_SIZE - 1))
+
+#define REMOVE_PAGE_OFFSET(ptr) ((void *) (((uintptr_t) ptr) & PAGE_MASK))
+
static void cl_insert(cacheline *last_cl, cacheline *new_cl);
static void *remove_cache_set(cache_ctx *ctx, void *ptr);
static void *remove_cache_group_set(void *ptr);
@@ -47,20 +51,20 @@ cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask,
}
cache_ctx *
-cachepc_get_ctx(cache_level cache_level)
+cachepc_get_ctx(int cache_level)
{
cache_ctx *ctx;
ctx = kzalloc(sizeof(cache_ctx), GFP_KERNEL);
BUG_ON(ctx == NULL);
- BUG_ON(cache_level != L1);
- if (cache_level == L1) {
+ BUG_ON(cache_level != L1_CACHE);
+ if (cache_level == L1_CACHE) {
ctx->addressing = L1_ADDRESSING;
ctx->sets = L1_SETS;
ctx->associativity = L1_ASSOCIATIVITY;
ctx->access_time = L1_ACCESS_TIME;
- } else if (cache_level == L2) {
+ } else if (cache_level == L2_CACHE) {
ctx->addressing = L2_ADDRESSING;
ctx->sets = L2_SETS;
ctx->associativity = L2_ASSOCIATIVITY;
@@ -120,7 +124,7 @@ cachepc_prepare_victim(cache_ctx *ctx, uint32_t set)
victim_cl = victim_set;
// Free the other lines in the same set that are not used.
- if (ctx->addressing == PHYSICAL) {
+ if (ctx->addressing == PHYSICAL_ADDRESSING) {
curr_cl = victim_cl->next;
do {
next_cl = curr_cl->next;
@@ -162,7 +166,7 @@ cachepc_save_msrmts(cacheline *head)
curr_cl = head;
do {
- if (IS_FIRST(curr_cl->flags)) {
+ if (CL_IS_FIRST(curr_cl->flags)) {
BUG_ON(curr_cl->cache_set >= cachepc_msrmts_count);
cachepc_msrmts[curr_cl->cache_set] = curr_cl->count;
}
@@ -178,7 +182,7 @@ cachepc_print_msrmts(cacheline *head)
curr_cl = head;
do {
- if (IS_FIRST(curr_cl->flags)) {
+ if (CL_IS_FIRST(curr_cl->flags)) {
printk(KERN_WARNING "CachePC: Count for cache set %i: %llu\n",
curr_cl->cache_set, curr_cl->count);
}
@@ -238,14 +242,14 @@ prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len)
do {
next_cl = curr_cl->next;
- if (IS_FIRST(curr_cl->flags)) {
+ if (CL_IS_FIRST(curr_cl->flags)) {
first_cl_in_sets[curr_cl->cache_set] = curr_cl;
}
- if (IS_LAST(curr_cl->flags)) {
+ if (CL_IS_LAST(curr_cl->flags)) {
last_cl_in_sets[curr_cl->cache_set] = curr_cl;
}
- if (ctx->addressing == PHYSICAL && !is_in_arr(
+ if (ctx->addressing == PHYSICAL_ADDRESSING && !is_in_arr(
curr_cl->cache_set / CACHE_GROUP_SIZE, cache_groups, cache_groups_len))
{
// Already free all unused blocks of the cache ds for physical
@@ -255,7 +259,7 @@ prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len)
}
curr_cl = next_cl;
- } while(curr_cl != cache_ds);
+ } while (curr_cl != cache_ds);
// Fix partial cache set ds
for (i = 0; i < sets_len; ++i) {
@@ -265,7 +269,7 @@ prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len)
cache_set_ds = first_cl_in_sets[sets[0]];
// Free unused cache lines
- if (ctx->addressing == PHYSICAL) {
+ if (ctx->addressing == PHYSICAL_ADDRESSING) {
cachepc_release_ds(ctx, to_del_cls);
}
@@ -359,9 +363,9 @@ cacheline *build_cache_ds(cache_ctx *ctx, cacheline **cl_ptr_arr) {
for (j = 0; j < ctx->nr_of_cachelines; ++j) {
curr_cl = cl_ptr_arr_sorted[j];
- if (IS_FIRST(curr_cl->flags))
+ if (CL_IS_FIRST(curr_cl->flags))
first_cl_in_sets[curr_cl->cache_set] = curr_cl;
- if (IS_LAST(curr_cl->flags))
+ if (CL_IS_LAST(curr_cl->flags))
last_cl_in_sets[curr_cl->cache_set] = curr_cl;
}
@@ -402,10 +406,10 @@ void build_randomized_list_for_cache_set(cache_ctx *ctx, cacheline **cacheline_p
curr_cl->prev = cacheline_ptr_arr[idx_map[(len - 1 + i) % len]];
if (idx_map[i] == 0) {
- curr_cl->flags = SET_FIRST(DEFAULT_FLAGS);
- curr_cl->prev->flags = SET_LAST(DEFAULT_FLAGS);
+ curr_cl->flags = CL_SET_FIRST(CL_DEFAULT_FLAGS);
+ curr_cl->prev->flags = CL_SET_LAST(CL_DEFAULT_FLAGS);
} else {
- curr_cl->flags |= DEFAULT_FLAGS;
+ curr_cl->flags |= CL_DEFAULT_FLAGS;
}
}
@@ -425,7 +429,7 @@ allocate_cache_ds(cache_ctx *ctx)
cl_ptr_arr = kzalloc(ctx->nr_of_cachelines * sizeof(cacheline *), GFP_KERNEL);
BUG_ON(cl_ptr_arr == NULL);
- BUG_ON(ctx->addressing != VIRTUAL);
+ BUG_ON(ctx->addressing != VIRTUAL_ADDRESSING);
// For virtual addressing, allocating a consecutive chunk of memory is enough
cl_arr = cachepc_aligned_alloc(PAGE_SIZE, ctx->cache_size);
diff --git a/cachepc/cachepc.h b/cachepc/cachepc.h
@@ -1,19 +1,94 @@
#pragma once
#include "asm.h"
-#include "cache_types.h"
#include "uapi.h"
-#define PMC_KERNEL 2
-#define PMC_USER 1
+#define CACHELINE_SIZE 64
+#define CACHE_GROUP_SIZE (PAGE_SIZE / CACHELINE_SIZE)
-#define PMC_HOST 2
-#define PMC_GUEST 1
+#define L1_CACHE 0
+#define L2_CACHE 1
+
+#define VIRTUAL_ADDRESSING 0
+#define PHYSICAL_ADDRESSING 1
+
+#define L1_ADDRESSING VIRTUAL_ADDRESSING
+#define L1_SETS 64
+#define L1_ASSOCIATIVITY 8
+#define L1_ACCESS_TIME 4
+
+#define L2_ADDRESSING PHYSICAL_ADDRESSING
+#define L2_SETS 512
+#define L2_ASSOCIATIVITY 8
+#define L2_ACCESS_TIME 12
+
+#define L3_ADDRESSING PHYSICAL_ADDRESSING
+#define L3_SETS 4096
+#define L3_ASSOCIATIVITY 16
+#define L3_ACCESS_TIME 30
+
+#define CACHEPC_GET_BIT(b, i) (((b) >> (i)) & 1)
+#define CACHEPC_SET_BIT(b, i) ((b) | (1 << (i)))
+
+/* Operate cacheline flags
+ * Used flags:
+ * 32 2 1 0
+ * | | ... | cache group initialized | last | first |
+ */
+#define CL_DEFAULT_FLAGS 0
+#define CL_SET_FIRST(flags) CACHEPC_SET_BIT(flags, 0)
+#define CL_SET_LAST(flags) CACHEPC_SET_BIT(flags, 1)
+#define CL_SET_GROUP_INIT(flags) CACHEPC_SET_BIT(flags, 2)
+#define CL_IS_FIRST(flags) CACHEPC_GET_BIT(flags, 0)
+#define CL_IS_LAST(flags) CACHEPC_GET_BIT(flags, 1)
+#define CL_IS_GROUP_INIT(flags) CACHEPC_GET_BIT(flags, 2)
+
+#define CL_NEXT_OFFSET offsetof(struct cacheline, next)
+#define CL_PREV_OFFSET offsetof(struct cacheline, prev)
+
+#define PMC_KERNEL (1 << 1)
+#define PMC_USER (1 << 0)
+
+#define PMC_HOST (1 << 1)
+#define PMC_GUEST (1 << 0)
+
+typedef struct cacheline cacheline;
+typedef struct cache_ctx cache_ctx;
+
+struct cache_ctx {
+ int cache_level;
+ int addressing;
+
+ uint32_t sets;
+ uint32_t associativity;
+ uint32_t access_time;
+ uint32_t nr_of_cachelines;
+ uint32_t set_size;
+ uint32_t cache_size;
+};
+
+struct cacheline {
+ /* Doubly linked cache lines inside same cache set */
+ cacheline *next;
+ cacheline *prev;
+
+ uint32_t cache_set;
+ uint32_t cache_line;
+ uint32_t flags;
+
+ uint64_t count;
+
+ /* padding to fill cache line */
+ char padding[24];
+};
+
+static_assert(sizeof(struct cacheline) == CACHELINE_SIZE, "Bad cache line struct size");
+static_assert(CL_NEXT_OFFSET == 0 && CL_PREV_OFFSET == 8);
void cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask,
int host_guest, int kernel_user);
-cache_ctx *cachepc_get_ctx(cache_level cl);
+cache_ctx *cachepc_get_ctx(int cache_level);
void cachepc_release_ctx(cache_ctx *ctx);
cacheline *cachepc_prepare_ds(cache_ctx *ctx);
diff --git a/cachepc/device_conf.h b/cachepc/device_conf.h
@@ -1,29 +0,0 @@
-#pragma once
-
-// TODO: Read from kernel headers
-
-// General settings
-// #define PAGE_SIZE 4096
-#define PROCESSOR_FREQ 2900000000
-
-// Cache related settings
-#define CACHELINE_SIZE 64
-#define CACHE_GROUP_SIZE (PAGE_SIZE / CACHELINE_SIZE)
-
-// Addressing:
-// - virtual: 0
-// - physical: 1
-#define L1_ADDRESSING 0
-#define L1_SETS 64
-#define L1_ASSOCIATIVITY 8
-#define L1_ACCESS_TIME 4
-
-#define L2_ADDRESSING 1
-#define L2_SETS 512
-#define L2_ASSOCIATIVITY 8
-#define L2_ACCESS_TIME 12
-
-#define L3_ADDRESSING 1
-#define L3_SETS 4096
-#define L3_ASSOCIATIVITY 16
-#define L3_ACCESS_TIME 30
diff --git a/cachepc/kvm.c b/cachepc/kvm.c
@@ -225,7 +225,7 @@ cachepc_kvm_single_eviction_test(void *p)
evicted = NULL;
cl = head = cachepc_ds;
do {
- if (IS_FIRST(cl->flags) && cl->count > 0) {
+ if (CL_IS_FIRST(cl->flags) && cl->count > 0) {
evicted = cl;
count += cl->count;
}
@@ -350,7 +350,7 @@ cachepc_kvm_setup_test(void *p)
printk(KERN_WARNING "CachePC: Running on core %i\n", cpu);
- cachepc_ctx = cachepc_get_ctx(L1);
+ cachepc_ctx = cachepc_get_ctx(L1_CACHE);
cachepc_ds = cachepc_prepare_ds(cachepc_ctx);
cachepc_kvm_system_setup();
diff --git a/cachepc/mmu.c b/cachepc/mmu.c
@@ -0,0 +1,135 @@
+#include "../cachepc/sevstep.h"
+#include "../cachepc/uspt.h"
+
+static void
+sevstep_uspt_page_fault_handle(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ const int modes[] = {
+ KVM_PAGE_TRACK_WRITE,
+ KVM_PAGE_TRACK_ACCESS,
+ KVM_PAGE_TRACK_EXEC
+ };
+ uint64_t current_rip;
+ bool was_tracked;
+ int have_rip, i;
+ int send_err;
+
+ was_tracked = false;
+ for (i = 0; i < sizeof(modes) / sizeof(modes[0]); i++) {
+ if (kvm_slot_page_track_is_active(vcpu->kvm,
+ fault->slot, fault->gfn, modes[i])) {
+ sevstep_untrack_single_page(vcpu, fault->gfn, modes[i]);
+ was_tracked = true;
+ }
+ }
+
+ if (was_tracked) {
+ have_rip = false;
+ if (sevstep_uspt_should_get_rip())
+ have_rip = sevstep_get_rip_kvm_vcpu(vcpu, ¤t_rip) == 0;
+ if (sevstep_uspt_batch_tracking_in_progress()) {
+ send_err = sevstep_uspt_batch_tracking_save(fault->gfn << PAGE_SHIFT,
+ fault->error_code, have_rip, current_rip);
+ if (send_err) {
+ printk_ratelimited(
+ "sevstep_uspt_batch_tracking_save failed with %d\n"
+ "##########################\n", send_err);
+ }
+ sevstep_uspt_batch_tracking_handle_retrack(vcpu, fault->gfn);
+ sevstep_uspt_batch_tracking_inc_event_idx();
+ } else {
+ send_err = sevstep_uspt_send_and_block(fault->gfn << PAGE_SHIFT,
+ fault->error_code, have_rip, current_rip);
+ if (send_err) {
+ printk("sevstep_uspt_send_and_block failed with %d\n"
+ "##########################\n", send_err);
+ }
+ }
+ }
+}
+
+bool
+sevstep_spte_protect(u64 *sptep, bool pt_protect, enum kvm_page_track_mode mode)
+{
+ u64 spte;
+ bool flush;
+
+ spte = *sptep;
+ if (!is_writable_pte(spte) && !(pt_protect && is_mmu_writable_spte(spte)))
+ return false;
+
+ rmap_printk("spte %p %llx\n", sptep, *sptep);
+
+ if (pt_protect)
+ spte &= ~EPT_SPTE_MMU_WRITABLE;
+
+ flush = false;
+ if (mode == KVM_PAGE_TRACK_WRITE) {
+ spte = spte & ~PT_WRITABLE_MASK;
+ flush = true;
+ } else if (mode == KVM_PAGE_TRACK_RESET_ACCESSED) {
+ spte = spte & ~PT_ACCESSED_MASK;
+ } else if (mode == KVM_PAGE_TRACK_ACCESS) {
+ spte = spte & ~PT_PRESENT_MASK;
+ spte = spte & ~PT_WRITABLE_MASK;
+ spte = spte & ~PT_USER_MASK;
+ spte = spte | (0x1ULL << PT64_NX_SHIFT);
+ flush = true;
+ } else if (mode == KVM_PAGE_TRACK_EXEC) {
+ spte = spte | (0x1ULL << PT64_NX_SHIFT);
+ flush = true;
+ } else if (mode == KVM_PAGE_TRACK_RESET_EXEC) {
+ spte = spte & ~(0x1ULL << PT64_NX_SHIFT);
+ flush = true;
+ } else {
+ printk(KERN_WARNING "spte_protect was called with invalid mode"
+ "parameter %d\n",mode);
+ }
+ flush |= mmu_spte_update(sptep, spte);
+
+ return flush;
+}
+EXPORT_SYMBOL(sevstep_spte_protect);
+
+bool sevstep_rmap_protect(struct kvm_rmap_head *rmap_head,
+ bool pt_protect, enum kvm_page_track_mode mode)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ flush |= sevstep_spte_protect(sptep, pt_protect, mode);
+ }
+
+ return flush;
+}
+EXPORT_SYMBOL(sevstep_rmap_protect);
+
+bool
+sevstep_kvm_mmu_slot_gfn_protect(struct kvm *kvm, struct kvm_memory_slot *slot,
+ uint64_t gfn, int min_level, enum kvm_page_track_mode mode)
+{
+ struct kvm_rmap_head *rmap_head;
+ bool protected;
+ int i;
+
+ protected = false;
+
+ if (kvm_memslots_have_rmaps(kvm)) {
+ for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+ rmap_head = gfn_to_rmap(gfn, i, slot);
+ protected |= sevstep_rmap_protect(rmap_head, true, mode);
+ }
+ }
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ protected |= kvm_tdp_mmu_write_protect_gfn(kvm,
+ slot, gfn, min_level);
+ }
+
+ return protected;
+}
+EXPORT_SYMBOL(sevstep_kvm_mmu_slot_gfn_protect);
+
diff --git a/cachepc/sevstep.c b/cachepc/sevstep.c
@@ -0,0 +1,263 @@
+#include "sevstep.h"
+#include "cachepc.h"
+
+#include "mmu/mmu_internal.h"
+#include "mmu.h"
+
+#include "irq.h"
+#include "ioapic.h"
+#include "mmu.h"
+#include "mmu/tdp_mmu.h"
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include "cpuid.h"
+#include "mmu/spte.h"
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/hugetlb.h>
+#include <linux/compiler.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/kern_levels.h>
+#include <linux/kthread.h>
+#include <linux/sev.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "kvm_cache_regs.h"
+#include "svm/svm.h"
+
+struct kvm* main_vm;
+EXPORT_SYMBOL(main_vm);
+
+bool
+sevstep_track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+ int idx;
+ bool ret;
+ struct kvm_memory_slot *slot;
+
+ ret = false;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ if (mode == KVM_PAGE_TRACK_ACCESS) {
+ pr_warn("Adding gfn: %016llx to access page track pool\n", gfn);
+ }
+
+ if (mode == KVM_PAGE_TRACK_WRITE) {
+ pr_warn("Adding gfn: %016llx to write page track pool\n", gfn);
+ }
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm,slot, gfn, mode)) {
+ write_lock(&vcpu->kvm->mmu_lock);
+ kvm_slot_page_track_add_page(vcpu->kvm, slot, gfn, mode);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ ret = true;
+ } else {
+ pr_warn("Failed to track %016llx because ", gfn);
+ if (slot == NULL) {
+ printk(KERN_CONT "slot was null");
+ }
+ if (kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) {
+ printk(KERN_CONT "page is already tracked");
+ }
+ printk(KERN_CONT "\n");
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ return ret;
+}
+EXPORT_SYMBOL(sevstep_track_single_page);
+
+bool
+sevstep_untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+ int idx;
+ bool ret;
+ struct kvm_memory_slot *slot;
+
+ ret = false;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ if (mode == KVM_PAGE_TRACK_ACCESS) {
+ pr_warn("Removing gfn: %016llx from acess page track pool\n", gfn);
+ }
+ if (mode == KVM_PAGE_TRACK_WRITE) {
+ pr_warn("Removing gfn: %016llx from write page track pool\n", gfn);
+ }
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) {
+ write_lock(&vcpu->kvm->mmu_lock);
+ kvm_slot_page_track_remove_page(vcpu->kvm, slot, gfn, mode);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ ret = true;
+ } else {
+ pr_warn("Failed to untrack %016llx because ", gfn);
+ if (slot == NULL) {
+ printk(KERN_CONT "slot was null");
+ } else if (!kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) {
+ printk(KERN_CONT "page track was not active");
+ }
+ printk(KERN_CONT "\n");
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ return ret;
+}
+EXPORT_SYMBOL(sevstep_untrack_single_page);
+
+bool
+sevstep_reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ int idx;
+ bool ret;
+ struct kvm_memory_slot *slot;
+
+ ret = false;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (slot != NULL) {
+ write_lock(&vcpu->kvm->mmu_lock);
+ // Vincent: The kvm mmu function now requires min_level
+ // We want all pages to protected so we do PG_LEVEL_4K
+ // https:// patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/
+ sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn,
+ PG_LEVEL_4K, KVM_PAGE_TRACK_RESET_ACCESSED);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ ret = true;
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ return ret;
+}
+EXPORT_SYMBOL(sevstep_reset_accessed_on_page);
+
+bool
+sevstep_clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ int idx;
+ bool ret;
+ struct kvm_memory_slot *slot;
+
+ ret = false;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (slot != NULL) {
+ write_lock(&vcpu->kvm->mmu_lock);
+ // Vincent: The kvm mmu function now requires min_level
+ // We want all pages to protected so we do PG_LEVEL_4K
+ // https:// patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/
+ sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn,
+ PG_LEVEL_4K, KVM_PAGE_TRACK_RESET_EXEC);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ ret = true;
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ return ret;
+}
+EXPORT_SYMBOL(sevstep_clear_nx_on_page);
+
+long
+sevstep_start_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm_memory_slot *first_memslot;
+ struct rb_node *node;
+ u64 iterator, iterat_max;
+ long count = 0;
+ int idx;
+
+ // Vincent: Memslots interface changed into a rb tree, see
+ // here: https:// lwn.net/Articles/856392/
+ // and here: https:// lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u
+ // Thus we use instead of
+ // iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn
+ // + vcpu->kvm->memslots[0]->memslots[0].npages;
+ node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree));
+ first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]);
+ iterat_max = first_memslot->base_gfn + first_memslot->npages;
+ for (iterator = 0; iterator < iterat_max; iterator++)
+ {
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator);
+ if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) {
+ write_lock(&vcpu->kvm->mmu_lock);
+ kvm_slot_page_track_add_page(vcpu->kvm, slot, iterator, mode);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ count++;
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ }
+
+ return count;
+}
+EXPORT_SYMBOL(sevstep_start_tracking);
+
+long
+sevstep_stop_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm_memory_slot *first_memslot;
+ struct rb_node *node;
+ u64 iterator, iterat_max;
+ long count = 0;
+ int idx;
+
+ // Vincent: Memslots interface changed into a rb tree, see
+ // here: https:// lwn.net/Articles/856392/
+ // and here: https:// lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u
+ // Thus we use instead of
+ // iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn
+ // + vcpu->kvm->memslots[0]->memslots[0].npages;
+ node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree));
+ first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]);
+ iterat_max = first_memslot->base_gfn + first_memslot->npages;
+ for (iterator=0; iterator < iterat_max; iterator++)
+ {
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator);
+ // Vincent: I think see here
+ // https:// patchwork.kernel.org/project/kvm/patch/20210924163152.289027-22-pbonzini@redhat.com/
+ if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) {
+ write_lock(&vcpu->kvm->mmu_lock);
+ kvm_slot_page_track_remove_page(vcpu->kvm, slot, iterator, mode);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ count++;
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ }
+
+ return count;
+}
+EXPORT_SYMBOL(sevstep_stop_tracking);
+
+int
+sevstep_get_rip_kvm_vcpu(struct kvm_vcpu *vcpu, uint64_t *rip)
+{
+ return 0;
+}
diff --git a/sevstep/sevstep.h b/cachepc/sevstep.h
diff --git a/cachepc/uapi.h b/cachepc/uapi.h
@@ -7,3 +7,94 @@
#define CACHEPC_IOCTL_TEST_ACCESS _IOWR(CACHEPC_IOCTL_MAGIC, 0, __u32)
#define CACHEPC_IOCTL_TEST_EVICTION _IOWR(CACHEPC_IOCTL_MAGIC, 1, __u32)
#define CACHEPC_IOCTL_INIT_PMC _IOW(CACHEPC_IOCTL_MAGIC, 2, __u32)
+
+#define KVM_TRACK_PAGE _IOWR(KVMIO, 0x20, track_page_param_t)
+#define KVM_USPT_REGISTER_PID _IOWR(KVMIO, 0x21, userspace_ctx_t)
+#define KVM_USPT_WAIT_AND_SEND _IO(KVMIO, 0x22)
+#define KVM_USPT_POLL_EVENT _IOWR(KVMIO, 0x23, page_fault_event_t)
+#define KVM_USPT_ACK_EVENT _IOWR(KVMIO, 0x24, ack_event_t)
+#define KVM_READ_GUEST_MEMORY _IOWR(KVMIO, 0x25, read_guest_memory_t)
+#define KVM_USPT_RESET _IO(KVMIO, 0x26)
+#define KVM_USPT_TRACK_ALL _IOWR(KVMIO, 0x27, track_all_pages_t)
+#define KVM_USPT_UNTRACK_ALL _IOWR(KVMIO, 0x28, track_all_pages_t)
+#define KVM_USPT_SETUP_RETINSTR_PERF _IOWR(KVMIO, 0x30, retired_instr_perf_config_t)
+#define KVM_USPT_READ_RETINSTR_PERF _IOWR(KVMIO, 0x31, retired_instr_perf_t)
+#define KVM_USPT_BATCH_TRACK_START _IOWR(KVMIO, 0x32, batch_track_config_t)
+#define KVM_USPT_BATCH_TRACK_STOP _IOWR(KVMIO, 0x33, batch_track_stop_and_get_t)
+#define KVM_USPT_BATCH_TRACK_EVENT_COUNT _IOWR(KVMIO, 0x34, batch_track_event_count_t)
+
+#define KVM_USPT_POLL_EVENT_NO_EVENT 1000
+#define KVM_USPT_POLL_EVENT_GOT_EVENT 0
+
+enum kvm_page_track_mode {
+ KVM_PAGE_TRACK_WRITE,
+ KVM_PAGE_TRACK_ACCESS,
+ KVM_PAGE_TRACK_RESET_ACCESSED,
+ KVM_PAGE_TRACK_EXEC,
+ KVM_PAGE_TRACK_RESET_EXEC,
+ KVM_PAGE_TRACK_MAX,
+};
+
+typedef struct {
+ __u64 id; // filled automatically
+ __u64 faulted_gpa;
+ __u32 error_code;
+ __u8 have_rip_info;
+ __u64 rip;
+ __u64 ns_timestamp;
+ __u8 have_retired_instructions;
+ __u64 retired_instructions;
+} page_fault_event_t;
+
+typedef struct {
+ __s32 tracking_type;
+ __u64 expected_events;
+ __s32 perf_cpu;
+ __u8 retrack;
+} batch_track_config_t;
+
+typedef struct {
+ __u64 event_count;
+} batch_track_event_count_t;
+
+typedef struct {
+ page_fault_event_t* out_buf;
+ __u64 len;
+ __u8 error_during_batch;
+} batch_track_stop_and_get_t;
+
+typedef struct {
+ __s32 cpu; // cpu on which we want to read the counter
+ __u64 retired_instruction_count; // result param
+} retired_instr_perf_t;
+
+typedef struct {
+ __s32 cpu; // cpu on which counter should be programmed
+} retired_instr_perf_config_t;
+
+typedef struct {
+ __u64 gpa;
+ __u64 len;
+ __u8 decrypt_with_host_key;
+ __s32 wbinvd_cpu; // -1: do not flush; else logical cpu on which we flush
+ void *output_buffer;
+} read_guest_memory_t;
+
+typedef struct {
+ __s32 pid;
+ __u8 get_rip;
+} userspace_ctx_t;
+
+typedef struct {
+ __u64 id;
+} ack_event_t;
+
+typedef struct {
+ __u64 gpa;
+ __s32 track_mode;
+} track_page_param_t;
+
+typedef struct {
+ __s32 track_mode;
+} track_all_pages_t;
+
diff --git a/cachepc/uspt.c b/cachepc/uspt.c
@@ -0,0 +1,488 @@
+#include "uspt.h"
+#include "sevstep.h"
+#include "cachepc.h"
+
+#include <linux/kvm.h>
+#include <linux/timekeeping.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+
+#define ARRLEN(x) (sizeof(x)/sizeof((x)[0]))
+
+typedef struct {
+ bool is_active;
+ int tracking_type;
+ bool retrack;
+
+ int perf_cpu;
+
+ uint64_t gfn_retrack_backlog[10];
+ int gfn_retrack_backlog_next_idx;
+
+ page_fault_event_t * events;
+ uint64_t event_next_idx;
+ uint64_t events_size;
+
+ bool error_occured;
+} batch_track_state_t;
+
+typedef struct {
+ uint64_t idx_for_last_perf_reading;
+ uint64_t last_perf_reading;
+ uint64_t delta_valid_idx;
+ uint64_t delta;
+} perf_state_t;
+
+// crude sync mechanism. don't know a good way to act on errors yet.
+static uint64_t last_sent_event_id = 1;
+static uint64_t last_acked_event_id = 1;
+DEFINE_RWLOCK(event_lock);
+
+static page_fault_event_t sent_event;
+static int have_event = 0;
+
+static bool get_rip = true;
+
+static int inited = 0;
+
+DEFINE_SPINLOCK(batch_track_state_lock);
+static batch_track_state_t batch_track_state;
+
+static perf_state_t perf_state;
+
+static uint64_t perf_state_update_and_get_delta(uint64_t current_event_idx);
+
+void
+sevstep_uspt_clear(void)
+{
+ write_lock(&event_lock);
+ inited = 0;
+ last_sent_event_id = 1;
+ last_acked_event_id = 1;
+ have_event = 0;
+ get_rip = false;
+ write_unlock(&event_lock);
+}
+
+int
+sevstep_uspt_initialize(int pid, bool should_get_rip)
+{
+ write_lock(&event_lock);
+ inited = 1;
+ last_sent_event_id = 1;
+ last_acked_event_id = 1;
+ have_event = 0;
+ get_rip = should_get_rip;
+ write_unlock(&event_lock);
+
+ return 0;
+}
+
+int
+sevstep_uspt_is_initialiized()
+{
+ return inited;
+}
+
+bool
+sevstep_uspt_should_get_rip()
+{
+ bool tmp;
+
+ read_lock(&event_lock);
+ tmp = get_rip;
+ read_unlock(&event_lock);
+
+ return tmp;
+}
+
+int
+sevstep_uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code,
+ bool have_rip, uint64_t rip)
+{
+ ktime_t abort_after;
+ page_fault_event_t message_for_user;
+
+ read_lock(&event_lock);
+ if (!sevstep_uspt_is_initialiized()) {
+ pr_warn("sevstep_uspt_send_and_block: ctx not initialized!\n");
+ read_unlock(&event_lock);
+ return 1;
+ }
+ read_unlock(&event_lock);
+
+ write_lock(&event_lock);
+ if (last_sent_event_id != last_acked_event_id) {
+ pr_warn("sevstep_uspt_send_and_block: "
+ "event id_s out of sync, aborting. Fix this later\n");
+ write_unlock(&event_lock);
+ return 1;
+ } else {
+ // TODO: handle overflow
+ last_sent_event_id++;
+ }
+ message_for_user.id = last_sent_event_id;
+ message_for_user.faulted_gpa = faulted_gpa;
+ message_for_user.error_code = error_code;
+ message_for_user.have_rip_info = have_rip;
+ message_for_user.rip = rip;
+ message_for_user.ns_timestamp = ktime_get_real_ns();
+ message_for_user.have_retired_instructions = false;
+
+ // for poll based system;
+ have_event = 1;
+ sent_event = message_for_user;
+ // printk("sevstep_uspt_send_and_block sending event %llu\n", sent_event.id);
+
+ write_unlock(&event_lock);
+
+ // wait for ack, but with timeout. Otherwise small bugs in userland
+ // easily lead to a kernel hang
+ abort_after = ktime_get() + 1000000000ULL; // 1 sec in nanosecond
+ while (!sevstep_uspt_is_event_done(sent_event.id)) {
+ if (ktime_get() > abort_after) {
+ pr_warn("sevstep_uspt_send_and_block: "
+ "Waiting for ack of event %llu timed out, "
+ "continuing\n",sent_event.id);
+ return 3;
+ }
+ }
+
+ return 0;
+}
+
+int
+sevstep_uspt_is_event_done(uint64_t id)
+{
+ int res;
+
+ read_lock(&event_lock);
+ res = last_acked_event_id >= id;
+ read_unlock(&event_lock);
+
+ return res;
+}
+
+int
+sevstep_uspt_handle_poll_event(page_fault_event_t* userpace_mem)
+{
+ int err;
+
+ // most of the time we won't have an event
+ read_lock(&event_lock);
+ if (!have_event) {
+ read_unlock(&event_lock);
+ return KVM_USPT_POLL_EVENT_NO_EVENT;
+ }
+ read_unlock(&event_lock);
+
+ write_lock(&event_lock);
+ if (have_event) {
+ err = copy_to_user(userpace_mem,
+ &sent_event, sizeof(page_fault_event_t));
+ have_event = 0;
+ } else {
+ err = KVM_USPT_POLL_EVENT_NO_EVENT;
+ }
+ write_unlock(&event_lock);
+
+ return err;
+}
+
+int
+sevstep_uspt_handle_ack_event_ioctl(ack_event_t event)
+{
+ int err = 0;
+
+ write_lock(&event_lock);
+ if (event.id == last_sent_event_id) {
+ last_acked_event_id = last_sent_event_id;
+ } else {
+ err = 1;
+ pr_warn("sevstep_uspt_handle_ack_event_ioctl: "
+ "last sent event id is %llu but received ack for %llu\n",
+ last_sent_event_id, event.id);
+ }
+ write_unlock(&event_lock);
+
+ return err;
+}
+
+// get retired instructions between current_event_idx-1 and current_event_idx
+// value is cached for multiple calls to the same current_event_idx
+uint64_t
+perf_state_update_and_get_delta(uint64_t current_event_idx)
+{
+ uint64_t current_value;
+
+ /* check if value is "cached" */
+ if (perf_state.delta_valid_idx == current_event_idx) {
+ if (current_event_idx == 0) {
+ perf_state.idx_for_last_perf_reading = current_event_idx;
+ perf_state.last_perf_reading = cachepc_read_pmc(0);
+ }
+ return perf_state.delta;
+ }
+
+ /* otherwise update, but logic is only valid for two consecutive events */
+ if (current_event_idx != perf_state.idx_for_last_perf_reading+1) {
+ pr_warn("perf_state_update_and_get_delta: "
+ "last reading was for idx %llu but was queried for %llu\n",
+ perf_state.idx_for_last_perf_reading, current_event_idx);
+ }
+
+ current_value = cachepc_read_pmc(0);
+ perf_state.delta = (current_value - perf_state.last_perf_reading);
+ perf_state.delta_valid_idx = current_event_idx;
+
+ perf_state.idx_for_last_perf_reading = current_event_idx;
+ perf_state.last_perf_reading = current_value;
+
+ return perf_state.delta;
+}
+
+void
+sevstep_uspt_batch_tracking_inc_event_idx(void)
+{
+ spin_lock(&batch_track_state_lock);
+ batch_track_state.event_next_idx++;
+ spin_unlock(&batch_track_state_lock);
+}
+
+int
+sevstep_uspt_batch_tracking_start(int tracking_type,uint64_t expected_events,
+ int perf_cpu, bool retrack)
+{
+ page_fault_event_t* events;
+ uint64_t buffer_size, i;
+
+ spin_lock(&batch_track_state_lock);
+ if (batch_track_state.is_active) {
+ pr_warn("sevstep_uspt_batch_tracking_start: "
+ "overwriting active batch track config!\n");
+ if (batch_track_state.events != NULL ) {
+ vfree(batch_track_state.events);
+ }
+ }
+ batch_track_state.is_active = false;
+ spin_unlock(&batch_track_state_lock);
+
+ buffer_size = expected_events * sizeof(page_fault_event_t);
+ pr_warn("sevstep_uspt_batch_tracking_start: "
+ "trying to alloc %llu bytes buffer for events\n",
+ buffer_size);
+ events = vmalloc(buffer_size);
+ if (events == NULL) {
+ pr_warn("sevstep_uspt_batch_tracking_start: "
+ "faperf_cpuiled to alloc %llu bytes for event buffer\n",
+ buffer_size);
+ return 1; // note: lock not held here
+ }
+
+ // access each element once to force them into memory, improving performance
+ // during tracking
+ for (i = 0; i < expected_events * sizeof(page_fault_event_t); i++) {
+ ((volatile uint8_t*)events)[i] = 0;
+ }
+
+ perf_state.idx_for_last_perf_reading = 0;
+ perf_state.last_perf_reading = 0;
+ perf_state.delta_valid_idx = 0;
+ perf_state.delta = 0;
+ cachepc_init_pmc(0, 0xc0, 0x00, PMC_GUEST, PMC_KERNEL | PMC_USER);
+
+ spin_lock(&batch_track_state_lock);
+
+ batch_track_state.perf_cpu = perf_cpu;
+ batch_track_state.retrack = retrack;
+
+ batch_track_state.events = events;
+ batch_track_state.event_next_idx = 0;
+ batch_track_state.events_size = expected_events;
+
+ batch_track_state.gfn_retrack_backlog_next_idx = 0;
+ batch_track_state.tracking_type = tracking_type;
+ batch_track_state.error_occured = false;
+
+ batch_track_state.is_active = true;
+
+ spin_unlock(&batch_track_state_lock);
+
+ return 0;
+}
+
+void
+sevstep_uspt_batch_tracking_handle_retrack(struct kvm_vcpu* vcpu,
+ uint64_t current_fault_gfn)
+{
+ uint64_t ret_instr_delta;
+ int i, next_idx;
+
+ spin_lock(&batch_track_state_lock);
+
+ if (!batch_track_state.retrack) {
+ spin_unlock(&batch_track_state_lock);
+ return;
+ }
+
+ if (smp_processor_id() != batch_track_state.perf_cpu) {
+ pr_warn("sevstep_uspt_batch_tracking_handle_retrack: perf was "
+ "programmed on logical cpu %d but handler was called "
+ "on %d. Did you forget to pin the vcpu thread?\n",
+ batch_track_state.perf_cpu, smp_processor_id());
+ }
+ ret_instr_delta = perf_state_update_and_get_delta(batch_track_state.event_next_idx);
+
+ // faulting instructions is probably the same as on last fault
+ // try to add current fault to retrack log and return
+ // for first event idx we do not have a valid ret_instr_delta.
+ // Retracking for the frist time is fine, if we loop, we end up here
+ // again but with a valid delta on one of the next event
+ if ((ret_instr_delta < 2) && ( batch_track_state.event_next_idx != 0)) {
+ next_idx = batch_track_state.gfn_retrack_backlog_next_idx;
+ if (next_idx >= ARRLEN(batch_track_state.gfn_retrack_backlog)) {
+ pr_warn("sevstep_uspt_batch_tracking_handle_retrack: "
+ "retrack backlog full, dropping retrack for fault "
+ "at 0x%llx\n", current_fault_gfn);
+ } else {
+ batch_track_state.gfn_retrack_backlog[next_idx] = current_fault_gfn;
+ batch_track_state.gfn_retrack_backlog_next_idx++;
+ }
+
+ spin_unlock(&batch_track_state_lock);
+ return;
+ }
+
+ /* made progress, retrack everything in backlog and reset idx */
+ for (i = 0; i < batch_track_state.gfn_retrack_backlog_next_idx; i++) {
+ sevstep_track_single_page(vcpu,
+ batch_track_state.gfn_retrack_backlog[i],
+ batch_track_state.tracking_type);
+ }
+
+ /* add current fault to list */
+ batch_track_state.gfn_retrack_backlog[0] = current_fault_gfn;
+ batch_track_state.gfn_retrack_backlog_next_idx = 1;
+
+ spin_unlock(&batch_track_state_lock);
+
+}
+
+int
+sevstep_uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code,
+ bool have_rip, uint64_t rip)
+{
+ uint64_t ret_instr_delta;
+ page_fault_event_t* event;
+
+ spin_lock(&batch_track_state_lock);
+
+ if (!batch_track_state.is_active) {
+ pr_warn("sevstep_uspt_batch_tracking_save: "
+ "got save but batch tracking is not active!\n");
+ batch_track_state.error_occured = true;
+ spin_unlock(&batch_track_state_lock);
+ return 1;
+ }
+
+
+ if (batch_track_state.event_next_idx >= batch_track_state.events_size) {
+ pr_warn("sevstep_uspt_batch_tracking_save: events buffer is full!\n");
+ batch_track_state.error_occured = true;
+ spin_unlock(&batch_track_state_lock);
+ return 1;
+ }
+
+ if (smp_processor_id() != batch_track_state.perf_cpu) {
+ pr_warn("sevstep_uspt_batch_tracking_save: perf was "
+ "programmed on logical cpu %d but handler was called "
+ "on %d. Did you forget to pin the vcpu thread?\n",
+ batch_track_state.perf_cpu, smp_processor_id());
+ }
+ ret_instr_delta = perf_state_update_and_get_delta(batch_track_state.event_next_idx);
+
+
+ if (batch_track_state.events == NULL) {
+ pr_warn("sevstep_uspt_batch_tracking_save: events buf was "
+ "NULL but \"is_active\" was set! This should never happen!!!\n");
+ spin_unlock(&batch_track_state_lock);
+ return 1;
+ }
+
+ event = &batch_track_state.events[batch_track_state.event_next_idx];
+ event->id = batch_track_state.event_next_idx;
+ event->faulted_gpa = faulted_gpa;
+ event->error_code = error_code;
+ event->have_rip_info = have_rip;
+ event->rip = rip;
+ event->ns_timestamp = ktime_get_real_ns();
+ event->have_retired_instructions = true;
+ event->retired_instructions = ret_instr_delta;
+
+ // old inc was here
+
+ if (batch_track_state.gfn_retrack_backlog_next_idx
+ > ARRLEN(batch_track_state.gfn_retrack_backlog)) {
+ pr_warn("sevstep_uspt_batch_tracking_save: "
+ "gfn retrack backlog overflow!\n");
+ batch_track_state.error_occured = true;
+ spin_unlock(&batch_track_state_lock);
+ return 1;
+ }
+
+ spin_unlock(&batch_track_state_lock);
+
+ return 0;
+}
+
+int
+sevstep_uspt_batch_tracking_stop(page_fault_event_t* results,
+ uint64_t len, __u8* error_occured)
+{
+ spin_lock(&batch_track_state_lock);
+ if (!batch_track_state.is_active) {
+ pr_warn("sevstep_uspt: batch tracking not active\n");
+ spin_unlock(&batch_track_state_lock);
+ return 1;
+
+ }
+ batch_track_state.is_active = false;
+
+ if (len > batch_track_state.event_next_idx) {
+ pr_warn("sevstep_uspt_batch_tracking_stop: "
+ "requested %llu events but got only %llu\n",
+ len, batch_track_state.event_next_idx);
+ spin_unlock(&batch_track_state_lock);
+ return 1;
+ }
+
+ memcpy(results,batch_track_state.events, len*sizeof(page_fault_event_t));
+ vfree(batch_track_state.events);
+
+ *error_occured = batch_track_state.error_occured;
+
+ spin_unlock(&batch_track_state_lock);
+
+ return 0;
+}
+
+uint64_t
+sevstep_uspt_batch_tracking_get_events_count()
+{
+ uint64_t buf;
+
+ spin_lock(&batch_track_state_lock);
+ buf = batch_track_state.event_next_idx;
+ spin_unlock(&batch_track_state_lock);
+
+ return buf;
+}
+
+bool
+sevstep_uspt_batch_tracking_in_progress()
+{
+ return batch_track_state.is_active;
+}
diff --git a/sevstep/uspt.h b/cachepc/uspt.h
diff --git a/patch.diff b/patch.diff
@@ -1,5 +1,5 @@
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
-index eb186bc57f6a..3f767a27045e 100644
+index eb186bc57f6a..b96e80934005 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -2,10 +2,9 @@
@@ -12,12 +12,12 @@ index eb186bc57f6a..3f767a27045e 100644
-};
+#include<linux/srcu.h>
+
-+#include "../../kvm/sevstep/uapi.h"
++#include "../../kvm/cachepc/uapi.h"
/*
* The notifier represented by @kvm_page_track_notifier_node is linked into
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
-index 30f244b64523..3c5f65040878 100644
+index 30f244b64523..e0eeffd340e8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,6 +1,6 @@
@@ -35,8 +35,8 @@ index 30f244b64523..3c5f65040878 100644
- hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
- mmu/spte.o
+ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o \
-+ svm/cachepc/cachepc.o svm/cachepc/kvm.o \
-+ sevstep/sevstep.o sevstep/uspt.o
++ cachepc/cachepc.o cachepc/kvm.o \
++ cachepc/sevstep.o cachepc/uspt.o
ifdef CONFIG_HYPERV
kvm-y += kvm_onhyperv.o
@@ -45,20 +45,20 @@ index 30f244b64523..3c5f65040878 100644
kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
-kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
-+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o \
-+ svm/cachepc/cachepc.o
++kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o \
++ svm/avic.o svm/sev.o cachepc/cachepc.o
ifdef CONFIG_HYPERV
kvm-amd-y += svm/svm_onhyperv.o
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
-index d871b8dee7b3..32900ef5ee0b 100644
+index d871b8dee7b3..3b7720aebbc6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1152,6 +1152,8 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
}
}
-+#include "../sevstep/mmu.c"
++#include "../cachepc/mmu.c"
+
/*
* Write-protect on the specified @sptep, @pt_protect indicates whether
@@ -154,14 +154,14 @@ index d871b8dee7b3..32900ef5ee0b 100644
return false;
}
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
-index 2e09d1b6249f..17b69a1f2b40 100644
+index 2e09d1b6249f..9b40e71564bf 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -19,6 +19,8 @@
#include "mmu.h"
#include "mmu_internal.h"
-+#include "../sevstep/sevstep.h"
++#include "../cachepc/sevstep.h"
+
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
{
@@ -180,22 +180,6 @@ index 2e09d1b6249f..17b69a1f2b40 100644
}
EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page);
-diff --git a/arch/x86/kvm/sevstep b/arch/x86/kvm/sevstep
-new file mode 120000
-index 000000000000..642ea24bf098
---- /dev/null
-+++ b/arch/x86/kvm/sevstep
-@@ -0,0 +1 @@
-+/home/louis/kvm-prime-count/sevstep
-\ No newline at end of file
-diff --git a/arch/x86/kvm/svm/cachepc b/arch/x86/kvm/svm/cachepc
-new file mode 120000
-index 000000000000..9119e44af1f0
---- /dev/null
-+++ b/arch/x86/kvm/svm/cachepc
-@@ -0,0 +1 @@
-+/home/louis/kvm-prime-count/cachepc
-\ No newline at end of file
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index cf0bf456d520..4dbb8041541f 100644
--- a/arch/x86/kvm/svm/svm.c
@@ -382,14 +366,14 @@ index dfaeb47fcf2a..0626f3fdddfd 100644
2: cli
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
-index d9adf79124f9..082dc8553566 100644
+index d9adf79124f9..3e5c55f9bef0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -82,6 +82,8 @@
#include <asm/sgx.h>
#include <clocksource/hyperv_timer.h>
-+#include "sevstep/sevstep.h"
++#include "cachepc/sevstep.h"
+
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -427,21 +411,29 @@ index e089fbf9017f..7899e1efe852
static int __sev_init_locked(int *error)
{
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
-index f2a63cb2658b..bd26b7a29c9e 100644
+index f2a63cb2658b..0d1c1d8c72ea 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
-@@ -70,6 +70,10 @@
+@@ -13,6 +13,7 @@
+ * Yaniv Kamay <yaniv@qumranet.com>
+ */
+
++#include <asm-generic/errno-base.h>
+ #include <kvm/iodev.h>
+
+ #include <linux/kvm_host.h>
+@@ -70,6 +71,10 @@
/* Worst case buffer size needed for holding an integer. */
#define ITOA_MAX_LEN 12
-+#include "../../arch/x86/kvm/svm/cachepc/kvm.h"
-+#include "../../arch/x86/kvm/sevstep/sevstep.h"
-+#include "../../arch/x86/kvm/sevstep/uspt.h"
++#include "../../arch/x86/kvm/cachepc/kvm.h"
++#include "../../arch/x86/kvm/cachepc/sevstep.h"
++#include "../../arch/x86/kvm/cachepc/uspt.h"
+
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
-@@ -159,6 +163,267 @@ static unsigned long long kvm_active_vms;
+@@ -159,6 +164,267 @@ static unsigned long long kvm_active_vms;
static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
@@ -709,7 +701,7 @@ index f2a63cb2658b..bd26b7a29c9e 100644
__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end)
{
-@@ -1261,6 +1526,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
+@@ -1261,6 +1527,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
hardware_disable_all();
mmdrop(mm);
module_put(kvm_chardev_ops.owner);
@@ -719,7 +711,7 @@ index f2a63cb2658b..bd26b7a29c9e 100644
}
void kvm_get_kvm(struct kvm *kvm)
-@@ -1360,7 +1628,7 @@ static void kvm_insert_gfn_node(struct kvm_memslots *slots,
+@@ -1360,7 +1629,7 @@ static void kvm_insert_gfn_node(struct kvm_memslots *slots,
int idx = slots->node_idx;
parent = NULL;
@@ -728,7 +720,7 @@ index f2a63cb2658b..bd26b7a29c9e 100644
struct kvm_memory_slot *tmp;
tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
-@@ -4823,6 +5091,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
+@@ -4823,6 +5092,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
fd_install(r, file);
@@ -738,7 +730,7 @@ index f2a63cb2658b..bd26b7a29c9e 100644
return r;
put_kvm:
-@@ -4836,6 +5107,315 @@ static long kvm_dev_ioctl(struct file *filp,
+@@ -4836,6 +5108,315 @@ static long kvm_dev_ioctl(struct file *filp,
long r = -EINVAL;
switch (ioctl) {
@@ -1054,7 +1046,18 @@ index f2a63cb2658b..bd26b7a29c9e 100644
case KVM_GET_API_VERSION:
if (arg)
goto out;
-@@ -5792,6 +6372,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
+@@ -4864,7 +5445,9 @@ static long kvm_dev_ioctl(struct file *filp,
+ r = -EOPNOTSUPP;
+ break;
+ default:
+- return kvm_arch_dev_ioctl(filp, ioctl, arg);
++ //r = cachepc_kvm_ioctl(filp, ioctl, arg);
++ //if (r == -EINVAL)
++ return kvm_arch_dev_ioctl(filp, ioctl, arg);
+ }
+ out:
+ return r;
+@@ -5792,6 +6375,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
r = kvm_vfio_ops_init();
WARN_ON(r);
@@ -1063,7 +1066,7 @@ index f2a63cb2658b..bd26b7a29c9e 100644
return 0;
out_unreg:
-@@ -5821,6 +6403,8 @@ void kvm_exit(void)
+@@ -5821,6 +6406,8 @@ void kvm_exit(void)
{
int cpu;
diff --git a/sevstep/mmu.c b/sevstep/mmu.c
@@ -1,135 +0,0 @@
-#include "../sevstep/sevstep.h"
-#include "../sevstep/uspt.h"
-
-static void
-sevstep_uspt_page_fault_handle(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault)
-{
- const int modes[] = {
- KVM_PAGE_TRACK_WRITE,
- KVM_PAGE_TRACK_ACCESS,
- KVM_PAGE_TRACK_EXEC
- };
- uint64_t current_rip;
- bool was_tracked;
- int have_rip, i;
- int send_err;
-
- was_tracked = false;
- for (i = 0; i < sizeof(modes) / sizeof(modes[0]); i++) {
- if (kvm_slot_page_track_is_active(vcpu->kvm,
- fault->slot, fault->gfn, modes[i])) {
- sevstep_untrack_single_page(vcpu, fault->gfn, modes[i]);
- was_tracked = true;
- }
- }
-
- if (was_tracked) {
- have_rip = false;
- if (sevstep_uspt_should_get_rip())
- have_rip = sevstep_get_rip_kvm_vcpu(vcpu, ¤t_rip) == 0;
- if (sevstep_uspt_batch_tracking_in_progress()) {
- send_err = sevstep_uspt_batch_tracking_save(fault->gfn << PAGE_SHIFT,
- fault->error_code, have_rip, current_rip);
- if (send_err) {
- printk_ratelimited(
- "sevstep_uspt_batch_tracking_save failed with %d\n"
- "##########################\n", send_err);
- }
- sevstep_uspt_batch_tracking_handle_retrack(vcpu, fault->gfn);
- sevstep_uspt_batch_tracking_inc_event_idx();
- } else {
- send_err = sevstep_uspt_send_and_block(fault->gfn << PAGE_SHIFT,
- fault->error_code, have_rip, current_rip);
- if (send_err) {
- printk("sevstep_uspt_send_and_block failed with %d\n"
- "##########################\n", send_err);
- }
- }
- }
-}
-
-bool
-sevstep_spte_protect(u64 *sptep, bool pt_protect, enum kvm_page_track_mode mode)
-{
- u64 spte;
- bool flush;
-
- spte = *sptep;
- if (!is_writable_pte(spte) && !(pt_protect && is_mmu_writable_spte(spte)))
- return false;
-
- rmap_printk("spte %p %llx\n", sptep, *sptep);
-
- if (pt_protect)
- spte &= ~EPT_SPTE_MMU_WRITABLE;
-
- flush = false;
- if (mode == KVM_PAGE_TRACK_WRITE) {
- spte = spte & ~PT_WRITABLE_MASK;
- flush = true;
- } else if (mode == KVM_PAGE_TRACK_RESET_ACCESSED) {
- spte = spte & ~PT_ACCESSED_MASK;
- } else if (mode == KVM_PAGE_TRACK_ACCESS) {
- spte = spte & ~PT_PRESENT_MASK;
- spte = spte & ~PT_WRITABLE_MASK;
- spte = spte & ~PT_USER_MASK;
- spte = spte | (0x1ULL << PT64_NX_SHIFT);
- flush = true;
- } else if (mode == KVM_PAGE_TRACK_EXEC) {
- spte = spte | (0x1ULL << PT64_NX_SHIFT);
- flush = true;
- } else if (mode == KVM_PAGE_TRACK_RESET_EXEC) {
- spte = spte & ~(0x1ULL << PT64_NX_SHIFT);
- flush = true;
- } else {
- printk(KERN_WARNING "spte_protect was called with invalid mode"
- "parameter %d\n",mode);
- }
- flush |= mmu_spte_update(sptep, spte);
-
- return flush;
-}
-EXPORT_SYMBOL(sevstep_spte_protect);
-
-bool sevstep_rmap_protect(struct kvm_rmap_head *rmap_head,
- bool pt_protect, enum kvm_page_track_mode mode)
-{
- u64 *sptep;
- struct rmap_iterator iter;
- bool flush = false;
-
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- flush |= sevstep_spte_protect(sptep, pt_protect, mode);
- }
-
- return flush;
-}
-EXPORT_SYMBOL(sevstep_rmap_protect);
-
-bool
-sevstep_kvm_mmu_slot_gfn_protect(struct kvm *kvm, struct kvm_memory_slot *slot,
- uint64_t gfn, int min_level, enum kvm_page_track_mode mode)
-{
- struct kvm_rmap_head *rmap_head;
- bool protected;
- int i;
-
- protected = false;
-
- if (kvm_memslots_have_rmaps(kvm)) {
- for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
- rmap_head = gfn_to_rmap(gfn, i, slot);
- protected |= sevstep_rmap_protect(rmap_head, true, mode);
- }
- }
-
- if (is_tdp_mmu_enabled(kvm)) {
- protected |= kvm_tdp_mmu_write_protect_gfn(kvm,
- slot, gfn, min_level);
- }
-
- return protected;
-}
-EXPORT_SYMBOL(sevstep_kvm_mmu_slot_gfn_protect);
-
diff --git a/sevstep/sevstep.c b/sevstep/sevstep.c
@@ -1,263 +0,0 @@
-#include "sevstep.h"
-#include "svm/cachepc/cachepc.h"
-
-#include "mmu/mmu_internal.h"
-#include "mmu.h"
-
-#include "irq.h"
-#include "ioapic.h"
-#include "mmu.h"
-#include "mmu/tdp_mmu.h"
-#include "x86.h"
-#include "kvm_cache_regs.h"
-#include "kvm_emulate.h"
-#include "cpuid.h"
-#include "mmu/spte.h"
-
-#include <linux/kvm_host.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/moduleparam.h>
-#include <linux/export.h>
-#include <linux/swap.h>
-#include <linux/hugetlb.h>
-#include <linux/compiler.h>
-#include <linux/srcu.h>
-#include <linux/slab.h>
-#include <linux/sched/signal.h>
-#include <linux/uaccess.h>
-#include <linux/hash.h>
-#include <linux/kern_levels.h>
-#include <linux/kthread.h>
-#include <linux/sev.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-
-#include "kvm_cache_regs.h"
-#include "svm/svm.h"
-
-struct kvm* main_vm;
-EXPORT_SYMBOL(main_vm);
-
-bool
-sevstep_track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- enum kvm_page_track_mode mode)
-{
- int idx;
- bool ret;
- struct kvm_memory_slot *slot;
-
- ret = false;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
-
- if (mode == KVM_PAGE_TRACK_ACCESS) {
- pr_warn("Adding gfn: %016llx to access page track pool\n", gfn);
- }
-
- if (mode == KVM_PAGE_TRACK_WRITE) {
- pr_warn("Adding gfn: %016llx to write page track pool\n", gfn);
- }
-
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm,slot, gfn, mode)) {
- write_lock(&vcpu->kvm->mmu_lock);
- kvm_slot_page_track_add_page(vcpu->kvm, slot, gfn, mode);
- write_unlock(&vcpu->kvm->mmu_lock);
- ret = true;
- } else {
- pr_warn("Failed to track %016llx because ", gfn);
- if (slot == NULL) {
- printk(KERN_CONT "slot was null");
- }
- if (kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) {
- printk(KERN_CONT "page is already tracked");
- }
- printk(KERN_CONT "\n");
- }
-
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- return ret;
-}
-EXPORT_SYMBOL(sevstep_track_single_page);
-
-bool
-sevstep_untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- enum kvm_page_track_mode mode)
-{
- int idx;
- bool ret;
- struct kvm_memory_slot *slot;
-
- ret = false;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
-
- if (mode == KVM_PAGE_TRACK_ACCESS) {
- pr_warn("Removing gfn: %016llx from acess page track pool\n", gfn);
- }
- if (mode == KVM_PAGE_TRACK_WRITE) {
- pr_warn("Removing gfn: %016llx from write page track pool\n", gfn);
- }
-
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) {
- write_lock(&vcpu->kvm->mmu_lock);
- kvm_slot_page_track_remove_page(vcpu->kvm, slot, gfn, mode);
- write_unlock(&vcpu->kvm->mmu_lock);
- ret = true;
- } else {
- pr_warn("Failed to untrack %016llx because ", gfn);
- if (slot == NULL) {
- printk(KERN_CONT "slot was null");
- } else if (!kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) {
- printk(KERN_CONT "page track was not active");
- }
- printk(KERN_CONT "\n");
- }
-
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- return ret;
-}
-EXPORT_SYMBOL(sevstep_untrack_single_page);
-
-bool
-sevstep_reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- int idx;
- bool ret;
- struct kvm_memory_slot *slot;
-
- ret = false;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
-
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- if (slot != NULL) {
- write_lock(&vcpu->kvm->mmu_lock);
- // Vincent: The kvm mmu function now requires min_level
- // We want all pages to protected so we do PG_LEVEL_4K
- // https:// patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/
- sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn,
- PG_LEVEL_4K, KVM_PAGE_TRACK_RESET_ACCESSED);
- write_unlock(&vcpu->kvm->mmu_lock);
- ret = true;
- }
-
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- return ret;
-}
-EXPORT_SYMBOL(sevstep_reset_accessed_on_page);
-
-bool
-sevstep_clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- int idx;
- bool ret;
- struct kvm_memory_slot *slot;
-
- ret = false;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
-
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- if (slot != NULL) {
- write_lock(&vcpu->kvm->mmu_lock);
- // Vincent: The kvm mmu function now requires min_level
- // We want all pages to protected so we do PG_LEVEL_4K
- // https:// patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/
- sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn,
- PG_LEVEL_4K, KVM_PAGE_TRACK_RESET_EXEC);
- write_unlock(&vcpu->kvm->mmu_lock);
- ret = true;
- }
-
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- return ret;
-}
-EXPORT_SYMBOL(sevstep_clear_nx_on_page);
-
-long
-sevstep_start_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode)
-{
- struct kvm_memory_slot *slot;
- struct kvm_memory_slot *first_memslot;
- struct rb_node *node;
- u64 iterator, iterat_max;
- long count = 0;
- int idx;
-
- // Vincent: Memslots interface changed into a rb tree, see
- // here: https:// lwn.net/Articles/856392/
- // and here: https:// lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u
- // Thus we use instead of
- // iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn
- // + vcpu->kvm->memslots[0]->memslots[0].npages;
- node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree));
- first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]);
- iterat_max = first_memslot->base_gfn + first_memslot->npages;
- for (iterator = 0; iterator < iterat_max; iterator++)
- {
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator);
- if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) {
- write_lock(&vcpu->kvm->mmu_lock);
- kvm_slot_page_track_add_page(vcpu->kvm, slot, iterator, mode);
- write_unlock(&vcpu->kvm->mmu_lock);
- count++;
- }
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- }
-
- return count;
-}
-EXPORT_SYMBOL(sevstep_start_tracking);
-
-long
-sevstep_stop_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode)
-{
- struct kvm_memory_slot *slot;
- struct kvm_memory_slot *first_memslot;
- struct rb_node *node;
- u64 iterator, iterat_max;
- long count = 0;
- int idx;
-
- // Vincent: Memslots interface changed into a rb tree, see
- // here: https:// lwn.net/Articles/856392/
- // and here: https:// lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u
- // Thus we use instead of
- // iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn
- // + vcpu->kvm->memslots[0]->memslots[0].npages;
- node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree));
- first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]);
- iterat_max = first_memslot->base_gfn + first_memslot->npages;
- for (iterator=0; iterator < iterat_max; iterator++)
- {
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator);
- // Vincent: I think see here
- // https:// patchwork.kernel.org/project/kvm/patch/20210924163152.289027-22-pbonzini@redhat.com/
- if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) {
- write_lock(&vcpu->kvm->mmu_lock);
- kvm_slot_page_track_remove_page(vcpu->kvm, slot, iterator, mode);
- write_unlock(&vcpu->kvm->mmu_lock);
- count++;
- }
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- }
-
- return count;
-}
-EXPORT_SYMBOL(sevstep_stop_tracking);
-
-int
-sevstep_get_rip_kvm_vcpu(struct kvm_vcpu *vcpu, uint64_t *rip)
-{
- return 0;
-}
diff --git a/sevstep/uapi.h b/sevstep/uapi.h
@@ -1,95 +0,0 @@
-#pragma once
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-
-#define KVM_TRACK_PAGE _IOWR(KVMIO, 0x20, track_page_param_t)
-#define KVM_USPT_REGISTER_PID _IOWR(KVMIO, 0x21, userspace_ctx_t)
-#define KVM_USPT_WAIT_AND_SEND _IO(KVMIO, 0x22)
-#define KVM_USPT_POLL_EVENT _IOWR(KVMIO, 0x23, page_fault_event_t)
-#define KVM_USPT_ACK_EVENT _IOWR(KVMIO, 0x24, ack_event_t)
-#define KVM_READ_GUEST_MEMORY _IOWR(KVMIO, 0x25, read_guest_memory_t)
-#define KVM_USPT_RESET _IO(KVMIO, 0x26)
-#define KVM_USPT_TRACK_ALL _IOWR(KVMIO, 0x27, track_all_pages_t)
-#define KVM_USPT_UNTRACK_ALL _IOWR(KVMIO, 0x28, track_all_pages_t)
-#define KVM_USPT_SETUP_RETINSTR_PERF _IOWR(KVMIO, 0x30, retired_instr_perf_config_t)
-#define KVM_USPT_READ_RETINSTR_PERF _IOWR(KVMIO, 0x31, retired_instr_perf_t)
-#define KVM_USPT_BATCH_TRACK_START _IOWR(KVMIO, 0x32, batch_track_config_t)
-#define KVM_USPT_BATCH_TRACK_STOP _IOWR(KVMIO, 0x33, batch_track_stop_and_get_t)
-#define KVM_USPT_BATCH_TRACK_EVENT_COUNT _IOWR(KVMIO, 0x34, batch_track_event_count_t)
-
-#define KVM_USPT_POLL_EVENT_NO_EVENT 1000
-#define KVM_USPT_POLL_EVENT_GOT_EVENT 0
-
-enum kvm_page_track_mode {
- KVM_PAGE_TRACK_WRITE,
- KVM_PAGE_TRACK_ACCESS,
- KVM_PAGE_TRACK_RESET_ACCESSED,
- KVM_PAGE_TRACK_EXEC,
- KVM_PAGE_TRACK_RESET_EXEC,
- KVM_PAGE_TRACK_MAX,
-};
-
-typedef struct {
- __u64 id; // filled automatically
- __u64 faulted_gpa;
- __u32 error_code;
- __u8 have_rip_info;
- __u64 rip;
- __u64 ns_timestamp;
- __u8 have_retired_instructions;
- __u64 retired_instructions;
-} page_fault_event_t;
-
-typedef struct {
- __s32 tracking_type;
- __u64 expected_events;
- __s32 perf_cpu;
- __u8 retrack;
-} batch_track_config_t;
-
-typedef struct {
- __u64 event_count;
-} batch_track_event_count_t;
-
-typedef struct {
- page_fault_event_t* out_buf;
- __u64 len;
- __u8 error_during_batch;
-} batch_track_stop_and_get_t;
-
-typedef struct {
- __s32 cpu; // cpu on which we want to read the counter
- __u64 retired_instruction_count; // result param
-} retired_instr_perf_t;
-
-typedef struct {
- __s32 cpu; // cpu on which counter should be programmed
-} retired_instr_perf_config_t;
-
-typedef struct {
- __u64 gpa;
- __u64 len;
- __u8 decrypt_with_host_key;
- __s32 wbinvd_cpu; // -1: do not flush; else logical cpu on which we flush
- void *output_buffer;
-} read_guest_memory_t;
-
-typedef struct {
- __s32 pid;
- __u8 get_rip;
-} userspace_ctx_t;
-
-typedef struct {
- __u64 id;
-} ack_event_t;
-
-typedef struct {
- __u64 gpa;
- __s32 track_mode;
-} track_page_param_t;
-
-typedef struct {
- __s32 track_mode;
-} track_all_pages_t;
-
diff --git a/sevstep/uspt.c b/sevstep/uspt.c
@@ -1,489 +0,0 @@
-#include "uspt.h"
-#include "sevstep.h"
-
-#include "svm/cachepc/cachepc.h"
-
-#include <linux/kvm.h>
-#include <linux/timekeeping.h>
-#include <linux/uaccess.h>
-#include <linux/types.h>
-#include <linux/vmalloc.h>
-#include <linux/printk.h>
-#include <linux/ratelimit.h>
-
-#define ARRLEN(x) (sizeof(x)/sizeof((x)[0]))
-
-typedef struct {
- bool is_active;
- int tracking_type;
- bool retrack;
-
- int perf_cpu;
-
- uint64_t gfn_retrack_backlog[10];
- int gfn_retrack_backlog_next_idx;
-
- page_fault_event_t * events;
- uint64_t event_next_idx;
- uint64_t events_size;
-
- bool error_occured;
-} batch_track_state_t;
-
-typedef struct {
- uint64_t idx_for_last_perf_reading;
- uint64_t last_perf_reading;
- uint64_t delta_valid_idx;
- uint64_t delta;
-} perf_state_t;
-
-// crude sync mechanism. don't know a good way to act on errors yet.
-static uint64_t last_sent_event_id = 1;
-static uint64_t last_acked_event_id = 1;
-DEFINE_RWLOCK(event_lock);
-
-static page_fault_event_t sent_event;
-static int have_event = 0;
-
-static bool get_rip = true;
-
-static int inited = 0;
-
-DEFINE_SPINLOCK(batch_track_state_lock);
-static batch_track_state_t batch_track_state;
-
-static perf_state_t perf_state;
-
-static uint64_t perf_state_update_and_get_delta(uint64_t current_event_idx);
-
-void
-sevstep_uspt_clear(void)
-{
- write_lock(&event_lock);
- inited = 0;
- last_sent_event_id = 1;
- last_acked_event_id = 1;
- have_event = 0;
- get_rip = false;
- write_unlock(&event_lock);
-}
-
-int
-sevstep_uspt_initialize(int pid, bool should_get_rip)
-{
- write_lock(&event_lock);
- inited = 1;
- last_sent_event_id = 1;
- last_acked_event_id = 1;
- have_event = 0;
- get_rip = should_get_rip;
- write_unlock(&event_lock);
-
- return 0;
-}
-
-int
-sevstep_uspt_is_initialiized()
-{
- return inited;
-}
-
-bool
-sevstep_uspt_should_get_rip()
-{
- bool tmp;
-
- read_lock(&event_lock);
- tmp = get_rip;
- read_unlock(&event_lock);
-
- return tmp;
-}
-
-int
-sevstep_uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code,
- bool have_rip, uint64_t rip)
-{
- ktime_t abort_after;
- page_fault_event_t message_for_user;
-
- read_lock(&event_lock);
- if (!sevstep_uspt_is_initialiized()) {
- pr_warn("sevstep_uspt_send_and_block: ctx not initialized!\n");
- read_unlock(&event_lock);
- return 1;
- }
- read_unlock(&event_lock);
-
- write_lock(&event_lock);
- if (last_sent_event_id != last_acked_event_id) {
- pr_warn("sevstep_uspt_send_and_block: "
- "event id_s out of sync, aborting. Fix this later\n");
- write_unlock(&event_lock);
- return 1;
- } else {
- // TODO: handle overflow
- last_sent_event_id++;
- }
- message_for_user.id = last_sent_event_id;
- message_for_user.faulted_gpa = faulted_gpa;
- message_for_user.error_code = error_code;
- message_for_user.have_rip_info = have_rip;
- message_for_user.rip = rip;
- message_for_user.ns_timestamp = ktime_get_real_ns();
- message_for_user.have_retired_instructions = false;
-
- // for poll based system;
- have_event = 1;
- sent_event = message_for_user;
- // printk("sevstep_uspt_send_and_block sending event %llu\n", sent_event.id);
-
- write_unlock(&event_lock);
-
- // wait for ack, but with timeout. Otherwise small bugs in userland
- // easily lead to a kernel hang
- abort_after = ktime_get() + 1000000000ULL; // 1 sec in nanosecond
- while (!sevstep_uspt_is_event_done(sent_event.id)) {
- if (ktime_get() > abort_after) {
- pr_warn("sevstep_uspt_send_and_block: "
- "Waiting for ack of event %llu timed out, "
- "continuing\n",sent_event.id);
- return 3;
- }
- }
-
- return 0;
-}
-
-int
-sevstep_uspt_is_event_done(uint64_t id)
-{
- int res;
-
- read_lock(&event_lock);
- res = last_acked_event_id >= id;
- read_unlock(&event_lock);
-
- return res;
-}
-
-int
-sevstep_uspt_handle_poll_event(page_fault_event_t* userpace_mem)
-{
- int err;
-
- // most of the time we won't have an event
- read_lock(&event_lock);
- if (!have_event) {
- read_unlock(&event_lock);
- return KVM_USPT_POLL_EVENT_NO_EVENT;
- }
- read_unlock(&event_lock);
-
- write_lock(&event_lock);
- if (have_event) {
- err = copy_to_user(userpace_mem,
- &sent_event, sizeof(page_fault_event_t));
- have_event = 0;
- } else {
- err = KVM_USPT_POLL_EVENT_NO_EVENT;
- }
- write_unlock(&event_lock);
-
- return err;
-}
-
-int
-sevstep_uspt_handle_ack_event_ioctl(ack_event_t event)
-{
- int err = 0;
-
- write_lock(&event_lock);
- if (event.id == last_sent_event_id) {
- last_acked_event_id = last_sent_event_id;
- } else {
- err = 1;
- pr_warn("sevstep_uspt_handle_ack_event_ioctl: "
- "last sent event id is %llu but received ack for %llu\n",
- last_sent_event_id, event.id);
- }
- write_unlock(&event_lock);
-
- return err;
-}
-
-// get retired instructions between current_event_idx-1 and current_event_idx
-// value is cached for multiple calls to the same current_event_idx
-uint64_t
-perf_state_update_and_get_delta(uint64_t current_event_idx)
-{
- uint64_t current_value;
-
- /* check if value is "cached" */
- if (perf_state.delta_valid_idx == current_event_idx) {
- if (current_event_idx == 0) {
- perf_state.idx_for_last_perf_reading = current_event_idx;
- perf_state.last_perf_reading = cachepc_read_pmc(0);
- }
- return perf_state.delta;
- }
-
- /* otherwise update, but logic is only valid for two consecutive events */
- if (current_event_idx != perf_state.idx_for_last_perf_reading+1) {
- pr_warn("perf_state_update_and_get_delta: "
- "last reading was for idx %llu but was queried for %llu\n",
- perf_state.idx_for_last_perf_reading, current_event_idx);
- }
-
- current_value = cachepc_read_pmc(0);
- perf_state.delta = (current_value - perf_state.last_perf_reading);
- perf_state.delta_valid_idx = current_event_idx;
-
- perf_state.idx_for_last_perf_reading = current_event_idx;
- perf_state.last_perf_reading = current_value;
-
- return perf_state.delta;
-}
-
-void
-sevstep_uspt_batch_tracking_inc_event_idx(void)
-{
- spin_lock(&batch_track_state_lock);
- batch_track_state.event_next_idx++;
- spin_unlock(&batch_track_state_lock);
-}
-
-int
-sevstep_uspt_batch_tracking_start(int tracking_type,uint64_t expected_events,
- int perf_cpu, bool retrack)
-{
- page_fault_event_t* events;
- uint64_t buffer_size, i;
-
- spin_lock(&batch_track_state_lock);
- if (batch_track_state.is_active) {
- pr_warn("sevstep_uspt_batch_tracking_start: "
- "overwriting active batch track config!\n");
- if (batch_track_state.events != NULL ) {
- vfree(batch_track_state.events);
- }
- }
- batch_track_state.is_active = false;
- spin_unlock(&batch_track_state_lock);
-
- buffer_size = expected_events * sizeof(page_fault_event_t);
- pr_warn("sevstep_uspt_batch_tracking_start: "
- "trying to alloc %llu bytes buffer for events\n",
- buffer_size);
- events = vmalloc(buffer_size);
- if (events == NULL) {
- pr_warn("sevstep_uspt_batch_tracking_start: "
- "faperf_cpuiled to alloc %llu bytes for event buffer\n",
- buffer_size);
- return 1; // note: lock not held here
- }
-
- // access each element once to force them into memory, improving performance
- // during tracking
- for (i = 0; i < expected_events * sizeof(page_fault_event_t); i++) {
- ((volatile uint8_t*)events)[i] = 0;
- }
-
- perf_state.idx_for_last_perf_reading = 0;
- perf_state.last_perf_reading = 0;
- perf_state.delta_valid_idx = 0;
- perf_state.delta = 0;
- cachepc_init_pmc(0, 0xc0, 0x00, PMC_GUEST, PMC_KERNEL | PMC_USER);
-
- spin_lock(&batch_track_state_lock);
-
- batch_track_state.perf_cpu = perf_cpu;
- batch_track_state.retrack = retrack;
-
- batch_track_state.events = events;
- batch_track_state.event_next_idx = 0;
- batch_track_state.events_size = expected_events;
-
- batch_track_state.gfn_retrack_backlog_next_idx = 0;
- batch_track_state.tracking_type = tracking_type;
- batch_track_state.error_occured = false;
-
- batch_track_state.is_active = true;
-
- spin_unlock(&batch_track_state_lock);
-
- return 0;
-}
-
-void
-sevstep_uspt_batch_tracking_handle_retrack(struct kvm_vcpu* vcpu,
- uint64_t current_fault_gfn)
-{
- uint64_t ret_instr_delta;
- int i, next_idx;
-
- spin_lock(&batch_track_state_lock);
-
- if (!batch_track_state.retrack) {
- spin_unlock(&batch_track_state_lock);
- return;
- }
-
- if (smp_processor_id() != batch_track_state.perf_cpu) {
- pr_warn("sevstep_uspt_batch_tracking_handle_retrack: perf was "
- "programmed on logical cpu %d but handler was called "
- "on %d. Did you forget to pin the vcpu thread?\n",
- batch_track_state.perf_cpu, smp_processor_id());
- }
- ret_instr_delta = perf_state_update_and_get_delta(batch_track_state.event_next_idx);
-
- // faulting instructions is probably the same as on last fault
- // try to add current fault to retrack log and return
- // for first event idx we do not have a valid ret_instr_delta.
- // Retracking for the frist time is fine, if we loop, we end up here
- // again but with a valid delta on one of the next event
- if ((ret_instr_delta < 2) && ( batch_track_state.event_next_idx != 0)) {
- next_idx = batch_track_state.gfn_retrack_backlog_next_idx;
- if (next_idx >= ARRLEN(batch_track_state.gfn_retrack_backlog)) {
- pr_warn("sevstep_uspt_batch_tracking_handle_retrack: "
- "retrack backlog full, dropping retrack for fault "
- "at 0x%llx\n", current_fault_gfn);
- } else {
- batch_track_state.gfn_retrack_backlog[next_idx] = current_fault_gfn;
- batch_track_state.gfn_retrack_backlog_next_idx++;
- }
-
- spin_unlock(&batch_track_state_lock);
- return;
- }
-
- /* made progress, retrack everything in backlog and reset idx */
- for (i = 0; i < batch_track_state.gfn_retrack_backlog_next_idx; i++) {
- sevstep_track_single_page(vcpu,
- batch_track_state.gfn_retrack_backlog[i],
- batch_track_state.tracking_type);
- }
-
- /* add current fault to list */
- batch_track_state.gfn_retrack_backlog[0] = current_fault_gfn;
- batch_track_state.gfn_retrack_backlog_next_idx = 1;
-
- spin_unlock(&batch_track_state_lock);
-
-}
-
-int
-sevstep_uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code,
- bool have_rip, uint64_t rip)
-{
- uint64_t ret_instr_delta;
- page_fault_event_t* event;
-
- spin_lock(&batch_track_state_lock);
-
- if (!batch_track_state.is_active) {
- pr_warn("sevstep_uspt_batch_tracking_save: "
- "got save but batch tracking is not active!\n");
- batch_track_state.error_occured = true;
- spin_unlock(&batch_track_state_lock);
- return 1;
- }
-
-
- if (batch_track_state.event_next_idx >= batch_track_state.events_size) {
- pr_warn("sevstep_uspt_batch_tracking_save: events buffer is full!\n");
- batch_track_state.error_occured = true;
- spin_unlock(&batch_track_state_lock);
- return 1;
- }
-
- if (smp_processor_id() != batch_track_state.perf_cpu) {
- pr_warn("sevstep_uspt_batch_tracking_save: perf was "
- "programmed on logical cpu %d but handler was called "
- "on %d. Did you forget to pin the vcpu thread?\n",
- batch_track_state.perf_cpu, smp_processor_id());
- }
- ret_instr_delta = perf_state_update_and_get_delta(batch_track_state.event_next_idx);
-
-
- if (batch_track_state.events == NULL) {
- pr_warn("sevstep_uspt_batch_tracking_save: events buf was "
- "NULL but \"is_active\" was set! This should never happen!!!\n");
- spin_unlock(&batch_track_state_lock);
- return 1;
- }
-
- event = &batch_track_state.events[batch_track_state.event_next_idx];
- event->id = batch_track_state.event_next_idx;
- event->faulted_gpa = faulted_gpa;
- event->error_code = error_code;
- event->have_rip_info = have_rip;
- event->rip = rip;
- event->ns_timestamp = ktime_get_real_ns();
- event->have_retired_instructions = true;
- event->retired_instructions = ret_instr_delta;
-
- // old inc was here
-
- if (batch_track_state.gfn_retrack_backlog_next_idx
- > ARRLEN(batch_track_state.gfn_retrack_backlog)) {
- pr_warn("sevstep_uspt_batch_tracking_save: "
- "gfn retrack backlog overflow!\n");
- batch_track_state.error_occured = true;
- spin_unlock(&batch_track_state_lock);
- return 1;
- }
-
- spin_unlock(&batch_track_state_lock);
-
- return 0;
-}
-
-int
-sevstep_uspt_batch_tracking_stop(page_fault_event_t* results,
- uint64_t len, __u8* error_occured)
-{
- spin_lock(&batch_track_state_lock);
- if (!batch_track_state.is_active) {
- pr_warn("sevstep_uspt: batch tracking not active\n");
- spin_unlock(&batch_track_state_lock);
- return 1;
-
- }
- batch_track_state.is_active = false;
-
- if (len > batch_track_state.event_next_idx) {
- pr_warn("sevstep_uspt_batch_tracking_stop: "
- "requested %llu events but got only %llu\n",
- len, batch_track_state.event_next_idx);
- spin_unlock(&batch_track_state_lock);
- return 1;
- }
-
- memcpy(results,batch_track_state.events, len*sizeof(page_fault_event_t));
- vfree(batch_track_state.events);
-
- *error_occured = batch_track_state.error_occured;
-
- spin_unlock(&batch_track_state_lock);
-
- return 0;
-}
-
-uint64_t
-sevstep_uspt_batch_tracking_get_events_count()
-{
- uint64_t buf;
-
- spin_lock(&batch_track_state_lock);
- buf = batch_track_state.event_next_idx;
- spin_unlock(&batch_track_state_lock);
-
- return buf;
-}
-
-bool
-sevstep_uspt_batch_tracking_in_progress()
-{
- return batch_track_state.is_active;
-}
diff --git a/test/sevstep.c b/test/sevstep.c
@@ -1,4 +1,4 @@
-#include "sevstep/uapi.h"
+#include "cachepc/uapi.h"
#include <linux/kvm.h>
#include <sys/ioctl.h>