From 58d8565f015f9e06e1e51a0fe4654b966b2c27c0 Mon Sep 17 00:00:00 2001
From: Louis Burda <quent.burda@gmail.com>
Date: Wed, 5 Oct 2022 15:05:19 +0200
Subject: Refactor sevstep kernel patch into repository

---
 sevstep/kvm.c     | 205 ++++++++++++++++++++++
 sevstep/kvm.h     |   4 +
 sevstep/mmu.c     | 132 ++++++++++++++
 sevstep/sevstep.c | 129 ++++++++++++++
 sevstep/sevstep.h |  67 ++++++++
 sevstep/uapi.h    |  86 ++++++++++
 sevstep/uspt.c    | 503 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 sevstep/uspt.h    |  49 ++++++
 8 files changed, 1175 insertions(+)
 create mode 100644 sevstep/kvm.c
 create mode 100644 sevstep/kvm.h
 create mode 100644 sevstep/mmu.c
 create mode 100644 sevstep/sevstep.c
 create mode 100644 sevstep/sevstep.h
 create mode 100644 sevstep/uapi.h
 create mode 100644 sevstep/uspt.c
 create mode 100644 sevstep/uspt.h

(limited to 'sevstep')

diff --git a/sevstep/kvm.c b/sevstep/kvm.c
new file mode 100644
index 0000000..b6b0d49
--- /dev/null
+++ b/sevstep/kvm.c
@@ -0,0 +1,205 @@
+#include "kvm.h"
+
+#include <linux/types.h>
+
+bool
+__untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+	enum kvm_page_track_mode mode)
+{
+	int idx;
+	bool ret;
+	struct kvm_memory_slot *slot;
+
+	ret = false;
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	if (mode == KVM_PAGE_TRACK_ACCESS) {
+		//printk("Removing gfn: %016llx from acess page track pool\n", gfn);
+	}
+	if (mode == KVM_PAGE_TRACK_WRITE) {
+		//printk("Removing gfn: %016llx from write page track pool\n", gfn);
+	}
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+
+	if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) {
+		write_lock(&vcpu->kvm->mmu_lock);
+		kvm_slot_page_track_remove_page(vcpu->kvm, slot, gfn, mode);
+		write_unlock(&vcpu->kvm->mmu_lock);
+		ret = true;
+	} else {
+		printk("Failed to untrack %016llx because ", gfn);
+		if (slot == NULL) {
+			printk(KERN_CONT "slot was	null");
+		} else if (!kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) {
+			printk(KERN_CONT "page track was not active");
+		}
+		printk(KERN_CONT "\n");
+	}
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	return ret;
+}
+EXPORT_SYMBOL(__untrack_single_page);
+
+bool
+__reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	int idx;
+	bool ret;
+	struct kvm_memory_slot *slot;
+
+	ret = false;
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	if( slot != NULL ) {
+		write_lock(&vcpu->kvm->mmu_lock);
+		//Vincent: The kvm mmu function now requires min_level
+		//We want all pages to protected so we do PG_LEVEL_4K
+		//https://patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/
+		sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm,slot,gfn,PG_LEVEL_4K,KVM_PAGE_TRACK_RESET_ACCESSED);
+		write_unlock(&vcpu->kvm->mmu_lock);
+		ret = true;
+	}
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	return ret;
+}
+EXPORT_SYMBOL(__reset_accessed_on_page);
+
+bool
+__clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	int idx;
+	bool ret;
+	struct kvm_memory_slot *slot;
+
+	ret = false;
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	if( slot != NULL ) {
+		write_lock(&vcpu->kvm->mmu_lock);
+		//Vincent: The kvm mmu function now requires min_level
+		//We want all pages to protected so we do PG_LEVEL_4K
+		//https://patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/
+		sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn,
+			PG_LEVEL_4K, KVM_PAGE_TRACK_RESET_EXEC);
+		write_unlock(&vcpu->kvm->mmu_lock);
+		ret = true;
+	}
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	return ret;
+}
+EXPORT_SYMBOL(__clear_nx_on_page);
+
+bool
+__track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+	enum kvm_page_track_mode mode)
+{
+	int idx;
+	bool ret;
+	struct kvm_memory_slot *slot;
+
+	ret = false;
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	if (mode == KVM_PAGE_TRACK_ACCESS) {
+		//printk_ratelimited("Adding gfn: %016llx to acess page track pool\n", gfn);
+		//printk("Adding gfn: %016llx to acess page track pool\n", gfn);
+	}
+	if (mode == KVM_PAGE_TRACK_WRITE) {
+		//printk_ratelimited("Adding gfn: %016llx to write page track pool\n", gfn);
+	}
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm,slot, gfn, mode)) {
+
+		write_lock(&vcpu->kvm->mmu_lock);
+		kvm_slot_page_track_add_page(vcpu->kvm, slot, gfn, mode);
+		write_unlock(&vcpu->kvm->mmu_lock);
+		ret = true;
+
+	} else {
+
+		printk("Failed to track %016llx because ", gfn);
+		if (slot == NULL) {
+			printk(KERN_CONT "slot was	null");
+		}
+		if (kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) {
+			printk(KERN_CONT "page is already tracked");
+		}
+		printk(KERN_CONT "\n");
+	}
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	return ret;
+}
+EXPORT_SYMBOL(__track_single_page);
+
+long
+kvm_start_tracking(struct kvm_vcpu *vcpu,enum kvm_page_track_mode mode )
+{
+	long count = 0;
+	u64 iterator, iterat_max;
+	struct kvm_memory_slot *slot;
+	int idx;
+
+	//Vincent: Memslots interface changed into a rb tree, see
+	//here: https://lwn.net/Articles/856392/
+	//and here: https://lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u
+	//Thus we use instead of
+	//iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn
+	//	     + vcpu->kvm->memslots[0]->memslots[0].npages;
+	struct rb_node *node;
+	struct kvm_memory_slot *first_memslot;
+	node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree));
+	first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]);
+	iterat_max = first_memslot->base_gfn + first_memslot->npages;
+	for (iterator=0; iterator < iterat_max; iterator++)
+	{
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+		slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator);
+		if ( slot != NULL  && !kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) {
+			write_lock(&vcpu->kvm->mmu_lock);
+			kvm_slot_page_track_add_page(vcpu->kvm, slot, iterator, mode);
+			write_unlock(&vcpu->kvm->mmu_lock);
+			count++;
+		}
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	}
+
+	return count;
+}
+EXPORT_SYMBOL(kvm_start_tracking);
+
+long
+kvm_stop_tracking(struct kvm_vcpu *vcpu,enum kvm_page_track_mode mode)
+{
+	long count = 0;
+	u64 iterator, iterat_max;
+	struct kvm_memory_slot *slot;
+	int idx;
+
+
+	//Vincent: Memslots interface changed into a rb tree, see
+	//here: https://lwn.net/Articles/856392/
+	//and here: https://lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u
+	//Thus we use instead of
+	//iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn
+	//	     + vcpu->kvm->memslots[0]->memslots[0].npages;
+	struct rb_node *node;
+	struct kvm_memory_slot *first_memslot;
+	node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree));
+	first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]);
+	iterat_max = first_memslot->base_gfn + first_memslot->npages;
+	for (iterator=0; iterator < iterat_max; iterator++)
+	{
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+		slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator);
+			//Vincent: I think see here https://patchwork.kernel.org/project/kvm/patch/20210924163152.289027-22-pbonzini@redhat.com/
+			if ( slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) {
+				write_lock(&vcpu->kvm->mmu_lock);
+				kvm_slot_page_track_remove_page(vcpu->kvm, slot, iterator, mode);
+				write_unlock(&vcpu->kvm->mmu_lock);
+				count++;
+		}
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	}
+
+	return count;
+}
+EXPORT_SYMBOL(kvm_stop_tracking);
+
diff --git a/sevstep/kvm.h b/sevstep/kvm.h
new file mode 100644
index 0000000..35cb4d5
--- /dev/null
+++ b/sevstep/kvm.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include "sev-step.h"
+#include "uapi.h"
diff --git a/sevstep/mmu.c b/sevstep/mmu.c
new file mode 100644
index 0000000..4eefea2
--- /dev/null
+++ b/sevstep/mmu.c
@@ -0,0 +1,132 @@
+#include "../sevstep/sevstep.h"
+#include "../sevstep/uspt.h"
+
+void
+sevstep_uspt_page_fault_handle(struct kvm_vcpu *vcpu,
+	struct kvm_page_fault *fault)
+{
+	const int modes[] = {
+		KVM_PAGE_TRACK_WRITE,
+		KVM_PAGE_TRACK_ACCESS,
+		KVM_PAGE_TRACK_EXEC
+	};
+	uint64_t current_rip;
+	bool was_tracked;
+	int have_rip, i;
+	int send_err;
+
+	was_tracked = false;
+	for (i = 0; i < sizeof(modes) / sizeof(modes[0]); i++) {
+		if (kvm_slot_page_track_is_active(vcpu->kvm,
+				fault->slot, fault->gfn, modes[i])) {
+			__untrack_single_page(vcpu, fault->gfn, modes[i]);
+			was_tracked = true;
+		}
+	}
+
+	if (was_tracked) {
+		have_rip = false;
+		if (uspt_should_get_rip())
+			have_rip = sev_step_get_rip_kvm_vcpu(vcpu,&current_rip) == 0;
+		if (uspt_batch_tracking_in_progress()) {
+			send_err = uspt_batch_tracking_save(fault->gfn << PAGE_SHIFT,
+				fault->error_code, have_rip, current_rip);
+			if (send_err) {
+				printk_ratelimited(
+					"uspt_batch_tracking_save failed with %d\n"
+					"##########################\n", send_err);
+			}
+			uspt_batch_tracking_handle_retrack(vcpu, fault->gfn);
+			uspt_batch_tracking_inc_event_idx();
+		} else {
+			send_err = uspt_send_and_block(fault->gfn << PAGE_SHIFT,
+				fault->error_code, have_rip, current_rip);
+			if (send_err) {
+				printk("uspt_send_and_block failed with %d\n"
+					"##########################\n", send_err);
+			}
+		}
+	}
+}
+
+bool
+sevstep_spte_protect(u64 *sptep, bool pt_protect, enum kvm_page_track_mode mode)
+{
+	u64 spte = *sptep;
+	bool shouldFlush = false;
+
+	if (!is_writable_pte(spte) && !(pt_protect && is_mmu_writable_spte(spte)))
+		return false;
+
+	rmap_printk("spte %p %llx\n", sptep, *sptep);
+
+	if (pt_protect)
+		spte &= ~EPT_SPTE_MMU_WRITABLE;
+
+	if (mode == KVM_PAGE_TRACK_WRITE) {
+		spte = spte & ~PT_WRITABLE_MASK;
+		shouldFlush = true;
+	} else if (mode == KVM_PAGE_TRACK_RESET_ACCESSED) {
+		spte = spte & ~PT_ACCESSED_MASK;
+	} else if (mode == KVM_PAGE_TRACK_ACCESS) {
+		spte = spte & ~PT_PRESENT_MASK;
+		spte = spte & ~PT_WRITABLE_MASK;
+		spte = spte & ~PT_USER_MASK;
+		spte = spte | (0x1ULL << PT64_NX_SHIFT);
+		shouldFlush = true;
+	} else if (mode == KVM_PAGE_TRACK_EXEC) {
+		spte = spte | (0x1ULL << PT64_NX_SHIFT);
+		shouldFlush = true;
+	} else if (mode == KVM_PAGE_TRACK_RESET_EXEC) {
+		spte = spte & ~(0x1ULL << PT64_NX_SHIFT);
+		shouldFlush = true;
+	} else {
+		printk(KERN_WARNING "spte_protect was called with invalid mode"
+			"parameter %d\n",mode);
+	}
+	shouldFlush |= mmu_spte_update(sptep, spte);
+	return shouldFlush;
+}
+EXPORT_SYMBOL(sevstep_spte_protect);
+
+bool sevstep_rmap_protect(struct kvm_rmap_head *rmap_head,
+	bool pt_protect, enum kvm_page_track_mode mode)
+{
+	u64 *sptep;
+	struct rmap_iterator iter;
+	bool flush = false;
+
+	for_each_rmap_spte(rmap_head, &iter, sptep) {
+		flush |= sevstep_spte_protect(sptep, pt_protect, mode);
+	}
+
+	return flush;
+}
+EXPORT_SYMBOL(sevstep_rmap_protect);
+
+bool
+sevstep_kvm_mmu_slot_gfn_protect(struct kvm *kvm, struct kvm_memory_slot *slot,
+	uint64_t gfn, int min_level, enum kvm_page_track_mode mode)
+{
+	struct kvm_rmap_head *rmap_head;
+	bool protected;
+	int i;
+
+	protected = false;
+
+	if (kvm_memslots_have_rmaps(kvm)) {
+		for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+			rmap_head = gfn_to_rmap(gfn, i, slot);
+			protected |= sevstep_rmap_protect(rmap_head, true, mode);
+		}
+	}
+
+	if (is_tdp_mmu_enabled(kvm)) {
+		protected |= kvm_tdp_mmu_write_protect_gfn(kvm,
+			slot, gfn, min_level);
+	}
+
+	return protected;
+}
+EXPORT_SYMBOL(sevstep_kvm_mmu_slot_gfn_protect);
+
diff --git a/sevstep/sevstep.c b/sevstep/sevstep.c
new file mode 100644
index 0000000..3345e04
--- /dev/null
+++ b/sevstep/sevstep.c
@@ -0,0 +1,129 @@
+#include "sevstep.h"
+
+#include "mmu/mmu_internal.h"
+#include "mmu.h"
+
+#include "irq.h"
+#include "ioapic.h"
+#include "mmu.h"
+#include "mmu/tdp_mmu.h"
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include "cpuid.h"
+#include "mmu/spte.h"
+
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/hugetlb.h>
+#include <linux/compiler.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/kern_levels.h>
+#include <linux/kthread.h>
+#include <linux/sev.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "kvm_cache_regs.h"
+#include "svm/svm.h"
+
+struct kvm* main_vm;
+EXPORT_SYMBOL(main_vm);
+
+// used to store performance counter values; 6 counters, 2 readings per counter
+// TODO: static!
+uint64_t perf_reads[6][2];
+perf_ctl_config_t perf_configs[6];
+int perf_cpu;
+
+
+uint64_t
+perf_ctl_to_u64(perf_ctl_config_t * config)
+{
+	uint64_t result;
+
+	result = 0;
+	result |= config->EventSelect & 0xffULL;
+	result |= (config->UintMask & 0xffULL) << 8;
+	result |= (config->OsUserMode & 0x3ULL) << 16;
+	result |= (config->Edge & 0x1ULL ) << 18;
+	result |= (config->Int & 0x1ULL ) << 20;
+	result |= (config->En & 0x1ULL ) << 22;
+	result |= (config->Inv & 0x1ULL ) << 23;
+	result |= (config->CntMask & 0xffULL) << 24;
+	result |= ((config->EventSelect & 0xf00ULL) >> 8) << 32;
+	result |= (config->HostGuestOnly & 0x3ULL) << 40;
+
+	return result;
+
+}
+
+void
+write_ctl(perf_ctl_config_t * config, int cpu, uint64_t ctl_msr)
+{
+	wrmsrl_on_cpu(cpu, ctl_msr, perf_ctl_to_u64(config));
+}
+
+void
+read_ctr(uint64_t ctr_msr, int cpu, uint64_t* result)
+{
+	uint64_t tmp;
+
+	rdmsrl_on_cpu(cpu, ctr_msr, &tmp);
+	*result = tmp & ( (0x1ULL << 48) - 1);
+}
+
+void
+setup_perfs()
+{
+	int i;
+
+	perf_cpu = smp_processor_id();
+
+	for (i = 0; i < 6; i++) {
+		perf_configs[i].HostGuestOnly = 0x1; /* count only guest */
+		perf_configs[i].CntMask = 0x0;
+		perf_configs[i].Inv = 0x0;
+		perf_configs[i].En = 0x0;
+		perf_configs[i].Int = 0x0;
+		perf_configs[i].Edge = 0x0;
+		perf_configs[i].OsUserMode = 0x3; /* count userland and kernel events */
+	}
+
+	perf_configs[0].EventSelect = 0x0c0;
+	perf_configs[0].UintMask = 0x0;
+	perf_configs[0].En = 0x1;
+	write_ctl(&perf_configs[0],perf_cpu, CTL_MSR_0);
+
+	/*
+	 * programm l2d hit from data cache miss perf for
+	 * cpu_probe_pointer_chasing_inplace without counting thread.
+	 * N.B. that this time we count host events
+	 */
+	perf_configs[1].EventSelect = 0x064;
+	perf_configs[1].UintMask = 0x70;
+	perf_configs[1].En = 0x1;
+	perf_configs[1].HostGuestOnly = 0x2; /* count only host events */
+	write_ctl(&perf_configs[1],perf_cpu,CTL_MSR_1);
+}
+EXPORT_SYMBOL(setup_perfs);
+
+int
+sev_step_get_rip_kvm_vcpu(struct kvm_vcpu* vcpu,uint64_t *rip)
+{
+	return 0;
+}
diff --git a/sevstep/sevstep.h b/sevstep/sevstep.h
new file mode 100644
index 0000000..86d25f7
--- /dev/null
+++ b/sevstep/sevstep.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <linux/types.h>
+#include <linux/spinlock_types.h>
+#include <asm/atomic.h>
+#include <linux/kvm_types.h>
+#include <asm/kvm_page_track.h>
+
+#include <linux/kvm_host.h>
+#include <linux/pid.h>
+#include <linux/psp-sev.h>
+
+
+#define CTL_MSR_0  0xc0010200ULL
+#define CTL_MSR_1  0xc0010202ULL
+#define CTL_MSR_2  0xc0010204ULL
+#define CTL_MSR_3  0xc0010206ULL
+#define CTL_MSR_4  0xc0010208ULL
+#define CTL_MSR_5  0xc001020aULL
+
+#define CTR_MSR_0  0xc0010201ULL
+#define CTR_MSR_1  0xc0010203ULL
+#define CTR_MSR_2  0xc0010205ULL
+#define CTR_MSR_3  0xc0010207ULL
+#define CTR_MSR_4  0xc0010209ULL
+#define CTR_MSR_5  0xc001020bULL
+
+typedef struct {
+	uint64_t HostGuestOnly;
+	uint64_t CntMask;
+	uint64_t Inv;
+	uint64_t En;
+	uint64_t Int;
+	uint64_t Edge;
+	uint64_t OsUserMode;
+	uint64_t UintMask;
+	uint64_t EventSelect; //12 bits in total split in [11:8] and [7:0]
+
+} perf_ctl_config_t;
+
+extern struct kvm* main_vm;
+
+bool sevstep_spte_protect(u64 *sptep,
+	bool pt_protect, enum kvm_page_track_mode mode);
+bool sevstep_rmap_protect(struct kvm_rmap_head *rmap_head,
+	bool pt_protect, enum kvm_page_track_mode mode);
+bool sevstep_kvm_mmu_slot_gfn_protect(struct kvm *kvm, struct kvm_memory_slot *slot,
+	uint64_t gfn, int min_level, enum kvm_page_track_mode mode);
+
+bool __untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+	enum kvm_page_track_mode mode);
+bool __track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+	enum kvm_page_track_mode mode);
+bool __reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn);
+bool __clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+long kvm_start_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode);
+long kvm_stop_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode);
+void sev_step_handle_callback(void);
+
+uint64_t perf_ctl_to_u64(perf_ctl_config_t *config);
+void write_ctl(perf_ctl_config_t *config, int cpu, uint64_t ctl_msr);
+void read_ctr(uint64_t ctr_msr, int cpu, uint64_t *result);
+
+void setup_perfs(void);
+
+int sev_step_get_rip_kvm_vcpu(struct kvm_vcpu *vcpu, uint64_t *rip);
diff --git a/sevstep/uapi.h b/sevstep/uapi.h
new file mode 100644
index 0000000..e41a036
--- /dev/null
+++ b/sevstep/uapi.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define KVM_TRACK_PAGE _IOWR(KVMIO, 0x20, track_page_param_t)
+#define KVM_USPT_REGISTER_PID _IOWR(KVMIO, 0x21, userspace_ctx_t)
+#define KVM_USPT_WAIT_AND_SEND _IO(KVMIO, 0x22)
+#define KVM_USPT_POLL_EVENT _IOWR(KVMIO, 0x23, page_fault_event_t)
+#define KVM_USPT_ACK_EVENT _IOWR(KVMIO, 0x24, ack_event_t)
+#define KVM_READ_GUEST_MEMORY _IOWR(KVMIO, 0x25, read_guest_memory_t)
+#define KVM_USPT_RESET _IO(KVMIO, 0x26)
+#define KVM_USPT_TRACK_ALL _IOWR(KVMIO, 0x27, track_all_pages_t)
+#define KVM_USPT_UNTRACK_ALL _IOWR(KVMIO, 0x28, track_all_pages_t)
+#define KVM_USPT_SETUP_RETINSTR_PERF _IOWR(KVMIO, 0x30,retired_instr_perf_config_t)
+#define KVM_USPT_READ_RETINSTR_PERF _IOWR(KVMIO,0x31, retired_instr_perf_t)
+#define KVM_USPT_BATCH_TRACK_START _IOWR(KVMIO,0x32,batch_track_config_t)
+#define KVM_USPT_BATCH_TRACK_STOP _IOWR(KVMIO,0x33,batch_track_stop_and_get_t)
+#define KVM_USPT_BATCH_TRACK_EVENT_COUNT _IOWR(KVMIO,0x34,batch_track_event_count_t)
+
+#define KVM_USPT_POLL_EVENT_NO_EVENT 1000
+#define KVM_USPT_POLL_EVENT_GOT_EVENT 0
+
+typedef struct {
+	uint64_t id; // filled automatically
+	uint64_t faulted_gpa;
+	uint32_t error_code;
+	bool have_rip_info;
+	uint64_t rip;
+	uint64_t ns_timestamp;
+	bool have_retired_instructions;
+	uint64_t retired_instructions;
+} page_fault_event_t;
+
+typedef struct {
+	int tracking_type;
+	uint64_t expected_events;
+	int perf_cpu;
+	bool retrack;
+} batch_track_config_t;
+
+typedef struct {
+	uint64_t event_count;
+} batch_track_event_count_t;
+
+typedef struct {
+	page_fault_event_t* out_buf;
+	uint64_t len;
+	bool error_during_batch;
+} batch_track_stop_and_get_t;
+
+typedef struct {
+	int cpu; // cpu on which we want to read the counter
+	uint64_t retired_instruction_count; // result param
+} retired_instr_perf_t;
+
+typedef struct {
+	int cpu; // cpu on which counter should be programmed
+} retired_instr_perf_config_t;
+
+typedef struct {
+	uint64_t gpa;
+	uint64_t len;
+	bool decrypt_with_host_key;
+	int wbinvd_cpu; // -1: do not flush; else logical cpu on which we flush
+	void* output_buffer;
+} read_guest_memory_t;
+
+typedef struct {
+    int pid;
+	bool get_rip;
+} userspace_ctx_t;
+
+typedef struct {
+	uint64_t id;
+} ack_event_t;
+
+typedef struct {
+	uint64_t gpa;
+	int track_mode;
+} track_page_param_t;
+
+typedef struct {
+	int track_mode;
+} track_all_pages_t;
+
diff --git a/sevstep/uspt.c b/sevstep/uspt.c
new file mode 100644
index 0000000..f7b329d
--- /dev/null
+++ b/sevstep/uspt.c
@@ -0,0 +1,503 @@
+#include "uspt.h"
+#include "sevstep.h"
+
+#include <linux/kvm.h>
+#include <linux/timekeeping.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+
+#define ARRLEN(x) (sizeof(x)/sizeof((x)[0]))
+
+typedef struct {
+	bool is_active;
+	int tracking_type;
+	bool retrack;
+
+	int perf_cpu;
+
+	uint64_t gfn_retrack_backlog[10];
+	int gfn_retrack_backlog_next_idx;
+
+	page_fault_event_t * events;
+	uint64_t event_next_idx;
+	uint64_t events_size;
+
+	bool error_occured;
+} batch_track_state_t;
+
+// crude sync mechanism. don't know a good way to act on errors yet.
+uint64_t last_sent_event_id = 1;
+uint64_t last_acked_event_id = 1;
+DEFINE_RWLOCK(event_lock);
+
+page_fault_event_t sent_event;
+static int have_event = 0;
+
+static bool get_rip = true;
+
+static int inited = 0;
+
+DEFINE_SPINLOCK(batch_track_state_lock);
+static batch_track_state_t batch_track_state;
+
+typedef struct {
+	uint64_t idx_for_last_perf_reading;
+	uint64_t last_perf_reading;
+	uint64_t delta_valid_idx;
+	uint64_t delta;
+} perf_state_t;
+
+perf_state_t perf_state;
+
+
+void
+uspt_clear(void)
+{
+	write_lock(&event_lock);
+	inited = 0;
+	last_sent_event_id = 1;
+	last_acked_event_id = 1;
+	have_event = 0;
+	get_rip = false;
+	write_unlock(&event_lock);
+}
+
+int
+uspt_initialize(int pid,bool should_get_rip)
+{
+	write_lock(&event_lock);
+	inited = 1;
+	last_sent_event_id = 1;
+	last_acked_event_id = 1;
+	have_event = 0;
+	get_rip = should_get_rip;
+	write_unlock(&event_lock);
+
+	return 0;
+}
+
+int
+uspt_is_initialiized()
+{
+	return inited;
+}
+
+bool
+uspt_should_get_rip()
+{
+	bool tmp;
+
+	read_lock(&event_lock);
+	tmp = get_rip;
+	read_unlock(&event_lock);
+
+	return tmp;
+}
+
+int
+uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code,
+	bool have_rip, uint64_t rip)
+{
+	ktime_t abort_after;
+	page_fault_event_t message_for_user;
+
+	read_lock(&event_lock);
+	if (!uspt_is_initialiized()) {
+		printk("userspace_page_track_signals: "
+			"uspt_send_and_block : ctx not initialized!\n");
+		read_unlock(&event_lock);
+		return 1;
+	}
+	read_unlock(&event_lock);
+
+	write_lock(&event_lock);
+	 if (last_sent_event_id != last_acked_event_id) {
+		printk("event id_s out of sync, aborting. Fix this later\n");
+		write_unlock(&event_lock);
+		return 1;
+	} else {
+		// TODO: handle overflow
+		last_sent_event_id++;
+	}
+	message_for_user.id = last_sent_event_id;
+	message_for_user.faulted_gpa = faulted_gpa;
+	message_for_user.error_code = error_code;
+	message_for_user.have_rip_info = have_rip;
+	message_for_user.rip = rip;
+	message_for_user.ns_timestamp = ktime_get_real_ns();
+	message_for_user.have_retired_instructions = false;
+
+	// for poll based system;
+	have_event = 1;
+	sent_event = message_for_user;
+	// printk("uspt_send_and_block sending event %llu\n",sent_event.id);
+
+	write_unlock(&event_lock);
+
+	// wait for ack, but with timeout. Otherwise small bugs in userland
+	// easily lead to a kernel hang
+	abort_after = ktime_get() + 1000000000ULL; // 1 sec in nanosecond
+	while (!uspt_is_event_done(sent_event.id)) {
+		if (ktime_get() > abort_after) {
+			printk("Waiting for ack of event %llu timed out, continuing\n",sent_event.id);
+			return 3;
+		}
+	}
+
+	return 0;
+}
+
+int
+uspt_is_event_done(uint64_t id)
+{
+	int res;
+
+	read_lock(&event_lock);
+	res = last_acked_event_id >= id;
+	read_unlock(&event_lock);
+
+	return res;
+}
+
+int
+uspt_handle_poll_event(page_fault_event_t* userpace_mem)
+{
+	int err;
+
+	// most of the time we won't have an event
+	read_lock(&event_lock);
+	if (!have_event) {
+		read_unlock(&event_lock);
+		return KVM_USPT_POLL_EVENT_NO_EVENT;
+	}
+	read_unlock(&event_lock);
+
+	write_lock(&event_lock);
+	if (have_event) {
+		err = copy_to_user(userpace_mem,
+			&sent_event, sizeof(page_fault_event_t));
+		have_event = 0;
+	} else {
+		err = KVM_USPT_POLL_EVENT_NO_EVENT;
+	}
+	write_unlock(&event_lock);
+
+	return err;
+}
+
+static int
+_uspt_handle_ack_event(uint64_t id)
+{
+	int err = 0;
+
+	write_lock(&event_lock);
+	if (id == last_sent_event_id) {
+		last_acked_event_id = last_sent_event_id;
+	} else {
+		err = 1;
+		printk("last sent event id is %llu but received ack for %llu\n",last_sent_event_id,id);
+	}
+	write_unlock(&event_lock);
+
+	return err;
+}
+
+int
+uspt_handle_ack_event_ioctl(ack_event_t event)
+{
+	return _uspt_handle_ack_event(event.id);
+}
+
+// setup perf_state and program retired instruction performance counter
+void
+_perf_state_setup_retired_instructions(void)
+{
+	perf_ctl_config_t retired_instructions_perf_config;
+	retired_instructions_perf_config.HostGuestOnly = 0x1; // 0x1 means: count only guest
+	retired_instructions_perf_config.CntMask = 0x0;
+	retired_instructions_perf_config.Inv = 0x0;
+	retired_instructions_perf_config.Int = 0x0;
+	retired_instructions_perf_config.Edge = 0x0;
+	retired_instructions_perf_config.OsUserMode = 0x3; // 0x3 means: count kern and user events
+	retired_instructions_perf_config.EventSelect = 0x0c0;
+	retired_instructions_perf_config.UintMask = 0x0;
+	retired_instructions_perf_config.En = 0x1;
+	write_ctl(&retired_instructions_perf_config,batch_track_state.perf_cpu, CTL_MSR_0);
+}
+
+
+// get retired instructions between current_event_idx-1 and current_event_idx
+// value is cached for multiple calls to the same current_event_idx
+uint64_t
+_perf_state_update_and_get_delta(uint64_t current_event_idx)
+{
+	uint64_t current_value;
+
+	// check if value is "cached"
+	if (perf_state.delta_valid_idx == current_event_idx) {
+		if (current_event_idx == 0) {
+			read_ctr(CTR_MSR_0, batch_track_state.perf_cpu, &current_value);
+			perf_state.idx_for_last_perf_reading = current_event_idx;
+			perf_state.last_perf_reading = current_event_idx;
+		}
+		return perf_state.delta;
+	}
+
+	// otherwise update, but logic is only valid for two consecutive events
+	if (current_event_idx != perf_state.idx_for_last_perf_reading+1) {
+		printk_ratelimited(KERN_CRIT "_perf_state_update_and_get_delta: "
+			"last reading was for idx %llu but was queried for %llu\n",
+			perf_state.idx_for_last_perf_reading, current_event_idx);
+	}
+
+	read_ctr(CTR_MSR_0, batch_track_state.perf_cpu, &current_value);
+	perf_state.delta = (current_value - perf_state.last_perf_reading);
+	perf_state.delta_valid_idx = current_event_idx;
+
+	perf_state.idx_for_last_perf_reading = current_event_idx;
+	perf_state.last_perf_reading = current_value;
+
+	return perf_state.delta;
+}
+
+void
+uspt_batch_tracking_inc_event_idx(void)
+{
+	spin_lock(&batch_track_state_lock);
+	batch_track_state.event_next_idx++;
+	spin_unlock(&batch_track_state_lock);
+}
+
+int
+uspt_batch_tracking_start(int tracking_type,uint64_t expected_events,
+	int perf_cpu, bool retrack)
+{
+	page_fault_event_t* events;
+	uint64_t buffer_size, i;
+
+	spin_lock(&batch_track_state_lock);
+	if (batch_track_state.is_active) {
+		printk("userspace_page_track_signals: overwriting "
+			"active batch track config!\n");
+		if (batch_track_state.events != NULL ) {
+			vfree(batch_track_state.events);
+		}
+	}
+	batch_track_state.is_active = false;
+	spin_unlock(&batch_track_state_lock);
+
+	buffer_size = expected_events * sizeof(page_fault_event_t);
+	printk("uspt_batch_tracking_start trying to alloc %llu "
+		"bytes buffer for events\n", buffer_size);
+	events = vmalloc(buffer_size);
+	if (events  == NULL) {
+		printk("userspace_page_track_signals: "
+			"faperf_cpuiled to alloc %llu bytes for event buffer\n",
+			buffer_size);
+		return 1; // note: lock not held here
+	}
+
+	// access each element once to force them into memory, improving performance
+	// during tracking
+	for (i = 0; i < expected_events * sizeof(page_fault_event_t); i++) {
+		((volatile uint8_t*)events)[i] = 0;
+	}
+
+	perf_state.idx_for_last_perf_reading = 0;
+	perf_state.last_perf_reading = 0;
+	perf_state.delta_valid_idx = 0;
+	perf_state.delta = 0;
+	_perf_state_setup_retired_instructions();
+
+	spin_lock(&batch_track_state_lock);
+
+	batch_track_state.perf_cpu = perf_cpu;
+	batch_track_state.retrack = retrack;
+
+	batch_track_state.events = events;
+	batch_track_state.event_next_idx = 0;
+	batch_track_state.events_size = expected_events;
+
+	batch_track_state.gfn_retrack_backlog_next_idx = 0;
+	batch_track_state.tracking_type = tracking_type;
+	batch_track_state.error_occured = false;
+
+	batch_track_state.is_active = true;
+
+	spin_unlock(&batch_track_state_lock);
+
+	return 0;
+}
+
+void
+uspt_batch_tracking_handle_retrack(struct kvm_vcpu* vcpu,
+	uint64_t current_fault_gfn)
+{
+	uint64_t ret_instr_delta;
+	int i, next_idx;
+
+	spin_lock(&batch_track_state_lock);
+
+	if (!batch_track_state.retrack) {
+		spin_unlock(&batch_track_state_lock);
+		return;
+	}
+
+	if (smp_processor_id() != batch_track_state.perf_cpu) {
+		printk("uspt_batch_tracking_handle_retrack: perf was "
+			"programmed on logical cpu %d but handler was called "
+			"on %d. Did you forget to pin the vcpu thread?\n",
+			batch_track_state.perf_cpu, smp_processor_id());
+	}
+	ret_instr_delta = _perf_state_update_and_get_delta(batch_track_state.event_next_idx);
+
+
+	// faulting instructions is probably the same as on last fault
+	// try to add current fault to retrack log and return
+	// for first event idx we do not have a valid ret_instr_delta.
+	// Retracking for the frist time is fine, if we loop, we end up here
+	// again but with a valid delta on one of the next event
+	if( (ret_instr_delta < 2) && ( batch_track_state.event_next_idx != 0) ) {
+		next_idx = batch_track_state.gfn_retrack_backlog_next_idx;
+		if (next_idx >= ARRLEN(batch_track_state.gfn_retrack_backlog)) {
+			printk("uspt_batch_tracking_handle_retrack: retrack "
+				"backlog full, dropping retrack for fault "
+				"at 0x%llx\n", current_fault_gfn);
+		} else {
+			batch_track_state.gfn_retrack_backlog[next_idx] = current_fault_gfn;
+			batch_track_state.gfn_retrack_backlog_next_idx++;
+		}
+
+		spin_unlock(&batch_track_state_lock);
+		return;
+	}
+
+	/* made progress, retrack everything in backlog and reset idx */
+	for (i = 0; i < batch_track_state.gfn_retrack_backlog_next_idx; i++) {
+		__track_single_page(vcpu,
+			batch_track_state.gfn_retrack_backlog[i],
+			batch_track_state.tracking_type);
+	}
+
+	/* add current fault to list */
+	batch_track_state.gfn_retrack_backlog[0] = current_fault_gfn;
+	batch_track_state.gfn_retrack_backlog_next_idx = 1;
+
+	spin_unlock(&batch_track_state_lock);
+
+}
+
+int
+uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code,
+	bool have_rip, uint64_t rip)
+{
+	uint64_t ret_instr_delta;
+	page_fault_event_t* event;
+
+	spin_lock(&batch_track_state_lock);
+
+	if (!batch_track_state.is_active) {
+		printk_ratelimited("userspace_page_track_signals: got save but batch tracking is not active!\n");
+		batch_track_state.error_occured = true;
+		spin_unlock(&batch_track_state_lock);
+		return 1;
+	}
+
+
+	if (batch_track_state.event_next_idx >= batch_track_state.events_size) {
+		printk_ratelimited("userspace_page_track_signals: events buffer is full!\n");
+		batch_track_state.error_occured = true;
+		spin_unlock(&batch_track_state_lock);
+		return 1;
+	}
+
+	if (smp_processor_id() != batch_track_state.perf_cpu) {
+		printk("uspt_batch_tracking_handle_retrack: perf was "
+			"programmed on logical cpu %d but handler was called "
+			"on %d. Did you forget to pin the vcpu thread?\n",
+			batch_track_state.perf_cpu, smp_processor_id());
+	}
+	ret_instr_delta = _perf_state_update_and_get_delta(batch_track_state.event_next_idx);
+
+
+	if (batch_track_state.events == NULL) {
+		printk(KERN_CRIT "userspace_page_track_signals: events buf was "
+			"NULL but \"is_active\" was set! This should never happen!!!\n");
+		spin_unlock(&batch_track_state_lock);
+		return 1;
+	}
+
+	event = &batch_track_state.events[batch_track_state.event_next_idx];
+	event->id = batch_track_state.event_next_idx;
+	event->faulted_gpa = faulted_gpa;
+	event->error_code = error_code;
+	event->have_rip_info = have_rip;
+	event->rip = rip;
+	event->ns_timestamp = ktime_get_real_ns();
+	event->have_retired_instructions = true;
+	event->retired_instructions = ret_instr_delta;
+
+	// old inc was here
+
+	if (batch_track_state.gfn_retrack_backlog_next_idx
+			> ARRLEN(batch_track_state.gfn_retrack_backlog)) {
+		printk_ratelimited("userspace_page_track_signals: "
+			"gfn retrack backlog overflow!\n");
+		batch_track_state.error_occured = true;
+		spin_unlock(&batch_track_state_lock);
+		return 1;
+	}
+
+	spin_unlock(&batch_track_state_lock);
+	return 0;
+}
+
+int
+uspt_batch_tracking_stop(page_fault_event_t* results, uint64_t len, bool* error_occured)
+{
+	spin_lock(&batch_track_state_lock);
+	if (!batch_track_state.is_active) {
+		printk("userspace_page_track_signals: batch tracking not active\n");
+		spin_unlock(&batch_track_state_lock);
+		return 1;
+
+	}
+	batch_track_state.is_active = false;
+
+	if (len > batch_track_state.event_next_idx) {
+		printk("userspace_page_track_signals: requested %llu "
+			"events but got only %llu\n",
+			len, batch_track_state.event_next_idx);
+		spin_unlock(&batch_track_state_lock);
+		return 1;
+	}
+
+	memcpy(results,batch_track_state.events, len*sizeof(page_fault_event_t));
+	vfree(batch_track_state.events);
+
+	*error_occured = batch_track_state.error_occured;
+
+	spin_unlock(&batch_track_state_lock);
+
+	return 0;
+}
+
+uint64_t
+uspt_batch_tracking_get_events_count()
+{
+	uint64_t buf;
+	spin_lock(&batch_track_state_lock);
+	buf = batch_track_state.event_next_idx;
+	spin_unlock(&batch_track_state_lock);
+
+	return buf;
+}
+
+bool
+uspt_batch_tracking_in_progress()
+{
+	return batch_track_state.is_active;
+}
diff --git a/sevstep/uspt.h b/sevstep/uspt.h
new file mode 100644
index 0000000..7c34996
--- /dev/null
+++ b/sevstep/uspt.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "uapi.h"
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+
+
+int uspt_initialize(int pid,bool should_get_rip);
+int uspt_is_initialiized(void);
+void uspt_clear(void);
+
+bool uspt_should_get_rip(void);
+
+int uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code,
+	bool have_rip, uint64_t rip);
+
+int uspt_is_event_done(uint64_t id);
+
+/* prepare next event based on faulted_gpa and error_code. Notify process
+ * behind pid_number. Event must be polled id is result param with the id
+ * used for the event. Can be used to call uspt_is_event_done */
+int uspt_send_notification(int pid_number, uint64_t faulted_gpa,
+	uint32_t error_code, uint64_t *id);
+
+/* copy next event to userpace_mem */
+int uspt_handle_poll_event(page_fault_event_t* userpace_mem);
+
+/* acknowledge receival of event to event handling logic */
+int uspt_handle_ack_event_ioctl(ack_event_t event);
+
+/* should be called after "uspt_batch_tracking_save",
+ * "uspt_batch_tracking_handle_retrack" and any future custom logic
+ * for an event is processed */
+void uspt_batch_tracking_inc_event_idx(void);
+int uspt_batch_tracking_start(int tracking_type, uint64_t expected_events, int perf_cpu, bool retrack);
+int uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code, bool have_rip, uint64_t rip);
+uint64_t uspt_batch_tracking_get_events_count(void);
+
+/* Stops batch tracking on copies the first @len events into @result.
+ * If an error occured at some point during the batch tracking,
+ * error_occured is set(there should also be a dmesg, but this allows programatic access);
+ * Caller can use uspt_batch_tracking_get_events_count() to determine the amount
+ * of memory they should allocate for @results */
+int uspt_batch_tracking_stop(page_fault_event_t *results, uint64_t len, bool *error_occured);
+void uspt_batch_tracking_handle_retrack(struct kvm_vcpu *vcpu, uint64_t current_fault_gfn);
+void uspt_batch_tracking_get_retrack_gfns(uint64_t **gfns, uint64_t *len, int *tracking_type);
+bool uspt_batch_tracking_in_progress(void);
-- 
cgit v1.2.3-71-gd317