From 58d8565f015f9e06e1e51a0fe4654b966b2c27c0 Mon Sep 17 00:00:00 2001 From: Louis Burda Date: Wed, 5 Oct 2022 15:05:19 +0200 Subject: Refactor sevstep kernel patch into repository --- Makefile | 31 +- cachepc/asm.h | 66 ++ cachepc/cache_types.h | 66 ++ cachepc/cachepc.c | 445 ++++++++++++ cachepc/cachepc.h | 188 +++++ cachepc/device_conf.h | 29 + cachepc/kvm.c | 392 +++++++++++ cachepc/kvm.h | 6 + cachepc/uapi.h | 8 + cachepc/util.c | 38 + cachepc/util.h | 8 + kmod/asm.h | 66 -- kmod/cache_types.h | 66 -- kmod/cachepc.c | 445 ------------ kmod/cachepc.h | 188 ----- kmod/cachepc_user.h | 8 - kmod/device_conf.h | 29 - kmod/kvm.c | 392 ----------- kmod/kvm.h | 6 - kmod/util.c | 38 - kmod/util.h | 8 - patch.diff | 1864 +++---------------------------------------------- sevstep/kvm.c | 205 ++++++ sevstep/kvm.h | 4 + sevstep/mmu.c | 132 ++++ sevstep/sevstep.c | 129 ++++ sevstep/sevstep.h | 67 ++ sevstep/uapi.h | 86 +++ sevstep/uspt.c | 503 +++++++++++++ sevstep/uspt.h | 49 ++ test/access.c | 2 +- test/eviction.c | 2 +- test/kvm.c | 3 +- test/sev-es.c | 3 +- test/sev.c | 3 +- test/sevstep.c | 32 + 36 files changed, 2574 insertions(+), 3033 deletions(-) create mode 100644 cachepc/asm.h create mode 100644 cachepc/cache_types.h create mode 100644 cachepc/cachepc.c create mode 100644 cachepc/cachepc.h create mode 100644 cachepc/device_conf.h create mode 100644 cachepc/kvm.c create mode 100644 cachepc/kvm.h create mode 100644 cachepc/uapi.h create mode 100644 cachepc/util.c create mode 100644 cachepc/util.h delete mode 100644 kmod/asm.h delete mode 100644 kmod/cache_types.h delete mode 100644 kmod/cachepc.c delete mode 100644 kmod/cachepc.h delete mode 100644 kmod/cachepc_user.h delete mode 100644 kmod/device_conf.h delete mode 100644 kmod/kvm.c delete mode 100644 kmod/kvm.h delete mode 100644 kmod/util.c delete mode 100644 kmod/util.h create mode 100644 sevstep/kvm.c create mode 100644 sevstep/kvm.h create mode 100644 sevstep/mmu.c create mode 100644 sevstep/sevstep.c create mode 100644 sevstep/sevstep.h create mode 100644 sevstep/uapi.h create mode 100644 sevstep/uspt.c create mode 100644 sevstep/uspt.h mode change 100755 => 100644 test/access.c mode change 100755 => 100644 test/eviction.c mode change 100755 => 100644 test/kvm.c mode change 100755 => 100644 test/sev-es.c mode change 100755 => 100644 test/sev.c create mode 100644 test/sevstep.c diff --git a/Makefile b/Makefile index 5f394db..3388608 100755 --- a/Makefile +++ b/Makefile @@ -1,28 +1,31 @@ -KERNEL_SOURCE ?= /usr/src/linux +LINUX ?= /usr/src/linux PWD := $(shell pwd) -all: build test/eviction test/access test/kvm test/sev test/sev-es +all: build test/eviction test/access test/kvm test/sev test/sev-es test/sevstep clean: - $(MAKE) -C $(KERNEL_SOURCE) SUBDIRS=arch/x86/kvm clean + $(MAKE) -C $(LINUX) SUBDIRS=arch/x86/kvm clean -$(KERNEL_SOURCE)/arch/x86/kvm/svm/cachepc: - ln -sf $(PWD)/kmod $@ +$(LINUX)/arch/x86/kvm/svm/cachepc: + ln -sf $(PWD)/cachepc $@ -build: - $(MAKE) -C $(KERNEL_SOURCE) -j6 M=arch/x86/kvm +$(LINUX)/arch/x86/kvm/sevstep: + ln -sf $(PWD)/sevstep $@ + +build: $(LINUX)/arch/x86/kvm/svm/cachepc $(LINUX)/arch/x86/kvm/sevstep + $(MAKE) -C $(LINUX) -j6 M=arch/x86/kvm load: sudo rmmod kvm_amd || true sudo rmmod kvm || true - sudo insmod $(KERNEL_SOURCE)/arch/x86/kvm/kvm.ko - sudo insmod $(KERNEL_SOURCE)/arch/x86/kvm/kvm-amd.ko + sudo insmod $(LINUX)/arch/x86/kvm/kvm.ko + sudo insmod $(LINUX)/arch/x86/kvm/kvm-amd.ko + +test/%: test/%.c cachepc/cachepc_user.h + clang -o $@ $< -fsanitize=address -I . -Wunused-variable -test/%: test/%.c kmod/cachepc_user.h -# $(CC) -o $@ $< -I kmod - clang -fsanitize=address -o $@ $< -I kmod -Wunused-variable -update: - git -C $(KERNEL_SOURCE) diff 0aaa1e599bee256b3b15643bbb95e80ce7aa9be5 -G. > patch.diff +update: + git -C $(LINUX) diff 0aaa1e599bee256b3b15643bbb95e80ce7aa9be5 -G. > patch.diff .PHONY: all clean build load update diff --git a/cachepc/asm.h b/cachepc/asm.h new file mode 100644 index 0000000..9e9385a --- /dev/null +++ b/cachepc/asm.h @@ -0,0 +1,66 @@ +#pragma once + +#include + +#define CPUID_AFFECTED_REGS "rax", "rbx", "rcx", "rdx" + +__attribute__((always_inline)) +static inline void cachepc_cpuid(void); + +__attribute__((always_inline)) +static inline void cachepc_lfence(void); + +__attribute__((always_inline)) +static inline void cachepc_sfence(void); + +__attribute__((always_inline)) +static inline void cachepc_mfence(void); + +__attribute__((always_inline)) +static inline void cachepc_readq(void *p); + +void +cachepc_cpuid(void) +{ + asm volatile( + "mov $0x80000005, %%eax\n\t" + "cpuid\n\t" + ::: CPUID_AFFECTED_REGS + ); +} + +void +cachepc_lfence(void) +{ + asm volatile( + "lfence\n\t" + ::: "memory" + ); +} + +void +cachepc_sfence(void) +{ + asm volatile( + "sfence\n\t" + ::: "memory" + ); +} + +void +cachepc_mfence(void) +{ + asm volatile( + "mfence\n\t" + ::: "memory" + ); +} + +void +cachepc_readq(void *p) +{ + asm volatile ( + "movq (%0), %%r10\n\t" + : : "r" (p) : "r10" + ); +} diff --git a/cachepc/cache_types.h b/cachepc/cache_types.h new file mode 100644 index 0000000..b337d55 --- /dev/null +++ b/cachepc/cache_types.h @@ -0,0 +1,66 @@ +#pragma once + +#include "device_conf.h" + +#define SET_MASK(SETS) (((((uintptr_t) SETS) * CACHELINE_SIZE) - 1) ^ (CACHELINE_SIZE - 1)) + +#define REMOVE_PAGE_OFFSET(ptr) ((void *) (((uintptr_t) ptr) & PAGE_MASK)) + +#define GET_BIT(b, i) (((b) >> (i)) & 1) +#define SET_BIT(b, i) ((b) | (1 << (i))) + +/* Operate cacheline flags + * Used flags: + * 32 2 1 0 + * | | ... | cache group initialized | last | first | + */ +#define DEFAULT_FLAGS 0 +#define SET_FIRST(flags) SET_BIT(flags, 0) +#define SET_LAST(flags) SET_BIT(flags, 1) +#define SET_CACHE_GROUP_INIT(flags) SET_BIT(flags, 2) +#define IS_FIRST(flags) GET_BIT(flags, 0) +#define IS_LAST(flags) GET_BIT(flags, 1) +#define IS_CACHE_GROUP_INIT(flags) GET_BIT(flags, 2) + +#define CL_NEXT_OFFSET offsetof(struct cacheline, next) +#define CL_PREV_OFFSET offsetof(struct cacheline, prev) + +typedef enum cache_level cache_level; +typedef enum addressing_type addressing_type; +typedef struct cacheline cacheline; +typedef struct cache_ctx cache_ctx; + +enum cache_level {L1, L2}; +enum addressing_type {VIRTUAL, PHYSICAL}; + +struct cache_ctx { + cache_level cache_level; + addressing_type addressing; + + uint32_t sets; + uint32_t associativity; + uint32_t access_time; + uint32_t nr_of_cachelines; + uint32_t set_size; + uint32_t cache_size; +}; + +struct cacheline { + // Doubly linked list inside same set + // Attention: CL_NEXT_OFFSET and CL_PREV_OFFSET + // must be kept up to date + cacheline *next; + cacheline *prev; + + uint32_t cache_set; + uint32_t cache_line; + uint32_t flags; + + // Unused padding to fill cache line + uint64_t count; + + char padding[24]; +}; + +static_assert(sizeof(struct cacheline) == CACHELINE_SIZE, "Bad cache line struct size"); +static_assert(CL_NEXT_OFFSET == 0 && CL_PREV_OFFSET == 8); diff --git a/cachepc/cachepc.c b/cachepc/cachepc.c new file mode 100644 index 0000000..09ed705 --- /dev/null +++ b/cachepc/cachepc.c @@ -0,0 +1,445 @@ +#include "cachepc.h" + +#include +#include +#include +#include +#include + +static void cl_insert(cacheline *last_cl, cacheline *new_cl); +static void *remove_cache_set(cache_ctx *ctx, void *ptr); +static void *remove_cache_group_set(void *ptr); + +static cacheline *prepare_cache_set_ds(cache_ctx *ctx, uint32_t *set, uint32_t sets_len); +static cacheline *build_cache_ds(cache_ctx *ctx, cacheline **cacheline_ptr_arr); +static void build_randomized_list_for_cache_set(cache_ctx *ctx, cacheline **cacheline_ptr_arr); +static cacheline **allocate_cache_ds(cache_ctx *ctx); +static uint16_t get_virt_cache_set(cache_ctx *ctx, void *ptr); + +void __attribute__((optimize(1))) // prevent instruction reordering +cachepc_prime_vcall(uintptr_t ret, cacheline *cl) +{ + cachepc_prime(cl); + asm volatile ("mov %0, %%rax; jmp *%%rax" : : "r"(ret) : "rax"); +} + +void __attribute__((optimize(1))) // prevent instruction reordering +cachepc_probe_vcall(uintptr_t ret, cacheline *cl) +{ + cachepc_probe(cl); + asm volatile ("mov %0, %%rax; jmp *%%rax" : : "r"(ret) : "rax"); +} + +void +cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask) +{ + uint64_t event; + uint64_t reg_addr; + + /* REF: https://developer.amd.com/resources/developer-guides-manuals (PPR 17H 31H, P.166) + * + * performance event selection via 0xC001_020X with X = (0..A)[::2] + * performance event reading viea 0XC001_020X with X = (1..B)[::2] + */ + + WARN_ON(index >= 6); + if (index >= 6) return; + + reg_addr = 0xc0010200 + index * 2; + event = event_no | (event_mask << 8); + event |= (1ULL << 17); /* OS (kernel) events only */ + event |= (1ULL << 22); /* enable performance counter */ + event |= (1ULL << 40); /* Host events only */ + printk(KERN_WARNING "CachePC: Initialized %i. PMC %02X:%02X\n", + index, event_no, event_mask); + asm volatile ("wrmsr" : : "c"(reg_addr), "a"(event), "d"(0x00)); +} + +cache_ctx * +cachepc_get_ctx(cache_level cache_level) +{ + cache_ctx *ctx; + + ctx = kzalloc(sizeof(cache_ctx), GFP_KERNEL); + BUG_ON(ctx == NULL); + + BUG_ON(cache_level != L1); + if (cache_level == L1) { + ctx->addressing = L1_ADDRESSING; + ctx->sets = L1_SETS; + ctx->associativity = L1_ASSOCIATIVITY; + ctx->access_time = L1_ACCESS_TIME; + } else if (cache_level == L2) { + ctx->addressing = L2_ADDRESSING; + ctx->sets = L2_SETS; + ctx->associativity = L2_ASSOCIATIVITY; + ctx->access_time = L2_ACCESS_TIME; + } else { + return NULL; + } + + ctx->cache_level = cache_level; + ctx->nr_of_cachelines = ctx->sets * ctx->associativity; + ctx->set_size = CACHELINE_SIZE * ctx->associativity; + ctx->cache_size = ctx->sets * ctx->set_size; + + return ctx; +} + +void +cachepc_release_ctx(cache_ctx *ctx) +{ + kfree(ctx); +} + + +/* + * Initialises the complete cache data structure for the given context + */ +cacheline * +cachepc_prepare_ds(cache_ctx *ctx) +{ + cacheline **cacheline_ptr_arr; + cacheline *cache_ds; + + //printk(KERN_WARNING "CachePC: Preparing ds..\n"); + + cacheline_ptr_arr = allocate_cache_ds(ctx); + cache_ds = build_cache_ds(ctx, cacheline_ptr_arr); + kfree(cacheline_ptr_arr); + + // printk(KERN_WARNING "CachePC: Preparing ds done\n"); + + return cache_ds; +} + +void +cachepc_release_ds(cache_ctx *ctx, cacheline *ds) +{ + kfree(remove_cache_set(ctx, ds)); +} + +cacheline * +cachepc_prepare_victim(cache_ctx *ctx, uint32_t set) +{ + cacheline *victim_set, *victim_cl; + cacheline *curr_cl, *next_cl; + + victim_set = prepare_cache_set_ds(ctx, &set, 1); + victim_cl = victim_set; + + // Free the other lines in the same set that are not used. + if (ctx->addressing == PHYSICAL) { + curr_cl = victim_cl->next; + do { + next_cl = curr_cl->next; + // Here, it is ok to free them directly, as every line in the same + // set is from a different page anyway. + kfree(remove_cache_group_set(curr_cl)); + curr_cl = next_cl; + } while(curr_cl != victim_cl); + } + + return victim_cl; +} + +void +cachepc_release_victim(cache_ctx *ctx, cacheline *victim) +{ + kfree(remove_cache_set(ctx, victim)); +} + +void +cachepc_save_msrmts(cacheline *head) +{ + cacheline *curr_cl; + + // printk(KERN_WARNING "CachePC: Updating /proc/cachepc\n"); + + curr_cl = head; + do { + if (IS_FIRST(curr_cl->flags)) { + BUG_ON(curr_cl->cache_set >= cachepc_msrmts_count); + cachepc_msrmts[curr_cl->cache_set] = curr_cl->count; + } + + curr_cl = curr_cl->prev; + } while (curr_cl != head); +} + +void +cachepc_print_msrmts(cacheline *head) +{ + cacheline *curr_cl; + + curr_cl = head; + do { + if (IS_FIRST(curr_cl->flags)) { + printk(KERN_WARNING "CachePC: Count for cache set %i: %llu\n", + curr_cl->cache_set, curr_cl->count); + } + + curr_cl = curr_cl->prev; + } while (curr_cl != head); +} + + +cacheline * +prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len) +{ + cacheline *cache_ds, **first_cl_in_sets, **last_cl_in_sets; + cacheline *to_del_cls, *curr_cl, *next_cl, *cache_set_ds; + uint32_t i, cache_groups_len, cache_groups_max_len; + uint32_t *cache_groups; + + cache_ds = cachepc_prepare_ds(ctx); + + first_cl_in_sets = kzalloc(ctx->sets * sizeof(cacheline *), GFP_KERNEL); + BUG_ON(first_cl_in_sets == NULL); + + last_cl_in_sets = kzalloc(ctx->sets * sizeof(cacheline *), GFP_KERNEL); + BUG_ON(last_cl_in_sets == NULL); + + // Find the cache groups that are used, so that we can delete the other ones + // later (to avoid memory leaks) + cache_groups_max_len = ctx->sets / CACHE_GROUP_SIZE; + cache_groups = kmalloc(cache_groups_max_len * sizeof(uint32_t), GFP_KERNEL); + BUG_ON(cache_groups == NULL); + + cache_groups_len = 0; + for (i = 0; i < sets_len; ++i) { + if (!is_in_arr(sets[i] / CACHE_GROUP_SIZE, cache_groups, cache_groups_len)) { + cache_groups[cache_groups_len] = sets[i] / CACHE_GROUP_SIZE; + ++cache_groups_len; + } + } + + to_del_cls = NULL; + curr_cl = cache_ds; + + // Extract the partial data structure for the cache sets and ensure correct freeing + do { + next_cl = curr_cl->next; + + if (IS_FIRST(curr_cl->flags)) { + first_cl_in_sets[curr_cl->cache_set] = curr_cl; + } + if (IS_LAST(curr_cl->flags)) { + last_cl_in_sets[curr_cl->cache_set] = curr_cl; + } + + if (ctx->addressing == PHYSICAL && !is_in_arr( + curr_cl->cache_set / CACHE_GROUP_SIZE, cache_groups, cache_groups_len)) + { + // Already free all unused blocks of the cache ds for physical + // addressing, because we loose their refs + cl_insert(to_del_cls, curr_cl); + to_del_cls = curr_cl; + } + curr_cl = next_cl; + + } while(curr_cl != cache_ds); + + // Fix partial cache set ds + for (i = 0; i < sets_len; ++i) { + last_cl_in_sets[sets[i]]->next = first_cl_in_sets[sets[(i + 1) % sets_len]]; + first_cl_in_sets[sets[(i + 1) % sets_len]]->prev = last_cl_in_sets[sets[i]]; + } + cache_set_ds = first_cl_in_sets[sets[0]]; + + // Free unused cache lines + if (ctx->addressing == PHYSICAL) { + cachepc_release_ds(ctx, to_del_cls); + } + + kfree(first_cl_in_sets); + kfree(last_cl_in_sets); + kfree(cache_groups); + + return cache_set_ds; +} + +void +cl_insert(cacheline *last_cl, cacheline *new_cl) +{ + if (last_cl == NULL) { + // Adding the first entry is a special case + new_cl->next = new_cl; + new_cl->prev = new_cl; + } else { + new_cl->next = last_cl->next; + new_cl->prev = last_cl; + last_cl->next->prev = new_cl; + last_cl->next = new_cl; + } +} + +void * +remove_cache_set(cache_ctx *ctx, void *ptr) +{ + return (void *) (((uintptr_t) ptr) & ~SET_MASK(ctx->sets)); +} + +void * +remove_cache_group_set(void *ptr) +{ + return (void *) (((uintptr_t) ptr) & ~SET_MASK(CACHE_GROUP_SIZE)); +} + + +/* + * Create a randomized doubly linked list with the following structure: + * set A <--> set B <--> ... <--> set X <--> set A + * where each set is one of the cache sets, in a random order. + * The sets are a doubly linked list of cachelines themselves: + * set A: + * line[A + x0 * #sets] <--> line[A + x1 * #sets] <--> ... + * where x0, x1, ..., xD is a random permutation of 1, 2, ..., D + * and D = Associativity = | cache set | + */ +cacheline *build_cache_ds(cache_ctx *ctx, cacheline **cl_ptr_arr) { + cacheline **first_cl_in_sets, **last_cl_in_sets; + cacheline **cl_ptr_arr_sorted; + cacheline *curr_cl; + cacheline *cache_ds; + uint32_t *idx_per_set; + uint32_t idx_curr_set, set_offset; + uint32_t i, j, set, set_len; + uint32_t *idx_map; + + idx_per_set = kzalloc(ctx->sets * sizeof(uint32_t), GFP_KERNEL); + BUG_ON(idx_per_set == NULL); + + cl_ptr_arr_sorted = kzalloc(ctx->nr_of_cachelines * sizeof(cacheline *), GFP_KERNEL); + BUG_ON(cl_ptr_arr_sorted == NULL); + + set_len = ctx->associativity; + for (i = 0; i < ctx->nr_of_cachelines; ++i) { + set_offset = cl_ptr_arr[i]->cache_set * set_len; + idx_curr_set = idx_per_set[cl_ptr_arr[i]->cache_set]; + + cl_ptr_arr_sorted[set_offset + idx_curr_set] = cl_ptr_arr[i]; + idx_per_set[cl_ptr_arr[i]->cache_set] += 1; + } + + // Build doubly linked list for every set + for (set = 0; set < ctx->sets; ++set) { + set_offset = set * set_len; + build_randomized_list_for_cache_set(ctx, cl_ptr_arr_sorted + set_offset); + } + + // Relink the sets among each other + idx_map = kzalloc(ctx->sets * sizeof(uint32_t), GFP_KERNEL); + BUG_ON(idx_map == NULL); + + gen_random_indices(idx_map, ctx->sets); + + first_cl_in_sets = kzalloc(ctx->sets * sizeof(cacheline *), GFP_KERNEL); + BUG_ON(first_cl_in_sets == NULL); + + last_cl_in_sets = kzalloc(ctx->sets * sizeof(cacheline *), GFP_KERNEL); + BUG_ON(last_cl_in_sets == NULL); + + for (j = 0; j < ctx->nr_of_cachelines; ++j) { + curr_cl = cl_ptr_arr_sorted[j]; + if (IS_FIRST(curr_cl->flags)) + first_cl_in_sets[curr_cl->cache_set] = curr_cl; + if (IS_LAST(curr_cl->flags)) + last_cl_in_sets[curr_cl->cache_set] = curr_cl; + } + + /* connect up sets */ + for (i = 0; i < ctx->sets; ++i) { + last_cl_in_sets[idx_map[i]]->next = first_cl_in_sets[idx_map[(i + 1) % ctx->sets]]; + first_cl_in_sets[idx_map[(i + 1) % ctx->sets]]->prev = last_cl_in_sets[idx_map[i]]; + } + cache_ds = first_cl_in_sets[idx_map[0]]; + + kfree(cl_ptr_arr_sorted); + kfree(first_cl_in_sets); + kfree(last_cl_in_sets); + kfree(idx_per_set); + kfree(idx_map); + + return cache_ds; +} + +/* + * Helper function to build a randomised list of cacheline structs for a set + */ +void build_randomized_list_for_cache_set(cache_ctx *ctx, cacheline **cacheline_ptr_arr) +{ + cacheline *curr_cl; + uint32_t len, *idx_map; + uint16_t i; + + len = ctx->associativity; + idx_map = kzalloc(len * sizeof(uint32_t), GFP_KERNEL); + BUG_ON(idx_map == NULL); + + gen_random_indices(idx_map, len); + + for (i = 0; i < len; ++i) { + curr_cl = cacheline_ptr_arr[idx_map[i]]; + curr_cl->next = cacheline_ptr_arr[idx_map[(i + 1) % len]]; + curr_cl->prev = cacheline_ptr_arr[idx_map[(len - 1 + i) % len]]; + + if (idx_map[i] == 0) { + curr_cl->flags = SET_FIRST(DEFAULT_FLAGS); + curr_cl->prev->flags = SET_LAST(DEFAULT_FLAGS); + } else { + curr_cl->flags |= DEFAULT_FLAGS; + } + } + + kfree(idx_map); +} + +/* + * Allocate a data structure that fills the complete cache, i.e. consisting + * of `associativity` many cache lines for each cache set. + */ +cacheline ** +allocate_cache_ds(cache_ctx *ctx) +{ + cacheline **cl_ptr_arr, *cl_arr; + uint32_t i; + + cl_ptr_arr = kzalloc(ctx->nr_of_cachelines * sizeof(cacheline *), GFP_KERNEL); + BUG_ON(cl_ptr_arr == NULL); + + BUG_ON(ctx->addressing != VIRTUAL); + + // For virtual addressing, allocating a consecutive chunk of memory is enough + cl_arr = cachepc_aligned_alloc(PAGE_SIZE, ctx->cache_size); + BUG_ON(cl_arr == NULL); + + for (i = 0; i < ctx->nr_of_cachelines; ++i) { + cl_ptr_arr[i] = cl_arr + i; + cl_ptr_arr[i]->cache_set = get_virt_cache_set(ctx, cl_ptr_arr[i]); + cl_ptr_arr[i]->cache_line = i / ctx->sets; + cl_ptr_arr[i]->count = 0; + } + + return cl_ptr_arr; +} + +uint16_t +get_virt_cache_set(cache_ctx *ctx, void *ptr) +{ + return (uint16_t) ((((uintptr_t) ptr) & SET_MASK(ctx->sets)) / CACHELINE_SIZE); +} + +void * +cachepc_aligned_alloc(size_t alignment, size_t size) +{ + void *p; + + if (size % alignment != 0) + size = size - (size % alignment) + alignment; + p = kzalloc(size, GFP_KERNEL); + BUG_ON(((uintptr_t) p) % alignment != 0); + + return p; +} + diff --git a/cachepc/cachepc.h b/cachepc/cachepc.h new file mode 100644 index 0000000..ad2dff1 --- /dev/null +++ b/cachepc/cachepc.h @@ -0,0 +1,188 @@ +#pragma once + +#include "asm.h" +#include "cache_types.h" +#include "util.h" +#include "uapi.h" + +void cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask); + +cache_ctx *cachepc_get_ctx(cache_level cl); +void cachepc_release_ctx(cache_ctx *ctx); + +cacheline *cachepc_prepare_ds(cache_ctx *ctx); +void cachepc_release_ds(cache_ctx *ctx, cacheline *ds); + +cacheline *cachepc_prepare_victim(cache_ctx *ctx, uint32_t set); +void cachepc_release_victim(cache_ctx *ctx, cacheline *ptr); + +void *cachepc_aligned_alloc(size_t alignment, size_t size); + +void cachepc_save_msrmts(cacheline *head); +void cachepc_print_msrmts(cacheline *head); + +void cachepc_prime_vcall(uintptr_t ret, cacheline *cl); +void cachepc_probe_vcall(uintptr_t ret, cacheline *cl); + +__attribute__((always_inline)) +static inline cacheline *cachepc_prime(cacheline *head); + +__attribute__((always_inline)) +static inline cacheline *cachepc_prime_rev(cacheline *head); + +__attribute__((always_inline)) +static inline cacheline *cachepc_probe(cacheline *head); + +__attribute__((always_inline)) +static inline void cachepc_victim(void *p); + +__attribute__((always_inline)) +static inline uint64_t cachepc_read_pmc(uint64_t event); + +extern uint16_t *cachepc_msrmts; +extern size_t cachepc_msrmts_count; + +extern cache_ctx *cachepc_ctx; +extern cacheline *cachepc_ds; + +extern uint64_t cachepc_regs_tmp[16]; +extern uint64_t cachepc_regs_vm[16]; + +/* + * Prime phase: fill the target cache (encoded in the size of the data structure) + * with the prepared data structure, i.e. with attacker data. + */ +cacheline * +cachepc_prime(cacheline *head) +{ + cacheline *curr_cl, *prev_cl; + + cachepc_mfence(); + cachepc_cpuid(); + + curr_cl = head; + do { + prev_cl = curr_cl; + curr_cl = curr_cl->next; + } while (curr_cl != head); + + cachepc_mfence(); + cachepc_cpuid(); + + return prev_cl; +} + +/* + * Same as prime, but in the reverse direction, i.e. the same direction that probe + * uses. This is beneficial for the following scenarios: + * - L1: + * - Trigger collision chain-reaction to amplify an evicted set (but this has + * the downside of more noisy measurements). + * - L2: + * - Always use this for L2, otherwise the first cache sets will still reside + * in L1 unless the victim filled L1 completely. In this case, an eviction + * has randomly (depending on where the cache set is placed in the randomised + * data structure) the following effect: + * A) An evicted set is L2_ACCESS_TIME - L1_ACCESS_TIME slower + * B) An evicted set is L3_ACCESS_TIME - L2_ACCESS_TIME slower + */ +cacheline * +cachepc_prime_rev(cacheline *head) +{ + cacheline *curr_cl; + + cachepc_mfence(); + cachepc_cpuid(); + + curr_cl = head; + do { + curr_cl = curr_cl->prev; + } while(curr_cl != head); + + cachepc_mfence(); + cachepc_cpuid(); + + return curr_cl->prev; +} + +cacheline * +cachepc_probe(cacheline *start_cl) +{ + uint64_t pre, post; + cacheline *next_cl; + cacheline *curr_cl; + + cachepc_mfence(); + cachepc_cpuid(); + + curr_cl = start_cl; + + do { + pre = cachepc_read_pmc(0); + + asm volatile( + "mov 8(%[curr_cl]), %%rax \n\t" // +8 + "mov 8(%%rax), %%rcx \n\t" // +16 + "mov 8(%%rcx), %%rax \n\t" // +24 + "mov 8(%%rax), %%rcx \n\t" // +32 + "mov 8(%%rcx), %%rax \n\t" // +40 + "mov 8(%%rax), %%rcx \n\t" // +48 + "mov 8(%%rcx), %[curr_cl_out] \n\t" // +56 + "mov 8(%[curr_cl_out]), %[next_cl_out] \n\t" // +64 + : [next_cl_out] "=r" (next_cl), + [curr_cl_out] "=r" (curr_cl) + : [curr_cl] "r" (curr_cl) + : "rax", "rcx" + ); + + post = cachepc_read_pmc(0); + + /* works across size boundary */ + curr_cl->count = post - pre; + + curr_cl = next_cl; + } while (__builtin_expect(curr_cl != start_cl, 1)); + + next_cl = curr_cl->next; + + cachepc_mfence(); + cachepc_cpuid(); + + return next_cl; +} + +void +cachepc_victim(void *p) +{ + cachepc_mfence(); + cachepc_cpuid(); + + cachepc_readq(p); + + cachepc_mfence(); + cachepc_cpuid(); +} + +uint64_t +cachepc_read_pmc(uint64_t event) +{ + uint32_t lo, hi; + uint64_t res; + + cachepc_mfence(); + cachepc_cpuid(); + + event = 0xC0010201 + 2 * event; + + asm volatile ( + "rdmsr" + : "=a" (lo), "=d" (hi) + : "c"(event) + ); + res = ((uint64_t) hi << 32) | (uint64_t) lo; + + cachepc_mfence(); + cachepc_cpuid(); + + return res; +} diff --git a/cachepc/device_conf.h b/cachepc/device_conf.h new file mode 100644 index 0000000..e24d681 --- /dev/null +++ b/cachepc/device_conf.h @@ -0,0 +1,29 @@ +#pragma once + +// TODO: Read from kernel headers + +// General settings +// #define PAGE_SIZE 4096 +#define PROCESSOR_FREQ 2900000000 + +// Cache related settings +#define CACHELINE_SIZE 64 +#define CACHE_GROUP_SIZE (PAGE_SIZE / CACHELINE_SIZE) + +// Addressing: +// - virtual: 0 +// - physical: 1 +#define L1_ADDRESSING 0 +#define L1_SETS 64 +#define L1_ASSOCIATIVITY 8 +#define L1_ACCESS_TIME 4 + +#define L2_ADDRESSING 1 +#define L2_SETS 512 +#define L2_ASSOCIATIVITY 8 +#define L2_ACCESS_TIME 12 + +#define L3_ADDRESSING 1 +#define L3_SETS 4096 +#define L3_ASSOCIATIVITY 16 +#define L3_ACCESS_TIME 30 diff --git a/cachepc/kvm.c b/cachepc/kvm.c new file mode 100644 index 0000000..4deb4fa --- /dev/null +++ b/cachepc/kvm.c @@ -0,0 +1,392 @@ +#include "kvm.h" + +#include +#include +#include +#include +#include + +struct proc_ops cachepc_proc_ops; + +uint16_t *cachepc_msrmts; +size_t cachepc_msrmts_count; +EXPORT_SYMBOL(cachepc_msrmts); +EXPORT_SYMBOL(cachepc_msrmts_count); + +cache_ctx *cachepc_ctx; +cacheline *cachepc_ds; +EXPORT_SYMBOL(cachepc_ctx); +EXPORT_SYMBOL(cachepc_ds); + +uint64_t cachepc_regs_tmp[16]; +uint64_t cachepc_regs_vm[16]; +EXPORT_SYMBOL(cachepc_regs_tmp); +EXPORT_SYMBOL(cachepc_regs_vm); + +int +cachepc_kvm_proc_open(struct inode *inode, struct file *file) +{ + try_module_get(THIS_MODULE); + + return 0; +} + +int +cachepc_kvm_proc_close(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + + return 0; +} + +ssize_t +cachepc_kvm_proc_read(struct file *file, char *buf, size_t buflen, loff_t *off) +{ + size_t len, left; + size_t size; + + printk(KERN_WARNING "CachePC: Reading entries (%lu:%lli)\n", + buflen, off ? *off : 0); + + size = cachepc_msrmts_count * sizeof(uint16_t); + if (!off || *off >= size || *off < 0) + return 0; + + len = size - *off; + if (len > buflen) len = buflen; + + left = copy_to_user(buf, (uint8_t *) cachepc_msrmts + *off, len); + + len -= left; + *off += len; + + return len; +} + +ssize_t +cachepc_kvm_proc_write(struct file *file, const char *buf, size_t buflen, loff_t *off) +{ + return 0; +} + +loff_t +cachepc_kvm_proc_lseek(struct file *file, loff_t off, int mode) +{ + switch (mode) { + case SEEK_SET: + file->f_pos = off; + break; + case SEEK_CUR: + file->f_pos += off; + break; + case SEEK_END: + file->f_pos = cachepc_msrmts_count * sizeof(uint16_t) + off; + break; + default: + return -EINVAL; + } + + return file->f_pos; +} + +void +cachepc_kvm_prime_probe_test(void *p) +{ + cacheline *lines; + cacheline *cl, *head; + uint32_t count; + uint32_t *arg; + int i, max; + + arg = p; + + /* l2 data cache, hit or miss */ + cachepc_init_pmc(0, 0x64, 0xD8); + + lines = cachepc_aligned_alloc(PAGE_SIZE, cachepc_ctx->cache_size); + BUG_ON(lines == NULL); + + max = cachepc_ctx->nr_of_cachelines; + + cachepc_cpuid(); + cachepc_mfence(); + + for (i = 0; i < max; i++) + asm volatile ("mov (%0), %%rbx" : : "r"(lines + i) : "rbx"); + + head = cachepc_prime(cachepc_ds); + cachepc_probe(head); + + count = 0; + cl = head = cachepc_ds; + do { + count += cl->count; + cl = cl->next; + } while (cl != head); + + printk(KERN_WARNING "CachePC: Prime-probe test done (%u vs. %u => %s)\n", + count, 0, (count == 0) ? "passed" : "failed"); + + if (arg) *arg = (count == 0); + + kfree(lines); +} + +void +cachepc_kvm_stream_hwpf_test(void *p) +{ + cacheline *lines; + uint32_t count; + uint32_t *arg; + uint32_t i, max; + + arg = p; + + /* TODO: accurately detect hwpf */ + + /* l2 data cache, hit or miss */ + cachepc_init_pmc(0, 0x64, 0xD8); + + lines = cachepc_aligned_alloc(PAGE_SIZE, cachepc_ctx->cache_size); + BUG_ON(lines == NULL); + + max = cachepc_ctx->nr_of_cachelines; + + cachepc_prime(cachepc_ds); + + count -= cachepc_read_pmc(0); + for (i = 0; i < max; i++) + asm volatile ("mov (%0), %%rbx" : : "r"(lines + i) : "rbx"); + count += cachepc_read_pmc(0); + + printk(KERN_WARNING "CachePC: HWPF test done (%u vs. %u => %s)\n", + count, max, (count == max) ? "passed" : "failed"); + + if (arg) *arg = (count == max); + + kfree(lines); +} + +void +cachepc_kvm_single_access_test(void *p) +{ + cacheline *ptr; + uint64_t pre, post; + uint32_t *arg; + + /* l2 data cache, hit or miss */ + cachepc_init_pmc(0, 0x64, 0xD8); + + arg = p; + + WARN_ON(arg && *arg >= L1_SETS); + if (arg && *arg >= L1_SETS) return; + ptr = cachepc_prepare_victim(cachepc_ctx, arg ? *arg : 48); + + cachepc_prime(cachepc_ds); + + pre = cachepc_read_pmc(0); + cachepc_victim(ptr); + post = cachepc_read_pmc(0); + + printk(KERN_WARNING "CachePC: Single access test done (%llu vs %u => %s)", + post - pre, 1, (post - pre == 1) ? "passed" : "failed"); + + if (arg) *arg = post - pre; + + cachepc_release_victim(cachepc_ctx, ptr); +} + +void +cachepc_kvm_single_eviction_test(void *p) +{ + cacheline *head, *cl, *evicted; + cacheline *ptr; + uint32_t target; + uint32_t *arg; + int count; + + arg = p; + + /* l2 data cache, hit or miss */ + cachepc_init_pmc(0, 0x64, 0xD8); + + WARN_ON(arg && *arg >= L1_SETS); + if (arg && *arg >= L1_SETS) return; + target = arg ? *arg : 48; + + ptr = cachepc_prepare_victim(cachepc_ctx, target); + + head = cachepc_prime(cachepc_ds); + cachepc_victim(ptr); + cachepc_probe(head); + + count = 0; + evicted = NULL; + cl = head = cachepc_ds; + do { + if (IS_FIRST(cl->flags) && cl->count > 0) { + evicted = cl; + count += cl->count; + } + cl = cl->next; + } while (cl != head); + + printk(KERN_WARNING "CachePC: Single eviction test done (%u vs %u => %s)\n", + count, 1, (count == 1 && evicted->cache_set == target) ? "passed" : "failed"); + cachepc_save_msrmts(head); + + if (arg) *arg = count; + + cachepc_release_victim(cachepc_ctx, ptr); +} + +void +cachepc_kvm_system_setup(void) +{ + uint64_t reg_addr, val; + uint32_t lo, hi; + + /* disable streaming store */ + reg_addr = 0xc0011020; + asm volatile ("rdmsr" : "=a"(lo), "=d"(hi) : "c"(reg_addr)); + val = (uint64_t) lo | ((uint64_t) hi << 32); + val |= 1 << 13; + asm volatile ("wrmsr" : : "c"(reg_addr), "a"(val), "d"(0x00)); + printk("CachePC: Writing MSR %08llX: %016llX\n", reg_addr, val); + + /* disable speculative data cache tlb reloads */ + reg_addr = 0xc0011022; + asm volatile ("rdmsr" : "=a"(lo), "=d"(hi) : "c"(reg_addr)); + val = (uint64_t) lo | ((uint64_t) hi << 32); + val |= 1 << 4; + asm volatile ("wrmsr" : : "c"(reg_addr), "a"(val), "d"(0x00)); + printk("CachePC: Writing MSR %08llX: %016llX\n", reg_addr, val); + + /* disable data cache hardware prefetcher */ + reg_addr = 0xc0011022; + asm volatile ("rdmsr" : "=a"(lo), "=d"(hi) : "c"(reg_addr)); + val = (uint64_t) lo | ((uint64_t) hi << 32); + val |= 1 << 13; + asm volatile ("wrmsr" : : "c"(reg_addr), "a"(val), "d"(0x00)); + printk("CachePC: Writing MSR %08llX: %016llX\n", reg_addr, val); +} + +void +cachepc_kvm_init_pmc_ioctl(void *p) +{ + uint32_t event; + uint8_t index, event_no, event_mask; + + WARN_ON(p == NULL); + if (!p) return; + + event = *(uint32_t *)p; + + index = (event & 0xFF000000) >> 24; + event_no = (event & 0x0000FF00) >> 8; + event_mask = (event & 0x000000FF) >> 0; + + cachepc_init_pmc(index, event_no, event_mask); +} + +long +cachepc_kvm_ioctl(struct file *file, unsigned int cmd, unsigned long argp) +{ + void __user *arg_user; + uint32_t u32; + int ret; + + arg_user = (void __user *)argp; + switch (cmd) { + case CACHEPC_IOCTL_TEST_ACCESS: + printk(KERN_WARNING "CachePC: Called ioctl access test\n"); + if (!arg_user) return -EINVAL; + if (copy_from_user(&u32, arg_user, sizeof(uint32_t))) + return -EFAULT; + ret = smp_call_function_single(2, + cachepc_kvm_single_access_test, &u32, true); + WARN_ON(ret != 0); + if (copy_to_user(arg_user, &u32, sizeof(uint32_t))) + return -EFAULT; + break; + case CACHEPC_IOCTL_TEST_EVICTION: + printk(KERN_WARNING "CachePC: Called ioctl eviction test\n"); + if (!arg_user) return -EINVAL; + if (copy_from_user(&u32, arg_user, sizeof(uint32_t))) + return -EFAULT; + ret = smp_call_function_single(2, + cachepc_kvm_single_eviction_test, &u32, true); + WARN_ON(ret != 0); + if (copy_to_user(arg_user, &u32, sizeof(uint32_t))) + return -EFAULT; + break; + case CACHEPC_IOCTL_INIT_PMC: + printk(KERN_WARNING "CachePC: Called ioctl init counter\n"); + if (!arg_user) return -EINVAL; + if (copy_from_user(&u32, arg_user, sizeof(uint32_t))) + return -EFAULT; + ret = smp_call_function_single(2, + cachepc_kvm_init_pmc_ioctl, &u32, true); + WARN_ON(ret != 0); + break; + default: + return -EINVAL; + } + + return 0; +} + +void +cachepc_kvm_setup_test(void *p) +{ + int cpu; + + cpu = get_cpu(); + + printk(KERN_WARNING "CachePC: Running on core %i\n", cpu); + + cachepc_ctx = cachepc_get_ctx(L1); + cachepc_ds = cachepc_prepare_ds(cachepc_ctx); + + cachepc_kvm_system_setup(); + + cachepc_kvm_prime_probe_test(NULL); + cachepc_kvm_single_access_test(NULL); + cachepc_kvm_single_eviction_test(NULL); + cachepc_kvm_stream_hwpf_test(NULL); + + put_cpu(); +} + +void +cachepc_kvm_init(void) +{ + int ret; + + cachepc_msrmts_count = L1_SETS; + cachepc_msrmts = kzalloc(cachepc_msrmts_count * sizeof(uint16_t), GFP_KERNEL); + BUG_ON(cachepc_msrmts == NULL); + + ret = smp_call_function_single(2, cachepc_kvm_setup_test, NULL, true); + WARN_ON(ret != 0); + + memset(&cachepc_proc_ops, 0, sizeof(cachepc_proc_ops)); + cachepc_proc_ops.proc_open = cachepc_kvm_proc_open; + cachepc_proc_ops.proc_read = cachepc_kvm_proc_read; + cachepc_proc_ops.proc_write = cachepc_kvm_proc_write; + cachepc_proc_ops.proc_lseek = cachepc_kvm_proc_lseek; + cachepc_proc_ops.proc_release = cachepc_kvm_proc_close; + cachepc_proc_ops.proc_ioctl = cachepc_kvm_ioctl; + proc_create("cachepc", 0644, NULL, &cachepc_proc_ops); +} + +void +cachepc_kvm_exit(void) +{ + remove_proc_entry("cachepc", NULL); + kfree(cachepc_msrmts); + + cachepc_release_ds(cachepc_ctx, cachepc_ds); + cachepc_release_ctx(cachepc_ctx); +} diff --git a/cachepc/kvm.h b/cachepc/kvm.h new file mode 100644 index 0000000..a44491e --- /dev/null +++ b/cachepc/kvm.h @@ -0,0 +1,6 @@ +#pragma once + +#include "cachepc.h" + +void cachepc_kvm_init(void); +void cachepc_kvm_exit(void); diff --git a/cachepc/uapi.h b/cachepc/uapi.h new file mode 100644 index 0000000..f815839 --- /dev/null +++ b/cachepc/uapi.h @@ -0,0 +1,8 @@ +#pragma once + +#include + +#define CACHEPC_IOCTL_MAGIC 0xBF +#define CACHEPC_IOCTL_TEST_ACCESS _IOWR(CACHEPC_IOCTL_MAGIC, 0, uint32_t) +#define CACHEPC_IOCTL_TEST_EVICTION _IOWR(CACHEPC_IOCTL_MAGIC, 1, uint32_t) +#define CACHEPC_IOCTL_INIT_PMC _IOW(CACHEPC_IOCTL_MAGIC, 2, uint32_t) diff --git a/cachepc/util.c b/cachepc/util.c new file mode 100644 index 0000000..abf2b71 --- /dev/null +++ b/cachepc/util.c @@ -0,0 +1,38 @@ +#include "util.h" + +void +random_perm(uint32_t *arr, uint32_t arr_len) +{ + uint32_t i; + + /* no special ordering needed when prefetcher is disabled */ + for (i = 0; i < arr_len; i++) + arr[i] = i; + + // /* prevent stream prefetching by alternating access direction */ + // mid = arr_len / 2; + // for (i = 0; i < arr_len; i++) + // arr[i] = mid + (i % 2 ? -1 : 1) * ((i + 1) / 2); +} + +void +gen_random_indices(uint32_t *arr, uint32_t arr_len) +{ + uint32_t i; + + for (i = 0; i < arr_len; ++i) + arr[i] = i; + random_perm(arr, arr_len); +} + + +bool is_in_arr(uint32_t elem, uint32_t *arr, uint32_t arr_len) { + uint32_t i; + + for (i = 0; i < arr_len; ++i) { + if (arr[i] == elem) + return true; + } + + return false; +} diff --git a/cachepc/util.h b/cachepc/util.h new file mode 100644 index 0000000..a0ff8be --- /dev/null +++ b/cachepc/util.h @@ -0,0 +1,8 @@ +#pragma once + +#include + +void random_perm(uint32_t *arr, uint32_t arr_len); +void gen_random_indices(uint32_t *arr, uint32_t arr_len); + +bool is_in_arr(uint32_t elem, uint32_t *arr, uint32_t arr_len); diff --git a/kmod/asm.h b/kmod/asm.h deleted file mode 100644 index 9e9385a..0000000 --- a/kmod/asm.h +++ /dev/null @@ -1,66 +0,0 @@ -#pragma once - -#include - -#define CPUID_AFFECTED_REGS "rax", "rbx", "rcx", "rdx" - -__attribute__((always_inline)) -static inline void cachepc_cpuid(void); - -__attribute__((always_inline)) -static inline void cachepc_lfence(void); - -__attribute__((always_inline)) -static inline void cachepc_sfence(void); - -__attribute__((always_inline)) -static inline void cachepc_mfence(void); - -__attribute__((always_inline)) -static inline void cachepc_readq(void *p); - -void -cachepc_cpuid(void) -{ - asm volatile( - "mov $0x80000005, %%eax\n\t" - "cpuid\n\t" - ::: CPUID_AFFECTED_REGS - ); -} - -void -cachepc_lfence(void) -{ - asm volatile( - "lfence\n\t" - ::: "memory" - ); -} - -void -cachepc_sfence(void) -{ - asm volatile( - "sfence\n\t" - ::: "memory" - ); -} - -void -cachepc_mfence(void) -{ - asm volatile( - "mfence\n\t" - ::: "memory" - ); -} - -void -cachepc_readq(void *p) -{ - asm volatile ( - "movq (%0), %%r10\n\t" - : : "r" (p) : "r10" - ); -} diff --git a/kmod/cache_types.h b/kmod/cache_types.h deleted file mode 100644 index b337d55..0000000 --- a/kmod/cache_types.h +++ /dev/null @@ -1,66 +0,0 @@ -#pragma once - -#include "device_conf.h" - -#define SET_MASK(SETS) (((((uintptr_t) SETS) * CACHELINE_SIZE) - 1) ^ (CACHELINE_SIZE - 1)) - -#define REMOVE_PAGE_OFFSET(ptr) ((void *) (((uintptr_t) ptr) & PAGE_MASK)) - -#define GET_BIT(b, i) (((b) >> (i)) & 1) -#define SET_BIT(b, i) ((b) | (1 << (i))) - -/* Operate cacheline flags - * Used flags: - * 32 2 1 0 - * | | ... | cache group initialized | last | first | - */ -#define DEFAULT_FLAGS 0 -#define SET_FIRST(flags) SET_BIT(flags, 0) -#define SET_LAST(flags) SET_BIT(flags, 1) -#define SET_CACHE_GROUP_INIT(flags) SET_BIT(flags, 2) -#define IS_FIRST(flags) GET_BIT(flags, 0) -#define IS_LAST(flags) GET_BIT(flags, 1) -#define IS_CACHE_GROUP_INIT(flags) GET_BIT(flags, 2) - -#define CL_NEXT_OFFSET offsetof(struct cacheline, next) -#define CL_PREV_OFFSET offsetof(struct cacheline, prev) - -typedef enum cache_level cache_level; -typedef enum addressing_type addressing_type; -typedef struct cacheline cacheline; -typedef struct cache_ctx cache_ctx; - -enum cache_level {L1, L2}; -enum addressing_type {VIRTUAL, PHYSICAL}; - -struct cache_ctx { - cache_level cache_level; - addressing_type addressing; - - uint32_t sets; - uint32_t associativity; - uint32_t access_time; - uint32_t nr_of_cachelines; - uint32_t set_size; - uint32_t cache_size; -}; - -struct cacheline { - // Doubly linked list inside same set - // Attention: CL_NEXT_OFFSET and CL_PREV_OFFSET - // must be kept up to date - cacheline *next; - cacheline *prev; - - uint32_t cache_set; - uint32_t cache_line; - uint32_t flags; - - // Unused padding to fill cache line - uint64_t count; - - char padding[24]; -}; - -static_assert(sizeof(struct cacheline) == CACHELINE_SIZE, "Bad cache line struct size"); -static_assert(CL_NEXT_OFFSET == 0 && CL_PREV_OFFSET == 8); diff --git a/kmod/cachepc.c b/kmod/cachepc.c deleted file mode 100644 index 09ed705..0000000 --- a/kmod/cachepc.c +++ /dev/null @@ -1,445 +0,0 @@ -#include "cachepc.h" - -#include -#include -#include -#include -#include - -static void cl_insert(cacheline *last_cl, cacheline *new_cl); -static void *remove_cache_set(cache_ctx *ctx, void *ptr); -static void *remove_cache_group_set(void *ptr); - -static cacheline *prepare_cache_set_ds(cache_ctx *ctx, uint32_t *set, uint32_t sets_len); -static cacheline *build_cache_ds(cache_ctx *ctx, cacheline **cacheline_ptr_arr); -static void build_randomized_list_for_cache_set(cache_ctx *ctx, cacheline **cacheline_ptr_arr); -static cacheline **allocate_cache_ds(cache_ctx *ctx); -static uint16_t get_virt_cache_set(cache_ctx *ctx, void *ptr); - -void __attribute__((optimize(1))) // prevent instruction reordering -cachepc_prime_vcall(uintptr_t ret, cacheline *cl) -{ - cachepc_prime(cl); - asm volatile ("mov %0, %%rax; jmp *%%rax" : : "r"(ret) : "rax"); -} - -void __attribute__((optimize(1))) // prevent instruction reordering -cachepc_probe_vcall(uintptr_t ret, cacheline *cl) -{ - cachepc_probe(cl); - asm volatile ("mov %0, %%rax; jmp *%%rax" : : "r"(ret) : "rax"); -} - -void -cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask) -{ - uint64_t event; - uint64_t reg_addr; - - /* REF: https://developer.amd.com/resources/developer-guides-manuals (PPR 17H 31H, P.166) - * - * performance event selection via 0xC001_020X with X = (0..A)[::2] - * performance event reading viea 0XC001_020X with X = (1..B)[::2] - */ - - WARN_ON(index >= 6); - if (index >= 6) return; - - reg_addr = 0xc0010200 + index * 2; - event = event_no | (event_mask << 8); - event |= (1ULL << 17); /* OS (kernel) events only */ - event |= (1ULL << 22); /* enable performance counter */ - event |= (1ULL << 40); /* Host events only */ - printk(KERN_WARNING "CachePC: Initialized %i. PMC %02X:%02X\n", - index, event_no, event_mask); - asm volatile ("wrmsr" : : "c"(reg_addr), "a"(event), "d"(0x00)); -} - -cache_ctx * -cachepc_get_ctx(cache_level cache_level) -{ - cache_ctx *ctx; - - ctx = kzalloc(sizeof(cache_ctx), GFP_KERNEL); - BUG_ON(ctx == NULL); - - BUG_ON(cache_level != L1); - if (cache_level == L1) { - ctx->addressing = L1_ADDRESSING; - ctx->sets = L1_SETS; - ctx->associativity = L1_ASSOCIATIVITY; - ctx->access_time = L1_ACCESS_TIME; - } else if (cache_level == L2) { - ctx->addressing = L2_ADDRESSING; - ctx->sets = L2_SETS; - ctx->associativity = L2_ASSOCIATIVITY; - ctx->access_time = L2_ACCESS_TIME; - } else { - return NULL; - } - - ctx->cache_level = cache_level; - ctx->nr_of_cachelines = ctx->sets * ctx->associativity; - ctx->set_size = CACHELINE_SIZE * ctx->associativity; - ctx->cache_size = ctx->sets * ctx->set_size; - - return ctx; -} - -void -cachepc_release_ctx(cache_ctx *ctx) -{ - kfree(ctx); -} - - -/* - * Initialises the complete cache data structure for the given context - */ -cacheline * -cachepc_prepare_ds(cache_ctx *ctx) -{ - cacheline **cacheline_ptr_arr; - cacheline *cache_ds; - - //printk(KERN_WARNING "CachePC: Preparing ds..\n"); - - cacheline_ptr_arr = allocate_cache_ds(ctx); - cache_ds = build_cache_ds(ctx, cacheline_ptr_arr); - kfree(cacheline_ptr_arr); - - // printk(KERN_WARNING "CachePC: Preparing ds done\n"); - - return cache_ds; -} - -void -cachepc_release_ds(cache_ctx *ctx, cacheline *ds) -{ - kfree(remove_cache_set(ctx, ds)); -} - -cacheline * -cachepc_prepare_victim(cache_ctx *ctx, uint32_t set) -{ - cacheline *victim_set, *victim_cl; - cacheline *curr_cl, *next_cl; - - victim_set = prepare_cache_set_ds(ctx, &set, 1); - victim_cl = victim_set; - - // Free the other lines in the same set that are not used. - if (ctx->addressing == PHYSICAL) { - curr_cl = victim_cl->next; - do { - next_cl = curr_cl->next; - // Here, it is ok to free them directly, as every line in the same - // set is from a different page anyway. - kfree(remove_cache_group_set(curr_cl)); - curr_cl = next_cl; - } while(curr_cl != victim_cl); - } - - return victim_cl; -} - -void -cachepc_release_victim(cache_ctx *ctx, cacheline *victim) -{ - kfree(remove_cache_set(ctx, victim)); -} - -void -cachepc_save_msrmts(cacheline *head) -{ - cacheline *curr_cl; - - // printk(KERN_WARNING "CachePC: Updating /proc/cachepc\n"); - - curr_cl = head; - do { - if (IS_FIRST(curr_cl->flags)) { - BUG_ON(curr_cl->cache_set >= cachepc_msrmts_count); - cachepc_msrmts[curr_cl->cache_set] = curr_cl->count; - } - - curr_cl = curr_cl->prev; - } while (curr_cl != head); -} - -void -cachepc_print_msrmts(cacheline *head) -{ - cacheline *curr_cl; - - curr_cl = head; - do { - if (IS_FIRST(curr_cl->flags)) { - printk(KERN_WARNING "CachePC: Count for cache set %i: %llu\n", - curr_cl->cache_set, curr_cl->count); - } - - curr_cl = curr_cl->prev; - } while (curr_cl != head); -} - - -cacheline * -prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len) -{ - cacheline *cache_ds, **first_cl_in_sets, **last_cl_in_sets; - cacheline *to_del_cls, *curr_cl, *next_cl, *cache_set_ds; - uint32_t i, cache_groups_len, cache_groups_max_len; - uint32_t *cache_groups; - - cache_ds = cachepc_prepare_ds(ctx); - - first_cl_in_sets = kzalloc(ctx->sets * sizeof(cacheline *), GFP_KERNEL); - BUG_ON(first_cl_in_sets == NULL); - - last_cl_in_sets = kzalloc(ctx->sets * sizeof(cacheline *), GFP_KERNEL); - BUG_ON(last_cl_in_sets == NULL); - - // Find the cache groups that are used, so that we can delete the other ones - // later (to avoid memory leaks) - cache_groups_max_len = ctx->sets / CACHE_GROUP_SIZE; - cache_groups = kmalloc(cache_groups_max_len * sizeof(uint32_t), GFP_KERNEL); - BUG_ON(cache_groups == NULL); - - cache_groups_len = 0; - for (i = 0; i < sets_len; ++i) { - if (!is_in_arr(sets[i] / CACHE_GROUP_SIZE, cache_groups, cache_groups_len)) { - cache_groups[cache_groups_len] = sets[i] / CACHE_GROUP_SIZE; - ++cache_groups_len; - } - } - - to_del_cls = NULL; - curr_cl = cache_ds; - - // Extract the partial data structure for the cache sets and ensure correct freeing - do { - next_cl = curr_cl->next; - - if (IS_FIRST(curr_cl->flags)) { - first_cl_in_sets[curr_cl->cache_set] = curr_cl; - } - if (IS_LAST(curr_cl->flags)) { - last_cl_in_sets[curr_cl->cache_set] = curr_cl; - } - - if (ctx->addressing == PHYSICAL && !is_in_arr( - curr_cl->cache_set / CACHE_GROUP_SIZE, cache_groups, cache_groups_len)) - { - // Already free all unused blocks of the cache ds for physical - // addressing, because we loose their refs - cl_insert(to_del_cls, curr_cl); - to_del_cls = curr_cl; - } - curr_cl = next_cl; - - } while(curr_cl != cache_ds); - - // Fix partial cache set ds - for (i = 0; i < sets_len; ++i) { - last_cl_in_sets[sets[i]]->next = first_cl_in_sets[sets[(i + 1) % sets_len]]; - first_cl_in_sets[sets[(i + 1) % sets_len]]->prev = last_cl_in_sets[sets[i]]; - } - cache_set_ds = first_cl_in_sets[sets[0]]; - - // Free unused cache lines - if (ctx->addressing == PHYSICAL) { - cachepc_release_ds(ctx, to_del_cls); - } - - kfree(first_cl_in_sets); - kfree(last_cl_in_sets); - kfree(cache_groups); - - return cache_set_ds; -} - -void -cl_insert(cacheline *last_cl, cacheline *new_cl) -{ - if (last_cl == NULL) { - // Adding the first entry is a special case - new_cl->next = new_cl; - new_cl->prev = new_cl; - } else { - new_cl->next = last_cl->next; - new_cl->prev = last_cl; - last_cl->next->prev = new_cl; - last_cl->next = new_cl; - } -} - -void * -remove_cache_set(cache_ctx *ctx, void *ptr) -{ - return (void *) (((uintptr_t) ptr) & ~SET_MASK(ctx->sets)); -} - -void * -remove_cache_group_set(void *ptr) -{ - return (void *) (((uintptr_t) ptr) & ~SET_MASK(CACHE_GROUP_SIZE)); -} - - -/* - * Create a randomized doubly linked list with the following structure: - * set A <--> set B <--> ... <--> set X <--> set A - * where each set is one of the cache sets, in a random order. - * The sets are a doubly linked list of cachelines themselves: - * set A: - * line[A + x0 * #sets] <--> line[A + x1 * #sets] <--> ... - * where x0, x1, ..., xD is a random permutation of 1, 2, ..., D - * and D = Associativity = | cache set | - */ -cacheline *build_cache_ds(cache_ctx *ctx, cacheline **cl_ptr_arr) { - cacheline **first_cl_in_sets, **last_cl_in_sets; - cacheline **cl_ptr_arr_sorted; - cacheline *curr_cl; - cacheline *cache_ds; - uint32_t *idx_per_set; - uint32_t idx_curr_set, set_offset; - uint32_t i, j, set, set_len; - uint32_t *idx_map; - - idx_per_set = kzalloc(ctx->sets * sizeof(uint32_t), GFP_KERNEL); - BUG_ON(idx_per_set == NULL); - - cl_ptr_arr_sorted = kzalloc(ctx->nr_of_cachelines * sizeof(cacheline *), GFP_KERNEL); - BUG_ON(cl_ptr_arr_sorted == NULL); - - set_len = ctx->associativity; - for (i = 0; i < ctx->nr_of_cachelines; ++i) { - set_offset = cl_ptr_arr[i]->cache_set * set_len; - idx_curr_set = idx_per_set[cl_ptr_arr[i]->cache_set]; - - cl_ptr_arr_sorted[set_offset + idx_curr_set] = cl_ptr_arr[i]; - idx_per_set[cl_ptr_arr[i]->cache_set] += 1; - } - - // Build doubly linked list for every set - for (set = 0; set < ctx->sets; ++set) { - set_offset = set * set_len; - build_randomized_list_for_cache_set(ctx, cl_ptr_arr_sorted + set_offset); - } - - // Relink the sets among each other - idx_map = kzalloc(ctx->sets * sizeof(uint32_t), GFP_KERNEL); - BUG_ON(idx_map == NULL); - - gen_random_indices(idx_map, ctx->sets); - - first_cl_in_sets = kzalloc(ctx->sets * sizeof(cacheline *), GFP_KERNEL); - BUG_ON(first_cl_in_sets == NULL); - - last_cl_in_sets = kzalloc(ctx->sets * sizeof(cacheline *), GFP_KERNEL); - BUG_ON(last_cl_in_sets == NULL); - - for (j = 0; j < ctx->nr_of_cachelines; ++j) { - curr_cl = cl_ptr_arr_sorted[j]; - if (IS_FIRST(curr_cl->flags)) - first_cl_in_sets[curr_cl->cache_set] = curr_cl; - if (IS_LAST(curr_cl->flags)) - last_cl_in_sets[curr_cl->cache_set] = curr_cl; - } - - /* connect up sets */ - for (i = 0; i < ctx->sets; ++i) { - last_cl_in_sets[idx_map[i]]->next = first_cl_in_sets[idx_map[(i + 1) % ctx->sets]]; - first_cl_in_sets[idx_map[(i + 1) % ctx->sets]]->prev = last_cl_in_sets[idx_map[i]]; - } - cache_ds = first_cl_in_sets[idx_map[0]]; - - kfree(cl_ptr_arr_sorted); - kfree(first_cl_in_sets); - kfree(last_cl_in_sets); - kfree(idx_per_set); - kfree(idx_map); - - return cache_ds; -} - -/* - * Helper function to build a randomised list of cacheline structs for a set - */ -void build_randomized_list_for_cache_set(cache_ctx *ctx, cacheline **cacheline_ptr_arr) -{ - cacheline *curr_cl; - uint32_t len, *idx_map; - uint16_t i; - - len = ctx->associativity; - idx_map = kzalloc(len * sizeof(uint32_t), GFP_KERNEL); - BUG_ON(idx_map == NULL); - - gen_random_indices(idx_map, len); - - for (i = 0; i < len; ++i) { - curr_cl = cacheline_ptr_arr[idx_map[i]]; - curr_cl->next = cacheline_ptr_arr[idx_map[(i + 1) % len]]; - curr_cl->prev = cacheline_ptr_arr[idx_map[(len - 1 + i) % len]]; - - if (idx_map[i] == 0) { - curr_cl->flags = SET_FIRST(DEFAULT_FLAGS); - curr_cl->prev->flags = SET_LAST(DEFAULT_FLAGS); - } else { - curr_cl->flags |= DEFAULT_FLAGS; - } - } - - kfree(idx_map); -} - -/* - * Allocate a data structure that fills the complete cache, i.e. consisting - * of `associativity` many cache lines for each cache set. - */ -cacheline ** -allocate_cache_ds(cache_ctx *ctx) -{ - cacheline **cl_ptr_arr, *cl_arr; - uint32_t i; - - cl_ptr_arr = kzalloc(ctx->nr_of_cachelines * sizeof(cacheline *), GFP_KERNEL); - BUG_ON(cl_ptr_arr == NULL); - - BUG_ON(ctx->addressing != VIRTUAL); - - // For virtual addressing, allocating a consecutive chunk of memory is enough - cl_arr = cachepc_aligned_alloc(PAGE_SIZE, ctx->cache_size); - BUG_ON(cl_arr == NULL); - - for (i = 0; i < ctx->nr_of_cachelines; ++i) { - cl_ptr_arr[i] = cl_arr + i; - cl_ptr_arr[i]->cache_set = get_virt_cache_set(ctx, cl_ptr_arr[i]); - cl_ptr_arr[i]->cache_line = i / ctx->sets; - cl_ptr_arr[i]->count = 0; - } - - return cl_ptr_arr; -} - -uint16_t -get_virt_cache_set(cache_ctx *ctx, void *ptr) -{ - return (uint16_t) ((((uintptr_t) ptr) & SET_MASK(ctx->sets)) / CACHELINE_SIZE); -} - -void * -cachepc_aligned_alloc(size_t alignment, size_t size) -{ - void *p; - - if (size % alignment != 0) - size = size - (size % alignment) + alignment; - p = kzalloc(size, GFP_KERNEL); - BUG_ON(((uintptr_t) p) % alignment != 0); - - return p; -} - diff --git a/kmod/cachepc.h b/kmod/cachepc.h deleted file mode 100644 index 6237eba..0000000 --- a/kmod/cachepc.h +++ /dev/null @@ -1,188 +0,0 @@ -#pragma once - -#include "asm.h" -#include "cache_types.h" -#include "util.h" -#include "cachepc_user.h" - -void cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask); - -cache_ctx *cachepc_get_ctx(cache_level cl); -void cachepc_release_ctx(cache_ctx *ctx); - -cacheline *cachepc_prepare_ds(cache_ctx *ctx); -void cachepc_release_ds(cache_ctx *ctx, cacheline *ds); - -cacheline *cachepc_prepare_victim(cache_ctx *ctx, uint32_t set); -void cachepc_release_victim(cache_ctx *ctx, cacheline *ptr); - -void *cachepc_aligned_alloc(size_t alignment, size_t size); - -void cachepc_save_msrmts(cacheline *head); -void cachepc_print_msrmts(cacheline *head); - -void cachepc_prime_vcall(uintptr_t ret, cacheline *cl); -void cachepc_probe_vcall(uintptr_t ret, cacheline *cl); - -__attribute__((always_inline)) -static inline cacheline *cachepc_prime(cacheline *head); - -__attribute__((always_inline)) -static inline cacheline *cachepc_prime_rev(cacheline *head); - -__attribute__((always_inline)) -static inline cacheline *cachepc_probe(cacheline *head); - -__attribute__((always_inline)) -static inline void cachepc_victim(void *p); - -__attribute__((always_inline)) -static inline uint64_t cachepc_read_pmc(uint64_t event); - -extern uint16_t *cachepc_msrmts; -extern size_t cachepc_msrmts_count; - -extern cache_ctx *cachepc_ctx; -extern cacheline *cachepc_ds; - -extern uint64_t cachepc_regs_tmp[16]; -extern uint64_t cachepc_regs_vm[16]; - -/* - * Prime phase: fill the target cache (encoded in the size of the data structure) - * with the prepared data structure, i.e. with attacker data. - */ -cacheline * -cachepc_prime(cacheline *head) -{ - cacheline *curr_cl, *prev_cl; - - cachepc_mfence(); - cachepc_cpuid(); - - curr_cl = head; - do { - prev_cl = curr_cl; - curr_cl = curr_cl->next; - } while (curr_cl != head); - - cachepc_mfence(); - cachepc_cpuid(); - - return prev_cl; -} - -/* - * Same as prime, but in the reverse direction, i.e. the same direction that probe - * uses. This is beneficial for the following scenarios: - * - L1: - * - Trigger collision chain-reaction to amplify an evicted set (but this has - * the downside of more noisy measurements). - * - L2: - * - Always use this for L2, otherwise the first cache sets will still reside - * in L1 unless the victim filled L1 completely. In this case, an eviction - * has randomly (depending on where the cache set is placed in the randomised - * data structure) the following effect: - * A) An evicted set is L2_ACCESS_TIME - L1_ACCESS_TIME slower - * B) An evicted set is L3_ACCESS_TIME - L2_ACCESS_TIME slower - */ -cacheline * -cachepc_prime_rev(cacheline *head) -{ - cacheline *curr_cl; - - cachepc_mfence(); - cachepc_cpuid(); - - curr_cl = head; - do { - curr_cl = curr_cl->prev; - } while(curr_cl != head); - - cachepc_mfence(); - cachepc_cpuid(); - - return curr_cl->prev; -} - -cacheline * -cachepc_probe(cacheline *start_cl) -{ - uint64_t pre, post; - cacheline *next_cl; - cacheline *curr_cl; - - cachepc_mfence(); - cachepc_cpuid(); - - curr_cl = start_cl; - - do { - pre = cachepc_read_pmc(0); - - asm volatile( - "mov 8(%[curr_cl]), %%rax \n\t" // +8 - "mov 8(%%rax), %%rcx \n\t" // +16 - "mov 8(%%rcx), %%rax \n\t" // +24 - "mov 8(%%rax), %%rcx \n\t" // +32 - "mov 8(%%rcx), %%rax \n\t" // +40 - "mov 8(%%rax), %%rcx \n\t" // +48 - "mov 8(%%rcx), %[curr_cl_out] \n\t" // +56 - "mov 8(%[curr_cl_out]), %[next_cl_out] \n\t" // +64 - : [next_cl_out] "=r" (next_cl), - [curr_cl_out] "=r" (curr_cl) - : [curr_cl] "r" (curr_cl) - : "rax", "rcx" - ); - - post = cachepc_read_pmc(0); - - /* works across size boundary */ - curr_cl->count = post - pre; - - curr_cl = next_cl; - } while (__builtin_expect(curr_cl != start_cl, 1)); - - next_cl = curr_cl->next; - - cachepc_mfence(); - cachepc_cpuid(); - - return next_cl; -} - -void -cachepc_victim(void *p) -{ - cachepc_mfence(); - cachepc_cpuid(); - - cachepc_readq(p); - - cachepc_mfence(); - cachepc_cpuid(); -} - -uint64_t -cachepc_read_pmc(uint64_t event) -{ - uint32_t lo, hi; - uint64_t res; - - cachepc_mfence(); - cachepc_cpuid(); - - event = 0xC0010201 + 2 * event; - - asm volatile ( - "rdmsr" - : "=a" (lo), "=d" (hi) - : "c"(event) - ); - res = ((uint64_t) hi << 32) | (uint64_t) lo; - - cachepc_mfence(); - cachepc_cpuid(); - - return res; -} diff --git a/kmod/cachepc_user.h b/kmod/cachepc_user.h deleted file mode 100644 index f815839..0000000 --- a/kmod/cachepc_user.h +++ /dev/null @@ -1,8 +0,0 @@ -#pragma once - -#include - -#define CACHEPC_IOCTL_MAGIC 0xBF -#define CACHEPC_IOCTL_TEST_ACCESS _IOWR(CACHEPC_IOCTL_MAGIC, 0, uint32_t) -#define CACHEPC_IOCTL_TEST_EVICTION _IOWR(CACHEPC_IOCTL_MAGIC, 1, uint32_t) -#define CACHEPC_IOCTL_INIT_PMC _IOW(CACHEPC_IOCTL_MAGIC, 2, uint32_t) diff --git a/kmod/device_conf.h b/kmod/device_conf.h deleted file mode 100644 index e24d681..0000000 --- a/kmod/device_conf.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -// TODO: Read from kernel headers - -// General settings -// #define PAGE_SIZE 4096 -#define PROCESSOR_FREQ 2900000000 - -// Cache related settings -#define CACHELINE_SIZE 64 -#define CACHE_GROUP_SIZE (PAGE_SIZE / CACHELINE_SIZE) - -// Addressing: -// - virtual: 0 -// - physical: 1 -#define L1_ADDRESSING 0 -#define L1_SETS 64 -#define L1_ASSOCIATIVITY 8 -#define L1_ACCESS_TIME 4 - -#define L2_ADDRESSING 1 -#define L2_SETS 512 -#define L2_ASSOCIATIVITY 8 -#define L2_ACCESS_TIME 12 - -#define L3_ADDRESSING 1 -#define L3_SETS 4096 -#define L3_ASSOCIATIVITY 16 -#define L3_ACCESS_TIME 30 diff --git a/kmod/kvm.c b/kmod/kvm.c deleted file mode 100644 index 4deb4fa..0000000 --- a/kmod/kvm.c +++ /dev/null @@ -1,392 +0,0 @@ -#include "kvm.h" - -#include -#include -#include -#include -#include - -struct proc_ops cachepc_proc_ops; - -uint16_t *cachepc_msrmts; -size_t cachepc_msrmts_count; -EXPORT_SYMBOL(cachepc_msrmts); -EXPORT_SYMBOL(cachepc_msrmts_count); - -cache_ctx *cachepc_ctx; -cacheline *cachepc_ds; -EXPORT_SYMBOL(cachepc_ctx); -EXPORT_SYMBOL(cachepc_ds); - -uint64_t cachepc_regs_tmp[16]; -uint64_t cachepc_regs_vm[16]; -EXPORT_SYMBOL(cachepc_regs_tmp); -EXPORT_SYMBOL(cachepc_regs_vm); - -int -cachepc_kvm_proc_open(struct inode *inode, struct file *file) -{ - try_module_get(THIS_MODULE); - - return 0; -} - -int -cachepc_kvm_proc_close(struct inode *inode, struct file *file) -{ - module_put(THIS_MODULE); - - return 0; -} - -ssize_t -cachepc_kvm_proc_read(struct file *file, char *buf, size_t buflen, loff_t *off) -{ - size_t len, left; - size_t size; - - printk(KERN_WARNING "CachePC: Reading entries (%lu:%lli)\n", - buflen, off ? *off : 0); - - size = cachepc_msrmts_count * sizeof(uint16_t); - if (!off || *off >= size || *off < 0) - return 0; - - len = size - *off; - if (len > buflen) len = buflen; - - left = copy_to_user(buf, (uint8_t *) cachepc_msrmts + *off, len); - - len -= left; - *off += len; - - return len; -} - -ssize_t -cachepc_kvm_proc_write(struct file *file, const char *buf, size_t buflen, loff_t *off) -{ - return 0; -} - -loff_t -cachepc_kvm_proc_lseek(struct file *file, loff_t off, int mode) -{ - switch (mode) { - case SEEK_SET: - file->f_pos = off; - break; - case SEEK_CUR: - file->f_pos += off; - break; - case SEEK_END: - file->f_pos = cachepc_msrmts_count * sizeof(uint16_t) + off; - break; - default: - return -EINVAL; - } - - return file->f_pos; -} - -void -cachepc_kvm_prime_probe_test(void *p) -{ - cacheline *lines; - cacheline *cl, *head; - uint32_t count; - uint32_t *arg; - int i, max; - - arg = p; - - /* l2 data cache, hit or miss */ - cachepc_init_pmc(0, 0x64, 0xD8); - - lines = cachepc_aligned_alloc(PAGE_SIZE, cachepc_ctx->cache_size); - BUG_ON(lines == NULL); - - max = cachepc_ctx->nr_of_cachelines; - - cachepc_cpuid(); - cachepc_mfence(); - - for (i = 0; i < max; i++) - asm volatile ("mov (%0), %%rbx" : : "r"(lines + i) : "rbx"); - - head = cachepc_prime(cachepc_ds); - cachepc_probe(head); - - count = 0; - cl = head = cachepc_ds; - do { - count += cl->count; - cl = cl->next; - } while (cl != head); - - printk(KERN_WARNING "CachePC: Prime-probe test done (%u vs. %u => %s)\n", - count, 0, (count == 0) ? "passed" : "failed"); - - if (arg) *arg = (count == 0); - - kfree(lines); -} - -void -cachepc_kvm_stream_hwpf_test(void *p) -{ - cacheline *lines; - uint32_t count; - uint32_t *arg; - uint32_t i, max; - - arg = p; - - /* TODO: accurately detect hwpf */ - - /* l2 data cache, hit or miss */ - cachepc_init_pmc(0, 0x64, 0xD8); - - lines = cachepc_aligned_alloc(PAGE_SIZE, cachepc_ctx->cache_size); - BUG_ON(lines == NULL); - - max = cachepc_ctx->nr_of_cachelines; - - cachepc_prime(cachepc_ds); - - count -= cachepc_read_pmc(0); - for (i = 0; i < max; i++) - asm volatile ("mov (%0), %%rbx" : : "r"(lines + i) : "rbx"); - count += cachepc_read_pmc(0); - - printk(KERN_WARNING "CachePC: HWPF test done (%u vs. %u => %s)\n", - count, max, (count == max) ? "passed" : "failed"); - - if (arg) *arg = (count == max); - - kfree(lines); -} - -void -cachepc_kvm_single_access_test(void *p) -{ - cacheline *ptr; - uint64_t pre, post; - uint32_t *arg; - - /* l2 data cache, hit or miss */ - cachepc_init_pmc(0, 0x64, 0xD8); - - arg = p; - - WARN_ON(arg && *arg >= L1_SETS); - if (arg && *arg >= L1_SETS) return; - ptr = cachepc_prepare_victim(cachepc_ctx, arg ? *arg : 48); - - cachepc_prime(cachepc_ds); - - pre = cachepc_read_pmc(0); - cachepc_victim(ptr); - post = cachepc_read_pmc(0); - - printk(KERN_WARNING "CachePC: Single access test done (%llu vs %u => %s)", - post - pre, 1, (post - pre == 1) ? "passed" : "failed"); - - if (arg) *arg = post - pre; - - cachepc_release_victim(cachepc_ctx, ptr); -} - -void -cachepc_kvm_single_eviction_test(void *p) -{ - cacheline *head, *cl, *evicted; - cacheline *ptr; - uint32_t target; - uint32_t *arg; - int count; - - arg = p; - - /* l2 data cache, hit or miss */ - cachepc_init_pmc(0, 0x64, 0xD8); - - WARN_ON(arg && *arg >= L1_SETS); - if (arg && *arg >= L1_SETS) return; - target = arg ? *arg : 48; - - ptr = cachepc_prepare_victim(cachepc_ctx, target); - - head = cachepc_prime(cachepc_ds); - cachepc_victim(ptr); - cachepc_probe(head); - - count = 0; - evicted = NULL; - cl = head = cachepc_ds; - do { - if (IS_FIRST(cl->flags) && cl->count > 0) { - evicted = cl; - count += cl->count; - } - cl = cl->next; - } while (cl != head); - - printk(KERN_WARNING "CachePC: Single eviction test done (%u vs %u => %s)\n", - count, 1, (count == 1 && evicted->cache_set == target) ? "passed" : "failed"); - cachepc_save_msrmts(head); - - if (arg) *arg = count; - - cachepc_release_victim(cachepc_ctx, ptr); -} - -void -cachepc_kvm_system_setup(void) -{ - uint64_t reg_addr, val; - uint32_t lo, hi; - - /* disable streaming store */ - reg_addr = 0xc0011020; - asm volatile ("rdmsr" : "=a"(lo), "=d"(hi) : "c"(reg_addr)); - val = (uint64_t) lo | ((uint64_t) hi << 32); - val |= 1 << 13; - asm volatile ("wrmsr" : : "c"(reg_addr), "a"(val), "d"(0x00)); - printk("CachePC: Writing MSR %08llX: %016llX\n", reg_addr, val); - - /* disable speculative data cache tlb reloads */ - reg_addr = 0xc0011022; - asm volatile ("rdmsr" : "=a"(lo), "=d"(hi) : "c"(reg_addr)); - val = (uint64_t) lo | ((uint64_t) hi << 32); - val |= 1 << 4; - asm volatile ("wrmsr" : : "c"(reg_addr), "a"(val), "d"(0x00)); - printk("CachePC: Writing MSR %08llX: %016llX\n", reg_addr, val); - - /* disable data cache hardware prefetcher */ - reg_addr = 0xc0011022; - asm volatile ("rdmsr" : "=a"(lo), "=d"(hi) : "c"(reg_addr)); - val = (uint64_t) lo | ((uint64_t) hi << 32); - val |= 1 << 13; - asm volatile ("wrmsr" : : "c"(reg_addr), "a"(val), "d"(0x00)); - printk("CachePC: Writing MSR %08llX: %016llX\n", reg_addr, val); -} - -void -cachepc_kvm_init_pmc_ioctl(void *p) -{ - uint32_t event; - uint8_t index, event_no, event_mask; - - WARN_ON(p == NULL); - if (!p) return; - - event = *(uint32_t *)p; - - index = (event & 0xFF000000) >> 24; - event_no = (event & 0x0000FF00) >> 8; - event_mask = (event & 0x000000FF) >> 0; - - cachepc_init_pmc(index, event_no, event_mask); -} - -long -cachepc_kvm_ioctl(struct file *file, unsigned int cmd, unsigned long argp) -{ - void __user *arg_user; - uint32_t u32; - int ret; - - arg_user = (void __user *)argp; - switch (cmd) { - case CACHEPC_IOCTL_TEST_ACCESS: - printk(KERN_WARNING "CachePC: Called ioctl access test\n"); - if (!arg_user) return -EINVAL; - if (copy_from_user(&u32, arg_user, sizeof(uint32_t))) - return -EFAULT; - ret = smp_call_function_single(2, - cachepc_kvm_single_access_test, &u32, true); - WARN_ON(ret != 0); - if (copy_to_user(arg_user, &u32, sizeof(uint32_t))) - return -EFAULT; - break; - case CACHEPC_IOCTL_TEST_EVICTION: - printk(KERN_WARNING "CachePC: Called ioctl eviction test\n"); - if (!arg_user) return -EINVAL; - if (copy_from_user(&u32, arg_user, sizeof(uint32_t))) - return -EFAULT; - ret = smp_call_function_single(2, - cachepc_kvm_single_eviction_test, &u32, true); - WARN_ON(ret != 0); - if (copy_to_user(arg_user, &u32, sizeof(uint32_t))) - return -EFAULT; - break; - case CACHEPC_IOCTL_INIT_PMC: - printk(KERN_WARNING "CachePC: Called ioctl init counter\n"); - if (!arg_user) return -EINVAL; - if (copy_from_user(&u32, arg_user, sizeof(uint32_t))) - return -EFAULT; - ret = smp_call_function_single(2, - cachepc_kvm_init_pmc_ioctl, &u32, true); - WARN_ON(ret != 0); - break; - default: - return -EINVAL; - } - - return 0; -} - -void -cachepc_kvm_setup_test(void *p) -{ - int cpu; - - cpu = get_cpu(); - - printk(KERN_WARNING "CachePC: Running on core %i\n", cpu); - - cachepc_ctx = cachepc_get_ctx(L1); - cachepc_ds = cachepc_prepare_ds(cachepc_ctx); - - cachepc_kvm_system_setup(); - - cachepc_kvm_prime_probe_test(NULL); - cachepc_kvm_single_access_test(NULL); - cachepc_kvm_single_eviction_test(NULL); - cachepc_kvm_stream_hwpf_test(NULL); - - put_cpu(); -} - -void -cachepc_kvm_init(void) -{ - int ret; - - cachepc_msrmts_count = L1_SETS; - cachepc_msrmts = kzalloc(cachepc_msrmts_count * sizeof(uint16_t), GFP_KERNEL); - BUG_ON(cachepc_msrmts == NULL); - - ret = smp_call_function_single(2, cachepc_kvm_setup_test, NULL, true); - WARN_ON(ret != 0); - - memset(&cachepc_proc_ops, 0, sizeof(cachepc_proc_ops)); - cachepc_proc_ops.proc_open = cachepc_kvm_proc_open; - cachepc_proc_ops.proc_read = cachepc_kvm_proc_read; - cachepc_proc_ops.proc_write = cachepc_kvm_proc_write; - cachepc_proc_ops.proc_lseek = cachepc_kvm_proc_lseek; - cachepc_proc_ops.proc_release = cachepc_kvm_proc_close; - cachepc_proc_ops.proc_ioctl = cachepc_kvm_ioctl; - proc_create("cachepc", 0644, NULL, &cachepc_proc_ops); -} - -void -cachepc_kvm_exit(void) -{ - remove_proc_entry("cachepc", NULL); - kfree(cachepc_msrmts); - - cachepc_release_ds(cachepc_ctx, cachepc_ds); - cachepc_release_ctx(cachepc_ctx); -} diff --git a/kmod/kvm.h b/kmod/kvm.h deleted file mode 100644 index a44491e..0000000 --- a/kmod/kvm.h +++ /dev/null @@ -1,6 +0,0 @@ -#pragma once - -#include "cachepc.h" - -void cachepc_kvm_init(void); -void cachepc_kvm_exit(void); diff --git a/kmod/util.c b/kmod/util.c deleted file mode 100644 index abf2b71..0000000 --- a/kmod/util.c +++ /dev/null @@ -1,38 +0,0 @@ -#include "util.h" - -void -random_perm(uint32_t *arr, uint32_t arr_len) -{ - uint32_t i; - - /* no special ordering needed when prefetcher is disabled */ - for (i = 0; i < arr_len; i++) - arr[i] = i; - - // /* prevent stream prefetching by alternating access direction */ - // mid = arr_len / 2; - // for (i = 0; i < arr_len; i++) - // arr[i] = mid + (i % 2 ? -1 : 1) * ((i + 1) / 2); -} - -void -gen_random_indices(uint32_t *arr, uint32_t arr_len) -{ - uint32_t i; - - for (i = 0; i < arr_len; ++i) - arr[i] = i; - random_perm(arr, arr_len); -} - - -bool is_in_arr(uint32_t elem, uint32_t *arr, uint32_t arr_len) { - uint32_t i; - - for (i = 0; i < arr_len; ++i) { - if (arr[i] == elem) - return true; - } - - return false; -} diff --git a/kmod/util.h b/kmod/util.h deleted file mode 100644 index a0ff8be..0000000 --- a/kmod/util.h +++ /dev/null @@ -1,8 +0,0 @@ -#pragma once - -#include - -void random_perm(uint32_t *arr, uint32_t arr_len); -void gen_random_indices(uint32_t *arr, uint32_t arr_len); - -bool is_in_arr(uint32_t elem, uint32_t *arr, uint32_t arr_len); diff --git a/patch.diff b/patch.diff index b6d69ce..fa112c0 100755 --- a/patch.diff +++ b/patch.diff @@ -1,7 +1,5 @@ diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h -old mode 100644 -new mode 100755 -index eb186bc57f6a..cefc1589e398 +index eb186bc57f6a..cefc1589e398 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -2,8 +2,14 @@ @@ -19,267 +17,8 @@ index eb186bc57f6a..cefc1589e398 KVM_PAGE_TRACK_MAX, }; -diff --git a/arch/x86/include/asm/sev-step.c b/arch/x86/include/asm/sev-step.c -new file mode 100755 -index 000000000000..489583f33342 ---- /dev/null -+++ b/arch/x86/include/asm/sev-step.c -@@ -0,0 +1,250 @@ -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include "kvm_cache_regs.h" -+#include "svm/svm.h" -+ -+ -+ -+struct kvm* main_vm; -+EXPORT_SYMBOL(main_vm); -+ -+//used to store performance counter values; 6 counters, 2 readings per counter -+uint64_t perf_reads[6][2]; -+perf_ctl_config_t perf_configs[6]; -+int perf_cpu; -+ -+ -+uint64_t perf_ctl_to_u64(perf_ctl_config_t * config) { -+ -+ uint64_t result = 0; -+ result |= ( config->EventSelect & 0xffULL); //[7:0] in result and [7:0] in EventSelect -+ result |= ( (config->UintMask & 0xffULL) << 8 ); //[15:8] -+ result |= ( (config->OsUserMode & 0x3ULL) << 16); //[17:16] -+ result |= ( (config->Edge & 0x1ULL ) << 18 ); // 18 -+ result |= ( (config->Int & 0x1ULL ) << 20 ); // 20 -+ result |= ( (config->En & 0x1ULL ) << 22 ); //22 -+ result |= ( (config->Inv & 0x1ULL ) << 23); //23 -+ result |= ( (config->CntMask & 0xffULL) << 24); //[31:24] -+ result |= ( ( (config->EventSelect & 0xf00ULL) >> 8 ) << 32); //[35:32] in result and [11:8] in EventSelect -+ result |= ( (config->HostGuestOnly & 0x3ULL) << 40); // [41:40] -+ -+ return result; -+ -+} -+ -+void write_ctl(perf_ctl_config_t * config, int cpu, uint64_t ctl_msr){ -+ wrmsrl_on_cpu(cpu, ctl_msr, perf_ctl_to_u64(config)); //always returns zero -+} -+ -+void read_ctr(uint64_t ctr_msr, int cpu, uint64_t* result) { -+ uint64_t tmp; -+ rdmsrl_on_cpu(cpu, ctr_msr, &tmp); //always returns zero -+ *result = tmp & ( (0x1ULL << 48) - 1); -+} -+ -+void setup_perfs() { -+ int i; -+ -+ perf_cpu = smp_processor_id(); -+ -+ for( i = 0; i < 6; i++) { -+ perf_configs[i].HostGuestOnly = 0x1; //0x1 means: count only guest -+ perf_configs[i].CntMask = 0x0; -+ perf_configs[i].Inv = 0x0; -+ perf_configs[i].En = 0x0; -+ perf_configs[i].Int = 0x0; -+ perf_configs[i].Edge = 0x0; -+ perf_configs[i].OsUserMode = 0x3; //0x3 means: count userland and kernel events -+ } -+ -+ //remember to set .En to enable the individual counter -+ -+ perf_configs[0].EventSelect = 0x0c0; -+ perf_configs[0].UintMask = 0x0; -+ perf_configs[0].En = 0x1; -+ write_ctl(&perf_configs[0],perf_cpu, CTL_MSR_0); -+ -+ /*programm l2d hit from data cache miss perf for -+ cpu_probe_pointer_chasing_inplace without counting thread. -+ N.B. that this time we count host events -+ */ -+ perf_configs[1].EventSelect = 0x064; -+ perf_configs[1].UintMask = 0x70; -+ perf_configs[1].En = 0x1; -+ perf_configs[1].HostGuestOnly = 0x2; //0x2 means: count only host events, as we do the chase here -+ write_ctl(&perf_configs[1],perf_cpu,CTL_MSR_1); -+} -+EXPORT_SYMBOL(setup_perfs); -+ -+ -+/* -+static int __my_sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, -+ unsigned long dst, int size, -+ int *error); -+ -+int my_sev_decrypt(struct kvm* kvm, void* dst_vaddr, void* src_vaddr, uint64_t dst_paddr, uint64_t src_paddr, uint64_t len, int* api_res) { -+ -+ int call_res; -+ call_res = 0x1337; -+ *api_res = 0x1337; -+ -+ -+ if( dst_paddr % PAGE_SIZE != 0 || src_paddr % PAGE_SIZE != 0) { -+ printk("decrypt: for now, src_paddr, and dst_paddr must be page aligned"); -+ return -1; -+ } -+ -+ if( len > PAGE_SIZE ) { -+ printk("decrypt: for now, can be at most 4096 byte"); -+ return -1; -+ } -+ -+ memset(dst_vaddr,0,PAGE_SIZE); -+ -+ //clflush_cache_range(src_vaddr, PAGE_SIZE); -+ //clflush_cache_range(dst_vaddr, PAGE_SIZE); -+ wbinvd_on_all_cpus(); -+ -+ call_res = __my_sev_issue_dbg_cmd(kvm, __sme_set(src_paddr), -+ __sme_set(dst_paddr), len, api_res); -+ -+ return call_res; -+ -+} -+EXPORT_SYMBOL(my_sev_decrypt); -+ -+static int __my_sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, -+ unsigned long dst, int size, -+ int *error) -+{ -+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; -+ struct sev_data_dbg *data; -+ int ret; -+ -+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); -+ if (!data) -+ return -ENOMEM; -+ -+ data->handle = sev->handle; -+ data->dst_addr = dst; -+ data->src_addr = src; -+ data->len = size; -+ -+ //ret = sev_issue_cmd(kvm, -+ // SEV_CMD_DBG_DECRYPT, -+ // data, error); -+ ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, data, error); -+ kfree(data); -+ return ret; -+} -+ -+int decrypt_vmsa(struct vcpu_svm* svm, struct vmcb_save_area* save_area) { -+ -+ uint64_t src_paddr, dst_paddr; -+ void * dst_vaddr; -+ void * src_vaddr; -+ struct page * dst_page; -+ int call_res,api_res; -+ call_res = 1337; -+ api_res = 1337; -+ -+ src_vaddr = svm->vmsa; -+ src_paddr = svm->vmcb->control.vmsa_pa; -+ -+ if( src_paddr % 16 != 0) { -+ printk("decrypt_vmsa: src_paddr was not 16b aligned"); -+ } -+ -+ if( sizeof( struct vmcb_save_area) % 16 != 0 ) { -+ printk("decrypt_vmsa: size of vmcb_save_area is not 16 b aligned\n"); -+ } -+ -+ dst_page = alloc_page(GFP_KERNEL); -+ dst_vaddr = vmap(&dst_page, 1, 0, PAGE_KERNEL); -+ dst_paddr = page_to_pfn(dst_page) << PAGE_SHIFT; -+ memset(dst_vaddr,0,PAGE_SIZE); -+ -+ -+ -+ if( dst_paddr % 16 != 0 ) { -+ printk("decrypt_vmsa: dst_paddr was not 16 byte aligned"); -+ } -+ -+ //printk("src_paddr = 0x%llx dst_paddr = 0x%llx\n", __sme_clr(src_paddr), __sme_clr(dst_paddr)); -+ //printk("Sizeof vmcb_save_area is: 0x%lx\n", sizeof( struct vmcb_save_area) ); -+ -+ -+ call_res = __my_sev_issue_dbg_cmd(svm->vcpu.kvm, __sme_set(src_paddr), __sme_set(dst_paddr), sizeof(struct vmcb_save_area), &api_res); -+ -+ -+ //printk("decrypt_vmsa: result of call was %d, result of api command was %d\n",call_res, api_res); -+ -+ //todo error handling -+ if( api_res != 0 ) { -+ __free_page(dst_page); -+ return -1; -+ } -+ -+ memcpy(save_area, dst_vaddr, sizeof( struct vmcb_save_area) ); -+ -+ -+ __free_page(dst_page); -+ -+ return 0; -+ -+ -+} -+ -+ -+// -+// Contains a switch to work SEV and SEV-ES -+ // -+uint64_t sev_step_get_rip(struct vcpu_svm* svm) { -+ struct vmcb_save_area* save_area; -+ struct kvm * kvm; -+ struct kvm_sev_info *sev; -+ uint64_t rip; -+ -+ -+ kvm = svm->vcpu.kvm; -+ sev = &to_kvm_svm(kvm)->sev_info; -+ -+ //for sev-es we need to use the debug api, to decrypt the vmsa -+ if( sev->active && sev->es_active) { -+ int res; -+ save_area = vmalloc(sizeof(struct vmcb_save_area) ); -+ memset(save_area,0, sizeof(struct vmcb_save_area)); -+ -+ res = decrypt_vmsa(svm, save_area); -+ if( res != 0) { -+ printk("sev_step_get_rip failed to decrypt\n"); -+ return 0; -+ } -+ -+ rip = save_area->rip; -+ -+ vfree(save_area); -+ } else { //otherwise we can just access as plaintexts -+ rip = svm->vmcb->save.rip; -+ } -+ return rip; -+ -+} -+EXPORT_SYMBOL(sev_step_get_rip); -+*/ -+ -+int sev_step_get_rip_kvm_vcpu(struct kvm_vcpu* vcpu,uint64_t *rip) { -+ /* -+ struct vcpu_svm *svm = container_of(vcpu, struct vcpu_svm, vcpu); -+ if( svm == NULL ) { -+ return 1; -+ } -+ (*rip) = sev_step_get_rip(svm); -+ */ -+ return 0; -+} -\ No newline at end of file diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile -old mode 100644 -new mode 100755 -index 30f244b64523..6d4a2a6530b6 +index 30f244b64523..7992f8cce838 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,8 +1,10 @@ @@ -294,18 +33,19 @@ index 30f244b64523..6d4a2a6530b6 ifeq ($(CONFIG_FRAME_POINTER),y) OBJECT_FILES_NON_STANDARD_vmenter.o := y endif -@@ -11,8 +13,8 @@ include $(srctree)/virt/kvm/Makefile.kvm +@@ -11,8 +13,9 @@ include $(srctree)/virt/kvm/Makefile.kvm kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ - mmu/spte.o + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o \ -+ sev-step.o userspace_page_track_signals.o svm/cachepc/cachepc.o svm/cachepc/util.o svm/cachepc/kvm.o ++ svm/cachepc/cachepc.o svm/cachepc/util.o svm/cachepc/kvm.o \ ++ sevstep/sevstep.o sevstep/uspt.o sevstep/kvm.o ifdef CONFIG_HYPERV kvm-y += kvm_onhyperv.o -@@ -25,7 +27,8 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ +@@ -25,7 +28,8 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o @@ -316,529 +56,150 @@ index 30f244b64523..6d4a2a6530b6 ifdef CONFIG_HYPERV kvm-amd-y += svm/svm_onhyperv.o diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c -old mode 100644 -new mode 100755 -index d871b8dee7b3..b6e1dc265cac +index d871b8dee7b3..32900ef5ee0b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c -@@ -56,6 +56,9 @@ - - #include "paging.h" - -+#include -+#include -+ - extern bool itlb_multihit_kvm_mitigation; - - int __read_mostly nx_huge_pages = -1; -@@ -1152,8 +1155,8 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +@@ -1152,6 +1152,8 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) } } --/* -- * Write-protect on the specified @sptep, @pt_protect indicates whether -+/* Apply the protection mode specified in @mode to the specified @sptep, -+ * @pt_protect indicates whether ++#include "../sevstep/mmu.c" ++ + /* + * Write-protect on the specified @sptep, @pt_protect indicates whether * spte write-protection is caused by protecting shadow page table. - * - * Note: write protection is difference between dirty logging and spte -@@ -1165,9 +1168,10 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +@@ -1165,34 +1167,15 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) * * Return true if tlb need be flushed. */ -static bool spte_write_protect(u64 *sptep, bool pt_protect) -+static bool spte_protect(u64 *sptep, bool pt_protect, enum kvm_page_track_mode mode) - { - u64 spte = *sptep; -+ bool shouldFlush = false; - - if (!is_writable_pte(spte) && - !(pt_protect && is_mmu_writable_spte(spte))) -@@ -1175,22 +1179,45 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) - - rmap_printk("spte %p %llx\n", sptep, *sptep); - +-{ +- u64 spte = *sptep; +- +- if (!is_writable_pte(spte) && +- !(pt_protect && is_mmu_writable_spte(spte))) +- return false; +- +- rmap_printk("spte %p %llx\n", sptep, *sptep); +- - if (pt_protect) - spte &= ~shadow_mmu_writable_mask; - spte = spte & ~PT_WRITABLE_MASK; - - return mmu_spte_update(sptep, spte); -+ if (pt_protect){ -+ //spte &= ~shadow_mmu_writable_mask; -+ spte &= ~EPT_SPTE_MMU_WRITABLE; -+ } -+ //spte = spte & ~PT_WRITABLE_MASK; -+ if(mode == KVM_PAGE_TRACK_WRITE) { -+ spte = spte & ~PT_WRITABLE_MASK; -+ shouldFlush = true; -+ } else if( mode == KVM_PAGE_TRACK_RESET_ACCESSED) { -+ spte = spte & ~PT_ACCESSED_MASK; -+ } else if(mode == KVM_PAGE_TRACK_ACCESS) { -+ spte = spte & ~PT_PRESENT_MASK; -+ spte = spte & ~PT_WRITABLE_MASK; -+ spte = spte & ~PT_USER_MASK; -+ spte = spte | (0x1ULL << PT64_NX_SHIFT); -+ shouldFlush = true; -+ } else if( mode == KVM_PAGE_TRACK_EXEC) { -+ spte = spte | (0x1ULL << PT64_NX_SHIFT); //nx bit is set, to prevent execution, not removed -+ shouldFlush = true; -+ } else if (mode == KVM_PAGE_TRACK_RESET_EXEC) { -+ spte = spte & (~(0x1ULL << PT64_NX_SHIFT)); -+ shouldFlush = true; -+ } else { -+ printk(KERN_WARNING "spte_protect was called with invalid mode" -+ "parameter %d\n",mode); -+ } -+ shouldFlush |= mmu_spte_update(sptep, spte); -+ return shouldFlush; - } - --static bool rmap_write_protect(struct kvm_rmap_head *rmap_head, -- bool pt_protect) -+static bool rmap_protect(struct kvm_rmap_head *rmap_head, bool pt_protect, enum kvm_page_track_mode mode) +-} ++// static bool spte_write_protect(u64 *sptep, bool pt_protect) ++// { ++// return sevstep_spte_protect(sptep, pt_protect, KVM_PAGE_TRACK_WRITE); ++// } + + static bool rmap_write_protect(struct kvm_rmap_head *rmap_head, + bool pt_protect) { - u64 *sptep; - struct rmap_iterator iter; - bool flush = false; - +- u64 *sptep; +- struct rmap_iterator iter; +- bool flush = false; +- - for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_write_protect(sptep, pt_protect); -+ for_each_rmap_spte(rmap_head, &iter, sptep) { -+ flush |= spte_protect(sptep, pt_protect, mode); -+ } - - return flush; - } -@@ -1263,7 +1290,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, - while (mask) { - rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); -- rmap_write_protect(rmap_head, false); -+ rmap_protect(rmap_head, false, KVM_PAGE_TRACK_WRITE); - - /* clear the first set bit */ - mask &= mask - 1; -@@ -1333,13 +1360,13 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, - if (READ_ONCE(eager_page_split)) - kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K); - -- kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); -+ kvm_mmu_slot_gfn_protect(kvm, slot, start, PG_LEVEL_2M, KVM_PAGE_TRACK_WRITE); - - /* Cross two large pages? */ - if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) != - ALIGN(end << PAGE_SHIFT, PMD_SIZE)) -- kvm_mmu_slot_gfn_write_protect(kvm, slot, end, -- PG_LEVEL_2M); -+ kvm_mmu_slot_gfn_protect(kvm, slot, end, -+ PG_LEVEL_2M, KVM_PAGE_TRACK_WRITE); - } - - /* Now handle 4K PTEs. */ -@@ -1354,26 +1381,29 @@ int kvm_cpu_dirty_log_size(void) - return kvm_x86_ops.cpu_dirty_log_size; +- +- return flush; ++ return sevstep_rmap_protect(rmap_head, pt_protect, KVM_PAGE_TRACK_WRITE); } --bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, -+bool kvm_mmu_slot_gfn_protect(struct kvm *kvm, + static bool spte_clear_dirty(u64 *sptep) +@@ -1358,22 +1341,8 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, -- int min_level) -+ int min_level, enum kvm_page_track_mode mode) + int min_level) { - struct kvm_rmap_head *rmap_head; - int i; +- struct kvm_rmap_head *rmap_head; +- int i; - bool write_protected = false; -+ //bool write_protected = false; -+ bool protected = false; - - if (kvm_memslots_have_rmaps(kvm)) { - for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = gfn_to_rmap(gfn, i, slot); +- +- if (kvm_memslots_have_rmaps(kvm)) { +- for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { +- rmap_head = gfn_to_rmap(gfn, i, slot); - write_protected |= rmap_write_protect(rmap_head, true); -+ //write_protected |= rmap_write_protect(rmap_head, true); -+ protected |= rmap_protect(rmap_head, true, mode); - } - } - - if (is_tdp_mmu_enabled(kvm)) +- } +- } +- +- if (is_tdp_mmu_enabled(kvm)) - write_protected |= -+ //write_protected |= -+ protected |= - kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); - +- kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); +- - return write_protected; -+ return protected; ++ return sevstep_kvm_mmu_slot_gfn_protect(kvm, slot, ++ gfn, min_level, KVM_PAGE_TRACK_WRITE); } static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) -@@ -1381,7 +1411,7 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) - struct kvm_memory_slot *slot; - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); -- return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); -+ return kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K, KVM_PAGE_TRACK_WRITE); - } - - static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, -@@ -3901,6 +3931,38 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) +@@ -3901,6 +3870,10 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { -+ int send_err; -+ uint64_t current_rip; -+ int have_rip; -+ int i; -+ bool was_tracked; -+ int modes[] = {KVM_PAGE_TRACK_WRITE,KVM_PAGE_TRACK_ACCESS,KVM_PAGE_TRACK_EXEC}; -+ was_tracked = false; -+ for( i = 0; i < sizeof(modes) / sizeof(modes[0]); i++ ) { -+ if(kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn,modes[i])) { -+ __untrack_single_page(vcpu, fault->gfn, modes[i]); -+ was_tracked = true; -+ } -+ } -+ if( was_tracked ) { -+ have_rip = false; -+ if( uspt_should_get_rip() ) { -+ //! because 0 indicates "no error" but have_rip should be one if successfull -+ have_rip = (!sev_step_get_rip_kvm_vcpu(vcpu,¤t_rip)); -+ } -+ if( uspt_batch_tracking_in_progress() ) { -+ if( (send_err = uspt_batch_tracking_save(fault->gfn << PAGE_SHIFT,fault->error_code,have_rip,current_rip)) ) { -+ printk_ratelimited("uspt_batch_tracking_save failed with %d\n##########################\n",send_err); -+ } -+ uspt_batch_tracking_handle_retrack(vcpu,fault->gfn); -+ uspt_batch_tracking_inc_event_idx(); -+ } else { -+ if( (send_err = uspt_send_and_block(fault->gfn << PAGE_SHIFT,fault->error_code,have_rip,current_rip)) ) { -+ printk("uspt_send_and_block failed with %d\n##########################\n",send_err); -+ } -+ } -+ } ++ int active; ++ ++ sevstep_uspt_page_fault_handle(vcpu, fault); + if (unlikely(fault->rsvd)) return false; -@@ -3911,7 +3973,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, +@@ -3911,8 +3884,11 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) -+ if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE) || kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_ACCESS)) - return true; +- return true; ++ active = kvm_slot_page_track_is_active(vcpu->kvm, ++ fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE); ++ active |= kvm_slot_page_track_is_active(vcpu->kvm, ++ fault->slot, fault->gfn, KVM_PAGE_TRACK_ACCESS); ++ if (active) return true; return false; -@@ -5991,7 +6053,7 @@ static bool slot_rmap_write_protect(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) - { -- return rmap_write_protect(rmap_head, false); -+ return rmap_protect(rmap_head, false, KVM_PAGE_TRACK_WRITE); } - - void kvm_mmu_slot_remove_write_access(struct kvm *kvm, -diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h -old mode 100644 -new mode 100755 -index bd2a26897b97..aa57ab1b4c89 ---- a/arch/x86/kvm/mmu/mmu_internal.h -+++ b/arch/x86/kvm/mmu/mmu_internal.h -@@ -133,9 +133,9 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, - - void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); - void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); --bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, -+bool kvm_mmu_slot_gfn_protect(struct kvm *kvm, - struct kvm_memory_slot *slot, u64 gfn, -- int min_level); -+ int min_level, enum kvm_page_track_mode mode); - void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, - u64 start_gfn, u64 pages); - unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c -old mode 100644 -new mode 100755 -index 2e09d1b6249f..22b631351673 +index 2e09d1b6249f..17b69a1f2b40 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c -@@ -131,9 +131,11 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, +@@ -19,6 +19,8 @@ + #include "mmu.h" + #include "mmu_internal.h" + ++#include "../sevstep/sevstep.h" ++ + bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) + { + return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) || +@@ -131,9 +133,10 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, */ kvm_mmu_gfn_disallow_lpage(slot, gfn); - if (mode == KVM_PAGE_TRACK_WRITE) - if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) -+ //if (mode == KVM_PAGE_TRACK_WRITE) -+ // if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) -+ if (kvm_mmu_slot_gfn_protect(kvm, slot, gfn, PG_LEVEL_4K, mode)) { - kvm_flush_remote_tlbs(kvm); +- kvm_flush_remote_tlbs(kvm); ++ if (sevstep_kvm_mmu_slot_gfn_protect(kvm, ++ slot, gfn, PG_LEVEL_4K, mode)) { ++ kvm_flush_remote_tlbs(kvm); + } } EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); -diff --git a/arch/x86/kvm/sev-step.c b/arch/x86/kvm/sev-step.c -new file mode 100755 -index 000000000000..489583f33342 +diff --git a/arch/x86/kvm/sevstep b/arch/x86/kvm/sevstep +new file mode 120000 +index 000000000000..642ea24bf098 --- /dev/null -+++ b/arch/x86/kvm/sev-step.c -@@ -0,0 +1,250 @@ -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include "kvm_cache_regs.h" -+#include "svm/svm.h" -+ -+ -+ -+struct kvm* main_vm; -+EXPORT_SYMBOL(main_vm); -+ -+//used to store performance counter values; 6 counters, 2 readings per counter -+uint64_t perf_reads[6][2]; -+perf_ctl_config_t perf_configs[6]; -+int perf_cpu; -+ -+ -+uint64_t perf_ctl_to_u64(perf_ctl_config_t * config) { -+ -+ uint64_t result = 0; -+ result |= ( config->EventSelect & 0xffULL); //[7:0] in result and [7:0] in EventSelect -+ result |= ( (config->UintMask & 0xffULL) << 8 ); //[15:8] -+ result |= ( (config->OsUserMode & 0x3ULL) << 16); //[17:16] -+ result |= ( (config->Edge & 0x1ULL ) << 18 ); // 18 -+ result |= ( (config->Int & 0x1ULL ) << 20 ); // 20 -+ result |= ( (config->En & 0x1ULL ) << 22 ); //22 -+ result |= ( (config->Inv & 0x1ULL ) << 23); //23 -+ result |= ( (config->CntMask & 0xffULL) << 24); //[31:24] -+ result |= ( ( (config->EventSelect & 0xf00ULL) >> 8 ) << 32); //[35:32] in result and [11:8] in EventSelect -+ result |= ( (config->HostGuestOnly & 0x3ULL) << 40); // [41:40] -+ -+ return result; -+ -+} -+ -+void write_ctl(perf_ctl_config_t * config, int cpu, uint64_t ctl_msr){ -+ wrmsrl_on_cpu(cpu, ctl_msr, perf_ctl_to_u64(config)); //always returns zero -+} -+ -+void read_ctr(uint64_t ctr_msr, int cpu, uint64_t* result) { -+ uint64_t tmp; -+ rdmsrl_on_cpu(cpu, ctr_msr, &tmp); //always returns zero -+ *result = tmp & ( (0x1ULL << 48) - 1); -+} -+ -+void setup_perfs() { -+ int i; -+ -+ perf_cpu = smp_processor_id(); -+ -+ for( i = 0; i < 6; i++) { -+ perf_configs[i].HostGuestOnly = 0x1; //0x1 means: count only guest -+ perf_configs[i].CntMask = 0x0; -+ perf_configs[i].Inv = 0x0; -+ perf_configs[i].En = 0x0; -+ perf_configs[i].Int = 0x0; -+ perf_configs[i].Edge = 0x0; -+ perf_configs[i].OsUserMode = 0x3; //0x3 means: count userland and kernel events -+ } -+ -+ //remember to set .En to enable the individual counter -+ -+ perf_configs[0].EventSelect = 0x0c0; -+ perf_configs[0].UintMask = 0x0; -+ perf_configs[0].En = 0x1; -+ write_ctl(&perf_configs[0],perf_cpu, CTL_MSR_0); -+ -+ /*programm l2d hit from data cache miss perf for -+ cpu_probe_pointer_chasing_inplace without counting thread. -+ N.B. that this time we count host events -+ */ -+ perf_configs[1].EventSelect = 0x064; -+ perf_configs[1].UintMask = 0x70; -+ perf_configs[1].En = 0x1; -+ perf_configs[1].HostGuestOnly = 0x2; //0x2 means: count only host events, as we do the chase here -+ write_ctl(&perf_configs[1],perf_cpu,CTL_MSR_1); -+} -+EXPORT_SYMBOL(setup_perfs); -+ -+ -+/* -+static int __my_sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, -+ unsigned long dst, int size, -+ int *error); -+ -+int my_sev_decrypt(struct kvm* kvm, void* dst_vaddr, void* src_vaddr, uint64_t dst_paddr, uint64_t src_paddr, uint64_t len, int* api_res) { -+ -+ int call_res; -+ call_res = 0x1337; -+ *api_res = 0x1337; -+ -+ -+ if( dst_paddr % PAGE_SIZE != 0 || src_paddr % PAGE_SIZE != 0) { -+ printk("decrypt: for now, src_paddr, and dst_paddr must be page aligned"); -+ return -1; -+ } -+ -+ if( len > PAGE_SIZE ) { -+ printk("decrypt: for now, can be at most 4096 byte"); -+ return -1; -+ } -+ -+ memset(dst_vaddr,0,PAGE_SIZE); -+ -+ //clflush_cache_range(src_vaddr, PAGE_SIZE); -+ //clflush_cache_range(dst_vaddr, PAGE_SIZE); -+ wbinvd_on_all_cpus(); -+ -+ call_res = __my_sev_issue_dbg_cmd(kvm, __sme_set(src_paddr), -+ __sme_set(dst_paddr), len, api_res); -+ -+ return call_res; -+ -+} -+EXPORT_SYMBOL(my_sev_decrypt); -+ -+static int __my_sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, -+ unsigned long dst, int size, -+ int *error) -+{ -+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; -+ struct sev_data_dbg *data; -+ int ret; -+ -+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); -+ if (!data) -+ return -ENOMEM; -+ -+ data->handle = sev->handle; -+ data->dst_addr = dst; -+ data->src_addr = src; -+ data->len = size; -+ -+ //ret = sev_issue_cmd(kvm, -+ // SEV_CMD_DBG_DECRYPT, -+ // data, error); -+ ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, data, error); -+ kfree(data); -+ return ret; -+} -+ -+int decrypt_vmsa(struct vcpu_svm* svm, struct vmcb_save_area* save_area) { -+ -+ uint64_t src_paddr, dst_paddr; -+ void * dst_vaddr; -+ void * src_vaddr; -+ struct page * dst_page; -+ int call_res,api_res; -+ call_res = 1337; -+ api_res = 1337; -+ -+ src_vaddr = svm->vmsa; -+ src_paddr = svm->vmcb->control.vmsa_pa; -+ -+ if( src_paddr % 16 != 0) { -+ printk("decrypt_vmsa: src_paddr was not 16b aligned"); -+ } -+ -+ if( sizeof( struct vmcb_save_area) % 16 != 0 ) { -+ printk("decrypt_vmsa: size of vmcb_save_area is not 16 b aligned\n"); -+ } -+ -+ dst_page = alloc_page(GFP_KERNEL); -+ dst_vaddr = vmap(&dst_page, 1, 0, PAGE_KERNEL); -+ dst_paddr = page_to_pfn(dst_page) << PAGE_SHIFT; -+ memset(dst_vaddr,0,PAGE_SIZE); -+ -+ -+ -+ if( dst_paddr % 16 != 0 ) { -+ printk("decrypt_vmsa: dst_paddr was not 16 byte aligned"); -+ } -+ -+ //printk("src_paddr = 0x%llx dst_paddr = 0x%llx\n", __sme_clr(src_paddr), __sme_clr(dst_paddr)); -+ //printk("Sizeof vmcb_save_area is: 0x%lx\n", sizeof( struct vmcb_save_area) ); -+ -+ -+ call_res = __my_sev_issue_dbg_cmd(svm->vcpu.kvm, __sme_set(src_paddr), __sme_set(dst_paddr), sizeof(struct vmcb_save_area), &api_res); -+ -+ -+ //printk("decrypt_vmsa: result of call was %d, result of api command was %d\n",call_res, api_res); -+ -+ //todo error handling -+ if( api_res != 0 ) { -+ __free_page(dst_page); -+ return -1; -+ } -+ -+ memcpy(save_area, dst_vaddr, sizeof( struct vmcb_save_area) ); -+ -+ -+ __free_page(dst_page); -+ -+ return 0; -+ -+ -+} -+ -+ -+// -+// Contains a switch to work SEV and SEV-ES -+ // -+uint64_t sev_step_get_rip(struct vcpu_svm* svm) { -+ struct vmcb_save_area* save_area; -+ struct kvm * kvm; -+ struct kvm_sev_info *sev; -+ uint64_t rip; -+ -+ -+ kvm = svm->vcpu.kvm; -+ sev = &to_kvm_svm(kvm)->sev_info; -+ -+ //for sev-es we need to use the debug api, to decrypt the vmsa -+ if( sev->active && sev->es_active) { -+ int res; -+ save_area = vmalloc(sizeof(struct vmcb_save_area) ); -+ memset(save_area,0, sizeof(struct vmcb_save_area)); -+ -+ res = decrypt_vmsa(svm, save_area); -+ if( res != 0) { -+ printk("sev_step_get_rip failed to decrypt\n"); -+ return 0; -+ } -+ -+ rip = save_area->rip; -+ -+ vfree(save_area); -+ } else { //otherwise we can just access as plaintexts -+ rip = svm->vmcb->save.rip; -+ } -+ return rip; -+ -+} -+EXPORT_SYMBOL(sev_step_get_rip); -+*/ -+ -+int sev_step_get_rip_kvm_vcpu(struct kvm_vcpu* vcpu,uint64_t *rip) { -+ /* -+ struct vcpu_svm *svm = container_of(vcpu, struct vcpu_svm, vcpu); -+ if( svm == NULL ) { -+ return 1; -+ } -+ (*rip) = sev_step_get_rip(svm); -+ */ -+ return 0; -+} ++++ b/arch/x86/kvm/sevstep +@@ -0,0 +1 @@ ++/home/louis/kvm-prime-count/sevstep \ No newline at end of file diff --git a/arch/x86/kvm/svm/cachepc b/arch/x86/kvm/svm/cachepc new file mode 120000 -index 000000000000..7bef8c5db46c +index 000000000000..9119e44af1f0 --- /dev/null +++ b/arch/x86/kvm/svm/cachepc @@ -0,0 +1 @@ -+/home/louis/kvm-prime-count/kmod ++/home/louis/kvm-prime-count/cachepc \ No newline at end of file diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cf0bf456d520..4dbb8041541f 100644 @@ -894,9 +255,7 @@ index cf0bf456d520..4dbb8041541f 100644 guest_state_exit_irqoff(); diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S -old mode 100644 -new mode 100755 -index dfaeb47fcf2a..0626f3fdddfd +index dfaeb47fcf2a..0626f3fdddfd 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -29,12 +29,59 @@ @@ -1027,677 +386,21 @@ index dfaeb47fcf2a..0626f3fdddfd 2: cli -diff --git a/arch/x86/kvm/userspace_page_track_signals.c b/arch/x86/kvm/userspace_page_track_signals.c -new file mode 100755 -index 000000000000..7f37c9c7e4cd ---- /dev/null -+++ b/arch/x86/kvm/userspace_page_track_signals.c -@@ -0,0 +1,445 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+ -+//crude sync mechanism. don't know a good way to act on errors yet. -+uint64_t last_sent_event_id = 1; -+uint64_t last_acked_event_id = 1; -+DEFINE_RWLOCK(event_lock); -+ -+page_fault_event_t sent_event; -+static int have_event = 0; -+ -+static bool get_rip = true; -+ -+static int inited = 0; -+ -+ -+ -+ -+ -+void uspt_clear(void) { -+ write_lock(&event_lock); -+ inited = 0; -+ last_sent_event_id = 1; -+ last_acked_event_id = 1; -+ have_event = 0; -+ get_rip = false; -+ write_unlock(&event_lock); -+} -+ -+int uspt_initialize(int pid,bool should_get_rip) { -+ write_lock(&event_lock); -+ -+ inited = 1; -+ last_sent_event_id = 1; -+ last_acked_event_id = 1; -+ have_event = 0; -+ get_rip = should_get_rip; -+ write_unlock(&event_lock); -+ return 0; -+} -+ -+int uspt_is_initialiized() { -+ return inited; -+} -+ -+bool uspt_should_get_rip() { -+ bool tmp; -+ read_lock(&event_lock); -+ tmp = get_rip; -+ read_unlock(&event_lock); -+ return tmp; -+} -+ -+int uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code,bool have_rip,uint64_t rip) { -+ ktime_t abort_after; -+ page_fault_event_t message_for_user; -+ -+ read_lock(&event_lock); -+ if( !uspt_is_initialiized() ) { -+ printk("userspace_page_track_signals: uspt_send_and_block : ctx not initialized!\n"); -+ read_unlock(&event_lock); -+ return 1; -+ } -+ read_unlock(&event_lock); -+ -+ write_lock(&event_lock); -+ if( last_sent_event_id != last_acked_event_id ) { -+ printk("event id_s out of sync, aborting. Fix this later\n"); -+ write_unlock(&event_lock); -+ return 1; -+ } else { -+ //TODO: handle overflow -+ last_sent_event_id++; -+ } -+ message_for_user.id = last_sent_event_id; -+ message_for_user.faulted_gpa = faulted_gpa; -+ message_for_user.error_code = error_code; -+ message_for_user.have_rip_info = have_rip; -+ message_for_user.rip = rip; -+ message_for_user.ns_timestamp = ktime_get_real_ns(); -+ message_for_user.have_retired_instructions = false; -+ -+ //for poll based system; -+ have_event = 1; -+ sent_event = message_for_user; -+ //printk("uspt_send_and_block sending event %llu\n",sent_event.id); -+ -+ write_unlock(&event_lock); -+ -+ -+ //wait for ack, but with tiemout. Otherwise small bugs in userland easily lead -+ //to a kernel hang -+ abort_after = ktime_get() + 1000000000ULL; //1 sec in nanosecond -+ while( !uspt_is_event_done(sent_event.id) ) { -+ if( ktime_get() > abort_after ) { -+ printk("Waiting for ack of event %llu timed out, continuing\n",sent_event.id); -+ return 3; -+ } -+ } -+ return 0; -+} -+ -+int uspt_is_event_done(uint64_t id) { -+ int res; -+ read_lock(&event_lock); -+ res = last_acked_event_id >= id; -+ read_unlock(&event_lock); -+ return res; -+ -+} -+ -+int uspt_handle_poll_event(page_fault_event_t* userpace_mem) { -+ int err; -+ -+ //most of the time we won't have an event -+ read_lock(&event_lock); -+ if( !have_event) { -+ read_unlock(&event_lock); -+ return KVM_USPT_POLL_EVENT_NO_EVENT; -+ } -+ read_unlock(&event_lock); -+ -+ write_lock(&event_lock); -+ if( have_event) { -+ err = copy_to_user(userpace_mem, &sent_event, sizeof(page_fault_event_t)); -+ have_event = 0; -+ } else { -+ err = KVM_USPT_POLL_EVENT_NO_EVENT; -+ } -+ write_unlock(&event_lock); -+ return err; -+ -+} -+ -+static int _uspt_handle_ack_event(uint64_t id) { -+ int err = 0; -+ write_lock(&event_lock); -+ if( id == last_sent_event_id) { -+ last_acked_event_id = last_sent_event_id; -+ //printk("successfull ack\n"); -+ } else { -+ err = 1; -+ printk("last sent event id is %llu but received ack for %llu\n",last_sent_event_id,id); -+ } -+ write_unlock(&event_lock); -+ return err; -+ -+ -+} -+ -+int uspt_handle_ack_event_ioctl(ack_event_t event) { -+ return _uspt_handle_ack_event(event.id); -+} -+ -+ -+ -+typedef struct { -+ bool is_active; -+ int tracking_type; -+ bool retrack; -+ -+ int perf_cpu; -+ -+ uint64_t gfn_retrack_backlog[10]; -+ int gfn_retrack_backlog_next_idx; -+ -+ page_fault_event_t * events; -+ uint64_t event_next_idx; -+ uint64_t events_size; -+ -+ bool error_occured; -+ -+ -+} batch_track_state_t; -+ -+DEFINE_SPINLOCK(batch_track_state_lock); -+static batch_track_state_t batch_track_state; -+ -+typedef struct { -+ uint64_t idx_for_last_perf_reading; -+ uint64_t last_perf_reading; -+ uint64_t delta_valid_idx; -+ uint64_t delta; -+} perf_state_t; -+ -+perf_state_t perf_state; -+ -+//setup perf_state and program retired instruction performance counter -+void _perf_state_setup_retired_instructions(void) { -+ perf_ctl_config_t retired_instructions_perf_config; -+ retired_instructions_perf_config.HostGuestOnly = 0x1; //0x1 means: count only guest -+ retired_instructions_perf_config.CntMask = 0x0; -+ retired_instructions_perf_config.Inv = 0x0; -+ retired_instructions_perf_config.Int = 0x0; -+ retired_instructions_perf_config.Edge = 0x0; -+ retired_instructions_perf_config.OsUserMode = 0x3; //0x3 means: count kern and user events -+ retired_instructions_perf_config.EventSelect = 0x0c0; -+ retired_instructions_perf_config.UintMask = 0x0; -+ retired_instructions_perf_config.En = 0x1; -+ write_ctl(&retired_instructions_perf_config,batch_track_state.perf_cpu, CTL_MSR_0); -+} -+ -+ -+//get retired instructions between current_event_idx-1 and current_event_idx -+//value is cached for multiple calls to the same current_event_idx -+uint64_t _perf_state_update_and_get_delta(uint64_t current_event_idx) { -+ uint64_t current_value; -+ -+ //check if value is "cached" -+ if( perf_state.delta_valid_idx == current_event_idx) { -+ if( current_event_idx == 0) { -+ read_ctr(CTR_MSR_0, batch_track_state.perf_cpu, ¤t_value); -+ perf_state.idx_for_last_perf_reading = current_event_idx; -+ perf_state.last_perf_reading = current_event_idx; -+ } -+ return perf_state.delta; -+ } -+ -+ //otherwise update, but logic is only valid for two consecutive events -+ if (current_event_idx != perf_state.idx_for_last_perf_reading+1) { -+ printk_ratelimited(KERN_CRIT "_perf_state_update_and_get_delta: last reading was for idx %llu but was queried for %llu\n",perf_state.idx_for_last_perf_reading,current_event_idx); -+ } -+ -+ read_ctr(CTR_MSR_0, batch_track_state.perf_cpu, ¤t_value); -+ perf_state.delta = (current_value - perf_state.last_perf_reading); -+ perf_state.delta_valid_idx = current_event_idx; -+ -+ perf_state.idx_for_last_perf_reading = current_event_idx; -+ perf_state.last_perf_reading = current_value; -+ -+ return perf_state.delta; -+} -+ -+void uspt_batch_tracking_inc_event_idx(void) { -+ spin_lock(&batch_track_state_lock); -+ batch_track_state.event_next_idx++; -+ spin_unlock(&batch_track_state_lock); -+} -+ -+int uspt_batch_tracking_start(int tracking_type,uint64_t expected_events, int perf_cpu,bool retrack) { -+ page_fault_event_t* events; -+ uint64_t buffer_size; -+ uint64_t idx = 0; -+ spin_lock(&batch_track_state_lock); -+ if( batch_track_state.is_active ) { -+ printk("userspace_page_track_signals: overwriting active batch track config!\n"); -+ if( batch_track_state.events != NULL ) { -+ vfree(batch_track_state.events); -+ } -+ } -+ batch_track_state.is_active = false; -+ spin_unlock(&batch_track_state_lock); -+ -+ buffer_size = expected_events*sizeof(page_fault_event_t); -+ printk("uspt_batch_tracking_start trying to alloc %llu bytes buffer for events\n",buffer_size); -+ events = vmalloc(buffer_size); -+ if( events == NULL) { -+ printk("userspace_page_track_signals: faperf_cpuiled to alloc %llu bytes for event buffer\n",buffer_size); -+ return 1; //note: lock not held here -+ } -+ -+ //access each element once to force them into memory, improving performance -+ //during tracking -+ for( idx = 0; idx < expected_events*sizeof(page_fault_event_t);idx++) { -+ ((volatile uint8_t*)events)[idx] = 0; -+ } -+ -+ perf_state.idx_for_last_perf_reading = 0; -+ perf_state.last_perf_reading = 0; -+ perf_state.delta_valid_idx = 0; -+ perf_state.delta = 0; -+ _perf_state_setup_retired_instructions(); -+ -+ -+ spin_lock(&batch_track_state_lock); -+ -+ batch_track_state.perf_cpu = perf_cpu; -+ batch_track_state.retrack = retrack; -+ -+ batch_track_state.events = events; -+ batch_track_state.event_next_idx = 0; -+ batch_track_state.events_size = expected_events; -+ -+ batch_track_state.gfn_retrack_backlog_next_idx = 0; -+ batch_track_state.tracking_type = tracking_type; -+ batch_track_state.error_occured = false; -+ -+ batch_track_state.is_active = true; -+ -+ spin_unlock(&batch_track_state_lock); -+ -+ return 0; -+ -+ -+} -+ -+void uspt_batch_tracking_handle_retrack(struct kvm_vcpu* vcpu, uint64_t current_fault_gfn) { -+ int i; -+ uint64_t ret_instr_delta; -+ -+ spin_lock(&batch_track_state_lock); -+ -+ if( !batch_track_state.retrack ) { -+ spin_unlock(&batch_track_state_lock); -+ return; -+ } -+ -+ if( smp_processor_id() != batch_track_state.perf_cpu) { -+ printk("uspt_batch_tracking_handle_retrack: perf was programmed on logical cpu %d but handler was called on %d. Did you forget to pin the vcpu thread?\n",batch_track_state.perf_cpu,smp_processor_id()); -+ } -+ ret_instr_delta = _perf_state_update_and_get_delta(batch_track_state.event_next_idx); -+ -+ -+ //faulting instructions is probably the same as on last fault -+ //try to add current fault to retrack log and return -+ //for first event idx we do not have a valid ret_instr_delta. Retracking for the frist time is fine, if we loop, we end up here again but with a valid delta on one of the next event -+ if( (ret_instr_delta < 2) && ( batch_track_state.event_next_idx != 0) ) { -+ int next_idx = batch_track_state.gfn_retrack_backlog_next_idx; -+ if( next_idx >= sizeof(batch_track_state.gfn_retrack_backlog)/sizeof(batch_track_state.gfn_retrack_backlog[0])) { -+ printk("uspt_batch_tracking_handle_retrack: retrack backlog full, dropping retrack for fault at 0x%llx\n",current_fault_gfn); -+ } else { -+ batch_track_state.gfn_retrack_backlog[next_idx] = current_fault_gfn; -+ batch_track_state.gfn_retrack_backlog_next_idx++; -+ } -+ -+ spin_unlock(&batch_track_state_lock); -+ return; -+ } -+ -+ //made progress, retrack everything in backlog and reset idx -+ for( i = 0; i < batch_track_state.gfn_retrack_backlog_next_idx;i++) { -+ __track_single_page(vcpu,batch_track_state.gfn_retrack_backlog[i],batch_track_state.tracking_type); -+ } -+ -+ //add current fault to list -+ batch_track_state.gfn_retrack_backlog[0] = current_fault_gfn; -+ batch_track_state.gfn_retrack_backlog_next_idx = 1; -+ -+ spin_unlock(&batch_track_state_lock); -+ -+} -+ -+int uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code, bool have_rip,uint64_t rip) { -+ uint64_t ret_instr_delta; -+ page_fault_event_t* event; -+ -+ spin_lock(&batch_track_state_lock); -+ -+ if( !batch_track_state.is_active ) { -+ printk_ratelimited("userspace_page_track_signals: got save but batch tracking is not active!\n"); -+ batch_track_state.error_occured = true; -+ spin_unlock(&batch_track_state_lock); -+ return 1; -+ } -+ -+ -+ if( batch_track_state.event_next_idx >= batch_track_state.events_size) { -+ printk_ratelimited("userspace_page_track_signals: events buffer is full!\n"); -+ batch_track_state.error_occured = true; -+ spin_unlock(&batch_track_state_lock); -+ return 1; -+ } -+ -+ if( smp_processor_id() != batch_track_state.perf_cpu) { -+ printk("uspt_batch_tracking_handle_retrack: perf was programmed on logical cpu %d but handler was called on %d. Did you forget to pin the vcpu thread?\n",batch_track_state.perf_cpu,smp_processor_id()); -+ } -+ ret_instr_delta = _perf_state_update_and_get_delta(batch_track_state.event_next_idx); -+ -+ -+ if( batch_track_state.events == NULL ) { -+ printk(KERN_CRIT "userspace_page_track_signals: events buf was NULL but \"is_active\" was set! This should never happen!!!\n"); -+ spin_unlock(&batch_track_state_lock); -+ return 1; -+ } -+ -+ event = &batch_track_state.events[batch_track_state.event_next_idx]; -+ event->id = batch_track_state.event_next_idx; -+ event->faulted_gpa = faulted_gpa; -+ event->error_code = error_code; -+ event->have_rip_info = have_rip; -+ event->rip = rip; -+ event->ns_timestamp = ktime_get_real_ns(); -+ event->have_retired_instructions = true; -+ event->retired_instructions = ret_instr_delta; -+ -+//old inc was here -+ -+ if(batch_track_state.gfn_retrack_backlog_next_idx > (sizeof(batch_track_state.gfn_retrack_backlog)/sizeof(batch_track_state.gfn_retrack_backlog[0])) ) { -+ printk_ratelimited("userspace_page_track_signals: gfn retrack backlog overflow!\n"); -+ batch_track_state.error_occured = true; -+ spin_unlock(&batch_track_state_lock); -+ return 1; -+ } -+ -+ spin_unlock(&batch_track_state_lock); -+ return 0; -+} -+ -+int uspt_batch_tracking_stop(page_fault_event_t* results, uint64_t len, bool* error_occured) { -+ spin_lock(&batch_track_state_lock); -+ if( !batch_track_state.is_active ) { -+ printk("userspace_page_track_signals: batch tracking not active\n"); -+ spin_unlock(&batch_track_state_lock); -+ return 1; -+ -+ } -+ batch_track_state.is_active = false; -+ -+ if( len > batch_track_state.event_next_idx) { -+ printk("userspace_page_track_signals: requested %llu events but got only %llu\n",len,batch_track_state.event_next_idx ); -+ spin_unlock(&batch_track_state_lock); -+ return 1; -+ } -+ -+ memcpy(results,batch_track_state.events, len*sizeof(page_fault_event_t)); -+ vfree(batch_track_state.events); -+ -+ (*error_occured) = batch_track_state.error_occured; -+ -+ spin_unlock(&batch_track_state_lock); -+ -+ return 0; -+} -+ -+uint64_t uspt_batch_tracking_get_events_count() { -+ uint64_t buf; -+ spin_lock(&batch_track_state_lock); -+ buf = batch_track_state.event_next_idx; -+ spin_unlock(&batch_track_state_lock); -+ -+ return buf; -+} -+ -+bool uspt_batch_tracking_in_progress() { -+ return batch_track_state.is_active; -+} -\ No newline at end of file diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c -old mode 100644 -new mode 100755 -index d9adf79124f9..0003b96f8565 +index d9adf79124f9..1809b79cb6cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c -@@ -82,6 +82,9 @@ +@@ -82,6 +82,8 @@ #include #include -+#include -+#include "mmu/mmu_internal.h" ++#include "sevstep/kvm.h" + #define CREATE_TRACE_POINTS #include "trace.h" -@@ -13083,6 +13086,198 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, - : kvm_sev_es_outs(vcpu, size, port); - } - EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); -+bool __untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, -+ enum kvm_page_track_mode mode) { -+ int idx; -+ bool ret; -+ struct kvm_memory_slot *slot; -+ -+ ret = false; -+ idx = srcu_read_lock(&vcpu->kvm->srcu); -+ if (mode == KVM_PAGE_TRACK_ACCESS) { -+ //printk("Removing gfn: %016llx from acess page track pool\n", gfn); -+ } -+ if (mode == KVM_PAGE_TRACK_WRITE) { -+ //printk("Removing gfn: %016llx from write page track pool\n", gfn); -+ } -+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); -+ -+ if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { -+ -+ write_lock(&vcpu->kvm->mmu_lock); -+ kvm_slot_page_track_remove_page(vcpu->kvm, slot, gfn, mode); -+ write_unlock(&vcpu->kvm->mmu_lock); -+ ret = true; -+ -+ } else { -+ -+ printk("Failed to untrack %016llx because ", gfn); -+ if (slot == NULL) { -+ printk(KERN_CONT "slot was null"); -+ } else if (!kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { -+ printk(KERN_CONT "page track was not active"); -+ } -+ printk(KERN_CONT "\n"); -+ } -+ srcu_read_unlock(&vcpu->kvm->srcu, idx); -+ return ret; -+} -+EXPORT_SYMBOL(__untrack_single_page); -+ -+bool __reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn) { -+ int idx; -+ bool ret; -+ struct kvm_memory_slot *slot; -+ -+ ret = false; -+ idx = srcu_read_lock(&vcpu->kvm->srcu); -+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); -+ if( slot != NULL ) { -+ write_lock(&vcpu->kvm->mmu_lock); -+ //Vincent: The kvm mmu function now requires min_level -+ //We want all pages to protected so we do PG_LEVEL_4K -+ //https://patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/ -+ kvm_mmu_slot_gfn_protect(vcpu->kvm,slot,gfn,PG_LEVEL_4K,KVM_PAGE_TRACK_RESET_ACCESSED); -+ write_unlock(&vcpu->kvm->mmu_lock); -+ ret = true; -+ } -+ srcu_read_unlock(&vcpu->kvm->srcu, idx); -+ return ret; -+} -+EXPORT_SYMBOL(__reset_accessed_on_page); -+ -+bool __clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn) { -+ int idx; -+ bool ret; -+ struct kvm_memory_slot *slot; -+ -+ ret = false; -+ idx = srcu_read_lock(&vcpu->kvm->srcu); -+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); -+ if( slot != NULL ) { -+ write_lock(&vcpu->kvm->mmu_lock); -+ //Vincent: The kvm mmu function now requires min_level -+ //We want all pages to protected so we do PG_LEVEL_4K -+ //https://patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/ -+ kvm_mmu_slot_gfn_protect(vcpu->kvm,slot,gfn,PG_LEVEL_4K,KVM_PAGE_TRACK_RESET_EXEC); -+ write_unlock(&vcpu->kvm->mmu_lock); -+ ret = true; -+ } -+ srcu_read_unlock(&vcpu->kvm->srcu, idx); -+ return ret; -+} -+EXPORT_SYMBOL(__clear_nx_on_page); -+ -+bool __track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, -+ enum kvm_page_track_mode mode) { -+ int idx; -+ bool ret; -+ struct kvm_memory_slot *slot; -+ -+ ret = false; -+ idx = srcu_read_lock(&vcpu->kvm->srcu); -+ if (mode == KVM_PAGE_TRACK_ACCESS) { -+ //printk_ratelimited("Adding gfn: %016llx to acess page track pool\n", gfn); -+ //printk("Adding gfn: %016llx to acess page track pool\n", gfn); -+ } -+ if (mode == KVM_PAGE_TRACK_WRITE) { -+ //printk_ratelimited("Adding gfn: %016llx to write page track pool\n", gfn); -+ } -+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); -+ if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm,slot, gfn, mode)) { -+ -+ write_lock(&vcpu->kvm->mmu_lock); -+ kvm_slot_page_track_add_page(vcpu->kvm, slot, gfn, mode); -+ write_unlock(&vcpu->kvm->mmu_lock); -+ ret = true; -+ -+ } else { -+ -+ printk("Failed to track %016llx because ", gfn); -+ if (slot == NULL) { -+ printk(KERN_CONT "slot was null"); -+ } -+ if (kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { -+ printk(KERN_CONT "page is already tracked"); -+ } -+ printk(KERN_CONT "\n"); -+ } -+ srcu_read_unlock(&vcpu->kvm->srcu, idx); -+ return ret; -+} -+EXPORT_SYMBOL(__track_single_page); -+ -+//track all pages; taken from severed repo -+long kvm_start_tracking(struct kvm_vcpu *vcpu,enum kvm_page_track_mode mode ) { -+ long count = 0; -+ u64 iterator, iterat_max; -+ struct kvm_memory_slot *slot; -+ int idx; -+ -+ //Vincent: Memslots interface changed into a rb tree, see -+ //here: https://lwn.net/Articles/856392/ -+ //and here: https://lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u -+ //Thus we use instead of -+ //iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn -+ // + vcpu->kvm->memslots[0]->memslots[0].npages; -+ struct rb_node *node; -+ struct kvm_memory_slot *first_memslot; -+ node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree)); -+ first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]); -+ iterat_max = first_memslot->base_gfn + first_memslot->npages; -+ for (iterator=0; iterator < iterat_max; iterator++) -+ { -+ idx = srcu_read_lock(&vcpu->kvm->srcu); -+ slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator); -+ if ( slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) { -+ write_lock(&vcpu->kvm->mmu_lock); -+ kvm_slot_page_track_add_page(vcpu->kvm, slot, iterator, mode); -+ write_unlock(&vcpu->kvm->mmu_lock); -+ count++; -+ } -+ srcu_read_unlock(&vcpu->kvm->srcu, idx); -+ } -+ -+ return count; -+} -+EXPORT_SYMBOL(kvm_start_tracking); -+ -+//track all pages; taken from severed repo -+long kvm_stop_tracking(struct kvm_vcpu *vcpu,enum kvm_page_track_mode mode ) { -+ long count = 0; -+ u64 iterator, iterat_max; -+ struct kvm_memory_slot *slot; -+ int idx; -+ -+ -+ //Vincent: Memslots interface changed into a rb tree, see -+ //here: https://lwn.net/Articles/856392/ -+ //and here: https://lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u -+ //Thus we use instead of -+ //iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn -+ // + vcpu->kvm->memslots[0]->memslots[0].npages; -+ struct rb_node *node; -+ struct kvm_memory_slot *first_memslot; -+ node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree)); -+ first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]); -+ iterat_max = first_memslot->base_gfn + first_memslot->npages; -+ for (iterator=0; iterator < iterat_max; iterator++) -+ { -+ idx = srcu_read_lock(&vcpu->kvm->srcu); -+ slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator); -+ //Vincent: I think see here https://patchwork.kernel.org/project/kvm/patch/20210924163152.289027-22-pbonzini@redhat.com/ -+ if ( slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) { -+ write_lock(&vcpu->kvm->mmu_lock); -+ kvm_slot_page_track_remove_page(vcpu->kvm, slot, iterator, mode); -+ write_unlock(&vcpu->kvm->mmu_lock); -+ count++; -+ } -+ srcu_read_unlock(&vcpu->kvm->srcu, idx); -+ } -+ -+ return count; -+} -+EXPORT_SYMBOL(kvm_stop_tracking); - - EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); - EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c -old mode 100644 -new mode 100755 -index e089fbf9017f..7899e1efe852 +index e089fbf9017f..7899e1efe852 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -87,7 +87,7 @@ static void *sev_init_ex_buffer; @@ -1726,391 +429,22 @@ index e089fbf9017f..7899e1efe852 static int __sev_init_locked(int *error) { -diff --git a/include/linux/sev-step.h b/include/linux/sev-step.h -new file mode 100755 -index 000000000000..ec49e5526edd ---- /dev/null -+++ b/include/linux/sev-step.h -@@ -0,0 +1,68 @@ -+#ifndef SEV_STEP_H -+#define SEV_STEP_H -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include //struct kvm -+#include -+#include -+ -+ -+ -+ -+ -+#define CTL_MSR_0 0xc0010200ULL -+#define CTL_MSR_1 0xc0010202ULL -+#define CTL_MSR_2 0xc0010204ULL -+#define CTL_MSR_3 0xc0010206ULL -+#define CTL_MSR_4 0xc0010208ULL -+#define CTL_MSR_5 0xc001020aULL -+ -+#define CTR_MSR_0 0xc0010201ULL -+#define CTR_MSR_1 0xc0010203ULL -+#define CTR_MSR_2 0xc0010205ULL -+#define CTR_MSR_3 0xc0010207ULL -+#define CTR_MSR_4 0xc0010209ULL -+#define CTR_MSR_5 0xc001020bULL -+ -+typedef struct { -+ uint64_t HostGuestOnly; -+ uint64_t CntMask; -+ uint64_t Inv; -+ uint64_t En; -+ uint64_t Int; -+ uint64_t Edge; -+ uint64_t OsUserMode; -+ uint64_t UintMask; -+ uint64_t EventSelect; //12 bits in total split in [11:8] and [7:0] -+ -+} perf_ctl_config_t; -+ -+ -+extern struct kvm* main_vm; -+ -+ -+bool __untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, -+ enum kvm_page_track_mode mode);//defined in x86.c -+ -+bool __track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, -+ enum kvm_page_track_mode mode); //defined in x86.c -+bool __reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn); //defined in x86.c -+bool __clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn); //defined in x86.c -+long kvm_start_tracking(struct kvm_vcpu *vcpu,enum kvm_page_track_mode mode ); -+long kvm_stop_tracking(struct kvm_vcpu *vcpu,enum kvm_page_track_mode mode ); -+void sev_step_handle_callback(void); -+ -+uint64_t perf_ctl_to_u64(perf_ctl_config_t * config); -+void write_ctl(perf_ctl_config_t * config, int cpu, uint64_t ctl_msr); -+void read_ctr(uint64_t ctr_msr, int cpu, uint64_t* result); -+void setup_perfs(void); -+ -+ -+int sev_step_get_rip_kvm_vcpu(struct kvm_vcpu* vcpu,uint64_t *rip); -+ -+#endif -diff --git a/include/linux/userspace_page_track_signals.h b/include/linux/userspace_page_track_signals.h -new file mode 100755 -index 000000000000..dc3fea4a9af7 ---- /dev/null -+++ b/include/linux/userspace_page_track_signals.h -@@ -0,0 +1,59 @@ -+#ifndef USERSPACE_PAGE_TRACK_SIGNALS -+#define USERSPACE_PAGE_TRACK_SIGNALS -+ -+#include -+#include -+#include -+ -+ -+// -+// User space signaling -+// -+ -+int uspt_initialize(int pid,bool should_get_rip); -+int uspt_is_initialiized(void); -+void uspt_clear(void); -+ -+bool uspt_should_get_rip(void); -+ -+ -+int uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code, bool have_rip,uint64_t rip); -+ -+int uspt_is_event_done(uint64_t id); -+ -+//prepare next event based on faulted_gpa and error_code. Notify process behind pid_number. Event must be polled -+//id is result param with the id used for the event. Can be used to call uspt_is_event_done -+int uspt_send_notification(int pid_number, uint64_t faulted_gpa, uint32_t error_code,uint64_t* id); -+ -+//copy next event to userpace_mem -+int uspt_handle_poll_event(page_fault_event_t* userpace_mem); -+ -+//acknowledge receival of event to event handling logic -+int uspt_handle_ack_event_ioctl(ack_event_t event); -+ -+// -+// Batch Tracking -+// -+ -+//should be called after "uspt_batch_tracking_save", "uspt_batch_tracking_handle_retrack" and any future custom logic -+//for an event is processed -+void uspt_batch_tracking_inc_event_idx(void); -+ -+int uspt_batch_tracking_start(int tracking_type,uint64_t expected_events, int perf_cpu,bool retrack); -+ -+int uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code, bool have_rip,uint64_t rip); -+ -+uint64_t uspt_batch_tracking_get_events_count(void); -+ -+//Stops batch tracking on copies the first @len events into @result. If an error occured at some point -+//during the batch tracking, error_occured is set(there should also be a dmesg, but this allows programatic access); -+//Caller can use uspt_batch_tracking_get_events_count() to determine the amount of memory they should allocate for -+//@results -+int uspt_batch_tracking_stop(page_fault_event_t* results, uint64_t len,bool* error_occured); -+ -+void uspt_batch_tracking_handle_retrack(struct kvm_vcpu* vcpu,uint64_t current_fault_gfn); -+ -+void uspt_batch_tracking_get_retrack_gfns(uint64_t** gfns, uint64_t* len,int * tracking_type); -+ -+bool uspt_batch_tracking_in_progress(void); -+#endif -diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h -old mode 100644 -new mode 100755 -index f288b421b603..81b232132f66 ---- a/include/uapi/linux/kvm.h -+++ b/include/uapi/linux/kvm.h -@@ -16,6 +16,78 @@ - - #define KVM_API_VERSION 12 - -+#define KVM_USPT_POLL_EVENT_NO_EVENT 1000 -+#define KVM_USPT_POLL_EVENT_GOT_EVENT 0 -+ -+ -+typedef struct { -+ uint64_t id; //filled automatically -+ uint64_t faulted_gpa; -+ uint32_t error_code; -+ bool have_rip_info; -+ uint64_t rip; -+ uint64_t ns_timestamp; -+ bool have_retired_instructions; -+ uint64_t retired_instructions; -+} page_fault_event_t; -+ -+typedef struct { -+ int tracking_type; -+ uint64_t expected_events; -+ int perf_cpu; -+ bool retrack; -+} batch_track_config_t; -+ -+typedef struct { -+ uint64_t event_count; -+} batch_track_event_count_t; -+ -+typedef struct { -+ page_fault_event_t* out_buf; -+ uint64_t len; -+ bool error_during_batch; -+} batch_track_stop_and_get_t; -+ -+typedef struct { -+ int cpu; //cpu on which we want to read the counter -+ uint64_t retired_instruction_count; //result param -+} retired_instr_perf_t; -+ -+typedef struct { -+ int cpu; //cpu on which counter should be programmed -+} retired_instr_perf_config_t; -+ -+typedef struct { -+ uint64_t gpa; -+ uint64_t len; -+ bool decrypt_with_host_key; -+ int wbinvd_cpu; //-1: do not flush; else logical cpu on which we flush -+ void* output_buffer; -+}read_guest_memory_t; -+ -+typedef struct { -+ int pid; -+ bool get_rip; -+} userspace_ctx_t; -+ -+ -+typedef struct { -+ uint64_t id; -+} ack_event_t; -+ -+ -+typedef struct { -+ uint64_t gpa; -+ int track_mode; -+} track_page_param_t; -+ -+ -+typedef struct { -+ int track_mode; -+} track_all_pages_t; -+ -+ -+ - /* *** Deprecated interfaces *** */ - - #define KVM_TRC_SHIFT 16 -@@ -921,6 +993,29 @@ struct kvm_ppc_resize_hpt { - #define KVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid2) - #define KVM_GET_MSR_FEATURE_INDEX_LIST _IOWR(KVMIO, 0x0a, struct kvm_msr_list) - -+ -+// -+// SNP ATTACK IOCTLS -+// -+ -+#define KVM_TRACK_PAGE _IOWR(KVMIO, 0x20, track_page_param_t) -+#define KVM_USPT_REGISTER_PID _IOWR(KVMIO, 0x21, userspace_ctx_t) -+#define KVM_USPT_WAIT_AND_SEND _IO(KVMIO, 0x22) -+#define KVM_USPT_POLL_EVENT _IOWR(KVMIO, 0x23, page_fault_event_t) -+#define KVM_USPT_ACK_EVENT _IOWR(KVMIO, 0x24, ack_event_t) -+#define KVM_READ_GUEST_MEMORY _IOWR(KVMIO, 0x25, read_guest_memory_t) -+#define KVM_USPT_RESET _IO(KVMIO, 0x26) -+#define KVM_USPT_TRACK_ALL _IOWR(KVMIO, 0x27, track_all_pages_t) -+#define KVM_USPT_UNTRACK_ALL _IOWR(KVMIO, 0x28, track_all_pages_t) -+#define KVM_USPT_SETUP_RETINSTR_PERF _IOWR(KVMIO, 0x30,retired_instr_perf_config_t) -+#define KVM_USPT_READ_RETINSTR_PERF _IOWR(KVMIO,0x31, retired_instr_perf_t) -+#define KVM_USPT_BATCH_TRACK_START _IOWR(KVMIO,0x32,batch_track_config_t) -+#define KVM_USPT_BATCH_TRACK_STOP _IOWR(KVMIO,0x33,batch_track_stop_and_get_t) -+#define KVM_USPT_BATCH_TRACK_EVENT_COUNT _IOWR(KVMIO,0x34,batch_track_event_count_t) -+ -+ -+ -+ - /* - * Extension capability list. - */ -diff --git a/my-make-ccp-modules.sh b/my-make-ccp-modules.sh -new file mode 100755 -index 000000000000..b5068c264ed0 ---- /dev/null -+++ b/my-make-ccp-modules.sh -@@ -0,0 +1,24 @@ -+#/bin/sh -+cores=$(nproc --all) -+#sudo -u luca make distclean && -+#./my-configure-sev.sh && -+EXTRAVERSION="" -+MODPATH="drivers/crypto/ccp" -+make clean M="$MODPATH" && -+make -j $cores scripts && -+make -j $cores prepare && -+make -j $cores modules_prepare && -+cp /usr/src/linux-headers-`uname -r`/Module.symvers "$MODPATH"/Module.symvers && -+cp /usr/src/linux-headers-`uname -r`/Module.symvers Module.symvers && -+chown luca:luca "$MODPATH"/Module.symvers -+cp "/boot/System.map-$(uname -r)" . -+cp "/boot/System.map-$(uname -r)" "$MODPATH" -+touch .scmversion && -+make -j $cores modules M="$MODPATH" LOCALVERSION= && -+make modules_install M="$MODPATH" LOCALVERSION= -+ -+exit -+ -+echo "Installing module file" -+cp ./drivers/crypto/ccp/ccp.ko "/lib/modules/$(uname -r)/kernel/drivers/crypto/ccp/ccp.ko" -+cp ./drivers/crypto/ccp/ccp-crypto.ko "/lib/modules/$(uname -r)/kernel/drivers/crypto/ccp/ccp-crypto.ko" -diff --git a/my-make-kernel.sh b/my-make-kernel.sh -new file mode 100755 -index 000000000000..0418f607cb43 ---- /dev/null -+++ b/my-make-kernel.sh -@@ -0,0 +1,38 @@ -+#!/bin/bash -+ -+run_cmd() -+{ -+ echo "$*" -+ -+ eval "$*" || { -+ echo "ERROR: $*" -+ exit 1 -+ } -+} -+ -+ -+[ -d linux-patches ] && { -+ -+ for P in linux-patches/*.patch; do -+ run_cmd patch -p1 -d linux < $P -+ done -+} -+ -+MAKE="make -j $(getconf _NPROCESSORS_ONLN) LOCALVERSION=" -+ -+run_cmd $MAKE distclean -+ -+ run_cmd cp /boot/config-$(uname -r) .config -+ run_cmd ./scripts/config --set-str LOCALVERSION "-sev-step-snp" -+ run_cmd ./scripts/config --disable LOCALVERSION_AUTO -+ run_cmd ./scripts/config --disable CONFIG_DEBUG_INFO -+# run_cmd ./scripts/config --undefine CONFIG_SYSTEM_TRUSTED_KEYS -+# run_cmd ./scripts/config --undefine CONFIG_MODULE_SIG_KEY -+ -+run_cmd $MAKE olddefconfig -+ -+# Build -+run_cmd $MAKE >/dev/null -+ -+run_cmd $MAKE bindeb-pkg -+ -diff --git a/my-make-kvm-modules.sh b/my-make-kvm-modules.sh -new file mode 100755 -index 000000000000..22f1f95b063f ---- /dev/null -+++ b/my-make-kvm-modules.sh -@@ -0,0 +1,29 @@ -+#/bin/sh -+cores=$(nproc --all) -+#sudo -u luca make distclean && -+#./my-configure-sev.sh && -+EXTRAVERSION="" -+make clean M=arch/x86/kvm/ && -+make -j $cores scripts && -+make -j $cores prepare && -+make -j $cores modules_prepare && -+cp /usr/src/linux-headers-`uname -r`/Module.symvers arch/x86/kvm/Module.symvers && -+cp /usr/src/linux-headers-`uname -r`/Module.symvers Module.symvers && -+chown luca:luca arch/x86/kvm/Module.symvers -+cp "/boot/System.map-$(uname -r)" . -+cp "/boot/System.map-$(uname -r)" arch/x86/kvm/ -+touch .scmversion && -+make -j $cores modules M=arch/x86/kvm/ LOCALVERSION= && -+make modules_install M=arch/x86/kvm/ LOCALVERSION= && -+ -+echo "Unload old modules" -+modprobe -r kvm_amd kvm -+cp ./arch/x86/kvm/kvm.ko "/lib/modules/$(uname -r)/kernel/arch/x86/kvm/" -+cp ./arch/x86/kvm/kvm-amd.ko "/lib/modules/$(uname -r)/kernel/arch/x86/kvm/" -+echo "Load new modules" -+modprobe kvm -+modprobe kvm-amd sev-snp=1 sev=1 sev-es=1 -+#insmod "/lib/modules/$(uname -r)/kernel/virt/lib/irqbypass.ko" -+#insmod ./arch/x86/kvm/kvm.ko -+#insmod "/lib/modules/$(uname -r)/kernel/drivers/crypto/ccp/ccp.ko" -+#insmod ./arch/x86/kvm/kvm-amd.ko sev=1 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c -old mode 100644 -new mode 100755 -index f2a63cb2658b..ac5fc6c64b7e +index f2a63cb2658b..bfe4a57bcc10 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c -@@ -67,9 +67,14 @@ - - #include - -+#include -+#include -+ +@@ -70,6 +70,10 @@ /* Worst case buffer size needed for holding an integer. */ #define ITOA_MAX_LEN 12 +#include "../../arch/x86/kvm/svm/cachepc/kvm.h" ++#include "../../arch/x86/kvm/sevstep/sevstep.h" ++#include "../../arch/x86/kvm/sevstep/uspt.h" + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -@@ -5792,6 +5797,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, +@@ -5792,6 +5796,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, r = kvm_vfio_ops_init(); WARN_ON(r); @@ -2119,7 +453,7 @@ index f2a63cb2658b..ac5fc6c64b7e return 0; out_unreg: -@@ -5821,6 +5828,8 @@ void kvm_exit(void) +@@ -5821,6 +5827,8 @@ void kvm_exit(void) { int cpu; diff --git a/sevstep/kvm.c b/sevstep/kvm.c new file mode 100644 index 0000000..b6b0d49 --- /dev/null +++ b/sevstep/kvm.c @@ -0,0 +1,205 @@ +#include "kvm.h" + +#include + +bool +__untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (mode == KVM_PAGE_TRACK_ACCESS) { + //printk("Removing gfn: %016llx from acess page track pool\n", gfn); + } + if (mode == KVM_PAGE_TRACK_WRITE) { + //printk("Removing gfn: %016llx from write page track pool\n", gfn); + } + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + + if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_remove_page(vcpu->kvm, slot, gfn, mode); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + } else { + printk("Failed to untrack %016llx because ", gfn); + if (slot == NULL) { + printk(KERN_CONT "slot was null"); + } else if (!kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { + printk(KERN_CONT "page track was not active"); + } + printk(KERN_CONT "\n"); + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; +} +EXPORT_SYMBOL(__untrack_single_page); + +bool +__reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if( slot != NULL ) { + write_lock(&vcpu->kvm->mmu_lock); + //Vincent: The kvm mmu function now requires min_level + //We want all pages to protected so we do PG_LEVEL_4K + //https://patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/ + sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm,slot,gfn,PG_LEVEL_4K,KVM_PAGE_TRACK_RESET_ACCESSED); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; +} +EXPORT_SYMBOL(__reset_accessed_on_page); + +bool +__clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if( slot != NULL ) { + write_lock(&vcpu->kvm->mmu_lock); + //Vincent: The kvm mmu function now requires min_level + //We want all pages to protected so we do PG_LEVEL_4K + //https://patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/ + sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn, + PG_LEVEL_4K, KVM_PAGE_TRACK_RESET_EXEC); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; +} +EXPORT_SYMBOL(__clear_nx_on_page); + +bool +__track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (mode == KVM_PAGE_TRACK_ACCESS) { + //printk_ratelimited("Adding gfn: %016llx to acess page track pool\n", gfn); + //printk("Adding gfn: %016llx to acess page track pool\n", gfn); + } + if (mode == KVM_PAGE_TRACK_WRITE) { + //printk_ratelimited("Adding gfn: %016llx to write page track pool\n", gfn); + } + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm,slot, gfn, mode)) { + + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_add_page(vcpu->kvm, slot, gfn, mode); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + + } else { + + printk("Failed to track %016llx because ", gfn); + if (slot == NULL) { + printk(KERN_CONT "slot was null"); + } + if (kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { + printk(KERN_CONT "page is already tracked"); + } + printk(KERN_CONT "\n"); + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; +} +EXPORT_SYMBOL(__track_single_page); + +long +kvm_start_tracking(struct kvm_vcpu *vcpu,enum kvm_page_track_mode mode ) +{ + long count = 0; + u64 iterator, iterat_max; + struct kvm_memory_slot *slot; + int idx; + + //Vincent: Memslots interface changed into a rb tree, see + //here: https://lwn.net/Articles/856392/ + //and here: https://lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u + //Thus we use instead of + //iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn + // + vcpu->kvm->memslots[0]->memslots[0].npages; + struct rb_node *node; + struct kvm_memory_slot *first_memslot; + node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree)); + first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]); + iterat_max = first_memslot->base_gfn + first_memslot->npages; + for (iterator=0; iterator < iterat_max; iterator++) + { + idx = srcu_read_lock(&vcpu->kvm->srcu); + slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator); + if ( slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) { + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_add_page(vcpu->kvm, slot, iterator, mode); + write_unlock(&vcpu->kvm->mmu_lock); + count++; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + return count; +} +EXPORT_SYMBOL(kvm_start_tracking); + +long +kvm_stop_tracking(struct kvm_vcpu *vcpu,enum kvm_page_track_mode mode) +{ + long count = 0; + u64 iterator, iterat_max; + struct kvm_memory_slot *slot; + int idx; + + + //Vincent: Memslots interface changed into a rb tree, see + //here: https://lwn.net/Articles/856392/ + //and here: https://lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u + //Thus we use instead of + //iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn + // + vcpu->kvm->memslots[0]->memslots[0].npages; + struct rb_node *node; + struct kvm_memory_slot *first_memslot; + node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree)); + first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]); + iterat_max = first_memslot->base_gfn + first_memslot->npages; + for (iterator=0; iterator < iterat_max; iterator++) + { + idx = srcu_read_lock(&vcpu->kvm->srcu); + slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator); + //Vincent: I think see here https://patchwork.kernel.org/project/kvm/patch/20210924163152.289027-22-pbonzini@redhat.com/ + if ( slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) { + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_remove_page(vcpu->kvm, slot, iterator, mode); + write_unlock(&vcpu->kvm->mmu_lock); + count++; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + return count; +} +EXPORT_SYMBOL(kvm_stop_tracking); + diff --git a/sevstep/kvm.h b/sevstep/kvm.h new file mode 100644 index 0000000..35cb4d5 --- /dev/null +++ b/sevstep/kvm.h @@ -0,0 +1,4 @@ +#pragma once + +#include "sev-step.h" +#include "uapi.h" diff --git a/sevstep/mmu.c b/sevstep/mmu.c new file mode 100644 index 0000000..4eefea2 --- /dev/null +++ b/sevstep/mmu.c @@ -0,0 +1,132 @@ +#include "../sevstep/sevstep.h" +#include "../sevstep/uspt.h" + +void +sevstep_uspt_page_fault_handle(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + const int modes[] = { + KVM_PAGE_TRACK_WRITE, + KVM_PAGE_TRACK_ACCESS, + KVM_PAGE_TRACK_EXEC + }; + uint64_t current_rip; + bool was_tracked; + int have_rip, i; + int send_err; + + was_tracked = false; + for (i = 0; i < sizeof(modes) / sizeof(modes[0]); i++) { + if (kvm_slot_page_track_is_active(vcpu->kvm, + fault->slot, fault->gfn, modes[i])) { + __untrack_single_page(vcpu, fault->gfn, modes[i]); + was_tracked = true; + } + } + + if (was_tracked) { + have_rip = false; + if (uspt_should_get_rip()) + have_rip = sev_step_get_rip_kvm_vcpu(vcpu,¤t_rip) == 0; + if (uspt_batch_tracking_in_progress()) { + send_err = uspt_batch_tracking_save(fault->gfn << PAGE_SHIFT, + fault->error_code, have_rip, current_rip); + if (send_err) { + printk_ratelimited( + "uspt_batch_tracking_save failed with %d\n" + "##########################\n", send_err); + } + uspt_batch_tracking_handle_retrack(vcpu, fault->gfn); + uspt_batch_tracking_inc_event_idx(); + } else { + send_err = uspt_send_and_block(fault->gfn << PAGE_SHIFT, + fault->error_code, have_rip, current_rip); + if (send_err) { + printk("uspt_send_and_block failed with %d\n" + "##########################\n", send_err); + } + } + } +} + +bool +sevstep_spte_protect(u64 *sptep, bool pt_protect, enum kvm_page_track_mode mode) +{ + u64 spte = *sptep; + bool shouldFlush = false; + + if (!is_writable_pte(spte) && !(pt_protect && is_mmu_writable_spte(spte))) + return false; + + rmap_printk("spte %p %llx\n", sptep, *sptep); + + if (pt_protect) + spte &= ~EPT_SPTE_MMU_WRITABLE; + + if (mode == KVM_PAGE_TRACK_WRITE) { + spte = spte & ~PT_WRITABLE_MASK; + shouldFlush = true; + } else if (mode == KVM_PAGE_TRACK_RESET_ACCESSED) { + spte = spte & ~PT_ACCESSED_MASK; + } else if (mode == KVM_PAGE_TRACK_ACCESS) { + spte = spte & ~PT_PRESENT_MASK; + spte = spte & ~PT_WRITABLE_MASK; + spte = spte & ~PT_USER_MASK; + spte = spte | (0x1ULL << PT64_NX_SHIFT); + shouldFlush = true; + } else if (mode == KVM_PAGE_TRACK_EXEC) { + spte = spte | (0x1ULL << PT64_NX_SHIFT); + shouldFlush = true; + } else if (mode == KVM_PAGE_TRACK_RESET_EXEC) { + spte = spte & ~(0x1ULL << PT64_NX_SHIFT); + shouldFlush = true; + } else { + printk(KERN_WARNING "spte_protect was called with invalid mode" + "parameter %d\n",mode); + } + shouldFlush |= mmu_spte_update(sptep, spte); + return shouldFlush; +} +EXPORT_SYMBOL(sevstep_spte_protect); + +bool sevstep_rmap_protect(struct kvm_rmap_head *rmap_head, + bool pt_protect, enum kvm_page_track_mode mode) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for_each_rmap_spte(rmap_head, &iter, sptep) { + flush |= sevstep_spte_protect(sptep, pt_protect, mode); + } + + return flush; +} +EXPORT_SYMBOL(sevstep_rmap_protect); + +bool +sevstep_kvm_mmu_slot_gfn_protect(struct kvm *kvm, struct kvm_memory_slot *slot, + uint64_t gfn, int min_level, enum kvm_page_track_mode mode) +{ + struct kvm_rmap_head *rmap_head; + bool protected; + int i; + + protected = false; + + if (kvm_memslots_have_rmaps(kvm)) { + for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { + rmap_head = gfn_to_rmap(gfn, i, slot); + protected |= sevstep_rmap_protect(rmap_head, true, mode); + } + } + + if (is_tdp_mmu_enabled(kvm)) { + protected |= kvm_tdp_mmu_write_protect_gfn(kvm, + slot, gfn, min_level); + } + + return protected; +} +EXPORT_SYMBOL(sevstep_kvm_mmu_slot_gfn_protect); + diff --git a/sevstep/sevstep.c b/sevstep/sevstep.c new file mode 100644 index 0000000..3345e04 --- /dev/null +++ b/sevstep/sevstep.c @@ -0,0 +1,129 @@ +#include "sevstep.h" + +#include "mmu/mmu_internal.h" +#include "mmu.h" + +#include "irq.h" +#include "ioapic.h" +#include "mmu.h" +#include "mmu/tdp_mmu.h" +#include "x86.h" +#include "kvm_cache_regs.h" +#include "kvm_emulate.h" +#include "cpuid.h" +#include "mmu/spte.h" + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kvm_cache_regs.h" +#include "svm/svm.h" + +struct kvm* main_vm; +EXPORT_SYMBOL(main_vm); + +// used to store performance counter values; 6 counters, 2 readings per counter +// TODO: static! +uint64_t perf_reads[6][2]; +perf_ctl_config_t perf_configs[6]; +int perf_cpu; + + +uint64_t +perf_ctl_to_u64(perf_ctl_config_t * config) +{ + uint64_t result; + + result = 0; + result |= config->EventSelect & 0xffULL; + result |= (config->UintMask & 0xffULL) << 8; + result |= (config->OsUserMode & 0x3ULL) << 16; + result |= (config->Edge & 0x1ULL ) << 18; + result |= (config->Int & 0x1ULL ) << 20; + result |= (config->En & 0x1ULL ) << 22; + result |= (config->Inv & 0x1ULL ) << 23; + result |= (config->CntMask & 0xffULL) << 24; + result |= ((config->EventSelect & 0xf00ULL) >> 8) << 32; + result |= (config->HostGuestOnly & 0x3ULL) << 40; + + return result; + +} + +void +write_ctl(perf_ctl_config_t * config, int cpu, uint64_t ctl_msr) +{ + wrmsrl_on_cpu(cpu, ctl_msr, perf_ctl_to_u64(config)); +} + +void +read_ctr(uint64_t ctr_msr, int cpu, uint64_t* result) +{ + uint64_t tmp; + + rdmsrl_on_cpu(cpu, ctr_msr, &tmp); + *result = tmp & ( (0x1ULL << 48) - 1); +} + +void +setup_perfs() +{ + int i; + + perf_cpu = smp_processor_id(); + + for (i = 0; i < 6; i++) { + perf_configs[i].HostGuestOnly = 0x1; /* count only guest */ + perf_configs[i].CntMask = 0x0; + perf_configs[i].Inv = 0x0; + perf_configs[i].En = 0x0; + perf_configs[i].Int = 0x0; + perf_configs[i].Edge = 0x0; + perf_configs[i].OsUserMode = 0x3; /* count userland and kernel events */ + } + + perf_configs[0].EventSelect = 0x0c0; + perf_configs[0].UintMask = 0x0; + perf_configs[0].En = 0x1; + write_ctl(&perf_configs[0],perf_cpu, CTL_MSR_0); + + /* + * programm l2d hit from data cache miss perf for + * cpu_probe_pointer_chasing_inplace without counting thread. + * N.B. that this time we count host events + */ + perf_configs[1].EventSelect = 0x064; + perf_configs[1].UintMask = 0x70; + perf_configs[1].En = 0x1; + perf_configs[1].HostGuestOnly = 0x2; /* count only host events */ + write_ctl(&perf_configs[1],perf_cpu,CTL_MSR_1); +} +EXPORT_SYMBOL(setup_perfs); + +int +sev_step_get_rip_kvm_vcpu(struct kvm_vcpu* vcpu,uint64_t *rip) +{ + return 0; +} diff --git a/sevstep/sevstep.h b/sevstep/sevstep.h new file mode 100644 index 0000000..86d25f7 --- /dev/null +++ b/sevstep/sevstep.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + + +#define CTL_MSR_0 0xc0010200ULL +#define CTL_MSR_1 0xc0010202ULL +#define CTL_MSR_2 0xc0010204ULL +#define CTL_MSR_3 0xc0010206ULL +#define CTL_MSR_4 0xc0010208ULL +#define CTL_MSR_5 0xc001020aULL + +#define CTR_MSR_0 0xc0010201ULL +#define CTR_MSR_1 0xc0010203ULL +#define CTR_MSR_2 0xc0010205ULL +#define CTR_MSR_3 0xc0010207ULL +#define CTR_MSR_4 0xc0010209ULL +#define CTR_MSR_5 0xc001020bULL + +typedef struct { + uint64_t HostGuestOnly; + uint64_t CntMask; + uint64_t Inv; + uint64_t En; + uint64_t Int; + uint64_t Edge; + uint64_t OsUserMode; + uint64_t UintMask; + uint64_t EventSelect; //12 bits in total split in [11:8] and [7:0] + +} perf_ctl_config_t; + +extern struct kvm* main_vm; + +bool sevstep_spte_protect(u64 *sptep, + bool pt_protect, enum kvm_page_track_mode mode); +bool sevstep_rmap_protect(struct kvm_rmap_head *rmap_head, + bool pt_protect, enum kvm_page_track_mode mode); +bool sevstep_kvm_mmu_slot_gfn_protect(struct kvm *kvm, struct kvm_memory_slot *slot, + uint64_t gfn, int min_level, enum kvm_page_track_mode mode); + +bool __untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode); +bool __track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode); +bool __reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn); +bool __clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn); + +long kvm_start_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode); +long kvm_stop_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode); +void sev_step_handle_callback(void); + +uint64_t perf_ctl_to_u64(perf_ctl_config_t *config); +void write_ctl(perf_ctl_config_t *config, int cpu, uint64_t ctl_msr); +void read_ctr(uint64_t ctr_msr, int cpu, uint64_t *result); + +void setup_perfs(void); + +int sev_step_get_rip_kvm_vcpu(struct kvm_vcpu *vcpu, uint64_t *rip); diff --git a/sevstep/uapi.h b/sevstep/uapi.h new file mode 100644 index 0000000..e41a036 --- /dev/null +++ b/sevstep/uapi.h @@ -0,0 +1,86 @@ +#pragma once + +#include +#include + +#define KVM_TRACK_PAGE _IOWR(KVMIO, 0x20, track_page_param_t) +#define KVM_USPT_REGISTER_PID _IOWR(KVMIO, 0x21, userspace_ctx_t) +#define KVM_USPT_WAIT_AND_SEND _IO(KVMIO, 0x22) +#define KVM_USPT_POLL_EVENT _IOWR(KVMIO, 0x23, page_fault_event_t) +#define KVM_USPT_ACK_EVENT _IOWR(KVMIO, 0x24, ack_event_t) +#define KVM_READ_GUEST_MEMORY _IOWR(KVMIO, 0x25, read_guest_memory_t) +#define KVM_USPT_RESET _IO(KVMIO, 0x26) +#define KVM_USPT_TRACK_ALL _IOWR(KVMIO, 0x27, track_all_pages_t) +#define KVM_USPT_UNTRACK_ALL _IOWR(KVMIO, 0x28, track_all_pages_t) +#define KVM_USPT_SETUP_RETINSTR_PERF _IOWR(KVMIO, 0x30,retired_instr_perf_config_t) +#define KVM_USPT_READ_RETINSTR_PERF _IOWR(KVMIO,0x31, retired_instr_perf_t) +#define KVM_USPT_BATCH_TRACK_START _IOWR(KVMIO,0x32,batch_track_config_t) +#define KVM_USPT_BATCH_TRACK_STOP _IOWR(KVMIO,0x33,batch_track_stop_and_get_t) +#define KVM_USPT_BATCH_TRACK_EVENT_COUNT _IOWR(KVMIO,0x34,batch_track_event_count_t) + +#define KVM_USPT_POLL_EVENT_NO_EVENT 1000 +#define KVM_USPT_POLL_EVENT_GOT_EVENT 0 + +typedef struct { + uint64_t id; // filled automatically + uint64_t faulted_gpa; + uint32_t error_code; + bool have_rip_info; + uint64_t rip; + uint64_t ns_timestamp; + bool have_retired_instructions; + uint64_t retired_instructions; +} page_fault_event_t; + +typedef struct { + int tracking_type; + uint64_t expected_events; + int perf_cpu; + bool retrack; +} batch_track_config_t; + +typedef struct { + uint64_t event_count; +} batch_track_event_count_t; + +typedef struct { + page_fault_event_t* out_buf; + uint64_t len; + bool error_during_batch; +} batch_track_stop_and_get_t; + +typedef struct { + int cpu; // cpu on which we want to read the counter + uint64_t retired_instruction_count; // result param +} retired_instr_perf_t; + +typedef struct { + int cpu; // cpu on which counter should be programmed +} retired_instr_perf_config_t; + +typedef struct { + uint64_t gpa; + uint64_t len; + bool decrypt_with_host_key; + int wbinvd_cpu; // -1: do not flush; else logical cpu on which we flush + void* output_buffer; +} read_guest_memory_t; + +typedef struct { + int pid; + bool get_rip; +} userspace_ctx_t; + +typedef struct { + uint64_t id; +} ack_event_t; + +typedef struct { + uint64_t gpa; + int track_mode; +} track_page_param_t; + +typedef struct { + int track_mode; +} track_all_pages_t; + diff --git a/sevstep/uspt.c b/sevstep/uspt.c new file mode 100644 index 0000000..f7b329d --- /dev/null +++ b/sevstep/uspt.c @@ -0,0 +1,503 @@ +#include "uspt.h" +#include "sevstep.h" + +#include +#include +#include +#include +#include +#include +#include + +#define ARRLEN(x) (sizeof(x)/sizeof((x)[0])) + +typedef struct { + bool is_active; + int tracking_type; + bool retrack; + + int perf_cpu; + + uint64_t gfn_retrack_backlog[10]; + int gfn_retrack_backlog_next_idx; + + page_fault_event_t * events; + uint64_t event_next_idx; + uint64_t events_size; + + bool error_occured; +} batch_track_state_t; + +// crude sync mechanism. don't know a good way to act on errors yet. +uint64_t last_sent_event_id = 1; +uint64_t last_acked_event_id = 1; +DEFINE_RWLOCK(event_lock); + +page_fault_event_t sent_event; +static int have_event = 0; + +static bool get_rip = true; + +static int inited = 0; + +DEFINE_SPINLOCK(batch_track_state_lock); +static batch_track_state_t batch_track_state; + +typedef struct { + uint64_t idx_for_last_perf_reading; + uint64_t last_perf_reading; + uint64_t delta_valid_idx; + uint64_t delta; +} perf_state_t; + +perf_state_t perf_state; + + +void +uspt_clear(void) +{ + write_lock(&event_lock); + inited = 0; + last_sent_event_id = 1; + last_acked_event_id = 1; + have_event = 0; + get_rip = false; + write_unlock(&event_lock); +} + +int +uspt_initialize(int pid,bool should_get_rip) +{ + write_lock(&event_lock); + inited = 1; + last_sent_event_id = 1; + last_acked_event_id = 1; + have_event = 0; + get_rip = should_get_rip; + write_unlock(&event_lock); + + return 0; +} + +int +uspt_is_initialiized() +{ + return inited; +} + +bool +uspt_should_get_rip() +{ + bool tmp; + + read_lock(&event_lock); + tmp = get_rip; + read_unlock(&event_lock); + + return tmp; +} + +int +uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code, + bool have_rip, uint64_t rip) +{ + ktime_t abort_after; + page_fault_event_t message_for_user; + + read_lock(&event_lock); + if (!uspt_is_initialiized()) { + printk("userspace_page_track_signals: " + "uspt_send_and_block : ctx not initialized!\n"); + read_unlock(&event_lock); + return 1; + } + read_unlock(&event_lock); + + write_lock(&event_lock); + if (last_sent_event_id != last_acked_event_id) { + printk("event id_s out of sync, aborting. Fix this later\n"); + write_unlock(&event_lock); + return 1; + } else { + // TODO: handle overflow + last_sent_event_id++; + } + message_for_user.id = last_sent_event_id; + message_for_user.faulted_gpa = faulted_gpa; + message_for_user.error_code = error_code; + message_for_user.have_rip_info = have_rip; + message_for_user.rip = rip; + message_for_user.ns_timestamp = ktime_get_real_ns(); + message_for_user.have_retired_instructions = false; + + // for poll based system; + have_event = 1; + sent_event = message_for_user; + // printk("uspt_send_and_block sending event %llu\n",sent_event.id); + + write_unlock(&event_lock); + + // wait for ack, but with timeout. Otherwise small bugs in userland + // easily lead to a kernel hang + abort_after = ktime_get() + 1000000000ULL; // 1 sec in nanosecond + while (!uspt_is_event_done(sent_event.id)) { + if (ktime_get() > abort_after) { + printk("Waiting for ack of event %llu timed out, continuing\n",sent_event.id); + return 3; + } + } + + return 0; +} + +int +uspt_is_event_done(uint64_t id) +{ + int res; + + read_lock(&event_lock); + res = last_acked_event_id >= id; + read_unlock(&event_lock); + + return res; +} + +int +uspt_handle_poll_event(page_fault_event_t* userpace_mem) +{ + int err; + + // most of the time we won't have an event + read_lock(&event_lock); + if (!have_event) { + read_unlock(&event_lock); + return KVM_USPT_POLL_EVENT_NO_EVENT; + } + read_unlock(&event_lock); + + write_lock(&event_lock); + if (have_event) { + err = copy_to_user(userpace_mem, + &sent_event, sizeof(page_fault_event_t)); + have_event = 0; + } else { + err = KVM_USPT_POLL_EVENT_NO_EVENT; + } + write_unlock(&event_lock); + + return err; +} + +static int +_uspt_handle_ack_event(uint64_t id) +{ + int err = 0; + + write_lock(&event_lock); + if (id == last_sent_event_id) { + last_acked_event_id = last_sent_event_id; + } else { + err = 1; + printk("last sent event id is %llu but received ack for %llu\n",last_sent_event_id,id); + } + write_unlock(&event_lock); + + return err; +} + +int +uspt_handle_ack_event_ioctl(ack_event_t event) +{ + return _uspt_handle_ack_event(event.id); +} + +// setup perf_state and program retired instruction performance counter +void +_perf_state_setup_retired_instructions(void) +{ + perf_ctl_config_t retired_instructions_perf_config; + retired_instructions_perf_config.HostGuestOnly = 0x1; // 0x1 means: count only guest + retired_instructions_perf_config.CntMask = 0x0; + retired_instructions_perf_config.Inv = 0x0; + retired_instructions_perf_config.Int = 0x0; + retired_instructions_perf_config.Edge = 0x0; + retired_instructions_perf_config.OsUserMode = 0x3; // 0x3 means: count kern and user events + retired_instructions_perf_config.EventSelect = 0x0c0; + retired_instructions_perf_config.UintMask = 0x0; + retired_instructions_perf_config.En = 0x1; + write_ctl(&retired_instructions_perf_config,batch_track_state.perf_cpu, CTL_MSR_0); +} + + +// get retired instructions between current_event_idx-1 and current_event_idx +// value is cached for multiple calls to the same current_event_idx +uint64_t +_perf_state_update_and_get_delta(uint64_t current_event_idx) +{ + uint64_t current_value; + + // check if value is "cached" + if (perf_state.delta_valid_idx == current_event_idx) { + if (current_event_idx == 0) { + read_ctr(CTR_MSR_0, batch_track_state.perf_cpu, ¤t_value); + perf_state.idx_for_last_perf_reading = current_event_idx; + perf_state.last_perf_reading = current_event_idx; + } + return perf_state.delta; + } + + // otherwise update, but logic is only valid for two consecutive events + if (current_event_idx != perf_state.idx_for_last_perf_reading+1) { + printk_ratelimited(KERN_CRIT "_perf_state_update_and_get_delta: " + "last reading was for idx %llu but was queried for %llu\n", + perf_state.idx_for_last_perf_reading, current_event_idx); + } + + read_ctr(CTR_MSR_0, batch_track_state.perf_cpu, ¤t_value); + perf_state.delta = (current_value - perf_state.last_perf_reading); + perf_state.delta_valid_idx = current_event_idx; + + perf_state.idx_for_last_perf_reading = current_event_idx; + perf_state.last_perf_reading = current_value; + + return perf_state.delta; +} + +void +uspt_batch_tracking_inc_event_idx(void) +{ + spin_lock(&batch_track_state_lock); + batch_track_state.event_next_idx++; + spin_unlock(&batch_track_state_lock); +} + +int +uspt_batch_tracking_start(int tracking_type,uint64_t expected_events, + int perf_cpu, bool retrack) +{ + page_fault_event_t* events; + uint64_t buffer_size, i; + + spin_lock(&batch_track_state_lock); + if (batch_track_state.is_active) { + printk("userspace_page_track_signals: overwriting " + "active batch track config!\n"); + if (batch_track_state.events != NULL ) { + vfree(batch_track_state.events); + } + } + batch_track_state.is_active = false; + spin_unlock(&batch_track_state_lock); + + buffer_size = expected_events * sizeof(page_fault_event_t); + printk("uspt_batch_tracking_start trying to alloc %llu " + "bytes buffer for events\n", buffer_size); + events = vmalloc(buffer_size); + if (events == NULL) { + printk("userspace_page_track_signals: " + "faperf_cpuiled to alloc %llu bytes for event buffer\n", + buffer_size); + return 1; // note: lock not held here + } + + // access each element once to force them into memory, improving performance + // during tracking + for (i = 0; i < expected_events * sizeof(page_fault_event_t); i++) { + ((volatile uint8_t*)events)[i] = 0; + } + + perf_state.idx_for_last_perf_reading = 0; + perf_state.last_perf_reading = 0; + perf_state.delta_valid_idx = 0; + perf_state.delta = 0; + _perf_state_setup_retired_instructions(); + + spin_lock(&batch_track_state_lock); + + batch_track_state.perf_cpu = perf_cpu; + batch_track_state.retrack = retrack; + + batch_track_state.events = events; + batch_track_state.event_next_idx = 0; + batch_track_state.events_size = expected_events; + + batch_track_state.gfn_retrack_backlog_next_idx = 0; + batch_track_state.tracking_type = tracking_type; + batch_track_state.error_occured = false; + + batch_track_state.is_active = true; + + spin_unlock(&batch_track_state_lock); + + return 0; +} + +void +uspt_batch_tracking_handle_retrack(struct kvm_vcpu* vcpu, + uint64_t current_fault_gfn) +{ + uint64_t ret_instr_delta; + int i, next_idx; + + spin_lock(&batch_track_state_lock); + + if (!batch_track_state.retrack) { + spin_unlock(&batch_track_state_lock); + return; + } + + if (smp_processor_id() != batch_track_state.perf_cpu) { + printk("uspt_batch_tracking_handle_retrack: perf was " + "programmed on logical cpu %d but handler was called " + "on %d. Did you forget to pin the vcpu thread?\n", + batch_track_state.perf_cpu, smp_processor_id()); + } + ret_instr_delta = _perf_state_update_and_get_delta(batch_track_state.event_next_idx); + + + // faulting instructions is probably the same as on last fault + // try to add current fault to retrack log and return + // for first event idx we do not have a valid ret_instr_delta. + // Retracking for the frist time is fine, if we loop, we end up here + // again but with a valid delta on one of the next event + if( (ret_instr_delta < 2) && ( batch_track_state.event_next_idx != 0) ) { + next_idx = batch_track_state.gfn_retrack_backlog_next_idx; + if (next_idx >= ARRLEN(batch_track_state.gfn_retrack_backlog)) { + printk("uspt_batch_tracking_handle_retrack: retrack " + "backlog full, dropping retrack for fault " + "at 0x%llx\n", current_fault_gfn); + } else { + batch_track_state.gfn_retrack_backlog[next_idx] = current_fault_gfn; + batch_track_state.gfn_retrack_backlog_next_idx++; + } + + spin_unlock(&batch_track_state_lock); + return; + } + + /* made progress, retrack everything in backlog and reset idx */ + for (i = 0; i < batch_track_state.gfn_retrack_backlog_next_idx; i++) { + __track_single_page(vcpu, + batch_track_state.gfn_retrack_backlog[i], + batch_track_state.tracking_type); + } + + /* add current fault to list */ + batch_track_state.gfn_retrack_backlog[0] = current_fault_gfn; + batch_track_state.gfn_retrack_backlog_next_idx = 1; + + spin_unlock(&batch_track_state_lock); + +} + +int +uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code, + bool have_rip, uint64_t rip) +{ + uint64_t ret_instr_delta; + page_fault_event_t* event; + + spin_lock(&batch_track_state_lock); + + if (!batch_track_state.is_active) { + printk_ratelimited("userspace_page_track_signals: got save but batch tracking is not active!\n"); + batch_track_state.error_occured = true; + spin_unlock(&batch_track_state_lock); + return 1; + } + + + if (batch_track_state.event_next_idx >= batch_track_state.events_size) { + printk_ratelimited("userspace_page_track_signals: events buffer is full!\n"); + batch_track_state.error_occured = true; + spin_unlock(&batch_track_state_lock); + return 1; + } + + if (smp_processor_id() != batch_track_state.perf_cpu) { + printk("uspt_batch_tracking_handle_retrack: perf was " + "programmed on logical cpu %d but handler was called " + "on %d. Did you forget to pin the vcpu thread?\n", + batch_track_state.perf_cpu, smp_processor_id()); + } + ret_instr_delta = _perf_state_update_and_get_delta(batch_track_state.event_next_idx); + + + if (batch_track_state.events == NULL) { + printk(KERN_CRIT "userspace_page_track_signals: events buf was " + "NULL but \"is_active\" was set! This should never happen!!!\n"); + spin_unlock(&batch_track_state_lock); + return 1; + } + + event = &batch_track_state.events[batch_track_state.event_next_idx]; + event->id = batch_track_state.event_next_idx; + event->faulted_gpa = faulted_gpa; + event->error_code = error_code; + event->have_rip_info = have_rip; + event->rip = rip; + event->ns_timestamp = ktime_get_real_ns(); + event->have_retired_instructions = true; + event->retired_instructions = ret_instr_delta; + + // old inc was here + + if (batch_track_state.gfn_retrack_backlog_next_idx + > ARRLEN(batch_track_state.gfn_retrack_backlog)) { + printk_ratelimited("userspace_page_track_signals: " + "gfn retrack backlog overflow!\n"); + batch_track_state.error_occured = true; + spin_unlock(&batch_track_state_lock); + return 1; + } + + spin_unlock(&batch_track_state_lock); + return 0; +} + +int +uspt_batch_tracking_stop(page_fault_event_t* results, uint64_t len, bool* error_occured) +{ + spin_lock(&batch_track_state_lock); + if (!batch_track_state.is_active) { + printk("userspace_page_track_signals: batch tracking not active\n"); + spin_unlock(&batch_track_state_lock); + return 1; + + } + batch_track_state.is_active = false; + + if (len > batch_track_state.event_next_idx) { + printk("userspace_page_track_signals: requested %llu " + "events but got only %llu\n", + len, batch_track_state.event_next_idx); + spin_unlock(&batch_track_state_lock); + return 1; + } + + memcpy(results,batch_track_state.events, len*sizeof(page_fault_event_t)); + vfree(batch_track_state.events); + + *error_occured = batch_track_state.error_occured; + + spin_unlock(&batch_track_state_lock); + + return 0; +} + +uint64_t +uspt_batch_tracking_get_events_count() +{ + uint64_t buf; + spin_lock(&batch_track_state_lock); + buf = batch_track_state.event_next_idx; + spin_unlock(&batch_track_state_lock); + + return buf; +} + +bool +uspt_batch_tracking_in_progress() +{ + return batch_track_state.is_active; +} diff --git a/sevstep/uspt.h b/sevstep/uspt.h new file mode 100644 index 0000000..7c34996 --- /dev/null +++ b/sevstep/uspt.h @@ -0,0 +1,49 @@ +#pragma once + +#include "uapi.h" + +#include +#include +#include + + +int uspt_initialize(int pid,bool should_get_rip); +int uspt_is_initialiized(void); +void uspt_clear(void); + +bool uspt_should_get_rip(void); + +int uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code, + bool have_rip, uint64_t rip); + +int uspt_is_event_done(uint64_t id); + +/* prepare next event based on faulted_gpa and error_code. Notify process + * behind pid_number. Event must be polled id is result param with the id + * used for the event. Can be used to call uspt_is_event_done */ +int uspt_send_notification(int pid_number, uint64_t faulted_gpa, + uint32_t error_code, uint64_t *id); + +/* copy next event to userpace_mem */ +int uspt_handle_poll_event(page_fault_event_t* userpace_mem); + +/* acknowledge receival of event to event handling logic */ +int uspt_handle_ack_event_ioctl(ack_event_t event); + +/* should be called after "uspt_batch_tracking_save", + * "uspt_batch_tracking_handle_retrack" and any future custom logic + * for an event is processed */ +void uspt_batch_tracking_inc_event_idx(void); +int uspt_batch_tracking_start(int tracking_type, uint64_t expected_events, int perf_cpu, bool retrack); +int uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code, bool have_rip, uint64_t rip); +uint64_t uspt_batch_tracking_get_events_count(void); + +/* Stops batch tracking on copies the first @len events into @result. + * If an error occured at some point during the batch tracking, + * error_occured is set(there should also be a dmesg, but this allows programatic access); + * Caller can use uspt_batch_tracking_get_events_count() to determine the amount + * of memory they should allocate for @results */ +int uspt_batch_tracking_stop(page_fault_event_t *results, uint64_t len, bool *error_occured); +void uspt_batch_tracking_handle_retrack(struct kvm_vcpu *vcpu, uint64_t current_fault_gfn); +void uspt_batch_tracking_get_retrack_gfns(uint64_t **gfns, uint64_t *len, int *tracking_type); +bool uspt_batch_tracking_in_progress(void); diff --git a/test/access.c b/test/access.c old mode 100755 new mode 100644 index 22e2fb8..1e38e1e --- a/test/access.c +++ b/test/access.c @@ -1,4 +1,4 @@ -#include "cachepc_user.h" +#include "cachepc/uapi.h" #include #include diff --git a/test/eviction.c b/test/eviction.c old mode 100755 new mode 100644 index e68132b..9fb57b5 --- a/test/eviction.c +++ b/test/eviction.c @@ -1,4 +1,4 @@ -#include "cachepc_user.h" +#include "cachepc/uapi.h" #include #include diff --git a/test/kvm.c b/test/kvm.c old mode 100755 new mode 100644 index 42d7f5a..cd0dd4d --- a/test/kvm.c +++ b/test/kvm.c @@ -1,7 +1,6 @@ -/* for CPU_ZERO macros.. */ #define _GNU_SOURCE -#include "cachepc_user.h" +#include "cachepc/uapi.h" #include #include diff --git a/test/sev-es.c b/test/sev-es.c old mode 100755 new mode 100644 index 17cb72c..f2a6f5c --- a/test/sev-es.c +++ b/test/sev-es.c @@ -1,7 +1,6 @@ -/* for CPU_ZERO macros.. */ #define _GNU_SOURCE -#include "cachepc_user.h" +#include "cachepc/uapi.h" #include #include diff --git a/test/sev.c b/test/sev.c old mode 100755 new mode 100644 index e6da94c..73bb91f --- a/test/sev.c +++ b/test/sev.c @@ -1,7 +1,6 @@ -/* for CPU_ZERO macros.. */ #define _GNU_SOURCE -#include "cachepc_user.h" +#include "cachepc/uapi.h" #include #include diff --git a/test/sevstep.c b/test/sevstep.c new file mode 100644 index 0000000..3ca7f03 --- /dev/null +++ b/test/sevstep.c @@ -0,0 +1,32 @@ +#include "sevstep/uapi.h" + +#include +#include + +#include +#include +#include + +int +main(int argc, const char **argv) +{ + track_all_pages_t tracking; + int ret, fd; + + fd = open("/proc/cachepc"); + if (!fd) err(1, "open"); + + tracking.track_mode = KVM_PAGE_TRACK_ACCESS; + ret = ioctl(fd, KVM_USPT_TRACK_ALL, &tracking); + if (ret == -1) err(1, "ioctl TRACK_ALL ACCESS"); + + + tracking.track_mode = KVM_PAGE_TRACK_RESET_ACCESSED; + ret = ioctl(fd, KVM_USPT_TRACK_ALL, &tracking); + if (ret == -1) err(1, "ioctl TRACK_ALL RESET_ACCESSED"); + + ret = ioctl(fd, KVM_USPT_UNTRACK_ALL, &tracking); + if (ret == -1) err(1, "ioctl UNTRACK_ALL"); + + close(fd); +} -- cgit v1.2.3-71-gd317