From 58d8565f015f9e06e1e51a0fe4654b966b2c27c0 Mon Sep 17 00:00:00 2001 From: Louis Burda Date: Wed, 5 Oct 2022 15:05:19 +0200 Subject: Refactor sevstep kernel patch into repository --- sevstep/kvm.c | 205 ++++++++++++++++++++++ sevstep/kvm.h | 4 + sevstep/mmu.c | 132 ++++++++++++++ sevstep/sevstep.c | 129 ++++++++++++++ sevstep/sevstep.h | 67 ++++++++ sevstep/uapi.h | 86 ++++++++++ sevstep/uspt.c | 503 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ sevstep/uspt.h | 49 ++++++ 8 files changed, 1175 insertions(+) create mode 100644 sevstep/kvm.c create mode 100644 sevstep/kvm.h create mode 100644 sevstep/mmu.c create mode 100644 sevstep/sevstep.c create mode 100644 sevstep/sevstep.h create mode 100644 sevstep/uapi.h create mode 100644 sevstep/uspt.c create mode 100644 sevstep/uspt.h (limited to 'sevstep') diff --git a/sevstep/kvm.c b/sevstep/kvm.c new file mode 100644 index 0000000..b6b0d49 --- /dev/null +++ b/sevstep/kvm.c @@ -0,0 +1,205 @@ +#include "kvm.h" + +#include + +bool +__untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (mode == KVM_PAGE_TRACK_ACCESS) { + //printk("Removing gfn: %016llx from acess page track pool\n", gfn); + } + if (mode == KVM_PAGE_TRACK_WRITE) { + //printk("Removing gfn: %016llx from write page track pool\n", gfn); + } + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + + if (slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_remove_page(vcpu->kvm, slot, gfn, mode); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + } else { + printk("Failed to untrack %016llx because ", gfn); + if (slot == NULL) { + printk(KERN_CONT "slot was null"); + } else if (!kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { + printk(KERN_CONT "page track was not active"); + } + printk(KERN_CONT "\n"); + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; +} +EXPORT_SYMBOL(__untrack_single_page); + +bool +__reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if( slot != NULL ) { + write_lock(&vcpu->kvm->mmu_lock); + //Vincent: The kvm mmu function now requires min_level + //We want all pages to protected so we do PG_LEVEL_4K + //https://patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/ + sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm,slot,gfn,PG_LEVEL_4K,KVM_PAGE_TRACK_RESET_ACCESSED); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; +} +EXPORT_SYMBOL(__reset_accessed_on_page); + +bool +__clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if( slot != NULL ) { + write_lock(&vcpu->kvm->mmu_lock); + //Vincent: The kvm mmu function now requires min_level + //We want all pages to protected so we do PG_LEVEL_4K + //https://patchwork.kernel.org/project/kvm/patch/20210416082511.2856-2-zhukeqian1@huawei.com/ + sevstep_kvm_mmu_slot_gfn_protect(vcpu->kvm, slot, gfn, + PG_LEVEL_4K, KVM_PAGE_TRACK_RESET_EXEC); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; +} +EXPORT_SYMBOL(__clear_nx_on_page); + +bool +__track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + int idx; + bool ret; + struct kvm_memory_slot *slot; + + ret = false; + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (mode == KVM_PAGE_TRACK_ACCESS) { + //printk_ratelimited("Adding gfn: %016llx to acess page track pool\n", gfn); + //printk("Adding gfn: %016llx to acess page track pool\n", gfn); + } + if (mode == KVM_PAGE_TRACK_WRITE) { + //printk_ratelimited("Adding gfn: %016llx to write page track pool\n", gfn); + } + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm,slot, gfn, mode)) { + + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_add_page(vcpu->kvm, slot, gfn, mode); + write_unlock(&vcpu->kvm->mmu_lock); + ret = true; + + } else { + + printk("Failed to track %016llx because ", gfn); + if (slot == NULL) { + printk(KERN_CONT "slot was null"); + } + if (kvm_slot_page_track_is_active(vcpu->kvm, slot, gfn, mode)) { + printk(KERN_CONT "page is already tracked"); + } + printk(KERN_CONT "\n"); + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; +} +EXPORT_SYMBOL(__track_single_page); + +long +kvm_start_tracking(struct kvm_vcpu *vcpu,enum kvm_page_track_mode mode ) +{ + long count = 0; + u64 iterator, iterat_max; + struct kvm_memory_slot *slot; + int idx; + + //Vincent: Memslots interface changed into a rb tree, see + //here: https://lwn.net/Articles/856392/ + //and here: https://lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u + //Thus we use instead of + //iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn + // + vcpu->kvm->memslots[0]->memslots[0].npages; + struct rb_node *node; + struct kvm_memory_slot *first_memslot; + node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree)); + first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]); + iterat_max = first_memslot->base_gfn + first_memslot->npages; + for (iterator=0; iterator < iterat_max; iterator++) + { + idx = srcu_read_lock(&vcpu->kvm->srcu); + slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator); + if ( slot != NULL && !kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) { + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_add_page(vcpu->kvm, slot, iterator, mode); + write_unlock(&vcpu->kvm->mmu_lock); + count++; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + return count; +} +EXPORT_SYMBOL(kvm_start_tracking); + +long +kvm_stop_tracking(struct kvm_vcpu *vcpu,enum kvm_page_track_mode mode) +{ + long count = 0; + u64 iterator, iterat_max; + struct kvm_memory_slot *slot; + int idx; + + + //Vincent: Memslots interface changed into a rb tree, see + //here: https://lwn.net/Articles/856392/ + //and here: https://lore.kernel.org/all/cover.1632171478.git.maciej.szmigiero@oracle.com/T/#u + //Thus we use instead of + //iterat_max = vcpu->kvm->memslots[0]->memslots[0].base_gfn + // + vcpu->kvm->memslots[0]->memslots[0].npages; + struct rb_node *node; + struct kvm_memory_slot *first_memslot; + node = rb_last(&(vcpu->kvm->memslots[0]->gfn_tree)); + first_memslot = container_of(node, struct kvm_memory_slot, gfn_node[0]); + iterat_max = first_memslot->base_gfn + first_memslot->npages; + for (iterator=0; iterator < iterat_max; iterator++) + { + idx = srcu_read_lock(&vcpu->kvm->srcu); + slot = kvm_vcpu_gfn_to_memslot(vcpu, iterator); + //Vincent: I think see here https://patchwork.kernel.org/project/kvm/patch/20210924163152.289027-22-pbonzini@redhat.com/ + if ( slot != NULL && kvm_slot_page_track_is_active(vcpu->kvm, slot, iterator, mode)) { + write_lock(&vcpu->kvm->mmu_lock); + kvm_slot_page_track_remove_page(vcpu->kvm, slot, iterator, mode); + write_unlock(&vcpu->kvm->mmu_lock); + count++; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + return count; +} +EXPORT_SYMBOL(kvm_stop_tracking); + diff --git a/sevstep/kvm.h b/sevstep/kvm.h new file mode 100644 index 0000000..35cb4d5 --- /dev/null +++ b/sevstep/kvm.h @@ -0,0 +1,4 @@ +#pragma once + +#include "sev-step.h" +#include "uapi.h" diff --git a/sevstep/mmu.c b/sevstep/mmu.c new file mode 100644 index 0000000..4eefea2 --- /dev/null +++ b/sevstep/mmu.c @@ -0,0 +1,132 @@ +#include "../sevstep/sevstep.h" +#include "../sevstep/uspt.h" + +void +sevstep_uspt_page_fault_handle(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + const int modes[] = { + KVM_PAGE_TRACK_WRITE, + KVM_PAGE_TRACK_ACCESS, + KVM_PAGE_TRACK_EXEC + }; + uint64_t current_rip; + bool was_tracked; + int have_rip, i; + int send_err; + + was_tracked = false; + for (i = 0; i < sizeof(modes) / sizeof(modes[0]); i++) { + if (kvm_slot_page_track_is_active(vcpu->kvm, + fault->slot, fault->gfn, modes[i])) { + __untrack_single_page(vcpu, fault->gfn, modes[i]); + was_tracked = true; + } + } + + if (was_tracked) { + have_rip = false; + if (uspt_should_get_rip()) + have_rip = sev_step_get_rip_kvm_vcpu(vcpu,¤t_rip) == 0; + if (uspt_batch_tracking_in_progress()) { + send_err = uspt_batch_tracking_save(fault->gfn << PAGE_SHIFT, + fault->error_code, have_rip, current_rip); + if (send_err) { + printk_ratelimited( + "uspt_batch_tracking_save failed with %d\n" + "##########################\n", send_err); + } + uspt_batch_tracking_handle_retrack(vcpu, fault->gfn); + uspt_batch_tracking_inc_event_idx(); + } else { + send_err = uspt_send_and_block(fault->gfn << PAGE_SHIFT, + fault->error_code, have_rip, current_rip); + if (send_err) { + printk("uspt_send_and_block failed with %d\n" + "##########################\n", send_err); + } + } + } +} + +bool +sevstep_spte_protect(u64 *sptep, bool pt_protect, enum kvm_page_track_mode mode) +{ + u64 spte = *sptep; + bool shouldFlush = false; + + if (!is_writable_pte(spte) && !(pt_protect && is_mmu_writable_spte(spte))) + return false; + + rmap_printk("spte %p %llx\n", sptep, *sptep); + + if (pt_protect) + spte &= ~EPT_SPTE_MMU_WRITABLE; + + if (mode == KVM_PAGE_TRACK_WRITE) { + spte = spte & ~PT_WRITABLE_MASK; + shouldFlush = true; + } else if (mode == KVM_PAGE_TRACK_RESET_ACCESSED) { + spte = spte & ~PT_ACCESSED_MASK; + } else if (mode == KVM_PAGE_TRACK_ACCESS) { + spte = spte & ~PT_PRESENT_MASK; + spte = spte & ~PT_WRITABLE_MASK; + spte = spte & ~PT_USER_MASK; + spte = spte | (0x1ULL << PT64_NX_SHIFT); + shouldFlush = true; + } else if (mode == KVM_PAGE_TRACK_EXEC) { + spte = spte | (0x1ULL << PT64_NX_SHIFT); + shouldFlush = true; + } else if (mode == KVM_PAGE_TRACK_RESET_EXEC) { + spte = spte & ~(0x1ULL << PT64_NX_SHIFT); + shouldFlush = true; + } else { + printk(KERN_WARNING "spte_protect was called with invalid mode" + "parameter %d\n",mode); + } + shouldFlush |= mmu_spte_update(sptep, spte); + return shouldFlush; +} +EXPORT_SYMBOL(sevstep_spte_protect); + +bool sevstep_rmap_protect(struct kvm_rmap_head *rmap_head, + bool pt_protect, enum kvm_page_track_mode mode) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for_each_rmap_spte(rmap_head, &iter, sptep) { + flush |= sevstep_spte_protect(sptep, pt_protect, mode); + } + + return flush; +} +EXPORT_SYMBOL(sevstep_rmap_protect); + +bool +sevstep_kvm_mmu_slot_gfn_protect(struct kvm *kvm, struct kvm_memory_slot *slot, + uint64_t gfn, int min_level, enum kvm_page_track_mode mode) +{ + struct kvm_rmap_head *rmap_head; + bool protected; + int i; + + protected = false; + + if (kvm_memslots_have_rmaps(kvm)) { + for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { + rmap_head = gfn_to_rmap(gfn, i, slot); + protected |= sevstep_rmap_protect(rmap_head, true, mode); + } + } + + if (is_tdp_mmu_enabled(kvm)) { + protected |= kvm_tdp_mmu_write_protect_gfn(kvm, + slot, gfn, min_level); + } + + return protected; +} +EXPORT_SYMBOL(sevstep_kvm_mmu_slot_gfn_protect); + diff --git a/sevstep/sevstep.c b/sevstep/sevstep.c new file mode 100644 index 0000000..3345e04 --- /dev/null +++ b/sevstep/sevstep.c @@ -0,0 +1,129 @@ +#include "sevstep.h" + +#include "mmu/mmu_internal.h" +#include "mmu.h" + +#include "irq.h" +#include "ioapic.h" +#include "mmu.h" +#include "mmu/tdp_mmu.h" +#include "x86.h" +#include "kvm_cache_regs.h" +#include "kvm_emulate.h" +#include "cpuid.h" +#include "mmu/spte.h" + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kvm_cache_regs.h" +#include "svm/svm.h" + +struct kvm* main_vm; +EXPORT_SYMBOL(main_vm); + +// used to store performance counter values; 6 counters, 2 readings per counter +// TODO: static! +uint64_t perf_reads[6][2]; +perf_ctl_config_t perf_configs[6]; +int perf_cpu; + + +uint64_t +perf_ctl_to_u64(perf_ctl_config_t * config) +{ + uint64_t result; + + result = 0; + result |= config->EventSelect & 0xffULL; + result |= (config->UintMask & 0xffULL) << 8; + result |= (config->OsUserMode & 0x3ULL) << 16; + result |= (config->Edge & 0x1ULL ) << 18; + result |= (config->Int & 0x1ULL ) << 20; + result |= (config->En & 0x1ULL ) << 22; + result |= (config->Inv & 0x1ULL ) << 23; + result |= (config->CntMask & 0xffULL) << 24; + result |= ((config->EventSelect & 0xf00ULL) >> 8) << 32; + result |= (config->HostGuestOnly & 0x3ULL) << 40; + + return result; + +} + +void +write_ctl(perf_ctl_config_t * config, int cpu, uint64_t ctl_msr) +{ + wrmsrl_on_cpu(cpu, ctl_msr, perf_ctl_to_u64(config)); +} + +void +read_ctr(uint64_t ctr_msr, int cpu, uint64_t* result) +{ + uint64_t tmp; + + rdmsrl_on_cpu(cpu, ctr_msr, &tmp); + *result = tmp & ( (0x1ULL << 48) - 1); +} + +void +setup_perfs() +{ + int i; + + perf_cpu = smp_processor_id(); + + for (i = 0; i < 6; i++) { + perf_configs[i].HostGuestOnly = 0x1; /* count only guest */ + perf_configs[i].CntMask = 0x0; + perf_configs[i].Inv = 0x0; + perf_configs[i].En = 0x0; + perf_configs[i].Int = 0x0; + perf_configs[i].Edge = 0x0; + perf_configs[i].OsUserMode = 0x3; /* count userland and kernel events */ + } + + perf_configs[0].EventSelect = 0x0c0; + perf_configs[0].UintMask = 0x0; + perf_configs[0].En = 0x1; + write_ctl(&perf_configs[0],perf_cpu, CTL_MSR_0); + + /* + * programm l2d hit from data cache miss perf for + * cpu_probe_pointer_chasing_inplace without counting thread. + * N.B. that this time we count host events + */ + perf_configs[1].EventSelect = 0x064; + perf_configs[1].UintMask = 0x70; + perf_configs[1].En = 0x1; + perf_configs[1].HostGuestOnly = 0x2; /* count only host events */ + write_ctl(&perf_configs[1],perf_cpu,CTL_MSR_1); +} +EXPORT_SYMBOL(setup_perfs); + +int +sev_step_get_rip_kvm_vcpu(struct kvm_vcpu* vcpu,uint64_t *rip) +{ + return 0; +} diff --git a/sevstep/sevstep.h b/sevstep/sevstep.h new file mode 100644 index 0000000..86d25f7 --- /dev/null +++ b/sevstep/sevstep.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + + +#define CTL_MSR_0 0xc0010200ULL +#define CTL_MSR_1 0xc0010202ULL +#define CTL_MSR_2 0xc0010204ULL +#define CTL_MSR_3 0xc0010206ULL +#define CTL_MSR_4 0xc0010208ULL +#define CTL_MSR_5 0xc001020aULL + +#define CTR_MSR_0 0xc0010201ULL +#define CTR_MSR_1 0xc0010203ULL +#define CTR_MSR_2 0xc0010205ULL +#define CTR_MSR_3 0xc0010207ULL +#define CTR_MSR_4 0xc0010209ULL +#define CTR_MSR_5 0xc001020bULL + +typedef struct { + uint64_t HostGuestOnly; + uint64_t CntMask; + uint64_t Inv; + uint64_t En; + uint64_t Int; + uint64_t Edge; + uint64_t OsUserMode; + uint64_t UintMask; + uint64_t EventSelect; //12 bits in total split in [11:8] and [7:0] + +} perf_ctl_config_t; + +extern struct kvm* main_vm; + +bool sevstep_spte_protect(u64 *sptep, + bool pt_protect, enum kvm_page_track_mode mode); +bool sevstep_rmap_protect(struct kvm_rmap_head *rmap_head, + bool pt_protect, enum kvm_page_track_mode mode); +bool sevstep_kvm_mmu_slot_gfn_protect(struct kvm *kvm, struct kvm_memory_slot *slot, + uint64_t gfn, int min_level, enum kvm_page_track_mode mode); + +bool __untrack_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode); +bool __track_single_page(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode); +bool __reset_accessed_on_page(struct kvm_vcpu *vcpu, gfn_t gfn); +bool __clear_nx_on_page(struct kvm_vcpu *vcpu, gfn_t gfn); + +long kvm_start_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode); +long kvm_stop_tracking(struct kvm_vcpu *vcpu, enum kvm_page_track_mode mode); +void sev_step_handle_callback(void); + +uint64_t perf_ctl_to_u64(perf_ctl_config_t *config); +void write_ctl(perf_ctl_config_t *config, int cpu, uint64_t ctl_msr); +void read_ctr(uint64_t ctr_msr, int cpu, uint64_t *result); + +void setup_perfs(void); + +int sev_step_get_rip_kvm_vcpu(struct kvm_vcpu *vcpu, uint64_t *rip); diff --git a/sevstep/uapi.h b/sevstep/uapi.h new file mode 100644 index 0000000..e41a036 --- /dev/null +++ b/sevstep/uapi.h @@ -0,0 +1,86 @@ +#pragma once + +#include +#include + +#define KVM_TRACK_PAGE _IOWR(KVMIO, 0x20, track_page_param_t) +#define KVM_USPT_REGISTER_PID _IOWR(KVMIO, 0x21, userspace_ctx_t) +#define KVM_USPT_WAIT_AND_SEND _IO(KVMIO, 0x22) +#define KVM_USPT_POLL_EVENT _IOWR(KVMIO, 0x23, page_fault_event_t) +#define KVM_USPT_ACK_EVENT _IOWR(KVMIO, 0x24, ack_event_t) +#define KVM_READ_GUEST_MEMORY _IOWR(KVMIO, 0x25, read_guest_memory_t) +#define KVM_USPT_RESET _IO(KVMIO, 0x26) +#define KVM_USPT_TRACK_ALL _IOWR(KVMIO, 0x27, track_all_pages_t) +#define KVM_USPT_UNTRACK_ALL _IOWR(KVMIO, 0x28, track_all_pages_t) +#define KVM_USPT_SETUP_RETINSTR_PERF _IOWR(KVMIO, 0x30,retired_instr_perf_config_t) +#define KVM_USPT_READ_RETINSTR_PERF _IOWR(KVMIO,0x31, retired_instr_perf_t) +#define KVM_USPT_BATCH_TRACK_START _IOWR(KVMIO,0x32,batch_track_config_t) +#define KVM_USPT_BATCH_TRACK_STOP _IOWR(KVMIO,0x33,batch_track_stop_and_get_t) +#define KVM_USPT_BATCH_TRACK_EVENT_COUNT _IOWR(KVMIO,0x34,batch_track_event_count_t) + +#define KVM_USPT_POLL_EVENT_NO_EVENT 1000 +#define KVM_USPT_POLL_EVENT_GOT_EVENT 0 + +typedef struct { + uint64_t id; // filled automatically + uint64_t faulted_gpa; + uint32_t error_code; + bool have_rip_info; + uint64_t rip; + uint64_t ns_timestamp; + bool have_retired_instructions; + uint64_t retired_instructions; +} page_fault_event_t; + +typedef struct { + int tracking_type; + uint64_t expected_events; + int perf_cpu; + bool retrack; +} batch_track_config_t; + +typedef struct { + uint64_t event_count; +} batch_track_event_count_t; + +typedef struct { + page_fault_event_t* out_buf; + uint64_t len; + bool error_during_batch; +} batch_track_stop_and_get_t; + +typedef struct { + int cpu; // cpu on which we want to read the counter + uint64_t retired_instruction_count; // result param +} retired_instr_perf_t; + +typedef struct { + int cpu; // cpu on which counter should be programmed +} retired_instr_perf_config_t; + +typedef struct { + uint64_t gpa; + uint64_t len; + bool decrypt_with_host_key; + int wbinvd_cpu; // -1: do not flush; else logical cpu on which we flush + void* output_buffer; +} read_guest_memory_t; + +typedef struct { + int pid; + bool get_rip; +} userspace_ctx_t; + +typedef struct { + uint64_t id; +} ack_event_t; + +typedef struct { + uint64_t gpa; + int track_mode; +} track_page_param_t; + +typedef struct { + int track_mode; +} track_all_pages_t; + diff --git a/sevstep/uspt.c b/sevstep/uspt.c new file mode 100644 index 0000000..f7b329d --- /dev/null +++ b/sevstep/uspt.c @@ -0,0 +1,503 @@ +#include "uspt.h" +#include "sevstep.h" + +#include +#include +#include +#include +#include +#include +#include + +#define ARRLEN(x) (sizeof(x)/sizeof((x)[0])) + +typedef struct { + bool is_active; + int tracking_type; + bool retrack; + + int perf_cpu; + + uint64_t gfn_retrack_backlog[10]; + int gfn_retrack_backlog_next_idx; + + page_fault_event_t * events; + uint64_t event_next_idx; + uint64_t events_size; + + bool error_occured; +} batch_track_state_t; + +// crude sync mechanism. don't know a good way to act on errors yet. +uint64_t last_sent_event_id = 1; +uint64_t last_acked_event_id = 1; +DEFINE_RWLOCK(event_lock); + +page_fault_event_t sent_event; +static int have_event = 0; + +static bool get_rip = true; + +static int inited = 0; + +DEFINE_SPINLOCK(batch_track_state_lock); +static batch_track_state_t batch_track_state; + +typedef struct { + uint64_t idx_for_last_perf_reading; + uint64_t last_perf_reading; + uint64_t delta_valid_idx; + uint64_t delta; +} perf_state_t; + +perf_state_t perf_state; + + +void +uspt_clear(void) +{ + write_lock(&event_lock); + inited = 0; + last_sent_event_id = 1; + last_acked_event_id = 1; + have_event = 0; + get_rip = false; + write_unlock(&event_lock); +} + +int +uspt_initialize(int pid,bool should_get_rip) +{ + write_lock(&event_lock); + inited = 1; + last_sent_event_id = 1; + last_acked_event_id = 1; + have_event = 0; + get_rip = should_get_rip; + write_unlock(&event_lock); + + return 0; +} + +int +uspt_is_initialiized() +{ + return inited; +} + +bool +uspt_should_get_rip() +{ + bool tmp; + + read_lock(&event_lock); + tmp = get_rip; + read_unlock(&event_lock); + + return tmp; +} + +int +uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code, + bool have_rip, uint64_t rip) +{ + ktime_t abort_after; + page_fault_event_t message_for_user; + + read_lock(&event_lock); + if (!uspt_is_initialiized()) { + printk("userspace_page_track_signals: " + "uspt_send_and_block : ctx not initialized!\n"); + read_unlock(&event_lock); + return 1; + } + read_unlock(&event_lock); + + write_lock(&event_lock); + if (last_sent_event_id != last_acked_event_id) { + printk("event id_s out of sync, aborting. Fix this later\n"); + write_unlock(&event_lock); + return 1; + } else { + // TODO: handle overflow + last_sent_event_id++; + } + message_for_user.id = last_sent_event_id; + message_for_user.faulted_gpa = faulted_gpa; + message_for_user.error_code = error_code; + message_for_user.have_rip_info = have_rip; + message_for_user.rip = rip; + message_for_user.ns_timestamp = ktime_get_real_ns(); + message_for_user.have_retired_instructions = false; + + // for poll based system; + have_event = 1; + sent_event = message_for_user; + // printk("uspt_send_and_block sending event %llu\n",sent_event.id); + + write_unlock(&event_lock); + + // wait for ack, but with timeout. Otherwise small bugs in userland + // easily lead to a kernel hang + abort_after = ktime_get() + 1000000000ULL; // 1 sec in nanosecond + while (!uspt_is_event_done(sent_event.id)) { + if (ktime_get() > abort_after) { + printk("Waiting for ack of event %llu timed out, continuing\n",sent_event.id); + return 3; + } + } + + return 0; +} + +int +uspt_is_event_done(uint64_t id) +{ + int res; + + read_lock(&event_lock); + res = last_acked_event_id >= id; + read_unlock(&event_lock); + + return res; +} + +int +uspt_handle_poll_event(page_fault_event_t* userpace_mem) +{ + int err; + + // most of the time we won't have an event + read_lock(&event_lock); + if (!have_event) { + read_unlock(&event_lock); + return KVM_USPT_POLL_EVENT_NO_EVENT; + } + read_unlock(&event_lock); + + write_lock(&event_lock); + if (have_event) { + err = copy_to_user(userpace_mem, + &sent_event, sizeof(page_fault_event_t)); + have_event = 0; + } else { + err = KVM_USPT_POLL_EVENT_NO_EVENT; + } + write_unlock(&event_lock); + + return err; +} + +static int +_uspt_handle_ack_event(uint64_t id) +{ + int err = 0; + + write_lock(&event_lock); + if (id == last_sent_event_id) { + last_acked_event_id = last_sent_event_id; + } else { + err = 1; + printk("last sent event id is %llu but received ack for %llu\n",last_sent_event_id,id); + } + write_unlock(&event_lock); + + return err; +} + +int +uspt_handle_ack_event_ioctl(ack_event_t event) +{ + return _uspt_handle_ack_event(event.id); +} + +// setup perf_state and program retired instruction performance counter +void +_perf_state_setup_retired_instructions(void) +{ + perf_ctl_config_t retired_instructions_perf_config; + retired_instructions_perf_config.HostGuestOnly = 0x1; // 0x1 means: count only guest + retired_instructions_perf_config.CntMask = 0x0; + retired_instructions_perf_config.Inv = 0x0; + retired_instructions_perf_config.Int = 0x0; + retired_instructions_perf_config.Edge = 0x0; + retired_instructions_perf_config.OsUserMode = 0x3; // 0x3 means: count kern and user events + retired_instructions_perf_config.EventSelect = 0x0c0; + retired_instructions_perf_config.UintMask = 0x0; + retired_instructions_perf_config.En = 0x1; + write_ctl(&retired_instructions_perf_config,batch_track_state.perf_cpu, CTL_MSR_0); +} + + +// get retired instructions between current_event_idx-1 and current_event_idx +// value is cached for multiple calls to the same current_event_idx +uint64_t +_perf_state_update_and_get_delta(uint64_t current_event_idx) +{ + uint64_t current_value; + + // check if value is "cached" + if (perf_state.delta_valid_idx == current_event_idx) { + if (current_event_idx == 0) { + read_ctr(CTR_MSR_0, batch_track_state.perf_cpu, ¤t_value); + perf_state.idx_for_last_perf_reading = current_event_idx; + perf_state.last_perf_reading = current_event_idx; + } + return perf_state.delta; + } + + // otherwise update, but logic is only valid for two consecutive events + if (current_event_idx != perf_state.idx_for_last_perf_reading+1) { + printk_ratelimited(KERN_CRIT "_perf_state_update_and_get_delta: " + "last reading was for idx %llu but was queried for %llu\n", + perf_state.idx_for_last_perf_reading, current_event_idx); + } + + read_ctr(CTR_MSR_0, batch_track_state.perf_cpu, ¤t_value); + perf_state.delta = (current_value - perf_state.last_perf_reading); + perf_state.delta_valid_idx = current_event_idx; + + perf_state.idx_for_last_perf_reading = current_event_idx; + perf_state.last_perf_reading = current_value; + + return perf_state.delta; +} + +void +uspt_batch_tracking_inc_event_idx(void) +{ + spin_lock(&batch_track_state_lock); + batch_track_state.event_next_idx++; + spin_unlock(&batch_track_state_lock); +} + +int +uspt_batch_tracking_start(int tracking_type,uint64_t expected_events, + int perf_cpu, bool retrack) +{ + page_fault_event_t* events; + uint64_t buffer_size, i; + + spin_lock(&batch_track_state_lock); + if (batch_track_state.is_active) { + printk("userspace_page_track_signals: overwriting " + "active batch track config!\n"); + if (batch_track_state.events != NULL ) { + vfree(batch_track_state.events); + } + } + batch_track_state.is_active = false; + spin_unlock(&batch_track_state_lock); + + buffer_size = expected_events * sizeof(page_fault_event_t); + printk("uspt_batch_tracking_start trying to alloc %llu " + "bytes buffer for events\n", buffer_size); + events = vmalloc(buffer_size); + if (events == NULL) { + printk("userspace_page_track_signals: " + "faperf_cpuiled to alloc %llu bytes for event buffer\n", + buffer_size); + return 1; // note: lock not held here + } + + // access each element once to force them into memory, improving performance + // during tracking + for (i = 0; i < expected_events * sizeof(page_fault_event_t); i++) { + ((volatile uint8_t*)events)[i] = 0; + } + + perf_state.idx_for_last_perf_reading = 0; + perf_state.last_perf_reading = 0; + perf_state.delta_valid_idx = 0; + perf_state.delta = 0; + _perf_state_setup_retired_instructions(); + + spin_lock(&batch_track_state_lock); + + batch_track_state.perf_cpu = perf_cpu; + batch_track_state.retrack = retrack; + + batch_track_state.events = events; + batch_track_state.event_next_idx = 0; + batch_track_state.events_size = expected_events; + + batch_track_state.gfn_retrack_backlog_next_idx = 0; + batch_track_state.tracking_type = tracking_type; + batch_track_state.error_occured = false; + + batch_track_state.is_active = true; + + spin_unlock(&batch_track_state_lock); + + return 0; +} + +void +uspt_batch_tracking_handle_retrack(struct kvm_vcpu* vcpu, + uint64_t current_fault_gfn) +{ + uint64_t ret_instr_delta; + int i, next_idx; + + spin_lock(&batch_track_state_lock); + + if (!batch_track_state.retrack) { + spin_unlock(&batch_track_state_lock); + return; + } + + if (smp_processor_id() != batch_track_state.perf_cpu) { + printk("uspt_batch_tracking_handle_retrack: perf was " + "programmed on logical cpu %d but handler was called " + "on %d. Did you forget to pin the vcpu thread?\n", + batch_track_state.perf_cpu, smp_processor_id()); + } + ret_instr_delta = _perf_state_update_and_get_delta(batch_track_state.event_next_idx); + + + // faulting instructions is probably the same as on last fault + // try to add current fault to retrack log and return + // for first event idx we do not have a valid ret_instr_delta. + // Retracking for the frist time is fine, if we loop, we end up here + // again but with a valid delta on one of the next event + if( (ret_instr_delta < 2) && ( batch_track_state.event_next_idx != 0) ) { + next_idx = batch_track_state.gfn_retrack_backlog_next_idx; + if (next_idx >= ARRLEN(batch_track_state.gfn_retrack_backlog)) { + printk("uspt_batch_tracking_handle_retrack: retrack " + "backlog full, dropping retrack for fault " + "at 0x%llx\n", current_fault_gfn); + } else { + batch_track_state.gfn_retrack_backlog[next_idx] = current_fault_gfn; + batch_track_state.gfn_retrack_backlog_next_idx++; + } + + spin_unlock(&batch_track_state_lock); + return; + } + + /* made progress, retrack everything in backlog and reset idx */ + for (i = 0; i < batch_track_state.gfn_retrack_backlog_next_idx; i++) { + __track_single_page(vcpu, + batch_track_state.gfn_retrack_backlog[i], + batch_track_state.tracking_type); + } + + /* add current fault to list */ + batch_track_state.gfn_retrack_backlog[0] = current_fault_gfn; + batch_track_state.gfn_retrack_backlog_next_idx = 1; + + spin_unlock(&batch_track_state_lock); + +} + +int +uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code, + bool have_rip, uint64_t rip) +{ + uint64_t ret_instr_delta; + page_fault_event_t* event; + + spin_lock(&batch_track_state_lock); + + if (!batch_track_state.is_active) { + printk_ratelimited("userspace_page_track_signals: got save but batch tracking is not active!\n"); + batch_track_state.error_occured = true; + spin_unlock(&batch_track_state_lock); + return 1; + } + + + if (batch_track_state.event_next_idx >= batch_track_state.events_size) { + printk_ratelimited("userspace_page_track_signals: events buffer is full!\n"); + batch_track_state.error_occured = true; + spin_unlock(&batch_track_state_lock); + return 1; + } + + if (smp_processor_id() != batch_track_state.perf_cpu) { + printk("uspt_batch_tracking_handle_retrack: perf was " + "programmed on logical cpu %d but handler was called " + "on %d. Did you forget to pin the vcpu thread?\n", + batch_track_state.perf_cpu, smp_processor_id()); + } + ret_instr_delta = _perf_state_update_and_get_delta(batch_track_state.event_next_idx); + + + if (batch_track_state.events == NULL) { + printk(KERN_CRIT "userspace_page_track_signals: events buf was " + "NULL but \"is_active\" was set! This should never happen!!!\n"); + spin_unlock(&batch_track_state_lock); + return 1; + } + + event = &batch_track_state.events[batch_track_state.event_next_idx]; + event->id = batch_track_state.event_next_idx; + event->faulted_gpa = faulted_gpa; + event->error_code = error_code; + event->have_rip_info = have_rip; + event->rip = rip; + event->ns_timestamp = ktime_get_real_ns(); + event->have_retired_instructions = true; + event->retired_instructions = ret_instr_delta; + + // old inc was here + + if (batch_track_state.gfn_retrack_backlog_next_idx + > ARRLEN(batch_track_state.gfn_retrack_backlog)) { + printk_ratelimited("userspace_page_track_signals: " + "gfn retrack backlog overflow!\n"); + batch_track_state.error_occured = true; + spin_unlock(&batch_track_state_lock); + return 1; + } + + spin_unlock(&batch_track_state_lock); + return 0; +} + +int +uspt_batch_tracking_stop(page_fault_event_t* results, uint64_t len, bool* error_occured) +{ + spin_lock(&batch_track_state_lock); + if (!batch_track_state.is_active) { + printk("userspace_page_track_signals: batch tracking not active\n"); + spin_unlock(&batch_track_state_lock); + return 1; + + } + batch_track_state.is_active = false; + + if (len > batch_track_state.event_next_idx) { + printk("userspace_page_track_signals: requested %llu " + "events but got only %llu\n", + len, batch_track_state.event_next_idx); + spin_unlock(&batch_track_state_lock); + return 1; + } + + memcpy(results,batch_track_state.events, len*sizeof(page_fault_event_t)); + vfree(batch_track_state.events); + + *error_occured = batch_track_state.error_occured; + + spin_unlock(&batch_track_state_lock); + + return 0; +} + +uint64_t +uspt_batch_tracking_get_events_count() +{ + uint64_t buf; + spin_lock(&batch_track_state_lock); + buf = batch_track_state.event_next_idx; + spin_unlock(&batch_track_state_lock); + + return buf; +} + +bool +uspt_batch_tracking_in_progress() +{ + return batch_track_state.is_active; +} diff --git a/sevstep/uspt.h b/sevstep/uspt.h new file mode 100644 index 0000000..7c34996 --- /dev/null +++ b/sevstep/uspt.h @@ -0,0 +1,49 @@ +#pragma once + +#include "uapi.h" + +#include +#include +#include + + +int uspt_initialize(int pid,bool should_get_rip); +int uspt_is_initialiized(void); +void uspt_clear(void); + +bool uspt_should_get_rip(void); + +int uspt_send_and_block(uint64_t faulted_gpa, uint32_t error_code, + bool have_rip, uint64_t rip); + +int uspt_is_event_done(uint64_t id); + +/* prepare next event based on faulted_gpa and error_code. Notify process + * behind pid_number. Event must be polled id is result param with the id + * used for the event. Can be used to call uspt_is_event_done */ +int uspt_send_notification(int pid_number, uint64_t faulted_gpa, + uint32_t error_code, uint64_t *id); + +/* copy next event to userpace_mem */ +int uspt_handle_poll_event(page_fault_event_t* userpace_mem); + +/* acknowledge receival of event to event handling logic */ +int uspt_handle_ack_event_ioctl(ack_event_t event); + +/* should be called after "uspt_batch_tracking_save", + * "uspt_batch_tracking_handle_retrack" and any future custom logic + * for an event is processed */ +void uspt_batch_tracking_inc_event_idx(void); +int uspt_batch_tracking_start(int tracking_type, uint64_t expected_events, int perf_cpu, bool retrack); +int uspt_batch_tracking_save(uint64_t faulted_gpa, uint32_t error_code, bool have_rip, uint64_t rip); +uint64_t uspt_batch_tracking_get_events_count(void); + +/* Stops batch tracking on copies the first @len events into @result. + * If an error occured at some point during the batch tracking, + * error_occured is set(there should also be a dmesg, but this allows programatic access); + * Caller can use uspt_batch_tracking_get_events_count() to determine the amount + * of memory they should allocate for @results */ +int uspt_batch_tracking_stop(page_fault_event_t *results, uint64_t len, bool *error_occured); +void uspt_batch_tracking_handle_retrack(struct kvm_vcpu *vcpu, uint64_t current_fault_gfn); +void uspt_batch_tracking_get_retrack_gfns(uint64_t **gfns, uint64_t *len, int *tracking_type); +bool uspt_batch_tracking_in_progress(void); -- cgit v1.2.3-71-gd317