#pragma once #include "asm.h" #include "cache_types.h" #include "util.h" #include "cachepc_user.h" void cachepc_init_pmc(uint8_t index, uint8_t event_no, uint8_t event_mask); cache_ctx *cachepc_get_ctx(cache_level cl); void cachepc_release_ctx(cache_ctx *ctx); cacheline *cachepc_prepare_ds(cache_ctx *ctx); void cachepc_release_ds(cache_ctx *ctx, cacheline *ds); cacheline *cachepc_prepare_victim(cache_ctx *ctx, uint32_t set); void cachepc_release_victim(cache_ctx *ctx, cacheline *ptr); void cachepc_save_msrmts(cacheline *head); void cachepc_print_msrmts(cacheline *head); __attribute__((always_inline)) static inline cacheline *cachepc_prime(cacheline *head); __attribute__((always_inline)) static inline cacheline *cachepc_prime_rev(cacheline *head); __attribute__((always_inline)) static inline cacheline *cachepc_probe(cacheline *head); __attribute__((always_inline)) static inline void cachepc_victim(void *p); __attribute__((always_inline)) static inline uint64_t cachepc_read_pmc(uint64_t event); extern uint16_t *cachepc_msrmts; extern size_t cachepc_msrmts_count; extern cache_ctx *cachepc_ctx; extern cacheline *cachepc_ds; /* * Prime phase: fill the target cache (encoded in the size of the data structure) * with the prepared data structure, i.e. with attacker data. */ cacheline * cachepc_prime(cacheline *head) { cacheline *curr_cl; cachepc_cpuid(); curr_cl = head; do { curr_cl = curr_cl->next; cachepc_mfence(); } while(curr_cl != head); cachepc_cpuid(); return curr_cl->prev; } /* * Same as prime, but in the reverse direction, i.e. the same direction that probe * uses. This is beneficial for the following scenarios: * - L1: * - Trigger collision chain-reaction to amplify an evicted set (but this has * the downside of more noisy measurements). * - L2: * - Always use this for L2, otherwise the first cache sets will still reside * in L1 unless the victim filled L1 completely. In this case, an eviction * has randomly (depending on where the cache set is placed in the randomised * data structure) the following effect: * A) An evicted set is L2_ACCESS_TIME - L1_ACCESS_TIME slower * B) An evicted set is L3_ACCESS_TIME - L2_ACCESS_TIME slower */ cacheline * cachepc_prime_rev(cacheline *head) { cacheline *curr_cl; cachepc_cpuid(); curr_cl = head; do { curr_cl = curr_cl->prev; cachepc_mfence(); } while(curr_cl != head); cachepc_cpuid(); return curr_cl->prev; } cacheline * cachepc_probe(cacheline *start_cl) { uint64_t pre, post; cacheline *next_cl; cacheline *curr_cl; volatile register uint64_t i asm("r12"); curr_cl = start_cl; do { pre = cachepc_read_pmc(0); pre += cachepc_read_pmc(1); cachepc_mfence(); cachepc_cpuid(); asm volatile( "mov 8(%[curr_cl]), %%rax \n\t" // +8 "mov 8(%%rax), %%rcx \n\t" // +16 "mov 8(%%rcx), %%rax \n\t" // +24 "mov 8(%%rax), %%rcx \n\t" // +32 "mov 8(%%rcx), %%rax \n\t" // +40 "mov 8(%%rax), %%rcx \n\t" // +48 "mov 8(%%rcx), %[curr_cl_out] \n\t" // +56 "mov 8(%[curr_cl_out]), %[next_cl_out] \n\t" // +64 : [next_cl_out] "=r" (next_cl), [curr_cl_out] "=r" (curr_cl) : [curr_cl] "r" (curr_cl) : "rax", "rcx" ); cachepc_mfence(); cachepc_cpuid(); post = cachepc_read_pmc(0); post += cachepc_read_pmc(1); cachepc_mfence(); cachepc_cpuid(); /* works across size boundary */ curr_cl->count = post - pre; curr_cl = next_cl; } while (__builtin_expect(curr_cl != start_cl, 1)); return curr_cl->next; } void cachepc_victim(void *p) { cachepc_cpuid(); cachepc_mfence(); cachepc_readq(p); } uint64_t cachepc_read_pmc(uint64_t event) { uint32_t lo, hi; event = 0xC0010201 + 2 * event; asm volatile ( "rdmsr" : "=a" (lo), "=d" (hi) : "c"(event) ); return ((uint64_t) hi << 32) | (uint64_t) lo; }