cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mperf_monitor.c (10488B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  (C) 2010,2011       Thomas Renninger <trenn@suse.de>, Novell Inc.
      4 */
      5
      6#if defined(__i386__) || defined(__x86_64__)
      7
      8#include <stdio.h>
      9#include <stdint.h>
     10#include <stdlib.h>
     11#include <string.h>
     12#include <limits.h>
     13
     14#include <cpufreq.h>
     15
     16#include "helpers/helpers.h"
     17#include "idle_monitor/cpupower-monitor.h"
     18
     19#define MSR_APERF	0xE8
     20#define MSR_MPERF	0xE7
     21
     22#define RDPRU ".byte 0x0f, 0x01, 0xfd"
     23#define RDPRU_ECX_MPERF	0
     24#define RDPRU_ECX_APERF	1
     25
     26#define MSR_TSC	0x10
     27
     28#define MSR_AMD_HWCR 0xc0010015
     29
     30enum mperf_id { C0 = 0, Cx, AVG_FREQ, MPERF_CSTATE_COUNT };
     31
     32static int mperf_get_count_percent(unsigned int self_id, double *percent,
     33				   unsigned int cpu);
     34static int mperf_get_count_freq(unsigned int id, unsigned long long *count,
     35				unsigned int cpu);
     36static struct timespec time_start, time_end;
     37
     38static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = {
     39	{
     40		.name			= "C0",
     41		.desc			= N_("Processor Core not idle"),
     42		.id			= C0,
     43		.range			= RANGE_THREAD,
     44		.get_count_percent	= mperf_get_count_percent,
     45	},
     46	{
     47		.name			= "Cx",
     48		.desc			= N_("Processor Core in an idle state"),
     49		.id			= Cx,
     50		.range			= RANGE_THREAD,
     51		.get_count_percent	= mperf_get_count_percent,
     52	},
     53
     54	{
     55		.name			= "Freq",
     56		.desc			= N_("Average Frequency (including boost) in MHz"),
     57		.id			= AVG_FREQ,
     58		.range			= RANGE_THREAD,
     59		.get_count		= mperf_get_count_freq,
     60	},
     61};
     62
     63enum MAX_FREQ_MODE { MAX_FREQ_SYSFS, MAX_FREQ_TSC_REF };
     64static int max_freq_mode;
     65/*
     66 * The max frequency mperf is ticking at (in C0), either retrieved via:
     67 *   1) calculated after measurements if we know TSC ticks at mperf/P0 frequency
     68 *   2) cpufreq /sys/devices/.../cpu0/cpufreq/cpuinfo_max_freq at init time
     69 * 1. Is preferred as it also works without cpufreq subsystem (e.g. on Xen)
     70 */
     71static unsigned long max_frequency;
     72
     73static unsigned long long tsc_at_measure_start;
     74static unsigned long long tsc_at_measure_end;
     75static unsigned long long *mperf_previous_count;
     76static unsigned long long *aperf_previous_count;
     77static unsigned long long *mperf_current_count;
     78static unsigned long long *aperf_current_count;
     79
     80/* valid flag for all CPUs. If a MSR read failed it will be zero */
     81static int *is_valid;
     82
     83static int mperf_get_tsc(unsigned long long *tsc)
     84{
     85	int ret;
     86
     87	ret = read_msr(base_cpu, MSR_TSC, tsc);
     88	if (ret)
     89		dprint("Reading TSC MSR failed, returning %llu\n", *tsc);
     90	return ret;
     91}
     92
     93static int get_aperf_mperf(int cpu, unsigned long long *aval,
     94				    unsigned long long *mval)
     95{
     96	unsigned long low_a, high_a;
     97	unsigned long low_m, high_m;
     98	int ret;
     99
    100	/*
    101	 * Running on the cpu from which we read the registers will
    102	 * prevent APERF/MPERF from going out of sync because of IPI
    103	 * latency introduced by read_msr()s.
    104	 */
    105	if (mperf_monitor.flags.per_cpu_schedule) {
    106		if (bind_cpu(cpu))
    107			return 1;
    108	}
    109
    110	if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_RDPRU) {
    111		asm volatile(RDPRU
    112			     : "=a" (low_a), "=d" (high_a)
    113			     : "c" (RDPRU_ECX_APERF));
    114		asm volatile(RDPRU
    115			     : "=a" (low_m), "=d" (high_m)
    116			     : "c" (RDPRU_ECX_MPERF));
    117
    118		*aval = ((low_a) | (high_a) << 32);
    119		*mval = ((low_m) | (high_m) << 32);
    120
    121		return 0;
    122	}
    123
    124	ret  = read_msr(cpu, MSR_APERF, aval);
    125	ret |= read_msr(cpu, MSR_MPERF, mval);
    126
    127	return ret;
    128}
    129
    130static int mperf_init_stats(unsigned int cpu)
    131{
    132	unsigned long long aval, mval;
    133	int ret;
    134
    135	ret = get_aperf_mperf(cpu, &aval, &mval);
    136	aperf_previous_count[cpu] = aval;
    137	mperf_previous_count[cpu] = mval;
    138	is_valid[cpu] = !ret;
    139
    140	return 0;
    141}
    142
    143static int mperf_measure_stats(unsigned int cpu)
    144{
    145	unsigned long long aval, mval;
    146	int ret;
    147
    148	ret = get_aperf_mperf(cpu, &aval, &mval);
    149	aperf_current_count[cpu] = aval;
    150	mperf_current_count[cpu] = mval;
    151	is_valid[cpu] = !ret;
    152
    153	return 0;
    154}
    155
    156static int mperf_get_count_percent(unsigned int id, double *percent,
    157				   unsigned int cpu)
    158{
    159	unsigned long long aperf_diff, mperf_diff, tsc_diff;
    160	unsigned long long timediff;
    161
    162	if (!is_valid[cpu])
    163		return -1;
    164
    165	if (id != C0 && id != Cx)
    166		return -1;
    167
    168	mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu];
    169	aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu];
    170
    171	if (max_freq_mode == MAX_FREQ_TSC_REF) {
    172		tsc_diff = tsc_at_measure_end - tsc_at_measure_start;
    173		*percent = 100.0 * mperf_diff / tsc_diff;
    174		dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n",
    175		       mperf_cstates[id].name, mperf_diff, tsc_diff);
    176	} else if (max_freq_mode == MAX_FREQ_SYSFS) {
    177		timediff = max_frequency * timespec_diff_us(time_start, time_end);
    178		*percent = 100.0 * mperf_diff / timediff;
    179		dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n",
    180		       mperf_cstates[id].name, mperf_diff, timediff);
    181	} else
    182		return -1;
    183
    184	if (id == Cx)
    185		*percent = 100.0 - *percent;
    186
    187	dprint("%s: previous: %llu - current: %llu - (%u)\n",
    188		mperf_cstates[id].name, mperf_diff, aperf_diff, cpu);
    189	dprint("%s: %f\n", mperf_cstates[id].name, *percent);
    190	return 0;
    191}
    192
    193static int mperf_get_count_freq(unsigned int id, unsigned long long *count,
    194				unsigned int cpu)
    195{
    196	unsigned long long aperf_diff, mperf_diff, time_diff, tsc_diff;
    197
    198	if (id != AVG_FREQ)
    199		return 1;
    200
    201	if (!is_valid[cpu])
    202		return -1;
    203
    204	mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu];
    205	aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu];
    206
    207	if (max_freq_mode == MAX_FREQ_TSC_REF) {
    208		/* Calculate max_freq from TSC count */
    209		tsc_diff = tsc_at_measure_end - tsc_at_measure_start;
    210		time_diff = timespec_diff_us(time_start, time_end);
    211		max_frequency = tsc_diff / time_diff;
    212	}
    213
    214	*count = max_frequency * ((double)aperf_diff / mperf_diff);
    215	dprint("%s: Average freq based on %s maximum frequency:\n",
    216	       mperf_cstates[id].name,
    217	       (max_freq_mode == MAX_FREQ_TSC_REF) ? "TSC calculated" : "sysfs read");
    218	dprint("max_frequency: %lu\n", max_frequency);
    219	dprint("aperf_diff: %llu\n", aperf_diff);
    220	dprint("mperf_diff: %llu\n", mperf_diff);
    221	dprint("avg freq:   %llu\n", *count);
    222	return 0;
    223}
    224
    225static int mperf_start(void)
    226{
    227	int cpu;
    228	unsigned long long dbg;
    229
    230	clock_gettime(CLOCK_REALTIME, &time_start);
    231	mperf_get_tsc(&tsc_at_measure_start);
    232
    233	for (cpu = 0; cpu < cpu_count; cpu++)
    234		mperf_init_stats(cpu);
    235
    236	mperf_get_tsc(&dbg);
    237	dprint("TSC diff: %llu\n", dbg - tsc_at_measure_start);
    238	return 0;
    239}
    240
    241static int mperf_stop(void)
    242{
    243	unsigned long long dbg;
    244	int cpu;
    245
    246	for (cpu = 0; cpu < cpu_count; cpu++)
    247		mperf_measure_stats(cpu);
    248
    249	mperf_get_tsc(&tsc_at_measure_end);
    250	clock_gettime(CLOCK_REALTIME, &time_end);
    251
    252	mperf_get_tsc(&dbg);
    253	dprint("TSC diff: %llu\n", dbg - tsc_at_measure_end);
    254
    255	return 0;
    256}
    257
    258/*
    259 * Mperf register is defined to tick at P0 (maximum) frequency
    260 *
    261 * Instead of reading out P0 which can be tricky to read out from HW,
    262 * we use TSC counter if it reliably ticks at P0/mperf frequency.
    263 *
    264 * Still try to fall back to:
    265 * /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq
    266 * on older Intel HW without invariant TSC feature.
    267 * Or on AMD machines where TSC does not tick at P0 (do not exist yet, but
    268 * it's still double checked (MSR_AMD_HWCR)).
    269 *
    270 * On these machines the user would still get useful mperf
    271 * stats when acpi-cpufreq driver is loaded.
    272 */
    273static int init_maxfreq_mode(void)
    274{
    275	int ret;
    276	unsigned long long hwcr;
    277	unsigned long min;
    278
    279	if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC))
    280		goto use_sysfs;
    281
    282	if (cpupower_cpu_info.vendor == X86_VENDOR_AMD ||
    283	    cpupower_cpu_info.vendor == X86_VENDOR_HYGON) {
    284		/* MSR_AMD_HWCR tells us whether TSC runs at P0/mperf
    285		 * freq.
    286		 * A test whether hwcr is accessable/available would be:
    287		 * (cpupower_cpu_info.family > 0x10 ||
    288		 *   cpupower_cpu_info.family == 0x10 &&
    289		 *   cpupower_cpu_info.model >= 0x2))
    290		 * This should be the case for all aperf/mperf
    291		 * capable AMD machines and is therefore safe to test here.
    292		 * Compare with Linus kernel git commit: acf01734b1747b1ec4
    293		 */
    294		ret = read_msr(0, MSR_AMD_HWCR, &hwcr);
    295		/*
    296		 * If the MSR read failed, assume a Xen system that did
    297		 * not explicitly provide access to it and assume TSC works
    298		*/
    299		if (ret != 0) {
    300			dprint("TSC read 0x%x failed - assume TSC working\n",
    301			       MSR_AMD_HWCR);
    302			return 0;
    303		} else if (1 & (hwcr >> 24)) {
    304			max_freq_mode = MAX_FREQ_TSC_REF;
    305			return 0;
    306		} else { /* Use sysfs max frequency if available */ }
    307	} else if (cpupower_cpu_info.vendor == X86_VENDOR_INTEL) {
    308		/*
    309		 * On Intel we assume mperf (in C0) is ticking at same
    310		 * rate than TSC
    311		 */
    312		max_freq_mode = MAX_FREQ_TSC_REF;
    313		return 0;
    314	}
    315use_sysfs:
    316	if (cpufreq_get_hardware_limits(0, &min, &max_frequency)) {
    317		dprint("Cannot retrieve max freq from cpufreq kernel "
    318		       "subsystem\n");
    319		return -1;
    320	}
    321	max_freq_mode = MAX_FREQ_SYSFS;
    322	max_frequency /= 1000; /* Default automatically to MHz value */
    323	return 0;
    324}
    325
    326/*
    327 * This monitor provides:
    328 *
    329 * 1) Average frequency a CPU resided in
    330 *    This always works if the CPU has aperf/mperf capabilities
    331 *
    332 * 2) C0 and Cx (any sleep state) time a CPU resided in
    333 *    Works if mperf timer stops ticking in sleep states which
    334 *    seem to be the case on all current HW.
    335 * Both is directly retrieved from HW registers and is independent
    336 * from kernel statistics.
    337 */
    338struct cpuidle_monitor mperf_monitor;
    339struct cpuidle_monitor *mperf_register(void)
    340{
    341	if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF))
    342		return NULL;
    343
    344	if (init_maxfreq_mode())
    345		return NULL;
    346
    347	if (cpupower_cpu_info.vendor == X86_VENDOR_AMD)
    348		mperf_monitor.flags.per_cpu_schedule = 1;
    349
    350	/* Free this at program termination */
    351	is_valid = calloc(cpu_count, sizeof(int));
    352	mperf_previous_count = calloc(cpu_count, sizeof(unsigned long long));
    353	aperf_previous_count = calloc(cpu_count, sizeof(unsigned long long));
    354	mperf_current_count = calloc(cpu_count, sizeof(unsigned long long));
    355	aperf_current_count = calloc(cpu_count, sizeof(unsigned long long));
    356
    357	mperf_monitor.name_len = strlen(mperf_monitor.name);
    358	return &mperf_monitor;
    359}
    360
    361void mperf_unregister(void)
    362{
    363	free(mperf_previous_count);
    364	free(aperf_previous_count);
    365	free(mperf_current_count);
    366	free(aperf_current_count);
    367	free(is_valid);
    368}
    369
    370struct cpuidle_monitor mperf_monitor = {
    371	.name			= "Mperf",
    372	.hw_states_num		= MPERF_CSTATE_COUNT,
    373	.hw_states		= mperf_cstates,
    374	.start			= mperf_start,
    375	.stop			= mperf_stop,
    376	.do_register		= mperf_register,
    377	.unregister		= mperf_unregister,
    378	.flags.needs_root	= 1,
    379	.overflow_s		= 922000000 /* 922337203 seconds TSC overflow
    380					       at 20GHz */
    381};
    382#endif /* #if defined(__i386__) || defined(__x86_64__) */