cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

rapl.c (23937B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Support Intel/AMD RAPL energy consumption counters
      4 * Copyright (C) 2013 Google, Inc., Stephane Eranian
      5 *
      6 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
      7 * section 14.7.1 (September 2013)
      8 *
      9 * AMD RAPL interface for Fam17h is described in the public PPR:
     10 * https://bugzilla.kernel.org/show_bug.cgi?id=206537
     11 *
     12 * RAPL provides more controls than just reporting energy consumption
     13 * however here we only expose the 3 energy consumption free running
     14 * counters (pp0, pkg, dram).
     15 *
     16 * Each of those counters increments in a power unit defined by the
     17 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
     18 * but it can vary.
     19 *
     20 * Counter to rapl events mappings:
     21 *
     22 *  pp0 counter: consumption of all physical cores (power plane 0)
     23 * 	  event: rapl_energy_cores
     24 *    perf code: 0x1
     25 *
     26 *  pkg counter: consumption of the whole processor package
     27 *	  event: rapl_energy_pkg
     28 *    perf code: 0x2
     29 *
     30 * dram counter: consumption of the dram domain (servers only)
     31 *	  event: rapl_energy_dram
     32 *    perf code: 0x3
     33 *
     34 * gpu counter: consumption of the builtin-gpu domain (client only)
     35 *	  event: rapl_energy_gpu
     36 *    perf code: 0x4
     37 *
     38 *  psys counter: consumption of the builtin-psys domain (client only)
     39 *	  event: rapl_energy_psys
     40 *    perf code: 0x5
     41 *
     42 * We manage those counters as free running (read-only). They may be
     43 * use simultaneously by other tools, such as turbostat.
     44 *
     45 * The events only support system-wide mode counting. There is no
     46 * sampling support because it does not make sense and is not
     47 * supported by the RAPL hardware.
     48 *
     49 * Because we want to avoid floating-point operations in the kernel,
     50 * the events are all reported in fixed point arithmetic (32.32).
     51 * Tools must adjust the counts to convert them to Watts using
     52 * the duration of the measurement. Tools may use a function such as
     53 * ldexp(raw_count, -32);
     54 */
     55
     56#define pr_fmt(fmt) "RAPL PMU: " fmt
     57
     58#include <linux/module.h>
     59#include <linux/slab.h>
     60#include <linux/perf_event.h>
     61#include <linux/nospec.h>
     62#include <asm/cpu_device_id.h>
     63#include <asm/intel-family.h>
     64#include "perf_event.h"
     65#include "probe.h"
     66
     67MODULE_LICENSE("GPL");
     68
     69/*
     70 * RAPL energy status counters
     71 */
     72enum perf_rapl_events {
     73	PERF_RAPL_PP0 = 0,		/* all cores */
     74	PERF_RAPL_PKG,			/* entire package */
     75	PERF_RAPL_RAM,			/* DRAM */
     76	PERF_RAPL_PP1,			/* gpu */
     77	PERF_RAPL_PSYS,			/* psys */
     78
     79	PERF_RAPL_MAX,
     80	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
     81};
     82
     83static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
     84	"pp0-core",
     85	"package",
     86	"dram",
     87	"pp1-gpu",
     88	"psys",
     89};
     90
     91/*
     92 * event code: LSB 8 bits, passed in attr->config
     93 * any other bit is reserved
     94 */
     95#define RAPL_EVENT_MASK	0xFFULL
     96#define RAPL_CNTR_WIDTH 32
     97
     98#define RAPL_EVENT_ATTR_STR(_name, v, str)					\
     99static struct perf_pmu_events_attr event_attr_##v = {				\
    100	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
    101	.id		= 0,							\
    102	.event_str	= str,							\
    103};
    104
    105struct rapl_pmu {
    106	raw_spinlock_t		lock;
    107	int			n_active;
    108	int			cpu;
    109	struct list_head	active_list;
    110	struct pmu		*pmu;
    111	ktime_t			timer_interval;
    112	struct hrtimer		hrtimer;
    113};
    114
    115struct rapl_pmus {
    116	struct pmu		pmu;
    117	unsigned int		maxdie;
    118	struct rapl_pmu		*pmus[];
    119};
    120
    121enum rapl_unit_quirk {
    122	RAPL_UNIT_QUIRK_NONE,
    123	RAPL_UNIT_QUIRK_INTEL_HSW,
    124	RAPL_UNIT_QUIRK_INTEL_SPR,
    125};
    126
    127struct rapl_model {
    128	struct perf_msr *rapl_msrs;
    129	unsigned long	events;
    130	unsigned int	msr_power_unit;
    131	enum rapl_unit_quirk	unit_quirk;
    132};
    133
    134 /* 1/2^hw_unit Joule */
    135static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
    136static struct rapl_pmus *rapl_pmus;
    137static cpumask_t rapl_cpu_mask;
    138static unsigned int rapl_cntr_mask;
    139static u64 rapl_timer_ms;
    140static struct perf_msr *rapl_msrs;
    141
    142static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
    143{
    144	unsigned int dieid = topology_logical_die_id(cpu);
    145
    146	/*
    147	 * The unsigned check also catches the '-1' return value for non
    148	 * existent mappings in the topology map.
    149	 */
    150	return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
    151}
    152
    153static inline u64 rapl_read_counter(struct perf_event *event)
    154{
    155	u64 raw;
    156	rdmsrl(event->hw.event_base, raw);
    157	return raw;
    158}
    159
    160static inline u64 rapl_scale(u64 v, int cfg)
    161{
    162	if (cfg > NR_RAPL_DOMAINS) {
    163		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
    164		return v;
    165	}
    166	/*
    167	 * scale delta to smallest unit (1/2^32)
    168	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
    169	 * or use ldexp(count, -32).
    170	 * Watts = Joules/Time delta
    171	 */
    172	return v << (32 - rapl_hw_unit[cfg - 1]);
    173}
    174
    175static u64 rapl_event_update(struct perf_event *event)
    176{
    177	struct hw_perf_event *hwc = &event->hw;
    178	u64 prev_raw_count, new_raw_count;
    179	s64 delta, sdelta;
    180	int shift = RAPL_CNTR_WIDTH;
    181
    182again:
    183	prev_raw_count = local64_read(&hwc->prev_count);
    184	rdmsrl(event->hw.event_base, new_raw_count);
    185
    186	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
    187			    new_raw_count) != prev_raw_count) {
    188		cpu_relax();
    189		goto again;
    190	}
    191
    192	/*
    193	 * Now we have the new raw value and have updated the prev
    194	 * timestamp already. We can now calculate the elapsed delta
    195	 * (event-)time and add that to the generic event.
    196	 *
    197	 * Careful, not all hw sign-extends above the physical width
    198	 * of the count.
    199	 */
    200	delta = (new_raw_count << shift) - (prev_raw_count << shift);
    201	delta >>= shift;
    202
    203	sdelta = rapl_scale(delta, event->hw.config);
    204
    205	local64_add(sdelta, &event->count);
    206
    207	return new_raw_count;
    208}
    209
    210static void rapl_start_hrtimer(struct rapl_pmu *pmu)
    211{
    212       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
    213		     HRTIMER_MODE_REL_PINNED);
    214}
    215
    216static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
    217{
    218	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
    219	struct perf_event *event;
    220	unsigned long flags;
    221
    222	if (!pmu->n_active)
    223		return HRTIMER_NORESTART;
    224
    225	raw_spin_lock_irqsave(&pmu->lock, flags);
    226
    227	list_for_each_entry(event, &pmu->active_list, active_entry)
    228		rapl_event_update(event);
    229
    230	raw_spin_unlock_irqrestore(&pmu->lock, flags);
    231
    232	hrtimer_forward_now(hrtimer, pmu->timer_interval);
    233
    234	return HRTIMER_RESTART;
    235}
    236
    237static void rapl_hrtimer_init(struct rapl_pmu *pmu)
    238{
    239	struct hrtimer *hr = &pmu->hrtimer;
    240
    241	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    242	hr->function = rapl_hrtimer_handle;
    243}
    244
    245static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
    246				   struct perf_event *event)
    247{
    248	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
    249		return;
    250
    251	event->hw.state = 0;
    252
    253	list_add_tail(&event->active_entry, &pmu->active_list);
    254
    255	local64_set(&event->hw.prev_count, rapl_read_counter(event));
    256
    257	pmu->n_active++;
    258	if (pmu->n_active == 1)
    259		rapl_start_hrtimer(pmu);
    260}
    261
    262static void rapl_pmu_event_start(struct perf_event *event, int mode)
    263{
    264	struct rapl_pmu *pmu = event->pmu_private;
    265	unsigned long flags;
    266
    267	raw_spin_lock_irqsave(&pmu->lock, flags);
    268	__rapl_pmu_event_start(pmu, event);
    269	raw_spin_unlock_irqrestore(&pmu->lock, flags);
    270}
    271
    272static void rapl_pmu_event_stop(struct perf_event *event, int mode)
    273{
    274	struct rapl_pmu *pmu = event->pmu_private;
    275	struct hw_perf_event *hwc = &event->hw;
    276	unsigned long flags;
    277
    278	raw_spin_lock_irqsave(&pmu->lock, flags);
    279
    280	/* mark event as deactivated and stopped */
    281	if (!(hwc->state & PERF_HES_STOPPED)) {
    282		WARN_ON_ONCE(pmu->n_active <= 0);
    283		pmu->n_active--;
    284		if (pmu->n_active == 0)
    285			hrtimer_cancel(&pmu->hrtimer);
    286
    287		list_del(&event->active_entry);
    288
    289		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
    290		hwc->state |= PERF_HES_STOPPED;
    291	}
    292
    293	/* check if update of sw counter is necessary */
    294	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
    295		/*
    296		 * Drain the remaining delta count out of a event
    297		 * that we are disabling:
    298		 */
    299		rapl_event_update(event);
    300		hwc->state |= PERF_HES_UPTODATE;
    301	}
    302
    303	raw_spin_unlock_irqrestore(&pmu->lock, flags);
    304}
    305
    306static int rapl_pmu_event_add(struct perf_event *event, int mode)
    307{
    308	struct rapl_pmu *pmu = event->pmu_private;
    309	struct hw_perf_event *hwc = &event->hw;
    310	unsigned long flags;
    311
    312	raw_spin_lock_irqsave(&pmu->lock, flags);
    313
    314	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
    315
    316	if (mode & PERF_EF_START)
    317		__rapl_pmu_event_start(pmu, event);
    318
    319	raw_spin_unlock_irqrestore(&pmu->lock, flags);
    320
    321	return 0;
    322}
    323
    324static void rapl_pmu_event_del(struct perf_event *event, int flags)
    325{
    326	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
    327}
    328
    329static int rapl_pmu_event_init(struct perf_event *event)
    330{
    331	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
    332	int bit, ret = 0;
    333	struct rapl_pmu *pmu;
    334
    335	/* only look at RAPL events */
    336	if (event->attr.type != rapl_pmus->pmu.type)
    337		return -ENOENT;
    338
    339	/* check only supported bits are set */
    340	if (event->attr.config & ~RAPL_EVENT_MASK)
    341		return -EINVAL;
    342
    343	if (event->cpu < 0)
    344		return -EINVAL;
    345
    346	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
    347
    348	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
    349		return -EINVAL;
    350
    351	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
    352	bit = cfg - 1;
    353
    354	/* check event supported */
    355	if (!(rapl_cntr_mask & (1 << bit)))
    356		return -EINVAL;
    357
    358	/* unsupported modes and filters */
    359	if (event->attr.sample_period) /* no sampling */
    360		return -EINVAL;
    361
    362	/* must be done before validate_group */
    363	pmu = cpu_to_rapl_pmu(event->cpu);
    364	if (!pmu)
    365		return -EINVAL;
    366	event->cpu = pmu->cpu;
    367	event->pmu_private = pmu;
    368	event->hw.event_base = rapl_msrs[bit].msr;
    369	event->hw.config = cfg;
    370	event->hw.idx = bit;
    371
    372	return ret;
    373}
    374
    375static void rapl_pmu_event_read(struct perf_event *event)
    376{
    377	rapl_event_update(event);
    378}
    379
    380static ssize_t rapl_get_attr_cpumask(struct device *dev,
    381				struct device_attribute *attr, char *buf)
    382{
    383	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
    384}
    385
    386static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
    387
    388static struct attribute *rapl_pmu_attrs[] = {
    389	&dev_attr_cpumask.attr,
    390	NULL,
    391};
    392
    393static struct attribute_group rapl_pmu_attr_group = {
    394	.attrs = rapl_pmu_attrs,
    395};
    396
    397RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
    398RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
    399RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
    400RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
    401RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
    402
    403RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
    404RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
    405RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
    406RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
    407RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
    408
    409/*
    410 * we compute in 0.23 nJ increments regardless of MSR
    411 */
    412RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
    413RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
    414RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
    415RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
    416RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
    417
    418/*
    419 * There are no default events, but we need to create
    420 * "events" group (with empty attrs) before updating
    421 * it with detected events.
    422 */
    423static struct attribute *attrs_empty[] = {
    424	NULL,
    425};
    426
    427static struct attribute_group rapl_pmu_events_group = {
    428	.name = "events",
    429	.attrs = attrs_empty,
    430};
    431
    432PMU_FORMAT_ATTR(event, "config:0-7");
    433static struct attribute *rapl_formats_attr[] = {
    434	&format_attr_event.attr,
    435	NULL,
    436};
    437
    438static struct attribute_group rapl_pmu_format_group = {
    439	.name = "format",
    440	.attrs = rapl_formats_attr,
    441};
    442
    443static const struct attribute_group *rapl_attr_groups[] = {
    444	&rapl_pmu_attr_group,
    445	&rapl_pmu_format_group,
    446	&rapl_pmu_events_group,
    447	NULL,
    448};
    449
    450static struct attribute *rapl_events_cores[] = {
    451	EVENT_PTR(rapl_cores),
    452	EVENT_PTR(rapl_cores_unit),
    453	EVENT_PTR(rapl_cores_scale),
    454	NULL,
    455};
    456
    457static struct attribute_group rapl_events_cores_group = {
    458	.name  = "events",
    459	.attrs = rapl_events_cores,
    460};
    461
    462static struct attribute *rapl_events_pkg[] = {
    463	EVENT_PTR(rapl_pkg),
    464	EVENT_PTR(rapl_pkg_unit),
    465	EVENT_PTR(rapl_pkg_scale),
    466	NULL,
    467};
    468
    469static struct attribute_group rapl_events_pkg_group = {
    470	.name  = "events",
    471	.attrs = rapl_events_pkg,
    472};
    473
    474static struct attribute *rapl_events_ram[] = {
    475	EVENT_PTR(rapl_ram),
    476	EVENT_PTR(rapl_ram_unit),
    477	EVENT_PTR(rapl_ram_scale),
    478	NULL,
    479};
    480
    481static struct attribute_group rapl_events_ram_group = {
    482	.name  = "events",
    483	.attrs = rapl_events_ram,
    484};
    485
    486static struct attribute *rapl_events_gpu[] = {
    487	EVENT_PTR(rapl_gpu),
    488	EVENT_PTR(rapl_gpu_unit),
    489	EVENT_PTR(rapl_gpu_scale),
    490	NULL,
    491};
    492
    493static struct attribute_group rapl_events_gpu_group = {
    494	.name  = "events",
    495	.attrs = rapl_events_gpu,
    496};
    497
    498static struct attribute *rapl_events_psys[] = {
    499	EVENT_PTR(rapl_psys),
    500	EVENT_PTR(rapl_psys_unit),
    501	EVENT_PTR(rapl_psys_scale),
    502	NULL,
    503};
    504
    505static struct attribute_group rapl_events_psys_group = {
    506	.name  = "events",
    507	.attrs = rapl_events_psys,
    508};
    509
    510static bool test_msr(int idx, void *data)
    511{
    512	return test_bit(idx, (unsigned long *) data);
    513}
    514
    515/* Only lower 32bits of the MSR represents the energy counter */
    516#define RAPL_MSR_MASK 0xFFFFFFFF
    517
    518static struct perf_msr intel_rapl_msrs[] = {
    519	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
    520	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
    521	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
    522	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
    523	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, false, RAPL_MSR_MASK },
    524};
    525
    526static struct perf_msr intel_rapl_spr_msrs[] = {
    527	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
    528	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
    529	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
    530	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
    531	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, true, RAPL_MSR_MASK },
    532};
    533
    534/*
    535 * Force to PERF_RAPL_MAX size due to:
    536 * - perf_msr_probe(PERF_RAPL_MAX)
    537 * - want to use same event codes across both architectures
    538 */
    539static struct perf_msr amd_rapl_msrs[] = {
    540	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, 0, false, 0 },
    541	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
    542	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   0, false, 0 },
    543	[PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   0, false, 0 },
    544	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  0, false, 0 },
    545};
    546
    547static int rapl_cpu_offline(unsigned int cpu)
    548{
    549	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
    550	int target;
    551
    552	/* Check if exiting cpu is used for collecting rapl events */
    553	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
    554		return 0;
    555
    556	pmu->cpu = -1;
    557	/* Find a new cpu to collect rapl events */
    558	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
    559
    560	/* Migrate rapl events to the new target */
    561	if (target < nr_cpu_ids) {
    562		cpumask_set_cpu(target, &rapl_cpu_mask);
    563		pmu->cpu = target;
    564		perf_pmu_migrate_context(pmu->pmu, cpu, target);
    565	}
    566	return 0;
    567}
    568
    569static int rapl_cpu_online(unsigned int cpu)
    570{
    571	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
    572	int target;
    573
    574	if (!pmu) {
    575		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
    576		if (!pmu)
    577			return -ENOMEM;
    578
    579		raw_spin_lock_init(&pmu->lock);
    580		INIT_LIST_HEAD(&pmu->active_list);
    581		pmu->pmu = &rapl_pmus->pmu;
    582		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
    583		rapl_hrtimer_init(pmu);
    584
    585		rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
    586	}
    587
    588	/*
    589	 * Check if there is an online cpu in the package which collects rapl
    590	 * events already.
    591	 */
    592	target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
    593	if (target < nr_cpu_ids)
    594		return 0;
    595
    596	cpumask_set_cpu(cpu, &rapl_cpu_mask);
    597	pmu->cpu = cpu;
    598	return 0;
    599}
    600
    601static int rapl_check_hw_unit(struct rapl_model *rm)
    602{
    603	u64 msr_rapl_power_unit_bits;
    604	int i;
    605
    606	/* protect rdmsrl() to handle virtualization */
    607	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
    608		return -1;
    609	for (i = 0; i < NR_RAPL_DOMAINS; i++)
    610		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
    611
    612	switch (rm->unit_quirk) {
    613	/*
    614	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
    615	 * different than the unit from power unit MSR. See
    616	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
    617	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
    618	 */
    619	case RAPL_UNIT_QUIRK_INTEL_HSW:
    620		rapl_hw_unit[PERF_RAPL_RAM] = 16;
    621		break;
    622	/*
    623	 * SPR shares the same DRAM domain energy unit as HSW, plus it
    624	 * also has a fixed energy unit for Psys domain.
    625	 */
    626	case RAPL_UNIT_QUIRK_INTEL_SPR:
    627		rapl_hw_unit[PERF_RAPL_RAM] = 16;
    628		rapl_hw_unit[PERF_RAPL_PSYS] = 0;
    629		break;
    630	default:
    631		break;
    632	}
    633
    634
    635	/*
    636	 * Calculate the timer rate:
    637	 * Use reference of 200W for scaling the timeout to avoid counter
    638	 * overflows. 200W = 200 Joules/sec
    639	 * Divide interval by 2 to avoid lockstep (2 * 100)
    640	 * if hw unit is 32, then we use 2 ms 1/200/2
    641	 */
    642	rapl_timer_ms = 2;
    643	if (rapl_hw_unit[0] < 32) {
    644		rapl_timer_ms = (1000 / (2 * 100));
    645		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
    646	}
    647	return 0;
    648}
    649
    650static void __init rapl_advertise(void)
    651{
    652	int i;
    653
    654	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
    655		hweight32(rapl_cntr_mask), rapl_timer_ms);
    656
    657	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
    658		if (rapl_cntr_mask & (1 << i)) {
    659			pr_info("hw unit of domain %s 2^-%d Joules\n",
    660				rapl_domain_names[i], rapl_hw_unit[i]);
    661		}
    662	}
    663}
    664
    665static void cleanup_rapl_pmus(void)
    666{
    667	int i;
    668
    669	for (i = 0; i < rapl_pmus->maxdie; i++)
    670		kfree(rapl_pmus->pmus[i]);
    671	kfree(rapl_pmus);
    672}
    673
    674static const struct attribute_group *rapl_attr_update[] = {
    675	&rapl_events_cores_group,
    676	&rapl_events_pkg_group,
    677	&rapl_events_ram_group,
    678	&rapl_events_gpu_group,
    679	&rapl_events_psys_group,
    680	NULL,
    681};
    682
    683static int __init init_rapl_pmus(void)
    684{
    685	int maxdie = topology_max_packages() * topology_max_die_per_package();
    686	size_t size;
    687
    688	size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
    689	rapl_pmus = kzalloc(size, GFP_KERNEL);
    690	if (!rapl_pmus)
    691		return -ENOMEM;
    692
    693	rapl_pmus->maxdie		= maxdie;
    694	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
    695	rapl_pmus->pmu.attr_update	= rapl_attr_update;
    696	rapl_pmus->pmu.task_ctx_nr	= perf_invalid_context;
    697	rapl_pmus->pmu.event_init	= rapl_pmu_event_init;
    698	rapl_pmus->pmu.add		= rapl_pmu_event_add;
    699	rapl_pmus->pmu.del		= rapl_pmu_event_del;
    700	rapl_pmus->pmu.start		= rapl_pmu_event_start;
    701	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
    702	rapl_pmus->pmu.read		= rapl_pmu_event_read;
    703	rapl_pmus->pmu.module		= THIS_MODULE;
    704	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
    705	return 0;
    706}
    707
    708static struct rapl_model model_snb = {
    709	.events		= BIT(PERF_RAPL_PP0) |
    710			  BIT(PERF_RAPL_PKG) |
    711			  BIT(PERF_RAPL_PP1),
    712	.msr_power_unit = MSR_RAPL_POWER_UNIT,
    713	.rapl_msrs      = intel_rapl_msrs,
    714};
    715
    716static struct rapl_model model_snbep = {
    717	.events		= BIT(PERF_RAPL_PP0) |
    718			  BIT(PERF_RAPL_PKG) |
    719			  BIT(PERF_RAPL_RAM),
    720	.msr_power_unit = MSR_RAPL_POWER_UNIT,
    721	.rapl_msrs      = intel_rapl_msrs,
    722};
    723
    724static struct rapl_model model_hsw = {
    725	.events		= BIT(PERF_RAPL_PP0) |
    726			  BIT(PERF_RAPL_PKG) |
    727			  BIT(PERF_RAPL_RAM) |
    728			  BIT(PERF_RAPL_PP1),
    729	.msr_power_unit = MSR_RAPL_POWER_UNIT,
    730	.rapl_msrs      = intel_rapl_msrs,
    731};
    732
    733static struct rapl_model model_hsx = {
    734	.events		= BIT(PERF_RAPL_PP0) |
    735			  BIT(PERF_RAPL_PKG) |
    736			  BIT(PERF_RAPL_RAM),
    737	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
    738	.msr_power_unit = MSR_RAPL_POWER_UNIT,
    739	.rapl_msrs      = intel_rapl_msrs,
    740};
    741
    742static struct rapl_model model_knl = {
    743	.events		= BIT(PERF_RAPL_PKG) |
    744			  BIT(PERF_RAPL_RAM),
    745	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
    746	.msr_power_unit = MSR_RAPL_POWER_UNIT,
    747	.rapl_msrs      = intel_rapl_msrs,
    748};
    749
    750static struct rapl_model model_skl = {
    751	.events		= BIT(PERF_RAPL_PP0) |
    752			  BIT(PERF_RAPL_PKG) |
    753			  BIT(PERF_RAPL_RAM) |
    754			  BIT(PERF_RAPL_PP1) |
    755			  BIT(PERF_RAPL_PSYS),
    756	.msr_power_unit = MSR_RAPL_POWER_UNIT,
    757	.rapl_msrs      = intel_rapl_msrs,
    758};
    759
    760static struct rapl_model model_spr = {
    761	.events		= BIT(PERF_RAPL_PP0) |
    762			  BIT(PERF_RAPL_PKG) |
    763			  BIT(PERF_RAPL_RAM) |
    764			  BIT(PERF_RAPL_PSYS),
    765	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
    766	.msr_power_unit = MSR_RAPL_POWER_UNIT,
    767	.rapl_msrs      = intel_rapl_spr_msrs,
    768};
    769
    770static struct rapl_model model_amd_hygon = {
    771	.events		= BIT(PERF_RAPL_PKG),
    772	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
    773	.rapl_msrs      = amd_rapl_msrs,
    774};
    775
    776static const struct x86_cpu_id rapl_model_match[] __initconst = {
    777	X86_MATCH_FEATURE(X86_FEATURE_RAPL,		&model_amd_hygon),
    778	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,		&model_snb),
    779	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,	&model_snbep),
    780	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,		&model_snb),
    781	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,		&model_snbep),
    782	X86_MATCH_INTEL_FAM6_MODEL(HASWELL,		&model_hsw),
    783	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,		&model_hsx),
    784	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,		&model_hsw),
    785	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,		&model_hsw),
    786	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,		&model_hsw),
    787	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,		&model_hsw),
    788	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,		&model_hsx),
    789	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,		&model_hsx),
    790	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&model_knl),
    791	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&model_knl),
    792	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,		&model_skl),
    793	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,		&model_skl),
    794	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&model_hsx),
    795	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,		&model_skl),
    796	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&model_skl),
    797	X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,	&model_skl),
    798	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&model_hsw),
    799	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&model_hsw),
    800	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&model_hsw),
    801	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,		&model_skl),
    802	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,		&model_skl),
    803	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&model_hsx),
    804	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&model_hsx),
    805	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,		&model_skl),
    806	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,		&model_skl),
    807	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,		&model_skl),
    808	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,		&model_skl),
    809	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&model_spr),
    810	{},
    811};
    812MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
    813
    814static int __init rapl_pmu_init(void)
    815{
    816	const struct x86_cpu_id *id;
    817	struct rapl_model *rm;
    818	int ret;
    819
    820	id = x86_match_cpu(rapl_model_match);
    821	if (!id)
    822		return -ENODEV;
    823
    824	rm = (struct rapl_model *) id->driver_data;
    825
    826	rapl_msrs = rm->rapl_msrs;
    827
    828	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
    829					false, (void *) &rm->events);
    830
    831	ret = rapl_check_hw_unit(rm);
    832	if (ret)
    833		return ret;
    834
    835	ret = init_rapl_pmus();
    836	if (ret)
    837		return ret;
    838
    839	/*
    840	 * Install callbacks. Core will call them for each online cpu.
    841	 */
    842	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
    843				"perf/x86/rapl:online",
    844				rapl_cpu_online, rapl_cpu_offline);
    845	if (ret)
    846		goto out;
    847
    848	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
    849	if (ret)
    850		goto out1;
    851
    852	rapl_advertise();
    853	return 0;
    854
    855out1:
    856	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
    857out:
    858	pr_warn("Initialization failed (%d), disabled\n", ret);
    859	cleanup_rapl_pmus();
    860	return ret;
    861}
    862module_init(rapl_pmu_init);
    863
    864static void __exit intel_rapl_exit(void)
    865{
    866	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
    867	perf_pmu_unregister(&rapl_pmus->pmu);
    868	cleanup_rapl_pmus();
    869}
    870module_exit(intel_rapl_exit);