cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

ibs.c (29636B)


      1/*
      2 * Performance events - AMD IBS
      3 *
      4 *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
      5 *
      6 *  For licencing details see kernel-base/COPYING
      7 */
      8
      9#include <linux/perf_event.h>
     10#include <linux/init.h>
     11#include <linux/export.h>
     12#include <linux/pci.h>
     13#include <linux/ptrace.h>
     14#include <linux/syscore_ops.h>
     15#include <linux/sched/clock.h>
     16
     17#include <asm/apic.h>
     18
     19#include "../perf_event.h"
     20
     21static u32 ibs_caps;
     22
     23#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
     24
     25#include <linux/kprobes.h>
     26#include <linux/hardirq.h>
     27
     28#include <asm/nmi.h>
     29#include <asm/amd-ibs.h>
     30
     31#define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
     32#define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT
     33
     34
     35/*
     36 * IBS states:
     37 *
     38 * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken
     39 * and any further add()s must fail.
     40 *
     41 * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are
     42 * complicated by the fact that the IBS hardware can send late NMIs (ie. after
     43 * we've cleared the EN bit).
     44 *
     45 * In order to consume these late NMIs we have the STOPPED state, any NMI that
     46 * happens after we've cleared the EN state will clear this bit and report the
     47 * NMI handled (this is fundamentally racy in the face or multiple NMI sources,
     48 * someone else can consume our BIT and our NMI will go unhandled).
     49 *
     50 * And since we cannot set/clear this separate bit together with the EN bit,
     51 * there are races; if we cleared STARTED early, an NMI could land in
     52 * between clearing STARTED and clearing the EN bit (in fact multiple NMIs
     53 * could happen if the period is small enough), and consume our STOPPED bit
     54 * and trigger streams of unhandled NMIs.
     55 *
     56 * If, however, we clear STARTED late, an NMI can hit between clearing the
     57 * EN bit and clearing STARTED, still see STARTED set and process the event.
     58 * If this event will have the VALID bit clear, we bail properly, but this
     59 * is not a given. With VALID set we can end up calling pmu::stop() again
     60 * (the throttle logic) and trigger the WARNs in there.
     61 *
     62 * So what we do is set STOPPING before clearing EN to avoid the pmu::stop()
     63 * nesting, and clear STARTED late, so that we have a well defined state over
     64 * the clearing of the EN bit.
     65 *
     66 * XXX: we could probably be using !atomic bitops for all this.
     67 */
     68
     69enum ibs_states {
     70	IBS_ENABLED	= 0,
     71	IBS_STARTED	= 1,
     72	IBS_STOPPING	= 2,
     73	IBS_STOPPED	= 3,
     74
     75	IBS_MAX_STATES,
     76};
     77
     78struct cpu_perf_ibs {
     79	struct perf_event	*event;
     80	unsigned long		state[BITS_TO_LONGS(IBS_MAX_STATES)];
     81};
     82
     83struct perf_ibs {
     84	struct pmu			pmu;
     85	unsigned int			msr;
     86	u64				config_mask;
     87	u64				cnt_mask;
     88	u64				enable_mask;
     89	u64				valid_mask;
     90	u64				max_period;
     91	unsigned long			offset_mask[1];
     92	int				offset_max;
     93	unsigned int			fetch_count_reset_broken : 1;
     94	unsigned int			fetch_ignore_if_zero_rip : 1;
     95	struct cpu_perf_ibs __percpu	*pcpu;
     96
     97	u64				(*get_count)(u64 config);
     98};
     99
    100static int
    101perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
    102{
    103	s64 left = local64_read(&hwc->period_left);
    104	s64 period = hwc->sample_period;
    105	int overflow = 0;
    106
    107	/*
    108	 * If we are way outside a reasonable range then just skip forward:
    109	 */
    110	if (unlikely(left <= -period)) {
    111		left = period;
    112		local64_set(&hwc->period_left, left);
    113		hwc->last_period = period;
    114		overflow = 1;
    115	}
    116
    117	if (unlikely(left < (s64)min)) {
    118		left += period;
    119		local64_set(&hwc->period_left, left);
    120		hwc->last_period = period;
    121		overflow = 1;
    122	}
    123
    124	/*
    125	 * If the hw period that triggers the sw overflow is too short
    126	 * we might hit the irq handler. This biases the results.
    127	 * Thus we shorten the next-to-last period and set the last
    128	 * period to the max period.
    129	 */
    130	if (left > max) {
    131		left -= max;
    132		if (left > max)
    133			left = max;
    134		else if (left < min)
    135			left = min;
    136	}
    137
    138	*hw_period = (u64)left;
    139
    140	return overflow;
    141}
    142
    143static  int
    144perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
    145{
    146	struct hw_perf_event *hwc = &event->hw;
    147	int shift = 64 - width;
    148	u64 prev_raw_count;
    149	u64 delta;
    150
    151	/*
    152	 * Careful: an NMI might modify the previous event value.
    153	 *
    154	 * Our tactic to handle this is to first atomically read and
    155	 * exchange a new raw count - then add that new-prev delta
    156	 * count to the generic event atomically:
    157	 */
    158	prev_raw_count = local64_read(&hwc->prev_count);
    159	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
    160					new_raw_count) != prev_raw_count)
    161		return 0;
    162
    163	/*
    164	 * Now we have the new raw value and have updated the prev
    165	 * timestamp already. We can now calculate the elapsed delta
    166	 * (event-)time and add that to the generic event.
    167	 *
    168	 * Careful, not all hw sign-extends above the physical width
    169	 * of the count.
    170	 */
    171	delta = (new_raw_count << shift) - (prev_raw_count << shift);
    172	delta >>= shift;
    173
    174	local64_add(delta, &event->count);
    175	local64_sub(delta, &hwc->period_left);
    176
    177	return 1;
    178}
    179
    180static struct perf_ibs perf_ibs_fetch;
    181static struct perf_ibs perf_ibs_op;
    182
    183static struct perf_ibs *get_ibs_pmu(int type)
    184{
    185	if (perf_ibs_fetch.pmu.type == type)
    186		return &perf_ibs_fetch;
    187	if (perf_ibs_op.pmu.type == type)
    188		return &perf_ibs_op;
    189	return NULL;
    190}
    191
    192/*
    193 * Use IBS for precise event sampling:
    194 *
    195 *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
    196 *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
    197 *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
    198 *
    199 * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
    200 * MSRC001_1033) is used to select either cycle or micro-ops counting
    201 * mode.
    202 *
    203 * The rip of IBS samples has skid 0. Thus, IBS supports precise
    204 * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
    205 * rip is invalid when IBS was not able to record the rip correctly.
    206 * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
    207 *
    208 */
    209static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
    210{
    211	switch (event->attr.precise_ip) {
    212	case 0:
    213		return -ENOENT;
    214	case 1:
    215	case 2:
    216		break;
    217	default:
    218		return -EOPNOTSUPP;
    219	}
    220
    221	switch (event->attr.type) {
    222	case PERF_TYPE_HARDWARE:
    223		switch (event->attr.config) {
    224		case PERF_COUNT_HW_CPU_CYCLES:
    225			*config = 0;
    226			return 0;
    227		}
    228		break;
    229	case PERF_TYPE_RAW:
    230		switch (event->attr.config) {
    231		case 0x0076:
    232			*config = 0;
    233			return 0;
    234		case 0x00C1:
    235			*config = IBS_OP_CNT_CTL;
    236			return 0;
    237		}
    238		break;
    239	default:
    240		return -ENOENT;
    241	}
    242
    243	return -EOPNOTSUPP;
    244}
    245
    246static int perf_ibs_init(struct perf_event *event)
    247{
    248	struct hw_perf_event *hwc = &event->hw;
    249	struct perf_ibs *perf_ibs;
    250	u64 max_cnt, config;
    251	int ret;
    252
    253	perf_ibs = get_ibs_pmu(event->attr.type);
    254	if (perf_ibs) {
    255		config = event->attr.config;
    256	} else {
    257		perf_ibs = &perf_ibs_op;
    258		ret = perf_ibs_precise_event(event, &config);
    259		if (ret)
    260			return ret;
    261	}
    262
    263	if (event->pmu != &perf_ibs->pmu)
    264		return -ENOENT;
    265
    266	if (config & ~perf_ibs->config_mask)
    267		return -EINVAL;
    268
    269	if (hwc->sample_period) {
    270		if (config & perf_ibs->cnt_mask)
    271			/* raw max_cnt may not be set */
    272			return -EINVAL;
    273		if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
    274			/*
    275			 * lower 4 bits can not be set in ibs max cnt,
    276			 * but allowing it in case we adjust the
    277			 * sample period to set a frequency.
    278			 */
    279			return -EINVAL;
    280		hwc->sample_period &= ~0x0FULL;
    281		if (!hwc->sample_period)
    282			hwc->sample_period = 0x10;
    283	} else {
    284		max_cnt = config & perf_ibs->cnt_mask;
    285		config &= ~perf_ibs->cnt_mask;
    286		event->attr.sample_period = max_cnt << 4;
    287		hwc->sample_period = event->attr.sample_period;
    288	}
    289
    290	if (!hwc->sample_period)
    291		return -EINVAL;
    292
    293	/*
    294	 * If we modify hwc->sample_period, we also need to update
    295	 * hwc->last_period and hwc->period_left.
    296	 */
    297	hwc->last_period = hwc->sample_period;
    298	local64_set(&hwc->period_left, hwc->sample_period);
    299
    300	hwc->config_base = perf_ibs->msr;
    301	hwc->config = config;
    302
    303	/*
    304	 * rip recorded by IbsOpRip will not be consistent with rsp and rbp
    305	 * recorded as part of interrupt regs. Thus we need to use rip from
    306	 * interrupt regs while unwinding call stack. Setting _EARLY flag
    307	 * makes sure we unwind call-stack before perf sample rip is set to
    308	 * IbsOpRip.
    309	 */
    310	if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
    311		event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
    312
    313	return 0;
    314}
    315
    316static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
    317			       struct hw_perf_event *hwc, u64 *period)
    318{
    319	int overflow;
    320
    321	/* ignore lower 4 bits in min count: */
    322	overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
    323	local64_set(&hwc->prev_count, 0);
    324
    325	return overflow;
    326}
    327
    328static u64 get_ibs_fetch_count(u64 config)
    329{
    330	union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config;
    331
    332	return fetch_ctl.fetch_cnt << 4;
    333}
    334
    335static u64 get_ibs_op_count(u64 config)
    336{
    337	union ibs_op_ctl op_ctl = (union ibs_op_ctl)config;
    338	u64 count = 0;
    339
    340	/*
    341	 * If the internal 27-bit counter rolled over, the count is MaxCnt
    342	 * and the lower 7 bits of CurCnt are randomized.
    343	 * Otherwise CurCnt has the full 27-bit current counter value.
    344	 */
    345	if (op_ctl.op_val) {
    346		count = op_ctl.opmaxcnt << 4;
    347		if (ibs_caps & IBS_CAPS_OPCNTEXT)
    348			count += op_ctl.opmaxcnt_ext << 20;
    349	} else if (ibs_caps & IBS_CAPS_RDWROPCNT) {
    350		count = op_ctl.opcurcnt;
    351	}
    352
    353	return count;
    354}
    355
    356static void
    357perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
    358		      u64 *config)
    359{
    360	u64 count = perf_ibs->get_count(*config);
    361
    362	/*
    363	 * Set width to 64 since we do not overflow on max width but
    364	 * instead on max count. In perf_ibs_set_period() we clear
    365	 * prev count manually on overflow.
    366	 */
    367	while (!perf_event_try_update(event, count, 64)) {
    368		rdmsrl(event->hw.config_base, *config);
    369		count = perf_ibs->get_count(*config);
    370	}
    371}
    372
    373static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
    374					 struct hw_perf_event *hwc, u64 config)
    375{
    376	u64 tmp = hwc->config | config;
    377
    378	if (perf_ibs->fetch_count_reset_broken)
    379		wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask);
    380
    381	wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask);
    382}
    383
    384/*
    385 * Erratum #420 Instruction-Based Sampling Engine May Generate
    386 * Interrupt that Cannot Be Cleared:
    387 *
    388 * Must clear counter mask first, then clear the enable bit. See
    389 * Revision Guide for AMD Family 10h Processors, Publication #41322.
    390 */
    391static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
    392					  struct hw_perf_event *hwc, u64 config)
    393{
    394	config &= ~perf_ibs->cnt_mask;
    395	if (boot_cpu_data.x86 == 0x10)
    396		wrmsrl(hwc->config_base, config);
    397	config &= ~perf_ibs->enable_mask;
    398	wrmsrl(hwc->config_base, config);
    399}
    400
    401/*
    402 * We cannot restore the ibs pmu state, so we always needs to update
    403 * the event while stopping it and then reset the state when starting
    404 * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
    405 * perf_ibs_start()/perf_ibs_stop() and instead always do it.
    406 */
    407static void perf_ibs_start(struct perf_event *event, int flags)
    408{
    409	struct hw_perf_event *hwc = &event->hw;
    410	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
    411	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
    412	u64 period, config = 0;
    413
    414	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
    415		return;
    416
    417	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
    418	hwc->state = 0;
    419
    420	perf_ibs_set_period(perf_ibs, hwc, &period);
    421	if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) {
    422		config |= period & IBS_OP_MAX_CNT_EXT_MASK;
    423		period &= ~IBS_OP_MAX_CNT_EXT_MASK;
    424	}
    425	config |= period >> 4;
    426
    427	/*
    428	 * Set STARTED before enabling the hardware, such that a subsequent NMI
    429	 * must observe it.
    430	 */
    431	set_bit(IBS_STARTED,    pcpu->state);
    432	clear_bit(IBS_STOPPING, pcpu->state);
    433	perf_ibs_enable_event(perf_ibs, hwc, config);
    434
    435	perf_event_update_userpage(event);
    436}
    437
    438static void perf_ibs_stop(struct perf_event *event, int flags)
    439{
    440	struct hw_perf_event *hwc = &event->hw;
    441	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
    442	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
    443	u64 config;
    444	int stopping;
    445
    446	if (test_and_set_bit(IBS_STOPPING, pcpu->state))
    447		return;
    448
    449	stopping = test_bit(IBS_STARTED, pcpu->state);
    450
    451	if (!stopping && (hwc->state & PERF_HES_UPTODATE))
    452		return;
    453
    454	rdmsrl(hwc->config_base, config);
    455
    456	if (stopping) {
    457		/*
    458		 * Set STOPPED before disabling the hardware, such that it
    459		 * must be visible to NMIs the moment we clear the EN bit,
    460		 * at which point we can generate an !VALID sample which
    461		 * we need to consume.
    462		 */
    463		set_bit(IBS_STOPPED, pcpu->state);
    464		perf_ibs_disable_event(perf_ibs, hwc, config);
    465		/*
    466		 * Clear STARTED after disabling the hardware; if it were
    467		 * cleared before an NMI hitting after the clear but before
    468		 * clearing the EN bit might think it a spurious NMI and not
    469		 * handle it.
    470		 *
    471		 * Clearing it after, however, creates the problem of the NMI
    472		 * handler seeing STARTED but not having a valid sample.
    473		 */
    474		clear_bit(IBS_STARTED, pcpu->state);
    475		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
    476		hwc->state |= PERF_HES_STOPPED;
    477	}
    478
    479	if (hwc->state & PERF_HES_UPTODATE)
    480		return;
    481
    482	/*
    483	 * Clear valid bit to not count rollovers on update, rollovers
    484	 * are only updated in the irq handler.
    485	 */
    486	config &= ~perf_ibs->valid_mask;
    487
    488	perf_ibs_event_update(perf_ibs, event, &config);
    489	hwc->state |= PERF_HES_UPTODATE;
    490}
    491
    492static int perf_ibs_add(struct perf_event *event, int flags)
    493{
    494	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
    495	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
    496
    497	if (test_and_set_bit(IBS_ENABLED, pcpu->state))
    498		return -ENOSPC;
    499
    500	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
    501
    502	pcpu->event = event;
    503
    504	if (flags & PERF_EF_START)
    505		perf_ibs_start(event, PERF_EF_RELOAD);
    506
    507	return 0;
    508}
    509
    510static void perf_ibs_del(struct perf_event *event, int flags)
    511{
    512	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
    513	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
    514
    515	if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
    516		return;
    517
    518	perf_ibs_stop(event, PERF_EF_UPDATE);
    519
    520	pcpu->event = NULL;
    521
    522	perf_event_update_userpage(event);
    523}
    524
    525static void perf_ibs_read(struct perf_event *event) { }
    526
    527/*
    528 * We need to initialize with empty group if all attributes in the
    529 * group are dynamic.
    530 */
    531static struct attribute *attrs_empty[] = {
    532	NULL,
    533};
    534
    535static struct attribute_group empty_format_group = {
    536	.name = "format",
    537	.attrs = attrs_empty,
    538};
    539
    540static struct attribute_group empty_caps_group = {
    541	.name = "caps",
    542	.attrs = attrs_empty,
    543};
    544
    545static const struct attribute_group *empty_attr_groups[] = {
    546	&empty_format_group,
    547	&empty_caps_group,
    548	NULL,
    549};
    550
    551PMU_FORMAT_ATTR(rand_en,	"config:57");
    552PMU_FORMAT_ATTR(cnt_ctl,	"config:19");
    553PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
    554PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
    555PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
    556
    557static umode_t
    558zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i)
    559{
    560	return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
    561}
    562
    563static struct attribute *rand_en_attrs[] = {
    564	&format_attr_rand_en.attr,
    565	NULL,
    566};
    567
    568static struct attribute *fetch_l3missonly_attrs[] = {
    569	&fetch_l3missonly.attr.attr,
    570	NULL,
    571};
    572
    573static struct attribute *zen4_ibs_extensions_attrs[] = {
    574	&zen4_ibs_extensions.attr.attr,
    575	NULL,
    576};
    577
    578static struct attribute_group group_rand_en = {
    579	.name = "format",
    580	.attrs = rand_en_attrs,
    581};
    582
    583static struct attribute_group group_fetch_l3missonly = {
    584	.name = "format",
    585	.attrs = fetch_l3missonly_attrs,
    586	.is_visible = zen4_ibs_extensions_is_visible,
    587};
    588
    589static struct attribute_group group_zen4_ibs_extensions = {
    590	.name = "caps",
    591	.attrs = zen4_ibs_extensions_attrs,
    592	.is_visible = zen4_ibs_extensions_is_visible,
    593};
    594
    595static const struct attribute_group *fetch_attr_groups[] = {
    596	&group_rand_en,
    597	&empty_caps_group,
    598	NULL,
    599};
    600
    601static const struct attribute_group *fetch_attr_update[] = {
    602	&group_fetch_l3missonly,
    603	&group_zen4_ibs_extensions,
    604	NULL,
    605};
    606
    607static umode_t
    608cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
    609{
    610	return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
    611}
    612
    613static struct attribute *cnt_ctl_attrs[] = {
    614	&format_attr_cnt_ctl.attr,
    615	NULL,
    616};
    617
    618static struct attribute *op_l3missonly_attrs[] = {
    619	&op_l3missonly.attr.attr,
    620	NULL,
    621};
    622
    623static struct attribute_group group_cnt_ctl = {
    624	.name = "format",
    625	.attrs = cnt_ctl_attrs,
    626	.is_visible = cnt_ctl_is_visible,
    627};
    628
    629static struct attribute_group group_op_l3missonly = {
    630	.name = "format",
    631	.attrs = op_l3missonly_attrs,
    632	.is_visible = zen4_ibs_extensions_is_visible,
    633};
    634
    635static const struct attribute_group *op_attr_update[] = {
    636	&group_cnt_ctl,
    637	&group_op_l3missonly,
    638	&group_zen4_ibs_extensions,
    639	NULL,
    640};
    641
    642static struct perf_ibs perf_ibs_fetch = {
    643	.pmu = {
    644		.task_ctx_nr	= perf_invalid_context,
    645
    646		.event_init	= perf_ibs_init,
    647		.add		= perf_ibs_add,
    648		.del		= perf_ibs_del,
    649		.start		= perf_ibs_start,
    650		.stop		= perf_ibs_stop,
    651		.read		= perf_ibs_read,
    652		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
    653	},
    654	.msr			= MSR_AMD64_IBSFETCHCTL,
    655	.config_mask		= IBS_FETCH_CONFIG_MASK,
    656	.cnt_mask		= IBS_FETCH_MAX_CNT,
    657	.enable_mask		= IBS_FETCH_ENABLE,
    658	.valid_mask		= IBS_FETCH_VAL,
    659	.max_period		= IBS_FETCH_MAX_CNT << 4,
    660	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK },
    661	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT,
    662
    663	.get_count		= get_ibs_fetch_count,
    664};
    665
    666static struct perf_ibs perf_ibs_op = {
    667	.pmu = {
    668		.task_ctx_nr	= perf_invalid_context,
    669
    670		.event_init	= perf_ibs_init,
    671		.add		= perf_ibs_add,
    672		.del		= perf_ibs_del,
    673		.start		= perf_ibs_start,
    674		.stop		= perf_ibs_stop,
    675		.read		= perf_ibs_read,
    676		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
    677	},
    678	.msr			= MSR_AMD64_IBSOPCTL,
    679	.config_mask		= IBS_OP_CONFIG_MASK,
    680	.cnt_mask		= IBS_OP_MAX_CNT | IBS_OP_CUR_CNT |
    681				  IBS_OP_CUR_CNT_RAND,
    682	.enable_mask		= IBS_OP_ENABLE,
    683	.valid_mask		= IBS_OP_VAL,
    684	.max_period		= IBS_OP_MAX_CNT << 4,
    685	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK },
    686	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT,
    687
    688	.get_count		= get_ibs_op_count,
    689};
    690
    691static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
    692{
    693	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
    694	struct perf_event *event = pcpu->event;
    695	struct hw_perf_event *hwc;
    696	struct perf_sample_data data;
    697	struct perf_raw_record raw;
    698	struct pt_regs regs;
    699	struct perf_ibs_data ibs_data;
    700	int offset, size, check_rip, offset_max, throttle = 0;
    701	unsigned int msr;
    702	u64 *buf, *config, period, new_config = 0;
    703
    704	if (!test_bit(IBS_STARTED, pcpu->state)) {
    705fail:
    706		/*
    707		 * Catch spurious interrupts after stopping IBS: After
    708		 * disabling IBS there could be still incoming NMIs
    709		 * with samples that even have the valid bit cleared.
    710		 * Mark all this NMIs as handled.
    711		 */
    712		if (test_and_clear_bit(IBS_STOPPED, pcpu->state))
    713			return 1;
    714
    715		return 0;
    716	}
    717
    718	if (WARN_ON_ONCE(!event))
    719		goto fail;
    720
    721	hwc = &event->hw;
    722	msr = hwc->config_base;
    723	buf = ibs_data.regs;
    724	rdmsrl(msr, *buf);
    725	if (!(*buf++ & perf_ibs->valid_mask))
    726		goto fail;
    727
    728	config = &ibs_data.regs[0];
    729	perf_ibs_event_update(perf_ibs, event, config);
    730	perf_sample_data_init(&data, 0, hwc->last_period);
    731	if (!perf_ibs_set_period(perf_ibs, hwc, &period))
    732		goto out;	/* no sw counter overflow */
    733
    734	ibs_data.caps = ibs_caps;
    735	size = 1;
    736	offset = 1;
    737	check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
    738	if (event->attr.sample_type & PERF_SAMPLE_RAW)
    739		offset_max = perf_ibs->offset_max;
    740	else if (check_rip)
    741		offset_max = 3;
    742	else
    743		offset_max = 1;
    744	do {
    745		rdmsrl(msr + offset, *buf++);
    746		size++;
    747		offset = find_next_bit(perf_ibs->offset_mask,
    748				       perf_ibs->offset_max,
    749				       offset + 1);
    750	} while (offset < offset_max);
    751	/*
    752	 * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately
    753	 * depending on their availability.
    754	 * Can't add to offset_max as they are staggered
    755	 */
    756	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
    757		if (perf_ibs == &perf_ibs_op) {
    758			if (ibs_caps & IBS_CAPS_BRNTRGT) {
    759				rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
    760				size++;
    761			}
    762			if (ibs_caps & IBS_CAPS_OPDATA4) {
    763				rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
    764				size++;
    765			}
    766		}
    767		if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) {
    768			rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++);
    769			size++;
    770		}
    771	}
    772	ibs_data.size = sizeof(u64) * size;
    773
    774	regs = *iregs;
    775	if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
    776		regs.flags &= ~PERF_EFLAGS_EXACT;
    777	} else {
    778		/* Workaround for erratum #1197 */
    779		if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1]))
    780			goto out;
    781
    782		set_linear_ip(&regs, ibs_data.regs[1]);
    783		regs.flags |= PERF_EFLAGS_EXACT;
    784	}
    785
    786	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
    787		raw = (struct perf_raw_record){
    788			.frag = {
    789				.size = sizeof(u32) + ibs_data.size,
    790				.data = ibs_data.data,
    791			},
    792		};
    793		data.raw = &raw;
    794	}
    795
    796	/*
    797	 * rip recorded by IbsOpRip will not be consistent with rsp and rbp
    798	 * recorded as part of interrupt regs. Thus we need to use rip from
    799	 * interrupt regs while unwinding call stack.
    800	 */
    801	if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
    802		data.callchain = perf_callchain(event, iregs);
    803
    804	throttle = perf_event_overflow(event, &data, &regs);
    805out:
    806	if (throttle) {
    807		perf_ibs_stop(event, 0);
    808	} else {
    809		if (perf_ibs == &perf_ibs_op) {
    810			if (ibs_caps & IBS_CAPS_OPCNTEXT) {
    811				new_config = period & IBS_OP_MAX_CNT_EXT_MASK;
    812				period &= ~IBS_OP_MAX_CNT_EXT_MASK;
    813			}
    814			if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL))
    815				new_config |= *config & IBS_OP_CUR_CNT_RAND;
    816		}
    817		new_config |= period >> 4;
    818
    819		perf_ibs_enable_event(perf_ibs, hwc, new_config);
    820	}
    821
    822	perf_event_update_userpage(event);
    823
    824	return 1;
    825}
    826
    827static int
    828perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
    829{
    830	u64 stamp = sched_clock();
    831	int handled = 0;
    832
    833	handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
    834	handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
    835
    836	if (handled)
    837		inc_irq_stat(apic_perf_irqs);
    838
    839	perf_sample_event_took(sched_clock() - stamp);
    840
    841	return handled;
    842}
    843NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
    844
    845static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
    846{
    847	struct cpu_perf_ibs __percpu *pcpu;
    848	int ret;
    849
    850	pcpu = alloc_percpu(struct cpu_perf_ibs);
    851	if (!pcpu)
    852		return -ENOMEM;
    853
    854	perf_ibs->pcpu = pcpu;
    855
    856	ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
    857	if (ret) {
    858		perf_ibs->pcpu = NULL;
    859		free_percpu(pcpu);
    860	}
    861
    862	return ret;
    863}
    864
    865static __init int perf_ibs_fetch_init(void)
    866{
    867	/*
    868	 * Some chips fail to reset the fetch count when it is written; instead
    869	 * they need a 0-1 transition of IbsFetchEn.
    870	 */
    871	if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18)
    872		perf_ibs_fetch.fetch_count_reset_broken = 1;
    873
    874	if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10)
    875		perf_ibs_fetch.fetch_ignore_if_zero_rip = 1;
    876
    877	if (ibs_caps & IBS_CAPS_ZEN4)
    878		perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY;
    879
    880	perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups;
    881	perf_ibs_fetch.pmu.attr_update = fetch_attr_update;
    882
    883	return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
    884}
    885
    886static __init int perf_ibs_op_init(void)
    887{
    888	if (ibs_caps & IBS_CAPS_OPCNT)
    889		perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
    890
    891	if (ibs_caps & IBS_CAPS_OPCNTEXT) {
    892		perf_ibs_op.max_period  |= IBS_OP_MAX_CNT_EXT_MASK;
    893		perf_ibs_op.config_mask	|= IBS_OP_MAX_CNT_EXT_MASK;
    894		perf_ibs_op.cnt_mask    |= IBS_OP_MAX_CNT_EXT_MASK;
    895	}
    896
    897	if (ibs_caps & IBS_CAPS_ZEN4)
    898		perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
    899
    900	perf_ibs_op.pmu.attr_groups = empty_attr_groups;
    901	perf_ibs_op.pmu.attr_update = op_attr_update;
    902
    903	return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
    904}
    905
    906static __init int perf_event_ibs_init(void)
    907{
    908	int ret;
    909
    910	ret = perf_ibs_fetch_init();
    911	if (ret)
    912		return ret;
    913
    914	ret = perf_ibs_op_init();
    915	if (ret)
    916		goto err_op;
    917
    918	ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
    919	if (ret)
    920		goto err_nmi;
    921
    922	pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
    923	return 0;
    924
    925err_nmi:
    926	perf_pmu_unregister(&perf_ibs_op.pmu);
    927	free_percpu(perf_ibs_op.pcpu);
    928	perf_ibs_op.pcpu = NULL;
    929err_op:
    930	perf_pmu_unregister(&perf_ibs_fetch.pmu);
    931	free_percpu(perf_ibs_fetch.pcpu);
    932	perf_ibs_fetch.pcpu = NULL;
    933
    934	return ret;
    935}
    936
    937#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
    938
    939static __init int perf_event_ibs_init(void)
    940{
    941	return 0;
    942}
    943
    944#endif
    945
    946/* IBS - apic initialization, for perf and oprofile */
    947
    948static __init u32 __get_ibs_caps(void)
    949{
    950	u32 caps;
    951	unsigned int max_level;
    952
    953	if (!boot_cpu_has(X86_FEATURE_IBS))
    954		return 0;
    955
    956	/* check IBS cpuid feature flags */
    957	max_level = cpuid_eax(0x80000000);
    958	if (max_level < IBS_CPUID_FEATURES)
    959		return IBS_CAPS_DEFAULT;
    960
    961	caps = cpuid_eax(IBS_CPUID_FEATURES);
    962	if (!(caps & IBS_CAPS_AVAIL))
    963		/* cpuid flags not valid */
    964		return IBS_CAPS_DEFAULT;
    965
    966	return caps;
    967}
    968
    969u32 get_ibs_caps(void)
    970{
    971	return ibs_caps;
    972}
    973
    974EXPORT_SYMBOL(get_ibs_caps);
    975
    976static inline int get_eilvt(int offset)
    977{
    978	return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
    979}
    980
    981static inline int put_eilvt(int offset)
    982{
    983	return !setup_APIC_eilvt(offset, 0, 0, 1);
    984}
    985
    986/*
    987 * Check and reserve APIC extended interrupt LVT offset for IBS if available.
    988 */
    989static inline int ibs_eilvt_valid(void)
    990{
    991	int offset;
    992	u64 val;
    993	int valid = 0;
    994
    995	preempt_disable();
    996
    997	rdmsrl(MSR_AMD64_IBSCTL, val);
    998	offset = val & IBSCTL_LVT_OFFSET_MASK;
    999
   1000	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
   1001		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
   1002		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
   1003		goto out;
   1004	}
   1005
   1006	if (!get_eilvt(offset)) {
   1007		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
   1008		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
   1009		goto out;
   1010	}
   1011
   1012	valid = 1;
   1013out:
   1014	preempt_enable();
   1015
   1016	return valid;
   1017}
   1018
   1019static int setup_ibs_ctl(int ibs_eilvt_off)
   1020{
   1021	struct pci_dev *cpu_cfg;
   1022	int nodes;
   1023	u32 value = 0;
   1024
   1025	nodes = 0;
   1026	cpu_cfg = NULL;
   1027	do {
   1028		cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
   1029					 PCI_DEVICE_ID_AMD_10H_NB_MISC,
   1030					 cpu_cfg);
   1031		if (!cpu_cfg)
   1032			break;
   1033		++nodes;
   1034		pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
   1035				       | IBSCTL_LVT_OFFSET_VALID);
   1036		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
   1037		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
   1038			pci_dev_put(cpu_cfg);
   1039			pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
   1040				 value);
   1041			return -EINVAL;
   1042		}
   1043	} while (1);
   1044
   1045	if (!nodes) {
   1046		pr_debug("No CPU node configured for IBS\n");
   1047		return -ENODEV;
   1048	}
   1049
   1050	return 0;
   1051}
   1052
   1053/*
   1054 * This runs only on the current cpu. We try to find an LVT offset and
   1055 * setup the local APIC. For this we must disable preemption. On
   1056 * success we initialize all nodes with this offset. This updates then
   1057 * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
   1058 * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
   1059 * is using the new offset.
   1060 */
   1061static void force_ibs_eilvt_setup(void)
   1062{
   1063	int offset;
   1064	int ret;
   1065
   1066	preempt_disable();
   1067	/* find the next free available EILVT entry, skip offset 0 */
   1068	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
   1069		if (get_eilvt(offset))
   1070			break;
   1071	}
   1072	preempt_enable();
   1073
   1074	if (offset == APIC_EILVT_NR_MAX) {
   1075		pr_debug("No EILVT entry available\n");
   1076		return;
   1077	}
   1078
   1079	ret = setup_ibs_ctl(offset);
   1080	if (ret)
   1081		goto out;
   1082
   1083	if (!ibs_eilvt_valid())
   1084		goto out;
   1085
   1086	pr_info("LVT offset %d assigned\n", offset);
   1087
   1088	return;
   1089out:
   1090	preempt_disable();
   1091	put_eilvt(offset);
   1092	preempt_enable();
   1093	return;
   1094}
   1095
   1096static void ibs_eilvt_setup(void)
   1097{
   1098	/*
   1099	 * Force LVT offset assignment for family 10h: The offsets are
   1100	 * not assigned by the BIOS for this family, so the OS is
   1101	 * responsible for doing it. If the OS assignment fails, fall
   1102	 * back to BIOS settings and try to setup this.
   1103	 */
   1104	if (boot_cpu_data.x86 == 0x10)
   1105		force_ibs_eilvt_setup();
   1106}
   1107
   1108static inline int get_ibs_lvt_offset(void)
   1109{
   1110	u64 val;
   1111
   1112	rdmsrl(MSR_AMD64_IBSCTL, val);
   1113	if (!(val & IBSCTL_LVT_OFFSET_VALID))
   1114		return -EINVAL;
   1115
   1116	return val & IBSCTL_LVT_OFFSET_MASK;
   1117}
   1118
   1119static void setup_APIC_ibs(void)
   1120{
   1121	int offset;
   1122
   1123	offset = get_ibs_lvt_offset();
   1124	if (offset < 0)
   1125		goto failed;
   1126
   1127	if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
   1128		return;
   1129failed:
   1130	pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
   1131		smp_processor_id());
   1132}
   1133
   1134static void clear_APIC_ibs(void)
   1135{
   1136	int offset;
   1137
   1138	offset = get_ibs_lvt_offset();
   1139	if (offset >= 0)
   1140		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
   1141}
   1142
   1143static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu)
   1144{
   1145	setup_APIC_ibs();
   1146	return 0;
   1147}
   1148
   1149#ifdef CONFIG_PM
   1150
   1151static int perf_ibs_suspend(void)
   1152{
   1153	clear_APIC_ibs();
   1154	return 0;
   1155}
   1156
   1157static void perf_ibs_resume(void)
   1158{
   1159	ibs_eilvt_setup();
   1160	setup_APIC_ibs();
   1161}
   1162
   1163static struct syscore_ops perf_ibs_syscore_ops = {
   1164	.resume		= perf_ibs_resume,
   1165	.suspend	= perf_ibs_suspend,
   1166};
   1167
   1168static void perf_ibs_pm_init(void)
   1169{
   1170	register_syscore_ops(&perf_ibs_syscore_ops);
   1171}
   1172
   1173#else
   1174
   1175static inline void perf_ibs_pm_init(void) { }
   1176
   1177#endif
   1178
   1179static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu)
   1180{
   1181	clear_APIC_ibs();
   1182	return 0;
   1183}
   1184
   1185static __init int amd_ibs_init(void)
   1186{
   1187	u32 caps;
   1188
   1189	caps = __get_ibs_caps();
   1190	if (!caps)
   1191		return -ENODEV;	/* ibs not supported by the cpu */
   1192
   1193	ibs_eilvt_setup();
   1194
   1195	if (!ibs_eilvt_valid())
   1196		return -EINVAL;
   1197
   1198	perf_ibs_pm_init();
   1199
   1200	ibs_caps = caps;
   1201	/* make ibs_caps visible to other cpus: */
   1202	smp_mb();
   1203	/*
   1204	 * x86_pmu_amd_ibs_starting_cpu will be called from core on
   1205	 * all online cpus.
   1206	 */
   1207	cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
   1208			  "perf/x86/amd/ibs:starting",
   1209			  x86_pmu_amd_ibs_starting_cpu,
   1210			  x86_pmu_amd_ibs_dying_cpu);
   1211
   1212	return perf_event_ibs_init();
   1213}
   1214
   1215/* Since we need the pci subsystem to init ibs we can't do this earlier: */
   1216device_initcall(amd_ibs_init);