cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

core-book3s.c (67109B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * Performance event support - powerpc architecture code
      4 *
      5 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
      6 */
      7#include <linux/kernel.h>
      8#include <linux/sched.h>
      9#include <linux/sched/clock.h>
     10#include <linux/perf_event.h>
     11#include <linux/percpu.h>
     12#include <linux/hardirq.h>
     13#include <linux/uaccess.h>
     14#include <asm/reg.h>
     15#include <asm/pmc.h>
     16#include <asm/machdep.h>
     17#include <asm/firmware.h>
     18#include <asm/ptrace.h>
     19#include <asm/code-patching.h>
     20#include <asm/hw_irq.h>
     21#include <asm/interrupt.h>
     22
     23#ifdef CONFIG_PPC64
     24#include "internal.h"
     25#endif
     26
     27#define BHRB_MAX_ENTRIES	32
     28#define BHRB_TARGET		0x0000000000000002
     29#define BHRB_PREDICTION		0x0000000000000001
     30#define BHRB_EA			0xFFFFFFFFFFFFFFFCUL
     31
     32struct cpu_hw_events {
     33	int n_events;
     34	int n_percpu;
     35	int disabled;
     36	int n_added;
     37	int n_limited;
     38	u8  pmcs_enabled;
     39	struct perf_event *event[MAX_HWEVENTS];
     40	u64 events[MAX_HWEVENTS];
     41	unsigned int flags[MAX_HWEVENTS];
     42	struct mmcr_regs mmcr;
     43	struct perf_event *limited_counter[MAX_LIMITED_HWCOUNTERS];
     44	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
     45	u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
     46	unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
     47	unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
     48
     49	unsigned int txn_flags;
     50	int n_txn_start;
     51
     52	/* BHRB bits */
     53	u64				bhrb_filter;	/* BHRB HW branch filter */
     54	unsigned int			bhrb_users;
     55	void				*bhrb_context;
     56	struct	perf_branch_stack	bhrb_stack;
     57	struct	perf_branch_entry	bhrb_entries[BHRB_MAX_ENTRIES];
     58	u64				ic_init;
     59
     60	/* Store the PMC values */
     61	unsigned long pmcs[MAX_HWEVENTS];
     62};
     63
     64static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
     65
     66static struct power_pmu *ppmu;
     67
     68/*
     69 * Normally, to ignore kernel events we set the FCS (freeze counters
     70 * in supervisor mode) bit in MMCR0, but if the kernel runs with the
     71 * hypervisor bit set in the MSR, or if we are running on a processor
     72 * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
     73 * then we need to use the FCHV bit to ignore kernel events.
     74 */
     75static unsigned int freeze_events_kernel = MMCR0_FCS;
     76
     77/*
     78 * 32-bit doesn't have MMCRA but does have an MMCR2,
     79 * and a few other names are different.
     80 * Also 32-bit doesn't have MMCR3, SIER2 and SIER3.
     81 * Define them as zero knowing that any code path accessing
     82 * these registers (via mtspr/mfspr) are done under ppmu flag
     83 * check for PPMU_ARCH_31 and we will not enter that code path
     84 * for 32-bit.
     85 */
     86#ifdef CONFIG_PPC32
     87
     88#define MMCR0_FCHV		0
     89#define MMCR0_PMCjCE		MMCR0_PMCnCE
     90#define MMCR0_FC56		0
     91#define MMCR0_PMAO		0
     92#define MMCR0_EBE		0
     93#define MMCR0_BHRBA		0
     94#define MMCR0_PMCC		0
     95#define MMCR0_PMCC_U6		0
     96
     97#define SPRN_MMCRA		SPRN_MMCR2
     98#define SPRN_MMCR3		0
     99#define SPRN_SIER2		0
    100#define SPRN_SIER3		0
    101#define MMCRA_SAMPLE_ENABLE	0
    102#define MMCRA_BHRB_DISABLE     0
    103#define MMCR0_PMCCEXT		0
    104
    105static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
    106{
    107	return 0;
    108}
    109static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs *regs, u64 *addrp) { }
    110static inline u32 perf_get_misc_flags(struct pt_regs *regs)
    111{
    112	return 0;
    113}
    114static inline void perf_read_regs(struct pt_regs *regs)
    115{
    116	regs->result = 0;
    117}
    118
    119static inline int siar_valid(struct pt_regs *regs)
    120{
    121	return 1;
    122}
    123
    124static bool is_ebb_event(struct perf_event *event) { return false; }
    125static int ebb_event_check(struct perf_event *event) { return 0; }
    126static void ebb_event_add(struct perf_event *event) { }
    127static void ebb_switch_out(unsigned long mmcr0) { }
    128static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
    129{
    130	return cpuhw->mmcr.mmcr0;
    131}
    132
    133static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
    134static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
    135static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {}
    136static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {}
    137static void pmao_restore_workaround(bool ebb) { }
    138#endif /* CONFIG_PPC32 */
    139
    140bool is_sier_available(void)
    141{
    142	if (!ppmu)
    143		return false;
    144
    145	if (ppmu->flags & PPMU_HAS_SIER)
    146		return true;
    147
    148	return false;
    149}
    150
    151/*
    152 * Return PMC value corresponding to the
    153 * index passed.
    154 */
    155unsigned long get_pmcs_ext_regs(int idx)
    156{
    157	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
    158
    159	return cpuhw->pmcs[idx];
    160}
    161
    162static bool regs_use_siar(struct pt_regs *regs)
    163{
    164	/*
    165	 * When we take a performance monitor exception the regs are setup
    166	 * using perf_read_regs() which overloads some fields, in particular
    167	 * regs->result to tell us whether to use SIAR.
    168	 *
    169	 * However if the regs are from another exception, eg. a syscall, then
    170	 * they have not been setup using perf_read_regs() and so regs->result
    171	 * is something random.
    172	 */
    173	return ((TRAP(regs) == INTERRUPT_PERFMON) && regs->result);
    174}
    175
    176/*
    177 * Things that are specific to 64-bit implementations.
    178 */
    179#ifdef CONFIG_PPC64
    180
    181static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
    182{
    183	unsigned long mmcra = regs->dsisr;
    184
    185	if ((ppmu->flags & PPMU_HAS_SSLOT) && (mmcra & MMCRA_SAMPLE_ENABLE)) {
    186		unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
    187		if (slot > 1)
    188			return 4 * (slot - 1);
    189	}
    190
    191	return 0;
    192}
    193
    194/*
    195 * The user wants a data address recorded.
    196 * If we're not doing instruction sampling, give them the SDAR
    197 * (sampled data address).  If we are doing instruction sampling, then
    198 * only give them the SDAR if it corresponds to the instruction
    199 * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC, the
    200 * [POWER7P_]MMCRA_SDAR_VALID bit in MMCRA, or the SDAR_VALID bit in SIER.
    201 */
    202static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs *regs, u64 *addrp)
    203{
    204	unsigned long mmcra = regs->dsisr;
    205	bool sdar_valid;
    206
    207	if (ppmu->flags & PPMU_HAS_SIER)
    208		sdar_valid = regs->dar & SIER_SDAR_VALID;
    209	else {
    210		unsigned long sdsync;
    211
    212		if (ppmu->flags & PPMU_SIAR_VALID)
    213			sdsync = POWER7P_MMCRA_SDAR_VALID;
    214		else if (ppmu->flags & PPMU_ALT_SIPR)
    215			sdsync = POWER6_MMCRA_SDSYNC;
    216		else if (ppmu->flags & PPMU_NO_SIAR)
    217			sdsync = MMCRA_SAMPLE_ENABLE;
    218		else
    219			sdsync = MMCRA_SDSYNC;
    220
    221		sdar_valid = mmcra & sdsync;
    222	}
    223
    224	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || sdar_valid)
    225		*addrp = mfspr(SPRN_SDAR);
    226
    227	if (is_kernel_addr(mfspr(SPRN_SDAR)) && event->attr.exclude_kernel)
    228		*addrp = 0;
    229}
    230
    231static bool regs_sihv(struct pt_regs *regs)
    232{
    233	unsigned long sihv = MMCRA_SIHV;
    234
    235	if (ppmu->flags & PPMU_HAS_SIER)
    236		return !!(regs->dar & SIER_SIHV);
    237
    238	if (ppmu->flags & PPMU_ALT_SIPR)
    239		sihv = POWER6_MMCRA_SIHV;
    240
    241	return !!(regs->dsisr & sihv);
    242}
    243
    244static bool regs_sipr(struct pt_regs *regs)
    245{
    246	unsigned long sipr = MMCRA_SIPR;
    247
    248	if (ppmu->flags & PPMU_HAS_SIER)
    249		return !!(regs->dar & SIER_SIPR);
    250
    251	if (ppmu->flags & PPMU_ALT_SIPR)
    252		sipr = POWER6_MMCRA_SIPR;
    253
    254	return !!(regs->dsisr & sipr);
    255}
    256
    257static inline u32 perf_flags_from_msr(struct pt_regs *regs)
    258{
    259	if (regs->msr & MSR_PR)
    260		return PERF_RECORD_MISC_USER;
    261	if ((regs->msr & MSR_HV) && freeze_events_kernel != MMCR0_FCHV)
    262		return PERF_RECORD_MISC_HYPERVISOR;
    263	return PERF_RECORD_MISC_KERNEL;
    264}
    265
    266static inline u32 perf_get_misc_flags(struct pt_regs *regs)
    267{
    268	bool use_siar = regs_use_siar(regs);
    269	unsigned long mmcra = regs->dsisr;
    270	int marked = mmcra & MMCRA_SAMPLE_ENABLE;
    271
    272	if (!use_siar)
    273		return perf_flags_from_msr(regs);
    274
    275	/*
    276	 * Check the address in SIAR to identify the
    277	 * privilege levels since the SIER[MSR_HV, MSR_PR]
    278	 * bits are not set for marked events in power10
    279	 * DD1.
    280	 */
    281	if (marked && (ppmu->flags & PPMU_P10_DD1)) {
    282		unsigned long siar = mfspr(SPRN_SIAR);
    283		if (siar) {
    284			if (is_kernel_addr(siar))
    285				return PERF_RECORD_MISC_KERNEL;
    286			return PERF_RECORD_MISC_USER;
    287		} else {
    288			if (is_kernel_addr(regs->nip))
    289				return PERF_RECORD_MISC_KERNEL;
    290			return PERF_RECORD_MISC_USER;
    291		}
    292	}
    293
    294	/*
    295	 * If we don't have flags in MMCRA, rather than using
    296	 * the MSR, we intuit the flags from the address in
    297	 * SIAR which should give slightly more reliable
    298	 * results
    299	 */
    300	if (ppmu->flags & PPMU_NO_SIPR) {
    301		unsigned long siar = mfspr(SPRN_SIAR);
    302		if (is_kernel_addr(siar))
    303			return PERF_RECORD_MISC_KERNEL;
    304		return PERF_RECORD_MISC_USER;
    305	}
    306
    307	/* PR has priority over HV, so order below is important */
    308	if (regs_sipr(regs))
    309		return PERF_RECORD_MISC_USER;
    310
    311	if (regs_sihv(regs) && (freeze_events_kernel != MMCR0_FCHV))
    312		return PERF_RECORD_MISC_HYPERVISOR;
    313
    314	return PERF_RECORD_MISC_KERNEL;
    315}
    316
    317/*
    318 * Overload regs->dsisr to store MMCRA so we only need to read it once
    319 * on each interrupt.
    320 * Overload regs->dar to store SIER if we have it.
    321 * Overload regs->result to specify whether we should use the MSR (result
    322 * is zero) or the SIAR (result is non zero).
    323 */
    324static inline void perf_read_regs(struct pt_regs *regs)
    325{
    326	unsigned long mmcra = mfspr(SPRN_MMCRA);
    327	int marked = mmcra & MMCRA_SAMPLE_ENABLE;
    328	int use_siar;
    329
    330	regs->dsisr = mmcra;
    331
    332	if (ppmu->flags & PPMU_HAS_SIER)
    333		regs->dar = mfspr(SPRN_SIER);
    334
    335	/*
    336	 * If this isn't a PMU exception (eg a software event) the SIAR is
    337	 * not valid. Use pt_regs.
    338	 *
    339	 * If it is a marked event use the SIAR.
    340	 *
    341	 * If the PMU doesn't update the SIAR for non marked events use
    342	 * pt_regs.
    343	 *
    344	 * If regs is a kernel interrupt, always use SIAR. Some PMUs have an
    345	 * issue with regs_sipr not being in synch with SIAR in interrupt entry
    346	 * and return sequences, which can result in regs_sipr being true for
    347	 * kernel interrupts and SIAR, which has the effect of causing samples
    348	 * to pile up at mtmsrd MSR[EE] 0->1 or pending irq replay around
    349	 * interrupt entry/exit.
    350	 *
    351	 * If the PMU has HV/PR flags then check to see if they
    352	 * place the exception in userspace. If so, use pt_regs. In
    353	 * continuous sampling mode the SIAR and the PMU exception are
    354	 * not synchronised, so they may be many instructions apart.
    355	 * This can result in confusing backtraces. We still want
    356	 * hypervisor samples as well as samples in the kernel with
    357	 * interrupts off hence the userspace check.
    358	 */
    359	if (TRAP(regs) != INTERRUPT_PERFMON)
    360		use_siar = 0;
    361	else if ((ppmu->flags & PPMU_NO_SIAR))
    362		use_siar = 0;
    363	else if (marked)
    364		use_siar = 1;
    365	else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING))
    366		use_siar = 0;
    367	else if (!user_mode(regs))
    368		use_siar = 1;
    369	else if (!(ppmu->flags & PPMU_NO_SIPR) && regs_sipr(regs))
    370		use_siar = 0;
    371	else
    372		use_siar = 1;
    373
    374	regs->result = use_siar;
    375}
    376
    377/*
    378 * On processors like P7+ that have the SIAR-Valid bit, marked instructions
    379 * must be sampled only if the SIAR-valid bit is set.
    380 *
    381 * For unmarked instructions and for processors that don't have the SIAR-Valid
    382 * bit, assume that SIAR is valid.
    383 */
    384static inline int siar_valid(struct pt_regs *regs)
    385{
    386	unsigned long mmcra = regs->dsisr;
    387	int marked = mmcra & MMCRA_SAMPLE_ENABLE;
    388
    389	if (marked) {
    390		/*
    391		 * SIER[SIAR_VALID] is not set for some
    392		 * marked events on power10 DD1, so drop
    393		 * the check for SIER[SIAR_VALID] and return true.
    394		 */
    395		if (ppmu->flags & PPMU_P10_DD1)
    396			return 0x1;
    397		else if (ppmu->flags & PPMU_HAS_SIER)
    398			return regs->dar & SIER_SIAR_VALID;
    399
    400		if (ppmu->flags & PPMU_SIAR_VALID)
    401			return mmcra & POWER7P_MMCRA_SIAR_VALID;
    402	}
    403
    404	return 1;
    405}
    406
    407
    408/* Reset all possible BHRB entries */
    409static void power_pmu_bhrb_reset(void)
    410{
    411	asm volatile(PPC_CLRBHRB);
    412}
    413
    414static void power_pmu_bhrb_enable(struct perf_event *event)
    415{
    416	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
    417
    418	if (!ppmu->bhrb_nr)
    419		return;
    420
    421	/* Clear BHRB if we changed task context to avoid data leaks */
    422	if (event->ctx->task && cpuhw->bhrb_context != event->ctx) {
    423		power_pmu_bhrb_reset();
    424		cpuhw->bhrb_context = event->ctx;
    425	}
    426	cpuhw->bhrb_users++;
    427	perf_sched_cb_inc(event->ctx->pmu);
    428}
    429
    430static void power_pmu_bhrb_disable(struct perf_event *event)
    431{
    432	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
    433
    434	if (!ppmu->bhrb_nr)
    435		return;
    436
    437	WARN_ON_ONCE(!cpuhw->bhrb_users);
    438	cpuhw->bhrb_users--;
    439	perf_sched_cb_dec(event->ctx->pmu);
    440
    441	if (!cpuhw->disabled && !cpuhw->bhrb_users) {
    442		/* BHRB cannot be turned off when other
    443		 * events are active on the PMU.
    444		 */
    445
    446		/* avoid stale pointer */
    447		cpuhw->bhrb_context = NULL;
    448	}
    449}
    450
    451/* Called from ctxsw to prevent one process's branch entries to
    452 * mingle with the other process's entries during context switch.
    453 */
    454static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
    455{
    456	if (!ppmu->bhrb_nr)
    457		return;
    458
    459	if (sched_in)
    460		power_pmu_bhrb_reset();
    461}
    462/* Calculate the to address for a branch */
    463static __u64 power_pmu_bhrb_to(u64 addr)
    464{
    465	unsigned int instr;
    466	__u64 target;
    467
    468	if (is_kernel_addr(addr)) {
    469		if (copy_from_kernel_nofault(&instr, (void *)addr,
    470				sizeof(instr)))
    471			return 0;
    472
    473		return branch_target(&instr);
    474	}
    475
    476	/* Userspace: need copy instruction here then translate it */
    477	if (copy_from_user_nofault(&instr, (unsigned int __user *)addr,
    478			sizeof(instr)))
    479		return 0;
    480
    481	target = branch_target(&instr);
    482	if ((!target) || (instr & BRANCH_ABSOLUTE))
    483		return target;
    484
    485	/* Translate relative branch target from kernel to user address */
    486	return target - (unsigned long)&instr + addr;
    487}
    488
    489/* Processing BHRB entries */
    490static void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw)
    491{
    492	u64 val;
    493	u64 addr;
    494	int r_index, u_index, pred;
    495
    496	r_index = 0;
    497	u_index = 0;
    498	while (r_index < ppmu->bhrb_nr) {
    499		/* Assembly read function */
    500		val = read_bhrb(r_index++);
    501		if (!val)
    502			/* Terminal marker: End of valid BHRB entries */
    503			break;
    504		else {
    505			addr = val & BHRB_EA;
    506			pred = val & BHRB_PREDICTION;
    507
    508			if (!addr)
    509				/* invalid entry */
    510				continue;
    511
    512			/*
    513			 * BHRB rolling buffer could very much contain the kernel
    514			 * addresses at this point. Check the privileges before
    515			 * exporting it to userspace (avoid exposure of regions
    516			 * where we could have speculative execution)
    517			 * Incase of ISA v3.1, BHRB will capture only user-space
    518			 * addresses, hence include a check before filtering code
    519			 */
    520			if (!(ppmu->flags & PPMU_ARCH_31) &&
    521			    is_kernel_addr(addr) && event->attr.exclude_kernel)
    522				continue;
    523
    524			/* Branches are read most recent first (ie. mfbhrb 0 is
    525			 * the most recent branch).
    526			 * There are two types of valid entries:
    527			 * 1) a target entry which is the to address of a
    528			 *    computed goto like a blr,bctr,btar.  The next
    529			 *    entry read from the bhrb will be branch
    530			 *    corresponding to this target (ie. the actual
    531			 *    blr/bctr/btar instruction).
    532			 * 2) a from address which is an actual branch.  If a
    533			 *    target entry proceeds this, then this is the
    534			 *    matching branch for that target.  If this is not
    535			 *    following a target entry, then this is a branch
    536			 *    where the target is given as an immediate field
    537			 *    in the instruction (ie. an i or b form branch).
    538			 *    In this case we need to read the instruction from
    539			 *    memory to determine the target/to address.
    540			 */
    541
    542			if (val & BHRB_TARGET) {
    543				/* Target branches use two entries
    544				 * (ie. computed gotos/XL form)
    545				 */
    546				cpuhw->bhrb_entries[u_index].to = addr;
    547				cpuhw->bhrb_entries[u_index].mispred = pred;
    548				cpuhw->bhrb_entries[u_index].predicted = ~pred;
    549
    550				/* Get from address in next entry */
    551				val = read_bhrb(r_index++);
    552				addr = val & BHRB_EA;
    553				if (val & BHRB_TARGET) {
    554					/* Shouldn't have two targets in a
    555					   row.. Reset index and try again */
    556					r_index--;
    557					addr = 0;
    558				}
    559				cpuhw->bhrb_entries[u_index].from = addr;
    560			} else {
    561				/* Branches to immediate field 
    562				   (ie I or B form) */
    563				cpuhw->bhrb_entries[u_index].from = addr;
    564				cpuhw->bhrb_entries[u_index].to =
    565					power_pmu_bhrb_to(addr);
    566				cpuhw->bhrb_entries[u_index].mispred = pred;
    567				cpuhw->bhrb_entries[u_index].predicted = ~pred;
    568			}
    569			u_index++;
    570
    571		}
    572	}
    573	cpuhw->bhrb_stack.nr = u_index;
    574	cpuhw->bhrb_stack.hw_idx = -1ULL;
    575	return;
    576}
    577
    578static bool is_ebb_event(struct perf_event *event)
    579{
    580	/*
    581	 * This could be a per-PMU callback, but we'd rather avoid the cost. We
    582	 * check that the PMU supports EBB, meaning those that don't can still
    583	 * use bit 63 of the event code for something else if they wish.
    584	 */
    585	return (ppmu->flags & PPMU_ARCH_207S) &&
    586	       ((event->attr.config >> PERF_EVENT_CONFIG_EBB_SHIFT) & 1);
    587}
    588
    589static int ebb_event_check(struct perf_event *event)
    590{
    591	struct perf_event *leader = event->group_leader;
    592
    593	/* Event and group leader must agree on EBB */
    594	if (is_ebb_event(leader) != is_ebb_event(event))
    595		return -EINVAL;
    596
    597	if (is_ebb_event(event)) {
    598		if (!(event->attach_state & PERF_ATTACH_TASK))
    599			return -EINVAL;
    600
    601		if (!leader->attr.pinned || !leader->attr.exclusive)
    602			return -EINVAL;
    603
    604		if (event->attr.freq ||
    605		    event->attr.inherit ||
    606		    event->attr.sample_type ||
    607		    event->attr.sample_period ||
    608		    event->attr.enable_on_exec)
    609			return -EINVAL;
    610	}
    611
    612	return 0;
    613}
    614
    615static void ebb_event_add(struct perf_event *event)
    616{
    617	if (!is_ebb_event(event) || current->thread.used_ebb)
    618		return;
    619
    620	/*
    621	 * IFF this is the first time we've added an EBB event, set
    622	 * PMXE in the user MMCR0 so we can detect when it's cleared by
    623	 * userspace. We need this so that we can context switch while
    624	 * userspace is in the EBB handler (where PMXE is 0).
    625	 */
    626	current->thread.used_ebb = 1;
    627	current->thread.mmcr0 |= MMCR0_PMXE;
    628}
    629
    630static void ebb_switch_out(unsigned long mmcr0)
    631{
    632	if (!(mmcr0 & MMCR0_EBE))
    633		return;
    634
    635	current->thread.siar  = mfspr(SPRN_SIAR);
    636	current->thread.sier  = mfspr(SPRN_SIER);
    637	current->thread.sdar  = mfspr(SPRN_SDAR);
    638	current->thread.mmcr0 = mmcr0 & MMCR0_USER_MASK;
    639	current->thread.mmcr2 = mfspr(SPRN_MMCR2) & MMCR2_USER_MASK;
    640	if (ppmu->flags & PPMU_ARCH_31) {
    641		current->thread.mmcr3 = mfspr(SPRN_MMCR3);
    642		current->thread.sier2 = mfspr(SPRN_SIER2);
    643		current->thread.sier3 = mfspr(SPRN_SIER3);
    644	}
    645}
    646
    647static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
    648{
    649	unsigned long mmcr0 = cpuhw->mmcr.mmcr0;
    650
    651	if (!ebb)
    652		goto out;
    653
    654	/* Enable EBB and read/write to all 6 PMCs and BHRB for userspace */
    655	mmcr0 |= MMCR0_EBE | MMCR0_BHRBA | MMCR0_PMCC_U6;
    656
    657	/*
    658	 * Add any bits from the user MMCR0, FC or PMAO. This is compatible
    659	 * with pmao_restore_workaround() because we may add PMAO but we never
    660	 * clear it here.
    661	 */
    662	mmcr0 |= current->thread.mmcr0;
    663
    664	/*
    665	 * Be careful not to set PMXE if userspace had it cleared. This is also
    666	 * compatible with pmao_restore_workaround() because it has already
    667	 * cleared PMXE and we leave PMAO alone.
    668	 */
    669	if (!(current->thread.mmcr0 & MMCR0_PMXE))
    670		mmcr0 &= ~MMCR0_PMXE;
    671
    672	mtspr(SPRN_SIAR, current->thread.siar);
    673	mtspr(SPRN_SIER, current->thread.sier);
    674	mtspr(SPRN_SDAR, current->thread.sdar);
    675
    676	/*
    677	 * Merge the kernel & user values of MMCR2. The semantics we implement
    678	 * are that the user MMCR2 can set bits, ie. cause counters to freeze,
    679	 * but not clear bits. If a task wants to be able to clear bits, ie.
    680	 * unfreeze counters, it should not set exclude_xxx in its events and
    681	 * instead manage the MMCR2 entirely by itself.
    682	 */
    683	mtspr(SPRN_MMCR2, cpuhw->mmcr.mmcr2 | current->thread.mmcr2);
    684
    685	if (ppmu->flags & PPMU_ARCH_31) {
    686		mtspr(SPRN_MMCR3, current->thread.mmcr3);
    687		mtspr(SPRN_SIER2, current->thread.sier2);
    688		mtspr(SPRN_SIER3, current->thread.sier3);
    689	}
    690out:
    691	return mmcr0;
    692}
    693
    694static void pmao_restore_workaround(bool ebb)
    695{
    696	unsigned pmcs[6];
    697
    698	if (!cpu_has_feature(CPU_FTR_PMAO_BUG))
    699		return;
    700
    701	/*
    702	 * On POWER8E there is a hardware defect which affects the PMU context
    703	 * switch logic, ie. power_pmu_disable/enable().
    704	 *
    705	 * When a counter overflows PMXE is cleared and FC/PMAO is set in MMCR0
    706	 * by the hardware. Sometime later the actual PMU exception is
    707	 * delivered.
    708	 *
    709	 * If we context switch, or simply disable/enable, the PMU prior to the
    710	 * exception arriving, the exception will be lost when we clear PMAO.
    711	 *
    712	 * When we reenable the PMU, we will write the saved MMCR0 with PMAO
    713	 * set, and this _should_ generate an exception. However because of the
    714	 * defect no exception is generated when we write PMAO, and we get
    715	 * stuck with no counters counting but no exception delivered.
    716	 *
    717	 * The workaround is to detect this case and tweak the hardware to
    718	 * create another pending PMU exception.
    719	 *
    720	 * We do that by setting up PMC6 (cycles) for an imminent overflow and
    721	 * enabling the PMU. That causes a new exception to be generated in the
    722	 * chip, but we don't take it yet because we have interrupts hard
    723	 * disabled. We then write back the PMU state as we want it to be seen
    724	 * by the exception handler. When we reenable interrupts the exception
    725	 * handler will be called and see the correct state.
    726	 *
    727	 * The logic is the same for EBB, except that the exception is gated by
    728	 * us having interrupts hard disabled as well as the fact that we are
    729	 * not in userspace. The exception is finally delivered when we return
    730	 * to userspace.
    731	 */
    732
    733	/* Only if PMAO is set and PMAO_SYNC is clear */
    734	if ((current->thread.mmcr0 & (MMCR0_PMAO | MMCR0_PMAO_SYNC)) != MMCR0_PMAO)
    735		return;
    736
    737	/* If we're doing EBB, only if BESCR[GE] is set */
    738	if (ebb && !(current->thread.bescr & BESCR_GE))
    739		return;
    740
    741	/*
    742	 * We are already soft-disabled in power_pmu_enable(). We need to hard
    743	 * disable to actually prevent the PMU exception from firing.
    744	 */
    745	hard_irq_disable();
    746
    747	/*
    748	 * This is a bit gross, but we know we're on POWER8E and have 6 PMCs.
    749	 * Using read/write_pmc() in a for loop adds 12 function calls and
    750	 * almost doubles our code size.
    751	 */
    752	pmcs[0] = mfspr(SPRN_PMC1);
    753	pmcs[1] = mfspr(SPRN_PMC2);
    754	pmcs[2] = mfspr(SPRN_PMC3);
    755	pmcs[3] = mfspr(SPRN_PMC4);
    756	pmcs[4] = mfspr(SPRN_PMC5);
    757	pmcs[5] = mfspr(SPRN_PMC6);
    758
    759	/* Ensure all freeze bits are unset */
    760	mtspr(SPRN_MMCR2, 0);
    761
    762	/* Set up PMC6 to overflow in one cycle */
    763	mtspr(SPRN_PMC6, 0x7FFFFFFE);
    764
    765	/* Enable exceptions and unfreeze PMC6 */
    766	mtspr(SPRN_MMCR0, MMCR0_PMXE | MMCR0_PMCjCE | MMCR0_PMAO);
    767
    768	/* Now we need to refreeze and restore the PMCs */
    769	mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMAO);
    770
    771	mtspr(SPRN_PMC1, pmcs[0]);
    772	mtspr(SPRN_PMC2, pmcs[1]);
    773	mtspr(SPRN_PMC3, pmcs[2]);
    774	mtspr(SPRN_PMC4, pmcs[3]);
    775	mtspr(SPRN_PMC5, pmcs[4]);
    776	mtspr(SPRN_PMC6, pmcs[5]);
    777}
    778
    779/*
    780 * If the perf subsystem wants performance monitor interrupts as soon as
    781 * possible (e.g., to sample the instruction address and stack chain),
    782 * this should return true. The IRQ masking code can then enable MSR[EE]
    783 * in some places (e.g., interrupt handlers) that allows PMI interrupts
    784 * through to improve accuracy of profiles, at the cost of some performance.
    785 *
    786 * The PMU counters can be enabled by other means (e.g., sysfs raw SPR
    787 * access), but in that case there is no need for prompt PMI handling.
    788 *
    789 * This currently returns true if any perf counter is being used. It
    790 * could possibly return false if only events are being counted rather than
    791 * samples being taken, but for now this is good enough.
    792 */
    793bool power_pmu_wants_prompt_pmi(void)
    794{
    795	struct cpu_hw_events *cpuhw;
    796
    797	/*
    798	 * This could simply test local_paca->pmcregs_in_use if that were not
    799	 * under ifdef KVM.
    800	 */
    801	if (!ppmu)
    802		return false;
    803
    804	cpuhw = this_cpu_ptr(&cpu_hw_events);
    805	return cpuhw->n_events;
    806}
    807#endif /* CONFIG_PPC64 */
    808
    809static void perf_event_interrupt(struct pt_regs *regs);
    810
    811/*
    812 * Read one performance monitor counter (PMC).
    813 */
    814static unsigned long read_pmc(int idx)
    815{
    816	unsigned long val;
    817
    818	switch (idx) {
    819	case 1:
    820		val = mfspr(SPRN_PMC1);
    821		break;
    822	case 2:
    823		val = mfspr(SPRN_PMC2);
    824		break;
    825	case 3:
    826		val = mfspr(SPRN_PMC3);
    827		break;
    828	case 4:
    829		val = mfspr(SPRN_PMC4);
    830		break;
    831	case 5:
    832		val = mfspr(SPRN_PMC5);
    833		break;
    834	case 6:
    835		val = mfspr(SPRN_PMC6);
    836		break;
    837#ifdef CONFIG_PPC64
    838	case 7:
    839		val = mfspr(SPRN_PMC7);
    840		break;
    841	case 8:
    842		val = mfspr(SPRN_PMC8);
    843		break;
    844#endif /* CONFIG_PPC64 */
    845	default:
    846		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
    847		val = 0;
    848	}
    849	return val;
    850}
    851
    852/*
    853 * Write one PMC.
    854 */
    855static void write_pmc(int idx, unsigned long val)
    856{
    857	switch (idx) {
    858	case 1:
    859		mtspr(SPRN_PMC1, val);
    860		break;
    861	case 2:
    862		mtspr(SPRN_PMC2, val);
    863		break;
    864	case 3:
    865		mtspr(SPRN_PMC3, val);
    866		break;
    867	case 4:
    868		mtspr(SPRN_PMC4, val);
    869		break;
    870	case 5:
    871		mtspr(SPRN_PMC5, val);
    872		break;
    873	case 6:
    874		mtspr(SPRN_PMC6, val);
    875		break;
    876#ifdef CONFIG_PPC64
    877	case 7:
    878		mtspr(SPRN_PMC7, val);
    879		break;
    880	case 8:
    881		mtspr(SPRN_PMC8, val);
    882		break;
    883#endif /* CONFIG_PPC64 */
    884	default:
    885		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
    886	}
    887}
    888
    889static int any_pmc_overflown(struct cpu_hw_events *cpuhw)
    890{
    891	int i, idx;
    892
    893	for (i = 0; i < cpuhw->n_events; i++) {
    894		idx = cpuhw->event[i]->hw.idx;
    895		if ((idx) && ((int)read_pmc(idx) < 0))
    896			return idx;
    897	}
    898
    899	return 0;
    900}
    901
    902/* Called from sysrq_handle_showregs() */
    903void perf_event_print_debug(void)
    904{
    905	unsigned long sdar, sier, flags;
    906	u32 pmcs[MAX_HWEVENTS];
    907	int i;
    908
    909	if (!ppmu) {
    910		pr_info("Performance monitor hardware not registered.\n");
    911		return;
    912	}
    913
    914	if (!ppmu->n_counter)
    915		return;
    916
    917	local_irq_save(flags);
    918
    919	pr_info("CPU: %d PMU registers, ppmu = %s n_counters = %d",
    920		 smp_processor_id(), ppmu->name, ppmu->n_counter);
    921
    922	for (i = 0; i < ppmu->n_counter; i++)
    923		pmcs[i] = read_pmc(i + 1);
    924
    925	for (; i < MAX_HWEVENTS; i++)
    926		pmcs[i] = 0xdeadbeef;
    927
    928	pr_info("PMC1:  %08x PMC2: %08x PMC3: %08x PMC4: %08x\n",
    929		 pmcs[0], pmcs[1], pmcs[2], pmcs[3]);
    930
    931	if (ppmu->n_counter > 4)
    932		pr_info("PMC5:  %08x PMC6: %08x PMC7: %08x PMC8: %08x\n",
    933			 pmcs[4], pmcs[5], pmcs[6], pmcs[7]);
    934
    935	pr_info("MMCR0: %016lx MMCR1: %016lx MMCRA: %016lx\n",
    936		mfspr(SPRN_MMCR0), mfspr(SPRN_MMCR1), mfspr(SPRN_MMCRA));
    937
    938	sdar = sier = 0;
    939#ifdef CONFIG_PPC64
    940	sdar = mfspr(SPRN_SDAR);
    941
    942	if (ppmu->flags & PPMU_HAS_SIER)
    943		sier = mfspr(SPRN_SIER);
    944
    945	if (ppmu->flags & PPMU_ARCH_207S) {
    946		pr_info("MMCR2: %016lx EBBHR: %016lx\n",
    947			mfspr(SPRN_MMCR2), mfspr(SPRN_EBBHR));
    948		pr_info("EBBRR: %016lx BESCR: %016lx\n",
    949			mfspr(SPRN_EBBRR), mfspr(SPRN_BESCR));
    950	}
    951
    952	if (ppmu->flags & PPMU_ARCH_31) {
    953		pr_info("MMCR3: %016lx SIER2: %016lx SIER3: %016lx\n",
    954			mfspr(SPRN_MMCR3), mfspr(SPRN_SIER2), mfspr(SPRN_SIER3));
    955	}
    956#endif
    957	pr_info("SIAR:  %016lx SDAR:  %016lx SIER:  %016lx\n",
    958		mfspr(SPRN_SIAR), sdar, sier);
    959
    960	local_irq_restore(flags);
    961}
    962
    963/*
    964 * Check if a set of events can all go on the PMU at once.
    965 * If they can't, this will look at alternative codes for the events
    966 * and see if any combination of alternative codes is feasible.
    967 * The feasible set is returned in event_id[].
    968 */
    969static int power_check_constraints(struct cpu_hw_events *cpuhw,
    970				   u64 event_id[], unsigned int cflags[],
    971				   int n_ev, struct perf_event **event)
    972{
    973	unsigned long mask, value, nv;
    974	unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
    975	int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
    976	int i, j;
    977	unsigned long addf = ppmu->add_fields;
    978	unsigned long tadd = ppmu->test_adder;
    979	unsigned long grp_mask = ppmu->group_constraint_mask;
    980	unsigned long grp_val = ppmu->group_constraint_val;
    981
    982	if (n_ev > ppmu->n_counter)
    983		return -1;
    984
    985	/* First see if the events will go on as-is */
    986	for (i = 0; i < n_ev; ++i) {
    987		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
    988		    && !ppmu->limited_pmc_event(event_id[i])) {
    989			ppmu->get_alternatives(event_id[i], cflags[i],
    990					       cpuhw->alternatives[i]);
    991			event_id[i] = cpuhw->alternatives[i][0];
    992		}
    993		if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
    994					 &cpuhw->avalues[i][0], event[i]->attr.config1))
    995			return -1;
    996	}
    997	value = mask = 0;
    998	for (i = 0; i < n_ev; ++i) {
    999		nv = (value | cpuhw->avalues[i][0]) +
   1000			(value & cpuhw->avalues[i][0] & addf);
   1001
   1002		if (((((nv + tadd) ^ value) & mask) & (~grp_mask)) != 0)
   1003			break;
   1004
   1005		if (((((nv + tadd) ^ cpuhw->avalues[i][0]) & cpuhw->amasks[i][0])
   1006			& (~grp_mask)) != 0)
   1007			break;
   1008
   1009		value = nv;
   1010		mask |= cpuhw->amasks[i][0];
   1011	}
   1012	if (i == n_ev) {
   1013		if ((value & mask & grp_mask) != (mask & grp_val))
   1014			return -1;
   1015		else
   1016			return 0;	/* all OK */
   1017	}
   1018
   1019	/* doesn't work, gather alternatives... */
   1020	if (!ppmu->get_alternatives)
   1021		return -1;
   1022	for (i = 0; i < n_ev; ++i) {
   1023		choice[i] = 0;
   1024		n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
   1025						  cpuhw->alternatives[i]);
   1026		for (j = 1; j < n_alt[i]; ++j)
   1027			ppmu->get_constraint(cpuhw->alternatives[i][j],
   1028					     &cpuhw->amasks[i][j],
   1029					     &cpuhw->avalues[i][j],
   1030					     event[i]->attr.config1);
   1031	}
   1032
   1033	/* enumerate all possibilities and see if any will work */
   1034	i = 0;
   1035	j = -1;
   1036	value = mask = nv = 0;
   1037	while (i < n_ev) {
   1038		if (j >= 0) {
   1039			/* we're backtracking, restore context */
   1040			value = svalues[i];
   1041			mask = smasks[i];
   1042			j = choice[i];
   1043		}
   1044		/*
   1045		 * See if any alternative k for event_id i,
   1046		 * where k > j, will satisfy the constraints.
   1047		 */
   1048		while (++j < n_alt[i]) {
   1049			nv = (value | cpuhw->avalues[i][j]) +
   1050				(value & cpuhw->avalues[i][j] & addf);
   1051			if ((((nv + tadd) ^ value) & mask) == 0 &&
   1052			    (((nv + tadd) ^ cpuhw->avalues[i][j])
   1053			     & cpuhw->amasks[i][j]) == 0)
   1054				break;
   1055		}
   1056		if (j >= n_alt[i]) {
   1057			/*
   1058			 * No feasible alternative, backtrack
   1059			 * to event_id i-1 and continue enumerating its
   1060			 * alternatives from where we got up to.
   1061			 */
   1062			if (--i < 0)
   1063				return -1;
   1064		} else {
   1065			/*
   1066			 * Found a feasible alternative for event_id i,
   1067			 * remember where we got up to with this event_id,
   1068			 * go on to the next event_id, and start with
   1069			 * the first alternative for it.
   1070			 */
   1071			choice[i] = j;
   1072			svalues[i] = value;
   1073			smasks[i] = mask;
   1074			value = nv;
   1075			mask |= cpuhw->amasks[i][j];
   1076			++i;
   1077			j = -1;
   1078		}
   1079	}
   1080
   1081	/* OK, we have a feasible combination, tell the caller the solution */
   1082	for (i = 0; i < n_ev; ++i)
   1083		event_id[i] = cpuhw->alternatives[i][choice[i]];
   1084	return 0;
   1085}
   1086
   1087/*
   1088 * Check if newly-added events have consistent settings for
   1089 * exclude_{user,kernel,hv} with each other and any previously
   1090 * added events.
   1091 */
   1092static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
   1093			  int n_prev, int n_new)
   1094{
   1095	int eu = 0, ek = 0, eh = 0;
   1096	int i, n, first;
   1097	struct perf_event *event;
   1098
   1099	/*
   1100	 * If the PMU we're on supports per event exclude settings then we
   1101	 * don't need to do any of this logic. NB. This assumes no PMU has both
   1102	 * per event exclude and limited PMCs.
   1103	 */
   1104	if (ppmu->flags & PPMU_ARCH_207S)
   1105		return 0;
   1106
   1107	n = n_prev + n_new;
   1108	if (n <= 1)
   1109		return 0;
   1110
   1111	first = 1;
   1112	for (i = 0; i < n; ++i) {
   1113		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
   1114			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
   1115			continue;
   1116		}
   1117		event = ctrs[i];
   1118		if (first) {
   1119			eu = event->attr.exclude_user;
   1120			ek = event->attr.exclude_kernel;
   1121			eh = event->attr.exclude_hv;
   1122			first = 0;
   1123		} else if (event->attr.exclude_user != eu ||
   1124			   event->attr.exclude_kernel != ek ||
   1125			   event->attr.exclude_hv != eh) {
   1126			return -EAGAIN;
   1127		}
   1128	}
   1129
   1130	if (eu || ek || eh)
   1131		for (i = 0; i < n; ++i)
   1132			if (cflags[i] & PPMU_LIMITED_PMC_OK)
   1133				cflags[i] |= PPMU_LIMITED_PMC_REQD;
   1134
   1135	return 0;
   1136}
   1137
   1138static u64 check_and_compute_delta(u64 prev, u64 val)
   1139{
   1140	u64 delta = (val - prev) & 0xfffffffful;
   1141
   1142	/*
   1143	 * POWER7 can roll back counter values, if the new value is smaller
   1144	 * than the previous value it will cause the delta and the counter to
   1145	 * have bogus values unless we rolled a counter over.  If a counter is
   1146	 * rolled back, it will be smaller, but within 256, which is the maximum
   1147	 * number of events to rollback at once.  If we detect a rollback
   1148	 * return 0.  This can lead to a small lack of precision in the
   1149	 * counters.
   1150	 */
   1151	if (prev > val && (prev - val) < 256)
   1152		delta = 0;
   1153
   1154	return delta;
   1155}
   1156
   1157static void power_pmu_read(struct perf_event *event)
   1158{
   1159	s64 val, delta, prev;
   1160
   1161	if (event->hw.state & PERF_HES_STOPPED)
   1162		return;
   1163
   1164	if (!event->hw.idx)
   1165		return;
   1166
   1167	if (is_ebb_event(event)) {
   1168		val = read_pmc(event->hw.idx);
   1169		local64_set(&event->hw.prev_count, val);
   1170		return;
   1171	}
   1172
   1173	/*
   1174	 * Performance monitor interrupts come even when interrupts
   1175	 * are soft-disabled, as long as interrupts are hard-enabled.
   1176	 * Therefore we treat them like NMIs.
   1177	 */
   1178	do {
   1179		prev = local64_read(&event->hw.prev_count);
   1180		barrier();
   1181		val = read_pmc(event->hw.idx);
   1182		delta = check_and_compute_delta(prev, val);
   1183		if (!delta)
   1184			return;
   1185	} while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
   1186
   1187	local64_add(delta, &event->count);
   1188
   1189	/*
   1190	 * A number of places program the PMC with (0x80000000 - period_left).
   1191	 * We never want period_left to be less than 1 because we will program
   1192	 * the PMC with a value >= 0x800000000 and an edge detected PMC will
   1193	 * roll around to 0 before taking an exception. We have seen this
   1194	 * on POWER8.
   1195	 *
   1196	 * To fix this, clamp the minimum value of period_left to 1.
   1197	 */
   1198	do {
   1199		prev = local64_read(&event->hw.period_left);
   1200		val = prev - delta;
   1201		if (val < 1)
   1202			val = 1;
   1203	} while (local64_cmpxchg(&event->hw.period_left, prev, val) != prev);
   1204}
   1205
   1206/*
   1207 * On some machines, PMC5 and PMC6 can't be written, don't respect
   1208 * the freeze conditions, and don't generate interrupts.  This tells
   1209 * us if `event' is using such a PMC.
   1210 */
   1211static int is_limited_pmc(int pmcnum)
   1212{
   1213	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
   1214		&& (pmcnum == 5 || pmcnum == 6);
   1215}
   1216
   1217static void freeze_limited_counters(struct cpu_hw_events *cpuhw,
   1218				    unsigned long pmc5, unsigned long pmc6)
   1219{
   1220	struct perf_event *event;
   1221	u64 val, prev, delta;
   1222	int i;
   1223
   1224	for (i = 0; i < cpuhw->n_limited; ++i) {
   1225		event = cpuhw->limited_counter[i];
   1226		if (!event->hw.idx)
   1227			continue;
   1228		val = (event->hw.idx == 5) ? pmc5 : pmc6;
   1229		prev = local64_read(&event->hw.prev_count);
   1230		event->hw.idx = 0;
   1231		delta = check_and_compute_delta(prev, val);
   1232		if (delta)
   1233			local64_add(delta, &event->count);
   1234	}
   1235}
   1236
   1237static void thaw_limited_counters(struct cpu_hw_events *cpuhw,
   1238				  unsigned long pmc5, unsigned long pmc6)
   1239{
   1240	struct perf_event *event;
   1241	u64 val, prev;
   1242	int i;
   1243
   1244	for (i = 0; i < cpuhw->n_limited; ++i) {
   1245		event = cpuhw->limited_counter[i];
   1246		event->hw.idx = cpuhw->limited_hwidx[i];
   1247		val = (event->hw.idx == 5) ? pmc5 : pmc6;
   1248		prev = local64_read(&event->hw.prev_count);
   1249		if (check_and_compute_delta(prev, val))
   1250			local64_set(&event->hw.prev_count, val);
   1251		perf_event_update_userpage(event);
   1252	}
   1253}
   1254
   1255/*
   1256 * Since limited events don't respect the freeze conditions, we
   1257 * have to read them immediately after freezing or unfreezing the
   1258 * other events.  We try to keep the values from the limited
   1259 * events as consistent as possible by keeping the delay (in
   1260 * cycles and instructions) between freezing/unfreezing and reading
   1261 * the limited events as small and consistent as possible.
   1262 * Therefore, if any limited events are in use, we read them
   1263 * both, and always in the same order, to minimize variability,
   1264 * and do it inside the same asm that writes MMCR0.
   1265 */
   1266static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
   1267{
   1268	unsigned long pmc5, pmc6;
   1269
   1270	if (!cpuhw->n_limited) {
   1271		mtspr(SPRN_MMCR0, mmcr0);
   1272		return;
   1273	}
   1274
   1275	/*
   1276	 * Write MMCR0, then read PMC5 and PMC6 immediately.
   1277	 * To ensure we don't get a performance monitor interrupt
   1278	 * between writing MMCR0 and freezing/thawing the limited
   1279	 * events, we first write MMCR0 with the event overflow
   1280	 * interrupt enable bits turned off.
   1281	 */
   1282	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
   1283		     : "=&r" (pmc5), "=&r" (pmc6)
   1284		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
   1285		       "i" (SPRN_MMCR0),
   1286		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
   1287
   1288	if (mmcr0 & MMCR0_FC)
   1289		freeze_limited_counters(cpuhw, pmc5, pmc6);
   1290	else
   1291		thaw_limited_counters(cpuhw, pmc5, pmc6);
   1292
   1293	/*
   1294	 * Write the full MMCR0 including the event overflow interrupt
   1295	 * enable bits, if necessary.
   1296	 */
   1297	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
   1298		mtspr(SPRN_MMCR0, mmcr0);
   1299}
   1300
   1301/*
   1302 * Disable all events to prevent PMU interrupts and to allow
   1303 * events to be added or removed.
   1304 */
   1305static void power_pmu_disable(struct pmu *pmu)
   1306{
   1307	struct cpu_hw_events *cpuhw;
   1308	unsigned long flags, mmcr0, val, mmcra;
   1309
   1310	if (!ppmu)
   1311		return;
   1312	local_irq_save(flags);
   1313	cpuhw = this_cpu_ptr(&cpu_hw_events);
   1314
   1315	if (!cpuhw->disabled) {
   1316		/*
   1317		 * Check if we ever enabled the PMU on this cpu.
   1318		 */
   1319		if (!cpuhw->pmcs_enabled) {
   1320			ppc_enable_pmcs();
   1321			cpuhw->pmcs_enabled = 1;
   1322		}
   1323
   1324		/*
   1325		 * Set the 'freeze counters' bit, clear EBE/BHRBA/PMCC/PMAO/FC56
   1326		 * Also clear PMXE to disable PMI's getting triggered in some
   1327		 * corner cases during PMU disable.
   1328		 */
   1329		val  = mmcr0 = mfspr(SPRN_MMCR0);
   1330		val |= MMCR0_FC;
   1331		val &= ~(MMCR0_EBE | MMCR0_BHRBA | MMCR0_PMCC | MMCR0_PMAO |
   1332			 MMCR0_PMXE | MMCR0_FC56);
   1333		/* Set mmcr0 PMCCEXT for p10 */
   1334		if (ppmu->flags & PPMU_ARCH_31)
   1335			val |= MMCR0_PMCCEXT;
   1336
   1337		/*
   1338		 * The barrier is to make sure the mtspr has been
   1339		 * executed and the PMU has frozen the events etc.
   1340		 * before we return.
   1341		 */
   1342		write_mmcr0(cpuhw, val);
   1343		mb();
   1344		isync();
   1345
   1346		/*
   1347		 * Some corner cases could clear the PMU counter overflow
   1348		 * while a masked PMI is pending. One such case is when
   1349		 * a PMI happens during interrupt replay and perf counter
   1350		 * values are cleared by PMU callbacks before replay.
   1351		 *
   1352		 * If any PMC corresponding to the active PMU events are
   1353		 * overflown, disable the interrupt by clearing the paca
   1354		 * bit for PMI since we are disabling the PMU now.
   1355		 * Otherwise provide a warning if there is PMI pending, but
   1356		 * no counter is found overflown.
   1357		 */
   1358		if (any_pmc_overflown(cpuhw)) {
   1359			/*
   1360			 * Since power_pmu_disable runs under local_irq_save, it
   1361			 * could happen that code hits a PMC overflow without PMI
   1362			 * pending in paca. Hence only clear PMI pending if it was
   1363			 * set.
   1364			 *
   1365			 * If a PMI is pending, then MSR[EE] must be disabled (because
   1366			 * the masked PMI handler disabling EE). So it is safe to
   1367			 * call clear_pmi_irq_pending().
   1368			 */
   1369			if (pmi_irq_pending())
   1370				clear_pmi_irq_pending();
   1371		} else
   1372			WARN_ON(pmi_irq_pending());
   1373
   1374		val = mmcra = cpuhw->mmcr.mmcra;
   1375
   1376		/*
   1377		 * Disable instruction sampling if it was enabled
   1378		 */
   1379		if (cpuhw->mmcr.mmcra & MMCRA_SAMPLE_ENABLE)
   1380			val &= ~MMCRA_SAMPLE_ENABLE;
   1381
   1382		/* Disable BHRB via mmcra (BHRBRD) for p10 */
   1383		if (ppmu->flags & PPMU_ARCH_31)
   1384			val |= MMCRA_BHRB_DISABLE;
   1385
   1386		/*
   1387		 * Write SPRN_MMCRA if mmcra has either disabled
   1388		 * instruction sampling or BHRB.
   1389		 */
   1390		if (val != mmcra) {
   1391			mtspr(SPRN_MMCRA, mmcra);
   1392			mb();
   1393			isync();
   1394		}
   1395
   1396		cpuhw->disabled = 1;
   1397		cpuhw->n_added = 0;
   1398
   1399		ebb_switch_out(mmcr0);
   1400
   1401#ifdef CONFIG_PPC64
   1402		/*
   1403		 * These are readable by userspace, may contain kernel
   1404		 * addresses and are not switched by context switch, so clear
   1405		 * them now to avoid leaking anything to userspace in general
   1406		 * including to another process.
   1407		 */
   1408		if (ppmu->flags & PPMU_ARCH_207S) {
   1409			mtspr(SPRN_SDAR, 0);
   1410			mtspr(SPRN_SIAR, 0);
   1411		}
   1412#endif
   1413	}
   1414
   1415	local_irq_restore(flags);
   1416}
   1417
   1418/*
   1419 * Re-enable all events if disable == 0.
   1420 * If we were previously disabled and events were added, then
   1421 * put the new config on the PMU.
   1422 */
   1423static void power_pmu_enable(struct pmu *pmu)
   1424{
   1425	struct perf_event *event;
   1426	struct cpu_hw_events *cpuhw;
   1427	unsigned long flags;
   1428	long i;
   1429	unsigned long val, mmcr0;
   1430	s64 left;
   1431	unsigned int hwc_index[MAX_HWEVENTS];
   1432	int n_lim;
   1433	int idx;
   1434	bool ebb;
   1435
   1436	if (!ppmu)
   1437		return;
   1438	local_irq_save(flags);
   1439
   1440	cpuhw = this_cpu_ptr(&cpu_hw_events);
   1441	if (!cpuhw->disabled)
   1442		goto out;
   1443
   1444	if (cpuhw->n_events == 0) {
   1445		ppc_set_pmu_inuse(0);
   1446		goto out;
   1447	}
   1448
   1449	cpuhw->disabled = 0;
   1450
   1451	/*
   1452	 * EBB requires an exclusive group and all events must have the EBB
   1453	 * flag set, or not set, so we can just check a single event. Also we
   1454	 * know we have at least one event.
   1455	 */
   1456	ebb = is_ebb_event(cpuhw->event[0]);
   1457
   1458	/*
   1459	 * If we didn't change anything, or only removed events,
   1460	 * no need to recalculate MMCR* settings and reset the PMCs.
   1461	 * Just reenable the PMU with the current MMCR* settings
   1462	 * (possibly updated for removal of events).
   1463	 */
   1464	if (!cpuhw->n_added) {
   1465		/*
   1466		 * If there is any active event with an overflown PMC
   1467		 * value, set back PACA_IRQ_PMI which would have been
   1468		 * cleared in power_pmu_disable().
   1469		 */
   1470		hard_irq_disable();
   1471		if (any_pmc_overflown(cpuhw))
   1472			set_pmi_irq_pending();
   1473
   1474		mtspr(SPRN_MMCRA, cpuhw->mmcr.mmcra & ~MMCRA_SAMPLE_ENABLE);
   1475		mtspr(SPRN_MMCR1, cpuhw->mmcr.mmcr1);
   1476		if (ppmu->flags & PPMU_ARCH_31)
   1477			mtspr(SPRN_MMCR3, cpuhw->mmcr.mmcr3);
   1478		goto out_enable;
   1479	}
   1480
   1481	/*
   1482	 * Clear all MMCR settings and recompute them for the new set of events.
   1483	 */
   1484	memset(&cpuhw->mmcr, 0, sizeof(cpuhw->mmcr));
   1485
   1486	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
   1487			       &cpuhw->mmcr, cpuhw->event, ppmu->flags)) {
   1488		/* shouldn't ever get here */
   1489		printk(KERN_ERR "oops compute_mmcr failed\n");
   1490		goto out;
   1491	}
   1492
   1493	if (!(ppmu->flags & PPMU_ARCH_207S)) {
   1494		/*
   1495		 * Add in MMCR0 freeze bits corresponding to the attr.exclude_*
   1496		 * bits for the first event. We have already checked that all
   1497		 * events have the same value for these bits as the first event.
   1498		 */
   1499		event = cpuhw->event[0];
   1500		if (event->attr.exclude_user)
   1501			cpuhw->mmcr.mmcr0 |= MMCR0_FCP;
   1502		if (event->attr.exclude_kernel)
   1503			cpuhw->mmcr.mmcr0 |= freeze_events_kernel;
   1504		if (event->attr.exclude_hv)
   1505			cpuhw->mmcr.mmcr0 |= MMCR0_FCHV;
   1506	}
   1507
   1508	/*
   1509	 * Write the new configuration to MMCR* with the freeze
   1510	 * bit set and set the hardware events to their initial values.
   1511	 * Then unfreeze the events.
   1512	 */
   1513	ppc_set_pmu_inuse(1);
   1514	mtspr(SPRN_MMCRA, cpuhw->mmcr.mmcra & ~MMCRA_SAMPLE_ENABLE);
   1515	mtspr(SPRN_MMCR1, cpuhw->mmcr.mmcr1);
   1516	mtspr(SPRN_MMCR0, (cpuhw->mmcr.mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
   1517				| MMCR0_FC);
   1518	if (ppmu->flags & PPMU_ARCH_207S)
   1519		mtspr(SPRN_MMCR2, cpuhw->mmcr.mmcr2);
   1520
   1521	if (ppmu->flags & PPMU_ARCH_31)
   1522		mtspr(SPRN_MMCR3, cpuhw->mmcr.mmcr3);
   1523
   1524	/*
   1525	 * Read off any pre-existing events that need to move
   1526	 * to another PMC.
   1527	 */
   1528	for (i = 0; i < cpuhw->n_events; ++i) {
   1529		event = cpuhw->event[i];
   1530		if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
   1531			power_pmu_read(event);
   1532			write_pmc(event->hw.idx, 0);
   1533			event->hw.idx = 0;
   1534		}
   1535	}
   1536
   1537	/*
   1538	 * Initialize the PMCs for all the new and moved events.
   1539	 */
   1540	cpuhw->n_limited = n_lim = 0;
   1541	for (i = 0; i < cpuhw->n_events; ++i) {
   1542		event = cpuhw->event[i];
   1543		if (event->hw.idx)
   1544			continue;
   1545		idx = hwc_index[i] + 1;
   1546		if (is_limited_pmc(idx)) {
   1547			cpuhw->limited_counter[n_lim] = event;
   1548			cpuhw->limited_hwidx[n_lim] = idx;
   1549			++n_lim;
   1550			continue;
   1551		}
   1552
   1553		if (ebb)
   1554			val = local64_read(&event->hw.prev_count);
   1555		else {
   1556			val = 0;
   1557			if (event->hw.sample_period) {
   1558				left = local64_read(&event->hw.period_left);
   1559				if (left < 0x80000000L)
   1560					val = 0x80000000L - left;
   1561			}
   1562			local64_set(&event->hw.prev_count, val);
   1563		}
   1564
   1565		event->hw.idx = idx;
   1566		if (event->hw.state & PERF_HES_STOPPED)
   1567			val = 0;
   1568		write_pmc(idx, val);
   1569
   1570		perf_event_update_userpage(event);
   1571	}
   1572	cpuhw->n_limited = n_lim;
   1573	cpuhw->mmcr.mmcr0 |= MMCR0_PMXE | MMCR0_FCECE;
   1574
   1575 out_enable:
   1576	pmao_restore_workaround(ebb);
   1577
   1578	mmcr0 = ebb_switch_in(ebb, cpuhw);
   1579
   1580	mb();
   1581	if (cpuhw->bhrb_users)
   1582		ppmu->config_bhrb(cpuhw->bhrb_filter);
   1583
   1584	write_mmcr0(cpuhw, mmcr0);
   1585
   1586	/*
   1587	 * Enable instruction sampling if necessary
   1588	 */
   1589	if (cpuhw->mmcr.mmcra & MMCRA_SAMPLE_ENABLE) {
   1590		mb();
   1591		mtspr(SPRN_MMCRA, cpuhw->mmcr.mmcra);
   1592	}
   1593
   1594 out:
   1595
   1596	local_irq_restore(flags);
   1597}
   1598
   1599static int collect_events(struct perf_event *group, int max_count,
   1600			  struct perf_event *ctrs[], u64 *events,
   1601			  unsigned int *flags)
   1602{
   1603	int n = 0;
   1604	struct perf_event *event;
   1605
   1606	if (group->pmu->task_ctx_nr == perf_hw_context) {
   1607		if (n >= max_count)
   1608			return -1;
   1609		ctrs[n] = group;
   1610		flags[n] = group->hw.event_base;
   1611		events[n++] = group->hw.config;
   1612	}
   1613	for_each_sibling_event(event, group) {
   1614		if (event->pmu->task_ctx_nr == perf_hw_context &&
   1615		    event->state != PERF_EVENT_STATE_OFF) {
   1616			if (n >= max_count)
   1617				return -1;
   1618			ctrs[n] = event;
   1619			flags[n] = event->hw.event_base;
   1620			events[n++] = event->hw.config;
   1621		}
   1622	}
   1623	return n;
   1624}
   1625
   1626/*
   1627 * Add an event to the PMU.
   1628 * If all events are not already frozen, then we disable and
   1629 * re-enable the PMU in order to get hw_perf_enable to do the
   1630 * actual work of reconfiguring the PMU.
   1631 */
   1632static int power_pmu_add(struct perf_event *event, int ef_flags)
   1633{
   1634	struct cpu_hw_events *cpuhw;
   1635	unsigned long flags;
   1636	int n0;
   1637	int ret = -EAGAIN;
   1638
   1639	local_irq_save(flags);
   1640	perf_pmu_disable(event->pmu);
   1641
   1642	/*
   1643	 * Add the event to the list (if there is room)
   1644	 * and check whether the total set is still feasible.
   1645	 */
   1646	cpuhw = this_cpu_ptr(&cpu_hw_events);
   1647	n0 = cpuhw->n_events;
   1648	if (n0 >= ppmu->n_counter)
   1649		goto out;
   1650	cpuhw->event[n0] = event;
   1651	cpuhw->events[n0] = event->hw.config;
   1652	cpuhw->flags[n0] = event->hw.event_base;
   1653
   1654	/*
   1655	 * This event may have been disabled/stopped in record_and_restart()
   1656	 * because we exceeded the ->event_limit. If re-starting the event,
   1657	 * clear the ->hw.state (STOPPED and UPTODATE flags), so the user
   1658	 * notification is re-enabled.
   1659	 */
   1660	if (!(ef_flags & PERF_EF_START))
   1661		event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
   1662	else
   1663		event->hw.state = 0;
   1664
   1665	/*
   1666	 * If group events scheduling transaction was started,
   1667	 * skip the schedulability test here, it will be performed
   1668	 * at commit time(->commit_txn) as a whole
   1669	 */
   1670	if (cpuhw->txn_flags & PERF_PMU_TXN_ADD)
   1671		goto nocheck;
   1672
   1673	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
   1674		goto out;
   1675	if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1, cpuhw->event))
   1676		goto out;
   1677	event->hw.config = cpuhw->events[n0];
   1678
   1679nocheck:
   1680	ebb_event_add(event);
   1681
   1682	++cpuhw->n_events;
   1683	++cpuhw->n_added;
   1684
   1685	ret = 0;
   1686 out:
   1687	if (has_branch_stack(event)) {
   1688		u64 bhrb_filter = -1;
   1689
   1690		if (ppmu->bhrb_filter_map)
   1691			bhrb_filter = ppmu->bhrb_filter_map(
   1692				event->attr.branch_sample_type);
   1693
   1694		if (bhrb_filter != -1) {
   1695			cpuhw->bhrb_filter = bhrb_filter;
   1696			power_pmu_bhrb_enable(event);
   1697		}
   1698	}
   1699
   1700	perf_pmu_enable(event->pmu);
   1701	local_irq_restore(flags);
   1702	return ret;
   1703}
   1704
   1705/*
   1706 * Remove an event from the PMU.
   1707 */
   1708static void power_pmu_del(struct perf_event *event, int ef_flags)
   1709{
   1710	struct cpu_hw_events *cpuhw;
   1711	long i;
   1712	unsigned long flags;
   1713
   1714	local_irq_save(flags);
   1715	perf_pmu_disable(event->pmu);
   1716
   1717	power_pmu_read(event);
   1718
   1719	cpuhw = this_cpu_ptr(&cpu_hw_events);
   1720	for (i = 0; i < cpuhw->n_events; ++i) {
   1721		if (event == cpuhw->event[i]) {
   1722			while (++i < cpuhw->n_events) {
   1723				cpuhw->event[i-1] = cpuhw->event[i];
   1724				cpuhw->events[i-1] = cpuhw->events[i];
   1725				cpuhw->flags[i-1] = cpuhw->flags[i];
   1726			}
   1727			--cpuhw->n_events;
   1728			ppmu->disable_pmc(event->hw.idx - 1, &cpuhw->mmcr);
   1729			if (event->hw.idx) {
   1730				write_pmc(event->hw.idx, 0);
   1731				event->hw.idx = 0;
   1732			}
   1733			perf_event_update_userpage(event);
   1734			break;
   1735		}
   1736	}
   1737	for (i = 0; i < cpuhw->n_limited; ++i)
   1738		if (event == cpuhw->limited_counter[i])
   1739			break;
   1740	if (i < cpuhw->n_limited) {
   1741		while (++i < cpuhw->n_limited) {
   1742			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
   1743			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
   1744		}
   1745		--cpuhw->n_limited;
   1746	}
   1747	if (cpuhw->n_events == 0) {
   1748		/* disable exceptions if no events are running */
   1749		cpuhw->mmcr.mmcr0 &= ~(MMCR0_PMXE | MMCR0_FCECE);
   1750	}
   1751
   1752	if (has_branch_stack(event))
   1753		power_pmu_bhrb_disable(event);
   1754
   1755	perf_pmu_enable(event->pmu);
   1756	local_irq_restore(flags);
   1757}
   1758
   1759/*
   1760 * POWER-PMU does not support disabling individual counters, hence
   1761 * program their cycle counter to their max value and ignore the interrupts.
   1762 */
   1763
   1764static void power_pmu_start(struct perf_event *event, int ef_flags)
   1765{
   1766	unsigned long flags;
   1767	s64 left;
   1768	unsigned long val;
   1769
   1770	if (!event->hw.idx || !event->hw.sample_period)
   1771		return;
   1772
   1773	if (!(event->hw.state & PERF_HES_STOPPED))
   1774		return;
   1775
   1776	if (ef_flags & PERF_EF_RELOAD)
   1777		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
   1778
   1779	local_irq_save(flags);
   1780	perf_pmu_disable(event->pmu);
   1781
   1782	event->hw.state = 0;
   1783	left = local64_read(&event->hw.period_left);
   1784
   1785	val = 0;
   1786	if (left < 0x80000000L)
   1787		val = 0x80000000L - left;
   1788
   1789	write_pmc(event->hw.idx, val);
   1790
   1791	perf_event_update_userpage(event);
   1792	perf_pmu_enable(event->pmu);
   1793	local_irq_restore(flags);
   1794}
   1795
   1796static void power_pmu_stop(struct perf_event *event, int ef_flags)
   1797{
   1798	unsigned long flags;
   1799
   1800	if (!event->hw.idx || !event->hw.sample_period)
   1801		return;
   1802
   1803	if (event->hw.state & PERF_HES_STOPPED)
   1804		return;
   1805
   1806	local_irq_save(flags);
   1807	perf_pmu_disable(event->pmu);
   1808
   1809	power_pmu_read(event);
   1810	event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
   1811	write_pmc(event->hw.idx, 0);
   1812
   1813	perf_event_update_userpage(event);
   1814	perf_pmu_enable(event->pmu);
   1815	local_irq_restore(flags);
   1816}
   1817
   1818/*
   1819 * Start group events scheduling transaction
   1820 * Set the flag to make pmu::enable() not perform the
   1821 * schedulability test, it will be performed at commit time
   1822 *
   1823 * We only support PERF_PMU_TXN_ADD transactions. Save the
   1824 * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
   1825 * transactions.
   1826 */
   1827static void power_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
   1828{
   1829	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
   1830
   1831	WARN_ON_ONCE(cpuhw->txn_flags);		/* txn already in flight */
   1832
   1833	cpuhw->txn_flags = txn_flags;
   1834	if (txn_flags & ~PERF_PMU_TXN_ADD)
   1835		return;
   1836
   1837	perf_pmu_disable(pmu);
   1838	cpuhw->n_txn_start = cpuhw->n_events;
   1839}
   1840
   1841/*
   1842 * Stop group events scheduling transaction
   1843 * Clear the flag and pmu::enable() will perform the
   1844 * schedulability test.
   1845 */
   1846static void power_pmu_cancel_txn(struct pmu *pmu)
   1847{
   1848	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
   1849	unsigned int txn_flags;
   1850
   1851	WARN_ON_ONCE(!cpuhw->txn_flags);	/* no txn in flight */
   1852
   1853	txn_flags = cpuhw->txn_flags;
   1854	cpuhw->txn_flags = 0;
   1855	if (txn_flags & ~PERF_PMU_TXN_ADD)
   1856		return;
   1857
   1858	perf_pmu_enable(pmu);
   1859}
   1860
   1861/*
   1862 * Commit group events scheduling transaction
   1863 * Perform the group schedulability test as a whole
   1864 * Return 0 if success
   1865 */
   1866static int power_pmu_commit_txn(struct pmu *pmu)
   1867{
   1868	struct cpu_hw_events *cpuhw;
   1869	long i, n;
   1870
   1871	if (!ppmu)
   1872		return -EAGAIN;
   1873
   1874	cpuhw = this_cpu_ptr(&cpu_hw_events);
   1875	WARN_ON_ONCE(!cpuhw->txn_flags);	/* no txn in flight */
   1876
   1877	if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) {
   1878		cpuhw->txn_flags = 0;
   1879		return 0;
   1880	}
   1881
   1882	n = cpuhw->n_events;
   1883	if (check_excludes(cpuhw->event, cpuhw->flags, 0, n))
   1884		return -EAGAIN;
   1885	i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n, cpuhw->event);
   1886	if (i < 0)
   1887		return -EAGAIN;
   1888
   1889	for (i = cpuhw->n_txn_start; i < n; ++i)
   1890		cpuhw->event[i]->hw.config = cpuhw->events[i];
   1891
   1892	cpuhw->txn_flags = 0;
   1893	perf_pmu_enable(pmu);
   1894	return 0;
   1895}
   1896
   1897/*
   1898 * Return 1 if we might be able to put event on a limited PMC,
   1899 * or 0 if not.
   1900 * An event can only go on a limited PMC if it counts something
   1901 * that a limited PMC can count, doesn't require interrupts, and
   1902 * doesn't exclude any processor mode.
   1903 */
   1904static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
   1905				 unsigned int flags)
   1906{
   1907	int n;
   1908	u64 alt[MAX_EVENT_ALTERNATIVES];
   1909
   1910	if (event->attr.exclude_user
   1911	    || event->attr.exclude_kernel
   1912	    || event->attr.exclude_hv
   1913	    || event->attr.sample_period)
   1914		return 0;
   1915
   1916	if (ppmu->limited_pmc_event(ev))
   1917		return 1;
   1918
   1919	/*
   1920	 * The requested event_id isn't on a limited PMC already;
   1921	 * see if any alternative code goes on a limited PMC.
   1922	 */
   1923	if (!ppmu->get_alternatives)
   1924		return 0;
   1925
   1926	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
   1927	n = ppmu->get_alternatives(ev, flags, alt);
   1928
   1929	return n > 0;
   1930}
   1931
   1932/*
   1933 * Find an alternative event_id that goes on a normal PMC, if possible,
   1934 * and return the event_id code, or 0 if there is no such alternative.
   1935 * (Note: event_id code 0 is "don't count" on all machines.)
   1936 */
   1937static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
   1938{
   1939	u64 alt[MAX_EVENT_ALTERNATIVES];
   1940	int n;
   1941
   1942	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
   1943	n = ppmu->get_alternatives(ev, flags, alt);
   1944	if (!n)
   1945		return 0;
   1946	return alt[0];
   1947}
   1948
   1949/* Number of perf_events counting hardware events */
   1950static atomic_t num_events;
   1951/* Used to avoid races in calling reserve/release_pmc_hardware */
   1952static DEFINE_MUTEX(pmc_reserve_mutex);
   1953
   1954/*
   1955 * Release the PMU if this is the last perf_event.
   1956 */
   1957static void hw_perf_event_destroy(struct perf_event *event)
   1958{
   1959	if (!atomic_add_unless(&num_events, -1, 1)) {
   1960		mutex_lock(&pmc_reserve_mutex);
   1961		if (atomic_dec_return(&num_events) == 0)
   1962			release_pmc_hardware();
   1963		mutex_unlock(&pmc_reserve_mutex);
   1964	}
   1965}
   1966
   1967/*
   1968 * Translate a generic cache event_id config to a raw event_id code.
   1969 */
   1970static int hw_perf_cache_event(u64 config, u64 *eventp)
   1971{
   1972	unsigned long type, op, result;
   1973	u64 ev;
   1974
   1975	if (!ppmu->cache_events)
   1976		return -EINVAL;
   1977
   1978	/* unpack config */
   1979	type = config & 0xff;
   1980	op = (config >> 8) & 0xff;
   1981	result = (config >> 16) & 0xff;
   1982
   1983	if (type >= PERF_COUNT_HW_CACHE_MAX ||
   1984	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
   1985	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
   1986		return -EINVAL;
   1987
   1988	ev = (*ppmu->cache_events)[type][op][result];
   1989	if (ev == 0)
   1990		return -EOPNOTSUPP;
   1991	if (ev == -1)
   1992		return -EINVAL;
   1993	*eventp = ev;
   1994	return 0;
   1995}
   1996
   1997static bool is_event_blacklisted(u64 ev)
   1998{
   1999	int i;
   2000
   2001	for (i=0; i < ppmu->n_blacklist_ev; i++) {
   2002		if (ppmu->blacklist_ev[i] == ev)
   2003			return true;
   2004	}
   2005
   2006	return false;
   2007}
   2008
   2009static int power_pmu_event_init(struct perf_event *event)
   2010{
   2011	u64 ev;
   2012	unsigned long flags, irq_flags;
   2013	struct perf_event *ctrs[MAX_HWEVENTS];
   2014	u64 events[MAX_HWEVENTS];
   2015	unsigned int cflags[MAX_HWEVENTS];
   2016	int n;
   2017	int err;
   2018	struct cpu_hw_events *cpuhw;
   2019
   2020	if (!ppmu)
   2021		return -ENOENT;
   2022
   2023	if (has_branch_stack(event)) {
   2024	        /* PMU has BHRB enabled */
   2025		if (!(ppmu->flags & PPMU_ARCH_207S))
   2026			return -EOPNOTSUPP;
   2027	}
   2028
   2029	switch (event->attr.type) {
   2030	case PERF_TYPE_HARDWARE:
   2031		ev = event->attr.config;
   2032		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
   2033			return -EOPNOTSUPP;
   2034
   2035		if (ppmu->blacklist_ev && is_event_blacklisted(ev))
   2036			return -EINVAL;
   2037		ev = ppmu->generic_events[ev];
   2038		break;
   2039	case PERF_TYPE_HW_CACHE:
   2040		err = hw_perf_cache_event(event->attr.config, &ev);
   2041		if (err)
   2042			return err;
   2043
   2044		if (ppmu->blacklist_ev && is_event_blacklisted(ev))
   2045			return -EINVAL;
   2046		break;
   2047	case PERF_TYPE_RAW:
   2048		ev = event->attr.config;
   2049
   2050		if (ppmu->blacklist_ev && is_event_blacklisted(ev))
   2051			return -EINVAL;
   2052		break;
   2053	default:
   2054		return -ENOENT;
   2055	}
   2056
   2057	/*
   2058	 * PMU config registers have fields that are
   2059	 * reserved and some specific values for bit fields are reserved.
   2060	 * For ex., MMCRA[61:62] is Random Sampling Mode (SM)
   2061	 * and value of 0b11 to this field is reserved.
   2062	 * Check for invalid values in attr.config.
   2063	 */
   2064	if (ppmu->check_attr_config &&
   2065	    ppmu->check_attr_config(event))
   2066		return -EINVAL;
   2067
   2068	event->hw.config_base = ev;
   2069	event->hw.idx = 0;
   2070
   2071	/*
   2072	 * If we are not running on a hypervisor, force the
   2073	 * exclude_hv bit to 0 so that we don't care what
   2074	 * the user set it to.
   2075	 */
   2076	if (!firmware_has_feature(FW_FEATURE_LPAR))
   2077		event->attr.exclude_hv = 0;
   2078
   2079	/*
   2080	 * If this is a per-task event, then we can use
   2081	 * PM_RUN_* events interchangeably with their non RUN_*
   2082	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
   2083	 * XXX we should check if the task is an idle task.
   2084	 */
   2085	flags = 0;
   2086	if (event->attach_state & PERF_ATTACH_TASK)
   2087		flags |= PPMU_ONLY_COUNT_RUN;
   2088
   2089	/*
   2090	 * If this machine has limited events, check whether this
   2091	 * event_id could go on a limited event.
   2092	 */
   2093	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
   2094		if (can_go_on_limited_pmc(event, ev, flags)) {
   2095			flags |= PPMU_LIMITED_PMC_OK;
   2096		} else if (ppmu->limited_pmc_event(ev)) {
   2097			/*
   2098			 * The requested event_id is on a limited PMC,
   2099			 * but we can't use a limited PMC; see if any
   2100			 * alternative goes on a normal PMC.
   2101			 */
   2102			ev = normal_pmc_alternative(ev, flags);
   2103			if (!ev)
   2104				return -EINVAL;
   2105		}
   2106	}
   2107
   2108	/* Extra checks for EBB */
   2109	err = ebb_event_check(event);
   2110	if (err)
   2111		return err;
   2112
   2113	/*
   2114	 * If this is in a group, check if it can go on with all the
   2115	 * other hardware events in the group.  We assume the event
   2116	 * hasn't been linked into its leader's sibling list at this point.
   2117	 */
   2118	n = 0;
   2119	if (event->group_leader != event) {
   2120		n = collect_events(event->group_leader, ppmu->n_counter - 1,
   2121				   ctrs, events, cflags);
   2122		if (n < 0)
   2123			return -EINVAL;
   2124	}
   2125	events[n] = ev;
   2126	ctrs[n] = event;
   2127	cflags[n] = flags;
   2128	if (check_excludes(ctrs, cflags, n, 1))
   2129		return -EINVAL;
   2130
   2131	local_irq_save(irq_flags);
   2132	cpuhw = this_cpu_ptr(&cpu_hw_events);
   2133
   2134	err = power_check_constraints(cpuhw, events, cflags, n + 1, ctrs);
   2135
   2136	if (has_branch_stack(event)) {
   2137		u64 bhrb_filter = -1;
   2138
   2139		if (ppmu->bhrb_filter_map)
   2140			bhrb_filter = ppmu->bhrb_filter_map(
   2141					event->attr.branch_sample_type);
   2142
   2143		if (bhrb_filter == -1) {
   2144			local_irq_restore(irq_flags);
   2145			return -EOPNOTSUPP;
   2146		}
   2147		cpuhw->bhrb_filter = bhrb_filter;
   2148	}
   2149
   2150	local_irq_restore(irq_flags);
   2151	if (err)
   2152		return -EINVAL;
   2153
   2154	event->hw.config = events[n];
   2155	event->hw.event_base = cflags[n];
   2156	event->hw.last_period = event->hw.sample_period;
   2157	local64_set(&event->hw.period_left, event->hw.last_period);
   2158
   2159	/*
   2160	 * For EBB events we just context switch the PMC value, we don't do any
   2161	 * of the sample_period logic. We use hw.prev_count for this.
   2162	 */
   2163	if (is_ebb_event(event))
   2164		local64_set(&event->hw.prev_count, 0);
   2165
   2166	/*
   2167	 * See if we need to reserve the PMU.
   2168	 * If no events are currently in use, then we have to take a
   2169	 * mutex to ensure that we don't race with another task doing
   2170	 * reserve_pmc_hardware or release_pmc_hardware.
   2171	 */
   2172	err = 0;
   2173	if (!atomic_inc_not_zero(&num_events)) {
   2174		mutex_lock(&pmc_reserve_mutex);
   2175		if (atomic_read(&num_events) == 0 &&
   2176		    reserve_pmc_hardware(perf_event_interrupt))
   2177			err = -EBUSY;
   2178		else
   2179			atomic_inc(&num_events);
   2180		mutex_unlock(&pmc_reserve_mutex);
   2181	}
   2182	event->destroy = hw_perf_event_destroy;
   2183
   2184	return err;
   2185}
   2186
   2187static int power_pmu_event_idx(struct perf_event *event)
   2188{
   2189	return event->hw.idx;
   2190}
   2191
   2192ssize_t power_events_sysfs_show(struct device *dev,
   2193				struct device_attribute *attr, char *page)
   2194{
   2195	struct perf_pmu_events_attr *pmu_attr;
   2196
   2197	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
   2198
   2199	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
   2200}
   2201
   2202static struct pmu power_pmu = {
   2203	.pmu_enable	= power_pmu_enable,
   2204	.pmu_disable	= power_pmu_disable,
   2205	.event_init	= power_pmu_event_init,
   2206	.add		= power_pmu_add,
   2207	.del		= power_pmu_del,
   2208	.start		= power_pmu_start,
   2209	.stop		= power_pmu_stop,
   2210	.read		= power_pmu_read,
   2211	.start_txn	= power_pmu_start_txn,
   2212	.cancel_txn	= power_pmu_cancel_txn,
   2213	.commit_txn	= power_pmu_commit_txn,
   2214	.event_idx	= power_pmu_event_idx,
   2215	.sched_task	= power_pmu_sched_task,
   2216};
   2217
   2218#define PERF_SAMPLE_ADDR_TYPE  (PERF_SAMPLE_ADDR |		\
   2219				PERF_SAMPLE_PHYS_ADDR |		\
   2220				PERF_SAMPLE_DATA_PAGE_SIZE)
   2221/*
   2222 * A counter has overflowed; update its count and record
   2223 * things if requested.  Note that interrupts are hard-disabled
   2224 * here so there is no possibility of being interrupted.
   2225 */
   2226static void record_and_restart(struct perf_event *event, unsigned long val,
   2227			       struct pt_regs *regs)
   2228{
   2229	u64 period = event->hw.sample_period;
   2230	s64 prev, delta, left;
   2231	int record = 0;
   2232
   2233	if (event->hw.state & PERF_HES_STOPPED) {
   2234		write_pmc(event->hw.idx, 0);
   2235		return;
   2236	}
   2237
   2238	/* we don't have to worry about interrupts here */
   2239	prev = local64_read(&event->hw.prev_count);
   2240	delta = check_and_compute_delta(prev, val);
   2241	local64_add(delta, &event->count);
   2242
   2243	/*
   2244	 * See if the total period for this event has expired,
   2245	 * and update for the next period.
   2246	 */
   2247	val = 0;
   2248	left = local64_read(&event->hw.period_left) - delta;
   2249	if (delta == 0)
   2250		left++;
   2251	if (period) {
   2252		if (left <= 0) {
   2253			left += period;
   2254			if (left <= 0)
   2255				left = period;
   2256
   2257			/*
   2258			 * If address is not requested in the sample via
   2259			 * PERF_SAMPLE_IP, just record that sample irrespective
   2260			 * of SIAR valid check.
   2261			 */
   2262			if (event->attr.sample_type & PERF_SAMPLE_IP)
   2263				record = siar_valid(regs);
   2264			else
   2265				record = 1;
   2266
   2267			event->hw.last_period = event->hw.sample_period;
   2268		}
   2269		if (left < 0x80000000LL)
   2270			val = 0x80000000LL - left;
   2271	}
   2272
   2273	write_pmc(event->hw.idx, val);
   2274	local64_set(&event->hw.prev_count, val);
   2275	local64_set(&event->hw.period_left, left);
   2276	perf_event_update_userpage(event);
   2277
   2278	/*
   2279	 * Due to hardware limitation, sometimes SIAR could sample a kernel
   2280	 * address even when freeze on supervisor state (kernel) is set in
   2281	 * MMCR2. Check attr.exclude_kernel and address to drop the sample in
   2282	 * these cases.
   2283	 */
   2284	if (event->attr.exclude_kernel &&
   2285	    (event->attr.sample_type & PERF_SAMPLE_IP) &&
   2286	    is_kernel_addr(mfspr(SPRN_SIAR)))
   2287		record = 0;
   2288
   2289	/*
   2290	 * Finally record data if requested.
   2291	 */
   2292	if (record) {
   2293		struct perf_sample_data data;
   2294
   2295		perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
   2296
   2297		if (event->attr.sample_type & PERF_SAMPLE_ADDR_TYPE)
   2298			perf_get_data_addr(event, regs, &data.addr);
   2299
   2300		if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
   2301			struct cpu_hw_events *cpuhw;
   2302			cpuhw = this_cpu_ptr(&cpu_hw_events);
   2303			power_pmu_bhrb_read(event, cpuhw);
   2304			data.br_stack = &cpuhw->bhrb_stack;
   2305		}
   2306
   2307		if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC &&
   2308						ppmu->get_mem_data_src)
   2309			ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs);
   2310
   2311		if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE &&
   2312						ppmu->get_mem_weight)
   2313			ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type);
   2314
   2315		if (perf_event_overflow(event, &data, regs))
   2316			power_pmu_stop(event, 0);
   2317	} else if (period) {
   2318		/* Account for interrupt in case of invalid SIAR */
   2319		if (perf_event_account_interrupt(event))
   2320			power_pmu_stop(event, 0);
   2321	}
   2322}
   2323
   2324/*
   2325 * Called from generic code to get the misc flags (i.e. processor mode)
   2326 * for an event_id.
   2327 */
   2328unsigned long perf_misc_flags(struct pt_regs *regs)
   2329{
   2330	u32 flags = perf_get_misc_flags(regs);
   2331
   2332	if (flags)
   2333		return flags;
   2334	return user_mode(regs) ? PERF_RECORD_MISC_USER :
   2335		PERF_RECORD_MISC_KERNEL;
   2336}
   2337
   2338/*
   2339 * Called from generic code to get the instruction pointer
   2340 * for an event_id.
   2341 */
   2342unsigned long perf_instruction_pointer(struct pt_regs *regs)
   2343{
   2344	unsigned long siar = mfspr(SPRN_SIAR);
   2345
   2346	if (regs_use_siar(regs) && siar_valid(regs) && siar)
   2347		return siar + perf_ip_adjust(regs);
   2348	else
   2349		return regs->nip;
   2350}
   2351
   2352static bool pmc_overflow_power7(unsigned long val)
   2353{
   2354	/*
   2355	 * Events on POWER7 can roll back if a speculative event doesn't
   2356	 * eventually complete. Unfortunately in some rare cases they will
   2357	 * raise a performance monitor exception. We need to catch this to
   2358	 * ensure we reset the PMC. In all cases the PMC will be 256 or less
   2359	 * cycles from overflow.
   2360	 *
   2361	 * We only do this if the first pass fails to find any overflowing
   2362	 * PMCs because a user might set a period of less than 256 and we
   2363	 * don't want to mistakenly reset them.
   2364	 */
   2365	if ((0x80000000 - val) <= 256)
   2366		return true;
   2367
   2368	return false;
   2369}
   2370
   2371static bool pmc_overflow(unsigned long val)
   2372{
   2373	if ((int)val < 0)
   2374		return true;
   2375
   2376	return false;
   2377}
   2378
   2379/*
   2380 * Performance monitor interrupt stuff
   2381 */
   2382static void __perf_event_interrupt(struct pt_regs *regs)
   2383{
   2384	int i, j;
   2385	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
   2386	struct perf_event *event;
   2387	int found, active;
   2388
   2389	if (cpuhw->n_limited)
   2390		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
   2391					mfspr(SPRN_PMC6));
   2392
   2393	perf_read_regs(regs);
   2394
   2395	/* Read all the PMCs since we'll need them a bunch of times */
   2396	for (i = 0; i < ppmu->n_counter; ++i)
   2397		cpuhw->pmcs[i] = read_pmc(i + 1);
   2398
   2399	/* Try to find what caused the IRQ */
   2400	found = 0;
   2401	for (i = 0; i < ppmu->n_counter; ++i) {
   2402		if (!pmc_overflow(cpuhw->pmcs[i]))
   2403			continue;
   2404		if (is_limited_pmc(i + 1))
   2405			continue; /* these won't generate IRQs */
   2406		/*
   2407		 * We've found one that's overflowed.  For active
   2408		 * counters we need to log this.  For inactive
   2409		 * counters, we need to reset it anyway
   2410		 */
   2411		found = 1;
   2412		active = 0;
   2413		for (j = 0; j < cpuhw->n_events; ++j) {
   2414			event = cpuhw->event[j];
   2415			if (event->hw.idx == (i + 1)) {
   2416				active = 1;
   2417				record_and_restart(event, cpuhw->pmcs[i], regs);
   2418				break;
   2419			}
   2420		}
   2421
   2422		/*
   2423		 * Clear PACA_IRQ_PMI in case it was set by
   2424		 * set_pmi_irq_pending() when PMU was enabled
   2425		 * after accounting for interrupts.
   2426		 */
   2427		clear_pmi_irq_pending();
   2428
   2429		if (!active)
   2430			/* reset non active counters that have overflowed */
   2431			write_pmc(i + 1, 0);
   2432	}
   2433	if (!found && pvr_version_is(PVR_POWER7)) {
   2434		/* check active counters for special buggy p7 overflow */
   2435		for (i = 0; i < cpuhw->n_events; ++i) {
   2436			event = cpuhw->event[i];
   2437			if (!event->hw.idx || is_limited_pmc(event->hw.idx))
   2438				continue;
   2439			if (pmc_overflow_power7(cpuhw->pmcs[event->hw.idx - 1])) {
   2440				/* event has overflowed in a buggy way*/
   2441				found = 1;
   2442				record_and_restart(event,
   2443						   cpuhw->pmcs[event->hw.idx - 1],
   2444						   regs);
   2445			}
   2446		}
   2447	}
   2448
   2449	/*
   2450	 * During system wide profiling or while specific CPU is monitored for an
   2451	 * event, some corner cases could cause PMC to overflow in idle path. This
   2452	 * will trigger a PMI after waking up from idle. Since counter values are _not_
   2453	 * saved/restored in idle path, can lead to below "Can't find PMC" message.
   2454	 */
   2455	if (unlikely(!found) && !arch_irq_disabled_regs(regs))
   2456		printk_ratelimited(KERN_WARNING "Can't find PMC that caused IRQ\n");
   2457
   2458	/*
   2459	 * Reset MMCR0 to its normal value.  This will set PMXE and
   2460	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
   2461	 * and thus allow interrupts to occur again.
   2462	 * XXX might want to use MSR.PM to keep the events frozen until
   2463	 * we get back out of this interrupt.
   2464	 */
   2465	write_mmcr0(cpuhw, cpuhw->mmcr.mmcr0);
   2466
   2467	/* Clear the cpuhw->pmcs */
   2468	memset(&cpuhw->pmcs, 0, sizeof(cpuhw->pmcs));
   2469
   2470}
   2471
   2472static void perf_event_interrupt(struct pt_regs *regs)
   2473{
   2474	u64 start_clock = sched_clock();
   2475
   2476	__perf_event_interrupt(regs);
   2477	perf_sample_event_took(sched_clock() - start_clock);
   2478}
   2479
   2480static int power_pmu_prepare_cpu(unsigned int cpu)
   2481{
   2482	struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
   2483
   2484	if (ppmu) {
   2485		memset(cpuhw, 0, sizeof(*cpuhw));
   2486		cpuhw->mmcr.mmcr0 = MMCR0_FC;
   2487	}
   2488	return 0;
   2489}
   2490
   2491int __init register_power_pmu(struct power_pmu *pmu)
   2492{
   2493	if (ppmu)
   2494		return -EBUSY;		/* something's already registered */
   2495
   2496	ppmu = pmu;
   2497	pr_info("%s performance monitor hardware support registered\n",
   2498		pmu->name);
   2499
   2500	power_pmu.attr_groups = ppmu->attr_groups;
   2501	power_pmu.capabilities |= (ppmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS);
   2502
   2503#ifdef MSR_HV
   2504	/*
   2505	 * Use FCHV to ignore kernel events if MSR.HV is set.
   2506	 */
   2507	if (mfmsr() & MSR_HV)
   2508		freeze_events_kernel = MMCR0_FCHV;
   2509#endif /* CONFIG_PPC64 */
   2510
   2511	perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW);
   2512	cpuhp_setup_state(CPUHP_PERF_POWER, "perf/powerpc:prepare",
   2513			  power_pmu_prepare_cpu, NULL);
   2514	return 0;
   2515}
   2516
   2517#ifdef CONFIG_PPC64
   2518static bool pmu_override = false;
   2519static unsigned long pmu_override_val;
   2520static void do_pmu_override(void *data)
   2521{
   2522	ppc_set_pmu_inuse(1);
   2523	if (pmu_override_val)
   2524		mtspr(SPRN_MMCR1, pmu_override_val);
   2525	mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
   2526}
   2527
   2528static int __init init_ppc64_pmu(void)
   2529{
   2530	if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) {
   2531		pr_warn("disabling perf due to pmu_override= command line option.\n");
   2532		on_each_cpu(do_pmu_override, NULL, 1);
   2533		return 0;
   2534	}
   2535
   2536	/* run through all the pmu drivers one at a time */
   2537	if (!init_power5_pmu())
   2538		return 0;
   2539	else if (!init_power5p_pmu())
   2540		return 0;
   2541	else if (!init_power6_pmu())
   2542		return 0;
   2543	else if (!init_power7_pmu())
   2544		return 0;
   2545	else if (!init_power8_pmu())
   2546		return 0;
   2547	else if (!init_power9_pmu())
   2548		return 0;
   2549	else if (!init_power10_pmu())
   2550		return 0;
   2551	else if (!init_ppc970_pmu())
   2552		return 0;
   2553	else
   2554		return init_generic_compat_pmu();
   2555}
   2556early_initcall(init_ppc64_pmu);
   2557
   2558static int __init pmu_setup(char *str)
   2559{
   2560	unsigned long val;
   2561
   2562	if (!early_cpu_has_feature(CPU_FTR_HVMODE))
   2563		return 0;
   2564
   2565	pmu_override = true;
   2566
   2567	if (kstrtoul(str, 0, &val))
   2568		val = 0;
   2569
   2570	pmu_override_val = val;
   2571
   2572	return 1;
   2573}
   2574__setup("pmu_override=", pmu_setup);
   2575
   2576#endif