core.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
core.c (328951B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Performance events core code:
      4 *
      5 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
      6 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
      7 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
      8 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
      9 */
     10
     11#include <linux/fs.h>
     12#include <linux/mm.h>
     13#include <linux/cpu.h>
     14#include <linux/smp.h>
     15#include <linux/idr.h>
     16#include <linux/file.h>
     17#include <linux/poll.h>
     18#include <linux/slab.h>
     19#include <linux/hash.h>
     20#include <linux/tick.h>
     21#include <linux/sysfs.h>
     22#include <linux/dcache.h>
     23#include <linux/percpu.h>
     24#include <linux/ptrace.h>
     25#include <linux/reboot.h>
     26#include <linux/vmstat.h>
     27#include <linux/device.h>
     28#include <linux/export.h>
     29#include <linux/vmalloc.h>
     30#include <linux/hardirq.h>
     31#include <linux/hugetlb.h>
     32#include <linux/rculist.h>
     33#include <linux/uaccess.h>
     34#include <linux/syscalls.h>
     35#include <linux/anon_inodes.h>
     36#include <linux/kernel_stat.h>
     37#include <linux/cgroup.h>
     38#include <linux/perf_event.h>
     39#include <linux/trace_events.h>
     40#include <linux/hw_breakpoint.h>
     41#include <linux/mm_types.h>
     42#include <linux/module.h>
     43#include <linux/mman.h>
     44#include <linux/compat.h>
     45#include <linux/bpf.h>
     46#include <linux/filter.h>
     47#include <linux/namei.h>
     48#include <linux/parser.h>
     49#include <linux/sched/clock.h>
     50#include <linux/sched/mm.h>
     51#include <linux/proc_ns.h>
     52#include <linux/mount.h>
     53#include <linux/min_heap.h>
     54#include <linux/highmem.h>
     55#include <linux/pgtable.h>
     56#include <linux/buildid.h>
     57
     58#include "internal.h"
     59
     60#include <asm/irq_regs.h>
     61
     62typedef int (*remote_function_f)(void *);
     63
     64struct remote_function_call {
     65	struct task_struct	*p;
     66	remote_function_f	func;
     67	void			*info;
     68	int			ret;
     69};
     70
     71static void remote_function(void *data)
     72{
     73	struct remote_function_call *tfc = data;
     74	struct task_struct *p = tfc->p;
     75
     76	if (p) {
     77		/* -EAGAIN */
     78		if (task_cpu(p) != smp_processor_id())
     79			return;
     80
     81		/*
     82		 * Now that we're on right CPU with IRQs disabled, we can test
     83		 * if we hit the right task without races.
     84		 */
     85
     86		tfc->ret = -ESRCH; /* No such (running) process */
     87		if (p != current)
     88			return;
     89	}
     90
     91	tfc->ret = tfc->func(tfc->info);
     92}
     93
     94/**
     95 * task_function_call - call a function on the cpu on which a task runs
     96 * @p:		the task to evaluate
     97 * @func:	the function to be called
     98 * @info:	the function call argument
     99 *
    100 * Calls the function @func when the task is currently running. This might
    101 * be on the current CPU, which just calls the function directly.  This will
    102 * retry due to any failures in smp_call_function_single(), such as if the
    103 * task_cpu() goes offline concurrently.
    104 *
    105 * returns @func return value or -ESRCH or -ENXIO when the process isn't running
    106 */
    107static int
    108task_function_call(struct task_struct *p, remote_function_f func, void *info)
    109{
    110	struct remote_function_call data = {
    111		.p	= p,
    112		.func	= func,
    113		.info	= info,
    114		.ret	= -EAGAIN,
    115	};
    116	int ret;
    117
    118	for (;;) {
    119		ret = smp_call_function_single(task_cpu(p), remote_function,
    120					       &data, 1);
    121		if (!ret)
    122			ret = data.ret;
    123
    124		if (ret != -EAGAIN)
    125			break;
    126
    127		cond_resched();
    128	}
    129
    130	return ret;
    131}
    132
    133/**
    134 * cpu_function_call - call a function on the cpu
    135 * @cpu:	target cpu to queue this function
    136 * @func:	the function to be called
    137 * @info:	the function call argument
    138 *
    139 * Calls the function @func on the remote cpu.
    140 *
    141 * returns: @func return value or -ENXIO when the cpu is offline
    142 */
    143static int cpu_function_call(int cpu, remote_function_f func, void *info)
    144{
    145	struct remote_function_call data = {
    146		.p	= NULL,
    147		.func	= func,
    148		.info	= info,
    149		.ret	= -ENXIO, /* No such CPU */
    150	};
    151
    152	smp_call_function_single(cpu, remote_function, &data, 1);
    153
    154	return data.ret;
    155}
    156
    157static inline struct perf_cpu_context *
    158__get_cpu_context(struct perf_event_context *ctx)
    159{
    160	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
    161}
    162
    163static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
    164			  struct perf_event_context *ctx)
    165{
    166	raw_spin_lock(&cpuctx->ctx.lock);
    167	if (ctx)
    168		raw_spin_lock(&ctx->lock);
    169}
    170
    171static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
    172			    struct perf_event_context *ctx)
    173{
    174	if (ctx)
    175		raw_spin_unlock(&ctx->lock);
    176	raw_spin_unlock(&cpuctx->ctx.lock);
    177}
    178
    179#define TASK_TOMBSTONE ((void *)-1L)
    180
    181static bool is_kernel_event(struct perf_event *event)
    182{
    183	return READ_ONCE(event->owner) == TASK_TOMBSTONE;
    184}
    185
    186/*
    187 * On task ctx scheduling...
    188 *
    189 * When !ctx->nr_events a task context will not be scheduled. This means
    190 * we can disable the scheduler hooks (for performance) without leaving
    191 * pending task ctx state.
    192 *
    193 * This however results in two special cases:
    194 *
    195 *  - removing the last event from a task ctx; this is relatively straight
    196 *    forward and is done in __perf_remove_from_context.
    197 *
    198 *  - adding the first event to a task ctx; this is tricky because we cannot
    199 *    rely on ctx->is_active and therefore cannot use event_function_call().
    200 *    See perf_install_in_context().
    201 *
    202 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
    203 */
    204
    205typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
    206			struct perf_event_context *, void *);
    207
    208struct event_function_struct {
    209	struct perf_event *event;
    210	event_f func;
    211	void *data;
    212};
    213
    214static int event_function(void *info)
    215{
    216	struct event_function_struct *efs = info;
    217	struct perf_event *event = efs->event;
    218	struct perf_event_context *ctx = event->ctx;
    219	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
    220	struct perf_event_context *task_ctx = cpuctx->task_ctx;
    221	int ret = 0;
    222
    223	lockdep_assert_irqs_disabled();
    224
    225	perf_ctx_lock(cpuctx, task_ctx);
    226	/*
    227	 * Since we do the IPI call without holding ctx->lock things can have
    228	 * changed, double check we hit the task we set out to hit.
    229	 */
    230	if (ctx->task) {
    231		if (ctx->task != current) {
    232			ret = -ESRCH;
    233			goto unlock;
    234		}
    235
    236		/*
    237		 * We only use event_function_call() on established contexts,
    238		 * and event_function() is only ever called when active (or
    239		 * rather, we'll have bailed in task_function_call() or the
    240		 * above ctx->task != current test), therefore we must have
    241		 * ctx->is_active here.
    242		 */
    243		WARN_ON_ONCE(!ctx->is_active);
    244		/*
    245		 * And since we have ctx->is_active, cpuctx->task_ctx must
    246		 * match.
    247		 */
    248		WARN_ON_ONCE(task_ctx != ctx);
    249	} else {
    250		WARN_ON_ONCE(&cpuctx->ctx != ctx);
    251	}
    252
    253	efs->func(event, cpuctx, ctx, efs->data);
    254unlock:
    255	perf_ctx_unlock(cpuctx, task_ctx);
    256
    257	return ret;
    258}
    259
    260static void event_function_call(struct perf_event *event, event_f func, void *data)
    261{
    262	struct perf_event_context *ctx = event->ctx;
    263	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
    264	struct event_function_struct efs = {
    265		.event = event,
    266		.func = func,
    267		.data = data,
    268	};
    269
    270	if (!event->parent) {
    271		/*
    272		 * If this is a !child event, we must hold ctx::mutex to
    273		 * stabilize the event->ctx relation. See
    274		 * perf_event_ctx_lock().
    275		 */
    276		lockdep_assert_held(&ctx->mutex);
    277	}
    278
    279	if (!task) {
    280		cpu_function_call(event->cpu, event_function, &efs);
    281		return;
    282	}
    283
    284	if (task == TASK_TOMBSTONE)
    285		return;
    286
    287again:
    288	if (!task_function_call(task, event_function, &efs))
    289		return;
    290
    291	raw_spin_lock_irq(&ctx->lock);
    292	/*
    293	 * Reload the task pointer, it might have been changed by
    294	 * a concurrent perf_event_context_sched_out().
    295	 */
    296	task = ctx->task;
    297	if (task == TASK_TOMBSTONE) {
    298		raw_spin_unlock_irq(&ctx->lock);
    299		return;
    300	}
    301	if (ctx->is_active) {
    302		raw_spin_unlock_irq(&ctx->lock);
    303		goto again;
    304	}
    305	func(event, NULL, ctx, data);
    306	raw_spin_unlock_irq(&ctx->lock);
    307}
    308
    309/*
    310 * Similar to event_function_call() + event_function(), but hard assumes IRQs
    311 * are already disabled and we're on the right CPU.
    312 */
    313static void event_function_local(struct perf_event *event, event_f func, void *data)
    314{
    315	struct perf_event_context *ctx = event->ctx;
    316	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
    317	struct task_struct *task = READ_ONCE(ctx->task);
    318	struct perf_event_context *task_ctx = NULL;
    319
    320	lockdep_assert_irqs_disabled();
    321
    322	if (task) {
    323		if (task == TASK_TOMBSTONE)
    324			return;
    325
    326		task_ctx = ctx;
    327	}
    328
    329	perf_ctx_lock(cpuctx, task_ctx);
    330
    331	task = ctx->task;
    332	if (task == TASK_TOMBSTONE)
    333		goto unlock;
    334
    335	if (task) {
    336		/*
    337		 * We must be either inactive or active and the right task,
    338		 * otherwise we're screwed, since we cannot IPI to somewhere
    339		 * else.
    340		 */
    341		if (ctx->is_active) {
    342			if (WARN_ON_ONCE(task != current))
    343				goto unlock;
    344
    345			if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
    346				goto unlock;
    347		}
    348	} else {
    349		WARN_ON_ONCE(&cpuctx->ctx != ctx);
    350	}
    351
    352	func(event, cpuctx, ctx, data);
    353unlock:
    354	perf_ctx_unlock(cpuctx, task_ctx);
    355}
    356
    357#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
    358		       PERF_FLAG_FD_OUTPUT  |\
    359		       PERF_FLAG_PID_CGROUP |\
    360		       PERF_FLAG_FD_CLOEXEC)
    361
    362/*
    363 * branch priv levels that need permission checks
    364 */
    365#define PERF_SAMPLE_BRANCH_PERM_PLM \
    366	(PERF_SAMPLE_BRANCH_KERNEL |\
    367	 PERF_SAMPLE_BRANCH_HV)
    368
    369enum event_type_t {
    370	EVENT_FLEXIBLE = 0x1,
    371	EVENT_PINNED = 0x2,
    372	EVENT_TIME = 0x4,
    373	/* see ctx_resched() for details */
    374	EVENT_CPU = 0x8,
    375	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
    376};
    377
    378/*
    379 * perf_sched_events : >0 events exist
    380 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
    381 */
    382
    383static void perf_sched_delayed(struct work_struct *work);
    384DEFINE_STATIC_KEY_FALSE(perf_sched_events);
    385static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
    386static DEFINE_MUTEX(perf_sched_mutex);
    387static atomic_t perf_sched_count;
    388
    389static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
    390static DEFINE_PER_CPU(int, perf_sched_cb_usages);
    391static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
    392
    393static atomic_t nr_mmap_events __read_mostly;
    394static atomic_t nr_comm_events __read_mostly;
    395static atomic_t nr_namespaces_events __read_mostly;
    396static atomic_t nr_task_events __read_mostly;
    397static atomic_t nr_freq_events __read_mostly;
    398static atomic_t nr_switch_events __read_mostly;
    399static atomic_t nr_ksymbol_events __read_mostly;
    400static atomic_t nr_bpf_events __read_mostly;
    401static atomic_t nr_cgroup_events __read_mostly;
    402static atomic_t nr_text_poke_events __read_mostly;
    403static atomic_t nr_build_id_events __read_mostly;
    404
    405static LIST_HEAD(pmus);
    406static DEFINE_MUTEX(pmus_lock);
    407static struct srcu_struct pmus_srcu;
    408static cpumask_var_t perf_online_mask;
    409static struct kmem_cache *perf_event_cache;
    410
    411/*
    412 * perf event paranoia level:
    413 *  -1 - not paranoid at all
    414 *   0 - disallow raw tracepoint access for unpriv
    415 *   1 - disallow cpu events for unpriv
    416 *   2 - disallow kernel profiling for unpriv
    417 */
    418int sysctl_perf_event_paranoid __read_mostly = 2;
    419
    420/* Minimum for 512 kiB + 1 user control page */
    421int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
    422
    423/*
    424 * max perf event sample rate
    425 */
    426#define DEFAULT_MAX_SAMPLE_RATE		100000
    427#define DEFAULT_SAMPLE_PERIOD_NS	(NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
    428#define DEFAULT_CPU_TIME_MAX_PERCENT	25
    429
    430int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
    431
    432static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
    433static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS;
    434
    435static int perf_sample_allowed_ns __read_mostly =
    436	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
    437
    438static void update_perf_cpu_limits(void)
    439{
    440	u64 tmp = perf_sample_period_ns;
    441
    442	tmp *= sysctl_perf_cpu_time_max_percent;
    443	tmp = div_u64(tmp, 100);
    444	if (!tmp)
    445		tmp = 1;
    446
    447	WRITE_ONCE(perf_sample_allowed_ns, tmp);
    448}
    449
    450static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
    451
    452int perf_proc_update_handler(struct ctl_table *table, int write,
    453		void *buffer, size_t *lenp, loff_t *ppos)
    454{
    455	int ret;
    456	int perf_cpu = sysctl_perf_cpu_time_max_percent;
    457	/*
    458	 * If throttling is disabled don't allow the write:
    459	 */
    460	if (write && (perf_cpu == 100 || perf_cpu == 0))
    461		return -EINVAL;
    462
    463	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
    464	if (ret || !write)
    465		return ret;
    466
    467	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
    468	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
    469	update_perf_cpu_limits();
    470
    471	return 0;
    472}
    473
    474int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
    475
    476int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
    477		void *buffer, size_t *lenp, loff_t *ppos)
    478{
    479	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
    480
    481	if (ret || !write)
    482		return ret;
    483
    484	if (sysctl_perf_cpu_time_max_percent == 100 ||
    485	    sysctl_perf_cpu_time_max_percent == 0) {
    486		printk(KERN_WARNING
    487		       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
    488		WRITE_ONCE(perf_sample_allowed_ns, 0);
    489	} else {
    490		update_perf_cpu_limits();
    491	}
    492
    493	return 0;
    494}
    495
    496/*
    497 * perf samples are done in some very critical code paths (NMIs).
    498 * If they take too much CPU time, the system can lock up and not
    499 * get any real work done.  This will drop the sample rate when
    500 * we detect that events are taking too long.
    501 */
    502#define NR_ACCUMULATED_SAMPLES 128
    503static DEFINE_PER_CPU(u64, running_sample_length);
    504
    505static u64 __report_avg;
    506static u64 __report_allowed;
    507
    508static void perf_duration_warn(struct irq_work *w)
    509{
    510	printk_ratelimited(KERN_INFO
    511		"perf: interrupt took too long (%lld > %lld), lowering "
    512		"kernel.perf_event_max_sample_rate to %d\n",
    513		__report_avg, __report_allowed,
    514		sysctl_perf_event_sample_rate);
    515}
    516
    517static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
    518
    519void perf_sample_event_took(u64 sample_len_ns)
    520{
    521	u64 max_len = READ_ONCE(perf_sample_allowed_ns);
    522	u64 running_len;
    523	u64 avg_len;
    524	u32 max;
    525
    526	if (max_len == 0)
    527		return;
    528
    529	/* Decay the counter by 1 average sample. */
    530	running_len = __this_cpu_read(running_sample_length);
    531	running_len -= running_len/NR_ACCUMULATED_SAMPLES;
    532	running_len += sample_len_ns;
    533	__this_cpu_write(running_sample_length, running_len);
    534
    535	/*
    536	 * Note: this will be biased artifically low until we have
    537	 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
    538	 * from having to maintain a count.
    539	 */
    540	avg_len = running_len/NR_ACCUMULATED_SAMPLES;
    541	if (avg_len <= max_len)
    542		return;
    543
    544	__report_avg = avg_len;
    545	__report_allowed = max_len;
    546
    547	/*
    548	 * Compute a throttle threshold 25% below the current duration.
    549	 */
    550	avg_len += avg_len / 4;
    551	max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
    552	if (avg_len < max)
    553		max /= (u32)avg_len;
    554	else
    555		max = 1;
    556
    557	WRITE_ONCE(perf_sample_allowed_ns, avg_len);
    558	WRITE_ONCE(max_samples_per_tick, max);
    559
    560	sysctl_perf_event_sample_rate = max * HZ;
    561	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
    562
    563	if (!irq_work_queue(&perf_duration_work)) {
    564		early_printk("perf: interrupt took too long (%lld > %lld), lowering "
    565			     "kernel.perf_event_max_sample_rate to %d\n",
    566			     __report_avg, __report_allowed,
    567			     sysctl_perf_event_sample_rate);
    568	}
    569}
    570
    571static atomic64_t perf_event_id;
    572
    573static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
    574			      enum event_type_t event_type);
    575
    576static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
    577			     enum event_type_t event_type);
    578
    579static void update_context_time(struct perf_event_context *ctx);
    580static u64 perf_event_time(struct perf_event *event);
    581
    582void __weak perf_event_print_debug(void)	{ }
    583
    584static inline u64 perf_clock(void)
    585{
    586	return local_clock();
    587}
    588
    589static inline u64 perf_event_clock(struct perf_event *event)
    590{
    591	return event->clock();
    592}
    593
    594/*
    595 * State based event timekeeping...
    596 *
    597 * The basic idea is to use event->state to determine which (if any) time
    598 * fields to increment with the current delta. This means we only need to
    599 * update timestamps when we change state or when they are explicitly requested
    600 * (read).
    601 *
    602 * Event groups make things a little more complicated, but not terribly so. The
    603 * rules for a group are that if the group leader is OFF the entire group is
    604 * OFF, irrespecive of what the group member states are. This results in
    605 * __perf_effective_state().
    606 *
    607 * A futher ramification is that when a group leader flips between OFF and
    608 * !OFF, we need to update all group member times.
    609 *
    610 *
    611 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
    612 * need to make sure the relevant context time is updated before we try and
    613 * update our timestamps.
    614 */
    615
    616static __always_inline enum perf_event_state
    617__perf_effective_state(struct perf_event *event)
    618{
    619	struct perf_event *leader = event->group_leader;
    620
    621	if (leader->state <= PERF_EVENT_STATE_OFF)
    622		return leader->state;
    623
    624	return event->state;
    625}
    626
    627static __always_inline void
    628__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
    629{
    630	enum perf_event_state state = __perf_effective_state(event);
    631	u64 delta = now - event->tstamp;
    632
    633	*enabled = event->total_time_enabled;
    634	if (state >= PERF_EVENT_STATE_INACTIVE)
    635		*enabled += delta;
    636
    637	*running = event->total_time_running;
    638	if (state >= PERF_EVENT_STATE_ACTIVE)
    639		*running += delta;
    640}
    641
    642static void perf_event_update_time(struct perf_event *event)
    643{
    644	u64 now = perf_event_time(event);
    645
    646	__perf_update_times(event, now, &event->total_time_enabled,
    647					&event->total_time_running);
    648	event->tstamp = now;
    649}
    650
    651static void perf_event_update_sibling_time(struct perf_event *leader)
    652{
    653	struct perf_event *sibling;
    654
    655	for_each_sibling_event(sibling, leader)
    656		perf_event_update_time(sibling);
    657}
    658
    659static void
    660perf_event_set_state(struct perf_event *event, enum perf_event_state state)
    661{
    662	if (event->state == state)
    663		return;
    664
    665	perf_event_update_time(event);
    666	/*
    667	 * If a group leader gets enabled/disabled all its siblings
    668	 * are affected too.
    669	 */
    670	if ((event->state < 0) ^ (state < 0))
    671		perf_event_update_sibling_time(event);
    672
    673	WRITE_ONCE(event->state, state);
    674}
    675
    676/*
    677 * UP store-release, load-acquire
    678 */
    679
    680#define __store_release(ptr, val)					\
    681do {									\
    682	barrier();							\
    683	WRITE_ONCE(*(ptr), (val));					\
    684} while (0)
    685
    686#define __load_acquire(ptr)						\
    687({									\
    688	__unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));	\
    689	barrier();							\
    690	___p;								\
    691})
    692
    693#ifdef CONFIG_CGROUP_PERF
    694
    695static inline bool
    696perf_cgroup_match(struct perf_event *event)
    697{
    698	struct perf_event_context *ctx = event->ctx;
    699	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
    700
    701	/* @event doesn't care about cgroup */
    702	if (!event->cgrp)
    703		return true;
    704
    705	/* wants specific cgroup scope but @cpuctx isn't associated with any */
    706	if (!cpuctx->cgrp)
    707		return false;
    708
    709	/*
    710	 * Cgroup scoping is recursive.  An event enabled for a cgroup is
    711	 * also enabled for all its descendant cgroups.  If @cpuctx's
    712	 * cgroup is a descendant of @event's (the test covers identity
    713	 * case), it's a match.
    714	 */
    715	return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
    716				    event->cgrp->css.cgroup);
    717}
    718
    719static inline void perf_detach_cgroup(struct perf_event *event)
    720{
    721	css_put(&event->cgrp->css);
    722	event->cgrp = NULL;
    723}
    724
    725static inline int is_cgroup_event(struct perf_event *event)
    726{
    727	return event->cgrp != NULL;
    728}
    729
    730static inline u64 perf_cgroup_event_time(struct perf_event *event)
    731{
    732	struct perf_cgroup_info *t;
    733
    734	t = per_cpu_ptr(event->cgrp->info, event->cpu);
    735	return t->time;
    736}
    737
    738static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
    739{
    740	struct perf_cgroup_info *t;
    741
    742	t = per_cpu_ptr(event->cgrp->info, event->cpu);
    743	if (!__load_acquire(&t->active))
    744		return t->time;
    745	now += READ_ONCE(t->timeoffset);
    746	return now;
    747}
    748
    749static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
    750{
    751	if (adv)
    752		info->time += now - info->timestamp;
    753	info->timestamp = now;
    754	/*
    755	 * see update_context_time()
    756	 */
    757	WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
    758}
    759
    760static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
    761{
    762	struct perf_cgroup *cgrp = cpuctx->cgrp;
    763	struct cgroup_subsys_state *css;
    764	struct perf_cgroup_info *info;
    765
    766	if (cgrp) {
    767		u64 now = perf_clock();
    768
    769		for (css = &cgrp->css; css; css = css->parent) {
    770			cgrp = container_of(css, struct perf_cgroup, css);
    771			info = this_cpu_ptr(cgrp->info);
    772
    773			__update_cgrp_time(info, now, true);
    774			if (final)
    775				__store_release(&info->active, 0);
    776		}
    777	}
    778}
    779
    780static inline void update_cgrp_time_from_event(struct perf_event *event)
    781{
    782	struct perf_cgroup_info *info;
    783
    784	/*
    785	 * ensure we access cgroup data only when needed and
    786	 * when we know the cgroup is pinned (css_get)
    787	 */
    788	if (!is_cgroup_event(event))
    789		return;
    790
    791	info = this_cpu_ptr(event->cgrp->info);
    792	/*
    793	 * Do not update time when cgroup is not active
    794	 */
    795	if (info->active)
    796		__update_cgrp_time(info, perf_clock(), true);
    797}
    798
    799static inline void
    800perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
    801{
    802	struct perf_event_context *ctx = &cpuctx->ctx;
    803	struct perf_cgroup *cgrp = cpuctx->cgrp;
    804	struct perf_cgroup_info *info;
    805	struct cgroup_subsys_state *css;
    806
    807	/*
    808	 * ctx->lock held by caller
    809	 * ensure we do not access cgroup data
    810	 * unless we have the cgroup pinned (css_get)
    811	 */
    812	if (!cgrp)
    813		return;
    814
    815	WARN_ON_ONCE(!ctx->nr_cgroups);
    816
    817	for (css = &cgrp->css; css; css = css->parent) {
    818		cgrp = container_of(css, struct perf_cgroup, css);
    819		info = this_cpu_ptr(cgrp->info);
    820		__update_cgrp_time(info, ctx->timestamp, false);
    821		__store_release(&info->active, 1);
    822	}
    823}
    824
    825static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
    826
    827/*
    828 * reschedule events based on the cgroup constraint of task.
    829 */
    830static void perf_cgroup_switch(struct task_struct *task)
    831{
    832	struct perf_cgroup *cgrp;
    833	struct perf_cpu_context *cpuctx, *tmp;
    834	struct list_head *list;
    835	unsigned long flags;
    836
    837	/*
    838	 * Disable interrupts and preemption to avoid this CPU's
    839	 * cgrp_cpuctx_entry to change under us.
    840	 */
    841	local_irq_save(flags);
    842
    843	cgrp = perf_cgroup_from_task(task, NULL);
    844
    845	list = this_cpu_ptr(&cgrp_cpuctx_list);
    846	list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
    847		WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
    848		if (READ_ONCE(cpuctx->cgrp) == cgrp)
    849			continue;
    850
    851		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
    852		perf_pmu_disable(cpuctx->ctx.pmu);
    853
    854		cpu_ctx_sched_out(cpuctx, EVENT_ALL);
    855		/*
    856		 * must not be done before ctxswout due
    857		 * to update_cgrp_time_from_cpuctx() in
    858		 * ctx_sched_out()
    859		 */
    860		cpuctx->cgrp = cgrp;
    861		/*
    862		 * set cgrp before ctxsw in to allow
    863		 * perf_cgroup_set_timestamp() in ctx_sched_in()
    864		 * to not have to pass task around
    865		 */
    866		cpu_ctx_sched_in(cpuctx, EVENT_ALL);
    867
    868		perf_pmu_enable(cpuctx->ctx.pmu);
    869		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
    870	}
    871
    872	local_irq_restore(flags);
    873}
    874
    875static int perf_cgroup_ensure_storage(struct perf_event *event,
    876				struct cgroup_subsys_state *css)
    877{
    878	struct perf_cpu_context *cpuctx;
    879	struct perf_event **storage;
    880	int cpu, heap_size, ret = 0;
    881
    882	/*
    883	 * Allow storage to have sufficent space for an iterator for each
    884	 * possibly nested cgroup plus an iterator for events with no cgroup.
    885	 */
    886	for (heap_size = 1; css; css = css->parent)
    887		heap_size++;
    888
    889	for_each_possible_cpu(cpu) {
    890		cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
    891		if (heap_size <= cpuctx->heap_size)
    892			continue;
    893
    894		storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
    895				       GFP_KERNEL, cpu_to_node(cpu));
    896		if (!storage) {
    897			ret = -ENOMEM;
    898			break;
    899		}
    900
    901		raw_spin_lock_irq(&cpuctx->ctx.lock);
    902		if (cpuctx->heap_size < heap_size) {
    903			swap(cpuctx->heap, storage);
    904			if (storage == cpuctx->heap_default)
    905				storage = NULL;
    906			cpuctx->heap_size = heap_size;
    907		}
    908		raw_spin_unlock_irq(&cpuctx->ctx.lock);
    909
    910		kfree(storage);
    911	}
    912
    913	return ret;
    914}
    915
    916static inline int perf_cgroup_connect(int fd, struct perf_event *event,
    917				      struct perf_event_attr *attr,
    918				      struct perf_event *group_leader)
    919{
    920	struct perf_cgroup *cgrp;
    921	struct cgroup_subsys_state *css;
    922	struct fd f = fdget(fd);
    923	int ret = 0;
    924
    925	if (!f.file)
    926		return -EBADF;
    927
    928	css = css_tryget_online_from_dir(f.file->f_path.dentry,
    929					 &perf_event_cgrp_subsys);
    930	if (IS_ERR(css)) {
    931		ret = PTR_ERR(css);
    932		goto out;
    933	}
    934
    935	ret = perf_cgroup_ensure_storage(event, css);
    936	if (ret)
    937		goto out;
    938
    939	cgrp = container_of(css, struct perf_cgroup, css);
    940	event->cgrp = cgrp;
    941
    942	/*
    943	 * all events in a group must monitor
    944	 * the same cgroup because a task belongs
    945	 * to only one perf cgroup at a time
    946	 */
    947	if (group_leader && group_leader->cgrp != cgrp) {
    948		perf_detach_cgroup(event);
    949		ret = -EINVAL;
    950	}
    951out:
    952	fdput(f);
    953	return ret;
    954}
    955
    956static inline void
    957perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
    958{
    959	struct perf_cpu_context *cpuctx;
    960
    961	if (!is_cgroup_event(event))
    962		return;
    963
    964	/*
    965	 * Because cgroup events are always per-cpu events,
    966	 * @ctx == &cpuctx->ctx.
    967	 */
    968	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
    969
    970	if (ctx->nr_cgroups++)
    971		return;
    972
    973	cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
    974	list_add(&cpuctx->cgrp_cpuctx_entry,
    975			per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
    976}
    977
    978static inline void
    979perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
    980{
    981	struct perf_cpu_context *cpuctx;
    982
    983	if (!is_cgroup_event(event))
    984		return;
    985
    986	/*
    987	 * Because cgroup events are always per-cpu events,
    988	 * @ctx == &cpuctx->ctx.
    989	 */
    990	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
    991
    992	if (--ctx->nr_cgroups)
    993		return;
    994
    995	cpuctx->cgrp = NULL;
    996	list_del(&cpuctx->cgrp_cpuctx_entry);
    997}
    998
    999#else /* !CONFIG_CGROUP_PERF */
   1000
   1001static inline bool
   1002perf_cgroup_match(struct perf_event *event)
   1003{
   1004	return true;
   1005}
   1006
   1007static inline void perf_detach_cgroup(struct perf_event *event)
   1008{}
   1009
   1010static inline int is_cgroup_event(struct perf_event *event)
   1011{
   1012	return 0;
   1013}
   1014
   1015static inline void update_cgrp_time_from_event(struct perf_event *event)
   1016{
   1017}
   1018
   1019static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
   1020						bool final)
   1021{
   1022}
   1023
   1024static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
   1025				      struct perf_event_attr *attr,
   1026				      struct perf_event *group_leader)
   1027{
   1028	return -EINVAL;
   1029}
   1030
   1031static inline void
   1032perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
   1033{
   1034}
   1035
   1036static inline u64 perf_cgroup_event_time(struct perf_event *event)
   1037{
   1038	return 0;
   1039}
   1040
   1041static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
   1042{
   1043	return 0;
   1044}
   1045
   1046static inline void
   1047perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
   1048{
   1049}
   1050
   1051static inline void
   1052perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
   1053{
   1054}
   1055
   1056static void perf_cgroup_switch(struct task_struct *task)
   1057{
   1058}
   1059#endif
   1060
   1061/*
   1062 * set default to be dependent on timer tick just
   1063 * like original code
   1064 */
   1065#define PERF_CPU_HRTIMER (1000 / HZ)
   1066/*
   1067 * function must be called with interrupts disabled
   1068 */
   1069static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
   1070{
   1071	struct perf_cpu_context *cpuctx;
   1072	bool rotations;
   1073
   1074	lockdep_assert_irqs_disabled();
   1075
   1076	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
   1077	rotations = perf_rotate_context(cpuctx);
   1078
   1079	raw_spin_lock(&cpuctx->hrtimer_lock);
   1080	if (rotations)
   1081		hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
   1082	else
   1083		cpuctx->hrtimer_active = 0;
   1084	raw_spin_unlock(&cpuctx->hrtimer_lock);
   1085
   1086	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
   1087}
   1088
   1089static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
   1090{
   1091	struct hrtimer *timer = &cpuctx->hrtimer;
   1092	struct pmu *pmu = cpuctx->ctx.pmu;
   1093	u64 interval;
   1094
   1095	/* no multiplexing needed for SW PMU */
   1096	if (pmu->task_ctx_nr == perf_sw_context)
   1097		return;
   1098
   1099	/*
   1100	 * check default is sane, if not set then force to
   1101	 * default interval (1/tick)
   1102	 */
   1103	interval = pmu->hrtimer_interval_ms;
   1104	if (interval < 1)
   1105		interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
   1106
   1107	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
   1108
   1109	raw_spin_lock_init(&cpuctx->hrtimer_lock);
   1110	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
   1111	timer->function = perf_mux_hrtimer_handler;
   1112}
   1113
   1114static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
   1115{
   1116	struct hrtimer *timer = &cpuctx->hrtimer;
   1117	struct pmu *pmu = cpuctx->ctx.pmu;
   1118	unsigned long flags;
   1119
   1120	/* not for SW PMU */
   1121	if (pmu->task_ctx_nr == perf_sw_context)
   1122		return 0;
   1123
   1124	raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
   1125	if (!cpuctx->hrtimer_active) {
   1126		cpuctx->hrtimer_active = 1;
   1127		hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
   1128		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
   1129	}
   1130	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
   1131
   1132	return 0;
   1133}
   1134
   1135void perf_pmu_disable(struct pmu *pmu)
   1136{
   1137	int *count = this_cpu_ptr(pmu->pmu_disable_count);
   1138	if (!(*count)++)
   1139		pmu->pmu_disable(pmu);
   1140}
   1141
   1142void perf_pmu_enable(struct pmu *pmu)
   1143{
   1144	int *count = this_cpu_ptr(pmu->pmu_disable_count);
   1145	if (!--(*count))
   1146		pmu->pmu_enable(pmu);
   1147}
   1148
   1149static DEFINE_PER_CPU(struct list_head, active_ctx_list);
   1150
   1151/*
   1152 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
   1153 * perf_event_task_tick() are fully serialized because they're strictly cpu
   1154 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
   1155 * disabled, while perf_event_task_tick is called from IRQ context.
   1156 */
   1157static void perf_event_ctx_activate(struct perf_event_context *ctx)
   1158{
   1159	struct list_head *head = this_cpu_ptr(&active_ctx_list);
   1160
   1161	lockdep_assert_irqs_disabled();
   1162
   1163	WARN_ON(!list_empty(&ctx->active_ctx_list));
   1164
   1165	list_add(&ctx->active_ctx_list, head);
   1166}
   1167
   1168static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
   1169{
   1170	lockdep_assert_irqs_disabled();
   1171
   1172	WARN_ON(list_empty(&ctx->active_ctx_list));
   1173
   1174	list_del_init(&ctx->active_ctx_list);
   1175}
   1176
   1177static void get_ctx(struct perf_event_context *ctx)
   1178{
   1179	refcount_inc(&ctx->refcount);
   1180}
   1181
   1182static void *alloc_task_ctx_data(struct pmu *pmu)
   1183{
   1184	if (pmu->task_ctx_cache)
   1185		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
   1186
   1187	return NULL;
   1188}
   1189
   1190static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
   1191{
   1192	if (pmu->task_ctx_cache && task_ctx_data)
   1193		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
   1194}
   1195
   1196static void free_ctx(struct rcu_head *head)
   1197{
   1198	struct perf_event_context *ctx;
   1199
   1200	ctx = container_of(head, struct perf_event_context, rcu_head);
   1201	free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
   1202	kfree(ctx);
   1203}
   1204
   1205static void put_ctx(struct perf_event_context *ctx)
   1206{
   1207	if (refcount_dec_and_test(&ctx->refcount)) {
   1208		if (ctx->parent_ctx)
   1209			put_ctx(ctx->parent_ctx);
   1210		if (ctx->task && ctx->task != TASK_TOMBSTONE)
   1211			put_task_struct(ctx->task);
   1212		call_rcu(&ctx->rcu_head, free_ctx);
   1213	}
   1214}
   1215
   1216/*
   1217 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
   1218 * perf_pmu_migrate_context() we need some magic.
   1219 *
   1220 * Those places that change perf_event::ctx will hold both
   1221 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
   1222 *
   1223 * Lock ordering is by mutex address. There are two other sites where
   1224 * perf_event_context::mutex nests and those are:
   1225 *
   1226 *  - perf_event_exit_task_context()	[ child , 0 ]
   1227 *      perf_event_exit_event()
   1228 *        put_event()			[ parent, 1 ]
   1229 *
   1230 *  - perf_event_init_context()		[ parent, 0 ]
   1231 *      inherit_task_group()
   1232 *        inherit_group()
   1233 *          inherit_event()
   1234 *            perf_event_alloc()
   1235 *              perf_init_event()
   1236 *                perf_try_init_event()	[ child , 1 ]
   1237 *
   1238 * While it appears there is an obvious deadlock here -- the parent and child
   1239 * nesting levels are inverted between the two. This is in fact safe because
   1240 * life-time rules separate them. That is an exiting task cannot fork, and a
   1241 * spawning task cannot (yet) exit.
   1242 *
   1243 * But remember that these are parent<->child context relations, and
   1244 * migration does not affect children, therefore these two orderings should not
   1245 * interact.
   1246 *
   1247 * The change in perf_event::ctx does not affect children (as claimed above)
   1248 * because the sys_perf_event_open() case will install a new event and break
   1249 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
   1250 * concerned with cpuctx and that doesn't have children.
   1251 *
   1252 * The places that change perf_event::ctx will issue:
   1253 *
   1254 *   perf_remove_from_context();
   1255 *   synchronize_rcu();
   1256 *   perf_install_in_context();
   1257 *
   1258 * to affect the change. The remove_from_context() + synchronize_rcu() should
   1259 * quiesce the event, after which we can install it in the new location. This
   1260 * means that only external vectors (perf_fops, prctl) can perturb the event
   1261 * while in transit. Therefore all such accessors should also acquire
   1262 * perf_event_context::mutex to serialize against this.
   1263 *
   1264 * However; because event->ctx can change while we're waiting to acquire
   1265 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
   1266 * function.
   1267 *
   1268 * Lock order:
   1269 *    exec_update_lock
   1270 *	task_struct::perf_event_mutex
   1271 *	  perf_event_context::mutex
   1272 *	    perf_event::child_mutex;
   1273 *	      perf_event_context::lock
   1274 *	    perf_event::mmap_mutex
   1275 *	    mmap_lock
   1276 *	      perf_addr_filters_head::lock
   1277 *
   1278 *    cpu_hotplug_lock
   1279 *      pmus_lock
   1280 *	  cpuctx->mutex / perf_event_context::mutex
   1281 */
   1282static struct perf_event_context *
   1283perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
   1284{
   1285	struct perf_event_context *ctx;
   1286
   1287again:
   1288	rcu_read_lock();
   1289	ctx = READ_ONCE(event->ctx);
   1290	if (!refcount_inc_not_zero(&ctx->refcount)) {
   1291		rcu_read_unlock();
   1292		goto again;
   1293	}
   1294	rcu_read_unlock();
   1295
   1296	mutex_lock_nested(&ctx->mutex, nesting);
   1297	if (event->ctx != ctx) {
   1298		mutex_unlock(&ctx->mutex);
   1299		put_ctx(ctx);
   1300		goto again;
   1301	}
   1302
   1303	return ctx;
   1304}
   1305
   1306static inline struct perf_event_context *
   1307perf_event_ctx_lock(struct perf_event *event)
   1308{
   1309	return perf_event_ctx_lock_nested(event, 0);
   1310}
   1311
   1312static void perf_event_ctx_unlock(struct perf_event *event,
   1313				  struct perf_event_context *ctx)
   1314{
   1315	mutex_unlock(&ctx->mutex);
   1316	put_ctx(ctx);
   1317}
   1318
   1319/*
   1320 * This must be done under the ctx->lock, such as to serialize against
   1321 * context_equiv(), therefore we cannot call put_ctx() since that might end up
   1322 * calling scheduler related locks and ctx->lock nests inside those.
   1323 */
   1324static __must_check struct perf_event_context *
   1325unclone_ctx(struct perf_event_context *ctx)
   1326{
   1327	struct perf_event_context *parent_ctx = ctx->parent_ctx;
   1328
   1329	lockdep_assert_held(&ctx->lock);
   1330
   1331	if (parent_ctx)
   1332		ctx->parent_ctx = NULL;
   1333	ctx->generation++;
   1334
   1335	return parent_ctx;
   1336}
   1337
   1338static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
   1339				enum pid_type type)
   1340{
   1341	u32 nr;
   1342	/*
   1343	 * only top level events have the pid namespace they were created in
   1344	 */
   1345	if (event->parent)
   1346		event = event->parent;
   1347
   1348	nr = __task_pid_nr_ns(p, type, event->ns);
   1349	/* avoid -1 if it is idle thread or runs in another ns */
   1350	if (!nr && !pid_alive(p))
   1351		nr = -1;
   1352	return nr;
   1353}
   1354
   1355static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
   1356{
   1357	return perf_event_pid_type(event, p, PIDTYPE_TGID);
   1358}
   1359
   1360static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
   1361{
   1362	return perf_event_pid_type(event, p, PIDTYPE_PID);
   1363}
   1364
   1365/*
   1366 * If we inherit events we want to return the parent event id
   1367 * to userspace.
   1368 */
   1369static u64 primary_event_id(struct perf_event *event)
   1370{
   1371	u64 id = event->id;
   1372
   1373	if (event->parent)
   1374		id = event->parent->id;
   1375
   1376	return id;
   1377}
   1378
   1379/*
   1380 * Get the perf_event_context for a task and lock it.
   1381 *
   1382 * This has to cope with the fact that until it is locked,
   1383 * the context could get moved to another task.
   1384 */
   1385static struct perf_event_context *
   1386perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
   1387{
   1388	struct perf_event_context *ctx;
   1389
   1390retry:
   1391	/*
   1392	 * One of the few rules of preemptible RCU is that one cannot do
   1393	 * rcu_read_unlock() while holding a scheduler (or nested) lock when
   1394	 * part of the read side critical section was irqs-enabled -- see
   1395	 * rcu_read_unlock_special().
   1396	 *
   1397	 * Since ctx->lock nests under rq->lock we must ensure the entire read
   1398	 * side critical section has interrupts disabled.
   1399	 */
   1400	local_irq_save(*flags);
   1401	rcu_read_lock();
   1402	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
   1403	if (ctx) {
   1404		/*
   1405		 * If this context is a clone of another, it might
   1406		 * get swapped for another underneath us by
   1407		 * perf_event_task_sched_out, though the
   1408		 * rcu_read_lock() protects us from any context
   1409		 * getting freed.  Lock the context and check if it
   1410		 * got swapped before we could get the lock, and retry
   1411		 * if so.  If we locked the right context, then it
   1412		 * can't get swapped on us any more.
   1413		 */
   1414		raw_spin_lock(&ctx->lock);
   1415		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
   1416			raw_spin_unlock(&ctx->lock);
   1417			rcu_read_unlock();
   1418			local_irq_restore(*flags);
   1419			goto retry;
   1420		}
   1421
   1422		if (ctx->task == TASK_TOMBSTONE ||
   1423		    !refcount_inc_not_zero(&ctx->refcount)) {
   1424			raw_spin_unlock(&ctx->lock);
   1425			ctx = NULL;
   1426		} else {
   1427			WARN_ON_ONCE(ctx->task != task);
   1428		}
   1429	}
   1430	rcu_read_unlock();
   1431	if (!ctx)
   1432		local_irq_restore(*flags);
   1433	return ctx;
   1434}
   1435
   1436/*
   1437 * Get the context for a task and increment its pin_count so it
   1438 * can't get swapped to another task.  This also increments its
   1439 * reference count so that the context can't get freed.
   1440 */
   1441static struct perf_event_context *
   1442perf_pin_task_context(struct task_struct *task, int ctxn)
   1443{
   1444	struct perf_event_context *ctx;
   1445	unsigned long flags;
   1446
   1447	ctx = perf_lock_task_context(task, ctxn, &flags);
   1448	if (ctx) {
   1449		++ctx->pin_count;
   1450		raw_spin_unlock_irqrestore(&ctx->lock, flags);
   1451	}
   1452	return ctx;
   1453}
   1454
   1455static void perf_unpin_context(struct perf_event_context *ctx)
   1456{
   1457	unsigned long flags;
   1458
   1459	raw_spin_lock_irqsave(&ctx->lock, flags);
   1460	--ctx->pin_count;
   1461	raw_spin_unlock_irqrestore(&ctx->lock, flags);
   1462}
   1463
   1464/*
   1465 * Update the record of the current time in a context.
   1466 */
   1467static void __update_context_time(struct perf_event_context *ctx, bool adv)
   1468{
   1469	u64 now = perf_clock();
   1470
   1471	if (adv)
   1472		ctx->time += now - ctx->timestamp;
   1473	ctx->timestamp = now;
   1474
   1475	/*
   1476	 * The above: time' = time + (now - timestamp), can be re-arranged
   1477	 * into: time` = now + (time - timestamp), which gives a single value
   1478	 * offset to compute future time without locks on.
   1479	 *
   1480	 * See perf_event_time_now(), which can be used from NMI context where
   1481	 * it's (obviously) not possible to acquire ctx->lock in order to read
   1482	 * both the above values in a consistent manner.
   1483	 */
   1484	WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
   1485}
   1486
   1487static void update_context_time(struct perf_event_context *ctx)
   1488{
   1489	__update_context_time(ctx, true);
   1490}
   1491
   1492static u64 perf_event_time(struct perf_event *event)
   1493{
   1494	struct perf_event_context *ctx = event->ctx;
   1495
   1496	if (unlikely(!ctx))
   1497		return 0;
   1498
   1499	if (is_cgroup_event(event))
   1500		return perf_cgroup_event_time(event);
   1501
   1502	return ctx->time;
   1503}
   1504
   1505static u64 perf_event_time_now(struct perf_event *event, u64 now)
   1506{
   1507	struct perf_event_context *ctx = event->ctx;
   1508
   1509	if (unlikely(!ctx))
   1510		return 0;
   1511
   1512	if (is_cgroup_event(event))
   1513		return perf_cgroup_event_time_now(event, now);
   1514
   1515	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
   1516		return ctx->time;
   1517
   1518	now += READ_ONCE(ctx->timeoffset);
   1519	return now;
   1520}
   1521
   1522static enum event_type_t get_event_type(struct perf_event *event)
   1523{
   1524	struct perf_event_context *ctx = event->ctx;
   1525	enum event_type_t event_type;
   1526
   1527	lockdep_assert_held(&ctx->lock);
   1528
   1529	/*
   1530	 * It's 'group type', really, because if our group leader is
   1531	 * pinned, so are we.
   1532	 */
   1533	if (event->group_leader != event)
   1534		event = event->group_leader;
   1535
   1536	event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
   1537	if (!ctx->task)
   1538		event_type |= EVENT_CPU;
   1539
   1540	return event_type;
   1541}
   1542
   1543/*
   1544 * Helper function to initialize event group nodes.
   1545 */
   1546static void init_event_group(struct perf_event *event)
   1547{
   1548	RB_CLEAR_NODE(&event->group_node);
   1549	event->group_index = 0;
   1550}
   1551
   1552/*
   1553 * Extract pinned or flexible groups from the context
   1554 * based on event attrs bits.
   1555 */
   1556static struct perf_event_groups *
   1557get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
   1558{
   1559	if (event->attr.pinned)
   1560		return &ctx->pinned_groups;
   1561	else
   1562		return &ctx->flexible_groups;
   1563}
   1564
   1565/*
   1566 * Helper function to initializes perf_event_group trees.
   1567 */
   1568static void perf_event_groups_init(struct perf_event_groups *groups)
   1569{
   1570	groups->tree = RB_ROOT;
   1571	groups->index = 0;
   1572}
   1573
   1574static inline struct cgroup *event_cgroup(const struct perf_event *event)
   1575{
   1576	struct cgroup *cgroup = NULL;
   1577
   1578#ifdef CONFIG_CGROUP_PERF
   1579	if (event->cgrp)
   1580		cgroup = event->cgrp->css.cgroup;
   1581#endif
   1582
   1583	return cgroup;
   1584}
   1585
   1586/*
   1587 * Compare function for event groups;
   1588 *
   1589 * Implements complex key that first sorts by CPU and then by virtual index
   1590 * which provides ordering when rotating groups for the same CPU.
   1591 */
   1592static __always_inline int
   1593perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
   1594		      const u64 left_group_index, const struct perf_event *right)
   1595{
   1596	if (left_cpu < right->cpu)
   1597		return -1;
   1598	if (left_cpu > right->cpu)
   1599		return 1;
   1600
   1601#ifdef CONFIG_CGROUP_PERF
   1602	{
   1603		const struct cgroup *right_cgroup = event_cgroup(right);
   1604
   1605		if (left_cgroup != right_cgroup) {
   1606			if (!left_cgroup) {
   1607				/*
   1608				 * Left has no cgroup but right does, no
   1609				 * cgroups come first.
   1610				 */
   1611				return -1;
   1612			}
   1613			if (!right_cgroup) {
   1614				/*
   1615				 * Right has no cgroup but left does, no
   1616				 * cgroups come first.
   1617				 */
   1618				return 1;
   1619			}
   1620			/* Two dissimilar cgroups, order by id. */
   1621			if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
   1622				return -1;
   1623
   1624			return 1;
   1625		}
   1626	}
   1627#endif
   1628
   1629	if (left_group_index < right->group_index)
   1630		return -1;
   1631	if (left_group_index > right->group_index)
   1632		return 1;
   1633
   1634	return 0;
   1635}
   1636
   1637#define __node_2_pe(node) \
   1638	rb_entry((node), struct perf_event, group_node)
   1639
   1640static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
   1641{
   1642	struct perf_event *e = __node_2_pe(a);
   1643	return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
   1644				     __node_2_pe(b)) < 0;
   1645}
   1646
   1647struct __group_key {
   1648	int cpu;
   1649	struct cgroup *cgroup;
   1650};
   1651
   1652static inline int __group_cmp(const void *key, const struct rb_node *node)
   1653{
   1654	const struct __group_key *a = key;
   1655	const struct perf_event *b = __node_2_pe(node);
   1656
   1657	/* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
   1658	return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
   1659}
   1660
   1661/*
   1662 * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
   1663 * key (see perf_event_groups_less). This places it last inside the CPU
   1664 * subtree.
   1665 */
   1666static void
   1667perf_event_groups_insert(struct perf_event_groups *groups,
   1668			 struct perf_event *event)
   1669{
   1670	event->group_index = ++groups->index;
   1671
   1672	rb_add(&event->group_node, &groups->tree, __group_less);
   1673}
   1674
   1675/*
   1676 * Helper function to insert event into the pinned or flexible groups.
   1677 */
   1678static void
   1679add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
   1680{
   1681	struct perf_event_groups *groups;
   1682
   1683	groups = get_event_groups(event, ctx);
   1684	perf_event_groups_insert(groups, event);
   1685}
   1686
   1687/*
   1688 * Delete a group from a tree.
   1689 */
   1690static void
   1691perf_event_groups_delete(struct perf_event_groups *groups,
   1692			 struct perf_event *event)
   1693{
   1694	WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
   1695		     RB_EMPTY_ROOT(&groups->tree));
   1696
   1697	rb_erase(&event->group_node, &groups->tree);
   1698	init_event_group(event);
   1699}
   1700
   1701/*
   1702 * Helper function to delete event from its groups.
   1703 */
   1704static void
   1705del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
   1706{
   1707	struct perf_event_groups *groups;
   1708
   1709	groups = get_event_groups(event, ctx);
   1710	perf_event_groups_delete(groups, event);
   1711}
   1712
   1713/*
   1714 * Get the leftmost event in the cpu/cgroup subtree.
   1715 */
   1716static struct perf_event *
   1717perf_event_groups_first(struct perf_event_groups *groups, int cpu,
   1718			struct cgroup *cgrp)
   1719{
   1720	struct __group_key key = {
   1721		.cpu = cpu,
   1722		.cgroup = cgrp,
   1723	};
   1724	struct rb_node *node;
   1725
   1726	node = rb_find_first(&key, &groups->tree, __group_cmp);
   1727	if (node)
   1728		return __node_2_pe(node);
   1729
   1730	return NULL;
   1731}
   1732
   1733/*
   1734 * Like rb_entry_next_safe() for the @cpu subtree.
   1735 */
   1736static struct perf_event *
   1737perf_event_groups_next(struct perf_event *event)
   1738{
   1739	struct __group_key key = {
   1740		.cpu = event->cpu,
   1741		.cgroup = event_cgroup(event),
   1742	};
   1743	struct rb_node *next;
   1744
   1745	next = rb_next_match(&key, &event->group_node, __group_cmp);
   1746	if (next)
   1747		return __node_2_pe(next);
   1748
   1749	return NULL;
   1750}
   1751
   1752/*
   1753 * Iterate through the whole groups tree.
   1754 */
   1755#define perf_event_groups_for_each(event, groups)			\
   1756	for (event = rb_entry_safe(rb_first(&((groups)->tree)),		\
   1757				typeof(*event), group_node); event;	\
   1758		event = rb_entry_safe(rb_next(&event->group_node),	\
   1759				typeof(*event), group_node))
   1760
   1761/*
   1762 * Add an event from the lists for its context.
   1763 * Must be called with ctx->mutex and ctx->lock held.
   1764 */
   1765static void
   1766list_add_event(struct perf_event *event, struct perf_event_context *ctx)
   1767{
   1768	lockdep_assert_held(&ctx->lock);
   1769
   1770	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
   1771	event->attach_state |= PERF_ATTACH_CONTEXT;
   1772
   1773	event->tstamp = perf_event_time(event);
   1774
   1775	/*
   1776	 * If we're a stand alone event or group leader, we go to the context
   1777	 * list, group events are kept attached to the group so that
   1778	 * perf_group_detach can, at all times, locate all siblings.
   1779	 */
   1780	if (event->group_leader == event) {
   1781		event->group_caps = event->event_caps;
   1782		add_event_to_groups(event, ctx);
   1783	}
   1784
   1785	list_add_rcu(&event->event_entry, &ctx->event_list);
   1786	ctx->nr_events++;
   1787	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
   1788		ctx->nr_user++;
   1789	if (event->attr.inherit_stat)
   1790		ctx->nr_stat++;
   1791
   1792	if (event->state > PERF_EVENT_STATE_OFF)
   1793		perf_cgroup_event_enable(event, ctx);
   1794
   1795	ctx->generation++;
   1796}
   1797
   1798/*
   1799 * Initialize event state based on the perf_event_attr::disabled.
   1800 */
   1801static inline void perf_event__state_init(struct perf_event *event)
   1802{
   1803	event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
   1804					      PERF_EVENT_STATE_INACTIVE;
   1805}
   1806
   1807static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
   1808{
   1809	int entry = sizeof(u64); /* value */
   1810	int size = 0;
   1811	int nr = 1;
   1812
   1813	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
   1814		size += sizeof(u64);
   1815
   1816	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
   1817		size += sizeof(u64);
   1818
   1819	if (event->attr.read_format & PERF_FORMAT_ID)
   1820		entry += sizeof(u64);
   1821
   1822	if (event->attr.read_format & PERF_FORMAT_GROUP) {
   1823		nr += nr_siblings;
   1824		size += sizeof(u64);
   1825	}
   1826
   1827	size += entry * nr;
   1828	event->read_size = size;
   1829}
   1830
   1831static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
   1832{
   1833	struct perf_sample_data *data;
   1834	u16 size = 0;
   1835
   1836	if (sample_type & PERF_SAMPLE_IP)
   1837		size += sizeof(data->ip);
   1838
   1839	if (sample_type & PERF_SAMPLE_ADDR)
   1840		size += sizeof(data->addr);
   1841
   1842	if (sample_type & PERF_SAMPLE_PERIOD)
   1843		size += sizeof(data->period);
   1844
   1845	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
   1846		size += sizeof(data->weight.full);
   1847
   1848	if (sample_type & PERF_SAMPLE_READ)
   1849		size += event->read_size;
   1850
   1851	if (sample_type & PERF_SAMPLE_DATA_SRC)
   1852		size += sizeof(data->data_src.val);
   1853
   1854	if (sample_type & PERF_SAMPLE_TRANSACTION)
   1855		size += sizeof(data->txn);
   1856
   1857	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
   1858		size += sizeof(data->phys_addr);
   1859
   1860	if (sample_type & PERF_SAMPLE_CGROUP)
   1861		size += sizeof(data->cgroup);
   1862
   1863	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
   1864		size += sizeof(data->data_page_size);
   1865
   1866	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
   1867		size += sizeof(data->code_page_size);
   1868
   1869	event->header_size = size;
   1870}
   1871
   1872/*
   1873 * Called at perf_event creation and when events are attached/detached from a
   1874 * group.
   1875 */
   1876static void perf_event__header_size(struct perf_event *event)
   1877{
   1878	__perf_event_read_size(event,
   1879			       event->group_leader->nr_siblings);
   1880	__perf_event_header_size(event, event->attr.sample_type);
   1881}
   1882
   1883static void perf_event__id_header_size(struct perf_event *event)
   1884{
   1885	struct perf_sample_data *data;
   1886	u64 sample_type = event->attr.sample_type;
   1887	u16 size = 0;
   1888
   1889	if (sample_type & PERF_SAMPLE_TID)
   1890		size += sizeof(data->tid_entry);
   1891
   1892	if (sample_type & PERF_SAMPLE_TIME)
   1893		size += sizeof(data->time);
   1894
   1895	if (sample_type & PERF_SAMPLE_IDENTIFIER)
   1896		size += sizeof(data->id);
   1897
   1898	if (sample_type & PERF_SAMPLE_ID)
   1899		size += sizeof(data->id);
   1900
   1901	if (sample_type & PERF_SAMPLE_STREAM_ID)
   1902		size += sizeof(data->stream_id);
   1903
   1904	if (sample_type & PERF_SAMPLE_CPU)
   1905		size += sizeof(data->cpu_entry);
   1906
   1907	event->id_header_size = size;
   1908}
   1909
   1910static bool perf_event_validate_size(struct perf_event *event)
   1911{
   1912	/*
   1913	 * The values computed here will be over-written when we actually
   1914	 * attach the event.
   1915	 */
   1916	__perf_event_read_size(event, event->group_leader->nr_siblings + 1);
   1917	__perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
   1918	perf_event__id_header_size(event);
   1919
   1920	/*
   1921	 * Sum the lot; should not exceed the 64k limit we have on records.
   1922	 * Conservative limit to allow for callchains and other variable fields.
   1923	 */
   1924	if (event->read_size + event->header_size +
   1925	    event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
   1926		return false;
   1927
   1928	return true;
   1929}
   1930
   1931static void perf_group_attach(struct perf_event *event)
   1932{
   1933	struct perf_event *group_leader = event->group_leader, *pos;
   1934
   1935	lockdep_assert_held(&event->ctx->lock);
   1936
   1937	/*
   1938	 * We can have double attach due to group movement in perf_event_open.
   1939	 */
   1940	if (event->attach_state & PERF_ATTACH_GROUP)
   1941		return;
   1942
   1943	event->attach_state |= PERF_ATTACH_GROUP;
   1944
   1945	if (group_leader == event)
   1946		return;
   1947
   1948	WARN_ON_ONCE(group_leader->ctx != event->ctx);
   1949
   1950	group_leader->group_caps &= event->event_caps;
   1951
   1952	list_add_tail(&event->sibling_list, &group_leader->sibling_list);
   1953	group_leader->nr_siblings++;
   1954
   1955	perf_event__header_size(group_leader);
   1956
   1957	for_each_sibling_event(pos, group_leader)
   1958		perf_event__header_size(pos);
   1959}
   1960
   1961/*
   1962 * Remove an event from the lists for its context.
   1963 * Must be called with ctx->mutex and ctx->lock held.
   1964 */
   1965static void
   1966list_del_event(struct perf_event *event, struct perf_event_context *ctx)
   1967{
   1968	WARN_ON_ONCE(event->ctx != ctx);
   1969	lockdep_assert_held(&ctx->lock);
   1970
   1971	/*
   1972	 * We can have double detach due to exit/hot-unplug + close.
   1973	 */
   1974	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
   1975		return;
   1976
   1977	event->attach_state &= ~PERF_ATTACH_CONTEXT;
   1978
   1979	ctx->nr_events--;
   1980	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
   1981		ctx->nr_user--;
   1982	if (event->attr.inherit_stat)
   1983		ctx->nr_stat--;
   1984
   1985	list_del_rcu(&event->event_entry);
   1986
   1987	if (event->group_leader == event)
   1988		del_event_from_groups(event, ctx);
   1989
   1990	/*
   1991	 * If event was in error state, then keep it
   1992	 * that way, otherwise bogus counts will be
   1993	 * returned on read(). The only way to get out
   1994	 * of error state is by explicit re-enabling
   1995	 * of the event
   1996	 */
   1997	if (event->state > PERF_EVENT_STATE_OFF) {
   1998		perf_cgroup_event_disable(event, ctx);
   1999		perf_event_set_state(event, PERF_EVENT_STATE_OFF);
   2000	}
   2001
   2002	ctx->generation++;
   2003}
   2004
   2005static int
   2006perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
   2007{
   2008	if (!has_aux(aux_event))
   2009		return 0;
   2010
   2011	if (!event->pmu->aux_output_match)
   2012		return 0;
   2013
   2014	return event->pmu->aux_output_match(aux_event);
   2015}
   2016
   2017static void put_event(struct perf_event *event);
   2018static void event_sched_out(struct perf_event *event,
   2019			    struct perf_cpu_context *cpuctx,
   2020			    struct perf_event_context *ctx);
   2021
   2022static void perf_put_aux_event(struct perf_event *event)
   2023{
   2024	struct perf_event_context *ctx = event->ctx;
   2025	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
   2026	struct perf_event *iter;
   2027
   2028	/*
   2029	 * If event uses aux_event tear down the link
   2030	 */
   2031	if (event->aux_event) {
   2032		iter = event->aux_event;
   2033		event->aux_event = NULL;
   2034		put_event(iter);
   2035		return;
   2036	}
   2037
   2038	/*
   2039	 * If the event is an aux_event, tear down all links to
   2040	 * it from other events.
   2041	 */
   2042	for_each_sibling_event(iter, event->group_leader) {
   2043		if (iter->aux_event != event)
   2044			continue;
   2045
   2046		iter->aux_event = NULL;
   2047		put_event(event);
   2048
   2049		/*
   2050		 * If it's ACTIVE, schedule it out and put it into ERROR
   2051		 * state so that we don't try to schedule it again. Note
   2052		 * that perf_event_enable() will clear the ERROR status.
   2053		 */
   2054		event_sched_out(iter, cpuctx, ctx);
   2055		perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
   2056	}
   2057}
   2058
   2059static bool perf_need_aux_event(struct perf_event *event)
   2060{
   2061	return !!event->attr.aux_output || !!event->attr.aux_sample_size;
   2062}
   2063
   2064static int perf_get_aux_event(struct perf_event *event,
   2065			      struct perf_event *group_leader)
   2066{
   2067	/*
   2068	 * Our group leader must be an aux event if we want to be
   2069	 * an aux_output. This way, the aux event will precede its
   2070	 * aux_output events in the group, and therefore will always
   2071	 * schedule first.
   2072	 */
   2073	if (!group_leader)
   2074		return 0;
   2075
   2076	/*
   2077	 * aux_output and aux_sample_size are mutually exclusive.
   2078	 */
   2079	if (event->attr.aux_output && event->attr.aux_sample_size)
   2080		return 0;
   2081
   2082	if (event->attr.aux_output &&
   2083	    !perf_aux_output_match(event, group_leader))
   2084		return 0;
   2085
   2086	if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
   2087		return 0;
   2088
   2089	if (!atomic_long_inc_not_zero(&group_leader->refcount))
   2090		return 0;
   2091
   2092	/*
   2093	 * Link aux_outputs to their aux event; this is undone in
   2094	 * perf_group_detach() by perf_put_aux_event(). When the
   2095	 * group in torn down, the aux_output events loose their
   2096	 * link to the aux_event and can't schedule any more.
   2097	 */
   2098	event->aux_event = group_leader;
   2099
   2100	return 1;
   2101}
   2102
   2103static inline struct list_head *get_event_list(struct perf_event *event)
   2104{
   2105	struct perf_event_context *ctx = event->ctx;
   2106	return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
   2107}
   2108
   2109/*
   2110 * Events that have PERF_EV_CAP_SIBLING require being part of a group and
   2111 * cannot exist on their own, schedule them out and move them into the ERROR
   2112 * state. Also see _perf_event_enable(), it will not be able to recover
   2113 * this ERROR state.
   2114 */
   2115static inline void perf_remove_sibling_event(struct perf_event *event)
   2116{
   2117	struct perf_event_context *ctx = event->ctx;
   2118	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
   2119
   2120	event_sched_out(event, cpuctx, ctx);
   2121	perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
   2122}
   2123
   2124static void perf_group_detach(struct perf_event *event)
   2125{
   2126	struct perf_event *leader = event->group_leader;
   2127	struct perf_event *sibling, *tmp;
   2128	struct perf_event_context *ctx = event->ctx;
   2129
   2130	lockdep_assert_held(&ctx->lock);
   2131
   2132	/*
   2133	 * We can have double detach due to exit/hot-unplug + close.
   2134	 */
   2135	if (!(event->attach_state & PERF_ATTACH_GROUP))
   2136		return;
   2137
   2138	event->attach_state &= ~PERF_ATTACH_GROUP;
   2139
   2140	perf_put_aux_event(event);
   2141
   2142	/*
   2143	 * If this is a sibling, remove it from its group.
   2144	 */
   2145	if (leader != event) {
   2146		list_del_init(&event->sibling_list);
   2147		event->group_leader->nr_siblings--;
   2148		goto out;
   2149	}
   2150
   2151	/*
   2152	 * If this was a group event with sibling events then
   2153	 * upgrade the siblings to singleton events by adding them
   2154	 * to whatever list we are on.
   2155	 */
   2156	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
   2157
   2158		if (sibling->event_caps & PERF_EV_CAP_SIBLING)
   2159			perf_remove_sibling_event(sibling);
   2160
   2161		sibling->group_leader = sibling;
   2162		list_del_init(&sibling->sibling_list);
   2163
   2164		/* Inherit group flags from the previous leader */
   2165		sibling->group_caps = event->group_caps;
   2166
   2167		if (!RB_EMPTY_NODE(&event->group_node)) {
   2168			add_event_to_groups(sibling, event->ctx);
   2169
   2170			if (sibling->state == PERF_EVENT_STATE_ACTIVE)
   2171				list_add_tail(&sibling->active_list, get_event_list(sibling));
   2172		}
   2173
   2174		WARN_ON_ONCE(sibling->ctx != event->ctx);
   2175	}
   2176
   2177out:
   2178	for_each_sibling_event(tmp, leader)
   2179		perf_event__header_size(tmp);
   2180
   2181	perf_event__header_size(leader);
   2182}
   2183
   2184static void sync_child_event(struct perf_event *child_event);
   2185
   2186static void perf_child_detach(struct perf_event *event)
   2187{
   2188	struct perf_event *parent_event = event->parent;
   2189
   2190	if (!(event->attach_state & PERF_ATTACH_CHILD))
   2191		return;
   2192
   2193	event->attach_state &= ~PERF_ATTACH_CHILD;
   2194
   2195	if (WARN_ON_ONCE(!parent_event))
   2196		return;
   2197
   2198	lockdep_assert_held(&parent_event->child_mutex);
   2199
   2200	sync_child_event(event);
   2201	list_del_init(&event->child_list);
   2202}
   2203
   2204static bool is_orphaned_event(struct perf_event *event)
   2205{
   2206	return event->state == PERF_EVENT_STATE_DEAD;
   2207}
   2208
   2209static inline int __pmu_filter_match(struct perf_event *event)
   2210{
   2211	struct pmu *pmu = event->pmu;
   2212	return pmu->filter_match ? pmu->filter_match(event) : 1;
   2213}
   2214
   2215/*
   2216 * Check whether we should attempt to schedule an event group based on
   2217 * PMU-specific filtering. An event group can consist of HW and SW events,
   2218 * potentially with a SW leader, so we must check all the filters, to
   2219 * determine whether a group is schedulable:
   2220 */
   2221static inline int pmu_filter_match(struct perf_event *event)
   2222{
   2223	struct perf_event *sibling;
   2224
   2225	if (!__pmu_filter_match(event))
   2226		return 0;
   2227
   2228	for_each_sibling_event(sibling, event) {
   2229		if (!__pmu_filter_match(sibling))
   2230			return 0;
   2231	}
   2232
   2233	return 1;
   2234}
   2235
   2236static inline int
   2237event_filter_match(struct perf_event *event)
   2238{
   2239	return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
   2240	       perf_cgroup_match(event) && pmu_filter_match(event);
   2241}
   2242
   2243static void
   2244event_sched_out(struct perf_event *event,
   2245		  struct perf_cpu_context *cpuctx,
   2246		  struct perf_event_context *ctx)
   2247{
   2248	enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
   2249
   2250	WARN_ON_ONCE(event->ctx != ctx);
   2251	lockdep_assert_held(&ctx->lock);
   2252
   2253	if (event->state != PERF_EVENT_STATE_ACTIVE)
   2254		return;
   2255
   2256	/*
   2257	 * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
   2258	 * we can schedule events _OUT_ individually through things like
   2259	 * __perf_remove_from_context().
   2260	 */
   2261	list_del_init(&event->active_list);
   2262
   2263	perf_pmu_disable(event->pmu);
   2264
   2265	event->pmu->del(event, 0);
   2266	event->oncpu = -1;
   2267
   2268	if (READ_ONCE(event->pending_disable) >= 0) {
   2269		WRITE_ONCE(event->pending_disable, -1);
   2270		perf_cgroup_event_disable(event, ctx);
   2271		state = PERF_EVENT_STATE_OFF;
   2272	}
   2273	perf_event_set_state(event, state);
   2274
   2275	if (!is_software_event(event))
   2276		cpuctx->active_oncpu--;
   2277	if (!--ctx->nr_active)
   2278		perf_event_ctx_deactivate(ctx);
   2279	if (event->attr.freq && event->attr.sample_freq)
   2280		ctx->nr_freq--;
   2281	if (event->attr.exclusive || !cpuctx->active_oncpu)
   2282		cpuctx->exclusive = 0;
   2283
   2284	perf_pmu_enable(event->pmu);
   2285}
   2286
   2287static void
   2288group_sched_out(struct perf_event *group_event,
   2289		struct perf_cpu_context *cpuctx,
   2290		struct perf_event_context *ctx)
   2291{
   2292	struct perf_event *event;
   2293
   2294	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
   2295		return;
   2296
   2297	perf_pmu_disable(ctx->pmu);
   2298
   2299	event_sched_out(group_event, cpuctx, ctx);
   2300
   2301	/*
   2302	 * Schedule out siblings (if any):
   2303	 */
   2304	for_each_sibling_event(event, group_event)
   2305		event_sched_out(event, cpuctx, ctx);
   2306
   2307	perf_pmu_enable(ctx->pmu);
   2308}
   2309
   2310#define DETACH_GROUP	0x01UL
   2311#define DETACH_CHILD	0x02UL
   2312
   2313/*
   2314 * Cross CPU call to remove a performance event
   2315 *
   2316 * We disable the event on the hardware level first. After that we
   2317 * remove it from the context list.
   2318 */
   2319static void
   2320__perf_remove_from_context(struct perf_event *event,
   2321			   struct perf_cpu_context *cpuctx,
   2322			   struct perf_event_context *ctx,
   2323			   void *info)
   2324{
   2325	unsigned long flags = (unsigned long)info;
   2326
   2327	if (ctx->is_active & EVENT_TIME) {
   2328		update_context_time(ctx);
   2329		update_cgrp_time_from_cpuctx(cpuctx, false);
   2330	}
   2331
   2332	event_sched_out(event, cpuctx, ctx);
   2333	if (flags & DETACH_GROUP)
   2334		perf_group_detach(event);
   2335	if (flags & DETACH_CHILD)
   2336		perf_child_detach(event);
   2337	list_del_event(event, ctx);
   2338
   2339	if (!ctx->nr_events && ctx->is_active) {
   2340		if (ctx == &cpuctx->ctx)
   2341			update_cgrp_time_from_cpuctx(cpuctx, true);
   2342
   2343		ctx->is_active = 0;
   2344		ctx->rotate_necessary = 0;
   2345		if (ctx->task) {
   2346			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
   2347			cpuctx->task_ctx = NULL;
   2348		}
   2349	}
   2350}
   2351
   2352/*
   2353 * Remove the event from a task's (or a CPU's) list of events.
   2354 *
   2355 * If event->ctx is a cloned context, callers must make sure that
   2356 * every task struct that event->ctx->task could possibly point to
   2357 * remains valid.  This is OK when called from perf_release since
   2358 * that only calls us on the top-level context, which can't be a clone.
   2359 * When called from perf_event_exit_task, it's OK because the
   2360 * context has been detached from its task.
   2361 */
   2362static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
   2363{
   2364	struct perf_event_context *ctx = event->ctx;
   2365
   2366	lockdep_assert_held(&ctx->mutex);
   2367
   2368	/*
   2369	 * Because of perf_event_exit_task(), perf_remove_from_context() ought
   2370	 * to work in the face of TASK_TOMBSTONE, unlike every other
   2371	 * event_function_call() user.
   2372	 */
   2373	raw_spin_lock_irq(&ctx->lock);
   2374	/*
   2375	 * Cgroup events are per-cpu events, and must IPI because of
   2376	 * cgrp_cpuctx_list.
   2377	 */
   2378	if (!ctx->is_active && !is_cgroup_event(event)) {
   2379		__perf_remove_from_context(event, __get_cpu_context(ctx),
   2380					   ctx, (void *)flags);
   2381		raw_spin_unlock_irq(&ctx->lock);
   2382		return;
   2383	}
   2384	raw_spin_unlock_irq(&ctx->lock);
   2385
   2386	event_function_call(event, __perf_remove_from_context, (void *)flags);
   2387}
   2388
   2389/*
   2390 * Cross CPU call to disable a performance event
   2391 */
   2392static void __perf_event_disable(struct perf_event *event,
   2393				 struct perf_cpu_context *cpuctx,
   2394				 struct perf_event_context *ctx,
   2395				 void *info)
   2396{
   2397	if (event->state < PERF_EVENT_STATE_INACTIVE)
   2398		return;
   2399
   2400	if (ctx->is_active & EVENT_TIME) {
   2401		update_context_time(ctx);
   2402		update_cgrp_time_from_event(event);
   2403	}
   2404
   2405	if (event == event->group_leader)
   2406		group_sched_out(event, cpuctx, ctx);
   2407	else
   2408		event_sched_out(event, cpuctx, ctx);
   2409
   2410	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
   2411	perf_cgroup_event_disable(event, ctx);
   2412}
   2413
   2414/*
   2415 * Disable an event.
   2416 *
   2417 * If event->ctx is a cloned context, callers must make sure that
   2418 * every task struct that event->ctx->task could possibly point to
   2419 * remains valid.  This condition is satisfied when called through
   2420 * perf_event_for_each_child or perf_event_for_each because they
   2421 * hold the top-level event's child_mutex, so any descendant that
   2422 * goes to exit will block in perf_event_exit_event().
   2423 *
   2424 * When called from perf_pending_event it's OK because event->ctx
   2425 * is the current context on this CPU and preemption is disabled,
   2426 * hence we can't get into perf_event_task_sched_out for this context.
   2427 */
   2428static void _perf_event_disable(struct perf_event *event)
   2429{
   2430	struct perf_event_context *ctx = event->ctx;
   2431
   2432	raw_spin_lock_irq(&ctx->lock);
   2433	if (event->state <= PERF_EVENT_STATE_OFF) {
   2434		raw_spin_unlock_irq(&ctx->lock);
   2435		return;
   2436	}
   2437	raw_spin_unlock_irq(&ctx->lock);
   2438
   2439	event_function_call(event, __perf_event_disable, NULL);
   2440}
   2441
   2442void perf_event_disable_local(struct perf_event *event)
   2443{
   2444	event_function_local(event, __perf_event_disable, NULL);
   2445}
   2446
   2447/*
   2448 * Strictly speaking kernel users cannot create groups and therefore this
   2449 * interface does not need the perf_event_ctx_lock() magic.
   2450 */
   2451void perf_event_disable(struct perf_event *event)
   2452{
   2453	struct perf_event_context *ctx;
   2454
   2455	ctx = perf_event_ctx_lock(event);
   2456	_perf_event_disable(event);
   2457	perf_event_ctx_unlock(event, ctx);
   2458}
   2459EXPORT_SYMBOL_GPL(perf_event_disable);
   2460
   2461void perf_event_disable_inatomic(struct perf_event *event)
   2462{
   2463	WRITE_ONCE(event->pending_disable, smp_processor_id());
   2464	/* can fail, see perf_pending_event_disable() */
   2465	irq_work_queue(&event->pending);
   2466}
   2467
   2468#define MAX_INTERRUPTS (~0ULL)
   2469
   2470static void perf_log_throttle(struct perf_event *event, int enable);
   2471static void perf_log_itrace_start(struct perf_event *event);
   2472
   2473static int
   2474event_sched_in(struct perf_event *event,
   2475		 struct perf_cpu_context *cpuctx,
   2476		 struct perf_event_context *ctx)
   2477{
   2478	int ret = 0;
   2479
   2480	WARN_ON_ONCE(event->ctx != ctx);
   2481
   2482	lockdep_assert_held(&ctx->lock);
   2483
   2484	if (event->state <= PERF_EVENT_STATE_OFF)
   2485		return 0;
   2486
   2487	WRITE_ONCE(event->oncpu, smp_processor_id());
   2488	/*
   2489	 * Order event::oncpu write to happen before the ACTIVE state is
   2490	 * visible. This allows perf_event_{stop,read}() to observe the correct
   2491	 * ->oncpu if it sees ACTIVE.
   2492	 */
   2493	smp_wmb();
   2494	perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
   2495
   2496	/*
   2497	 * Unthrottle events, since we scheduled we might have missed several
   2498	 * ticks already, also for a heavily scheduling task there is little
   2499	 * guarantee it'll get a tick in a timely manner.
   2500	 */
   2501	if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
   2502		perf_log_throttle(event, 1);
   2503		event->hw.interrupts = 0;
   2504	}
   2505
   2506	perf_pmu_disable(event->pmu);
   2507
   2508	perf_log_itrace_start(event);
   2509
   2510	if (event->pmu->add(event, PERF_EF_START)) {
   2511		perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
   2512		event->oncpu = -1;
   2513		ret = -EAGAIN;
   2514		goto out;
   2515	}
   2516
   2517	if (!is_software_event(event))
   2518		cpuctx->active_oncpu++;
   2519	if (!ctx->nr_active++)
   2520		perf_event_ctx_activate(ctx);
   2521	if (event->attr.freq && event->attr.sample_freq)
   2522		ctx->nr_freq++;
   2523
   2524	if (event->attr.exclusive)
   2525		cpuctx->exclusive = 1;
   2526
   2527out:
   2528	perf_pmu_enable(event->pmu);
   2529
   2530	return ret;
   2531}
   2532
   2533static int
   2534group_sched_in(struct perf_event *group_event,
   2535	       struct perf_cpu_context *cpuctx,
   2536	       struct perf_event_context *ctx)
   2537{
   2538	struct perf_event *event, *partial_group = NULL;
   2539	struct pmu *pmu = ctx->pmu;
   2540
   2541	if (group_event->state == PERF_EVENT_STATE_OFF)
   2542		return 0;
   2543
   2544	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
   2545
   2546	if (event_sched_in(group_event, cpuctx, ctx))
   2547		goto error;
   2548
   2549	/*
   2550	 * Schedule in siblings as one group (if any):
   2551	 */
   2552	for_each_sibling_event(event, group_event) {
   2553		if (event_sched_in(event, cpuctx, ctx)) {
   2554			partial_group = event;
   2555			goto group_error;
   2556		}
   2557	}
   2558
   2559	if (!pmu->commit_txn(pmu))
   2560		return 0;
   2561
   2562group_error:
   2563	/*
   2564	 * Groups can be scheduled in as one unit only, so undo any
   2565	 * partial group before returning:
   2566	 * The events up to the failed event are scheduled out normally.
   2567	 */
   2568	for_each_sibling_event(event, group_event) {
   2569		if (event == partial_group)
   2570			break;
   2571
   2572		event_sched_out(event, cpuctx, ctx);
   2573	}
   2574	event_sched_out(group_event, cpuctx, ctx);
   2575
   2576error:
   2577	pmu->cancel_txn(pmu);
   2578	return -EAGAIN;
   2579}
   2580
   2581/*
   2582 * Work out whether we can put this event group on the CPU now.
   2583 */
   2584static int group_can_go_on(struct perf_event *event,
   2585			   struct perf_cpu_context *cpuctx,
   2586			   int can_add_hw)
   2587{
   2588	/*
   2589	 * Groups consisting entirely of software events can always go on.
   2590	 */
   2591	if (event->group_caps & PERF_EV_CAP_SOFTWARE)
   2592		return 1;
   2593	/*
   2594	 * If an exclusive group is already on, no other hardware
   2595	 * events can go on.
   2596	 */
   2597	if (cpuctx->exclusive)
   2598		return 0;
   2599	/*
   2600	 * If this group is exclusive and there are already
   2601	 * events on the CPU, it can't go on.
   2602	 */
   2603	if (event->attr.exclusive && !list_empty(get_event_list(event)))
   2604		return 0;
   2605	/*
   2606	 * Otherwise, try to add it if all previous groups were able
   2607	 * to go on.
   2608	 */
   2609	return can_add_hw;
   2610}
   2611
   2612static void add_event_to_ctx(struct perf_event *event,
   2613			       struct perf_event_context *ctx)
   2614{
   2615	list_add_event(event, ctx);
   2616	perf_group_attach(event);
   2617}
   2618
   2619static void ctx_sched_out(struct perf_event_context *ctx,
   2620			  struct perf_cpu_context *cpuctx,
   2621			  enum event_type_t event_type);
   2622static void
   2623ctx_sched_in(struct perf_event_context *ctx,
   2624	     struct perf_cpu_context *cpuctx,
   2625	     enum event_type_t event_type);
   2626
   2627static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
   2628			       struct perf_event_context *ctx,
   2629			       enum event_type_t event_type)
   2630{
   2631	if (!cpuctx->task_ctx)
   2632		return;
   2633
   2634	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
   2635		return;
   2636
   2637	ctx_sched_out(ctx, cpuctx, event_type);
   2638}
   2639
   2640static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
   2641				struct perf_event_context *ctx)
   2642{
   2643	cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
   2644	if (ctx)
   2645		ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
   2646	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
   2647	if (ctx)
   2648		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
   2649}
   2650
   2651/*
   2652 * We want to maintain the following priority of scheduling:
   2653 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
   2654 *  - task pinned (EVENT_PINNED)
   2655 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
   2656 *  - task flexible (EVENT_FLEXIBLE).
   2657 *
   2658 * In order to avoid unscheduling and scheduling back in everything every
   2659 * time an event is added, only do it for the groups of equal priority and
   2660 * below.
   2661 *
   2662 * This can be called after a batch operation on task events, in which case
   2663 * event_type is a bit mask of the types of events involved. For CPU events,
   2664 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
   2665 */
   2666static void ctx_resched(struct perf_cpu_context *cpuctx,
   2667			struct perf_event_context *task_ctx,
   2668			enum event_type_t event_type)
   2669{
   2670	enum event_type_t ctx_event_type;
   2671	bool cpu_event = !!(event_type & EVENT_CPU);
   2672
   2673	/*
   2674	 * If pinned groups are involved, flexible groups also need to be
   2675	 * scheduled out.
   2676	 */
   2677	if (event_type & EVENT_PINNED)
   2678		event_type |= EVENT_FLEXIBLE;
   2679
   2680	ctx_event_type = event_type & EVENT_ALL;
   2681
   2682	perf_pmu_disable(cpuctx->ctx.pmu);
   2683	if (task_ctx)
   2684		task_ctx_sched_out(cpuctx, task_ctx, event_type);
   2685
   2686	/*
   2687	 * Decide which cpu ctx groups to schedule out based on the types
   2688	 * of events that caused rescheduling:
   2689	 *  - EVENT_CPU: schedule out corresponding groups;
   2690	 *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
   2691	 *  - otherwise, do nothing more.
   2692	 */
   2693	if (cpu_event)
   2694		cpu_ctx_sched_out(cpuctx, ctx_event_type);
   2695	else if (ctx_event_type & EVENT_PINNED)
   2696		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
   2697
   2698	perf_event_sched_in(cpuctx, task_ctx);
   2699	perf_pmu_enable(cpuctx->ctx.pmu);
   2700}
   2701
   2702void perf_pmu_resched(struct pmu *pmu)
   2703{
   2704	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
   2705	struct perf_event_context *task_ctx = cpuctx->task_ctx;
   2706
   2707	perf_ctx_lock(cpuctx, task_ctx);
   2708	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
   2709	perf_ctx_unlock(cpuctx, task_ctx);
   2710}
   2711
   2712/*
   2713 * Cross CPU call to install and enable a performance event
   2714 *
   2715 * Very similar to remote_function() + event_function() but cannot assume that
   2716 * things like ctx->is_active and cpuctx->task_ctx are set.
   2717 */
   2718static int  __perf_install_in_context(void *info)
   2719{
   2720	struct perf_event *event = info;
   2721	struct perf_event_context *ctx = event->ctx;
   2722	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
   2723	struct perf_event_context *task_ctx = cpuctx->task_ctx;
   2724	bool reprogram = true;
   2725	int ret = 0;
   2726
   2727	raw_spin_lock(&cpuctx->ctx.lock);
   2728	if (ctx->task) {
   2729		raw_spin_lock(&ctx->lock);
   2730		task_ctx = ctx;
   2731
   2732		reprogram = (ctx->task == current);
   2733
   2734		/*
   2735		 * If the task is running, it must be running on this CPU,
   2736		 * otherwise we cannot reprogram things.
   2737		 *
   2738		 * If its not running, we don't care, ctx->lock will
   2739		 * serialize against it becoming runnable.
   2740		 */
   2741		if (task_curr(ctx->task) && !reprogram) {
   2742			ret = -ESRCH;
   2743			goto unlock;
   2744		}
   2745
   2746		WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
   2747	} else if (task_ctx) {
   2748		raw_spin_lock(&task_ctx->lock);
   2749	}
   2750
   2751#ifdef CONFIG_CGROUP_PERF
   2752	if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
   2753		/*
   2754		 * If the current cgroup doesn't match the event's
   2755		 * cgroup, we should not try to schedule it.
   2756		 */
   2757		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
   2758		reprogram = cgroup_is_descendant(cgrp->css.cgroup,
   2759					event->cgrp->css.cgroup);
   2760	}
   2761#endif
   2762
   2763	if (reprogram) {
   2764		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
   2765		add_event_to_ctx(event, ctx);
   2766		ctx_resched(cpuctx, task_ctx, get_event_type(event));
   2767	} else {
   2768		add_event_to_ctx(event, ctx);
   2769	}
   2770
   2771unlock:
   2772	perf_ctx_unlock(cpuctx, task_ctx);
   2773
   2774	return ret;
   2775}
   2776
   2777static bool exclusive_event_installable(struct perf_event *event,
   2778					struct perf_event_context *ctx);
   2779
   2780/*
   2781 * Attach a performance event to a context.
   2782 *
   2783 * Very similar to event_function_call, see comment there.
   2784 */
   2785static void
   2786perf_install_in_context(struct perf_event_context *ctx,
   2787			struct perf_event *event,
   2788			int cpu)
   2789{
   2790	struct task_struct *task = READ_ONCE(ctx->task);
   2791
   2792	lockdep_assert_held(&ctx->mutex);
   2793
   2794	WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
   2795
   2796	if (event->cpu != -1)
   2797		event->cpu = cpu;
   2798
   2799	/*
   2800	 * Ensures that if we can observe event->ctx, both the event and ctx
   2801	 * will be 'complete'. See perf_iterate_sb_cpu().
   2802	 */
   2803	smp_store_release(&event->ctx, ctx);
   2804
   2805	/*
   2806	 * perf_event_attr::disabled events will not run and can be initialized
   2807	 * without IPI. Except when this is the first event for the context, in
   2808	 * that case we need the magic of the IPI to set ctx->is_active.
   2809	 * Similarly, cgroup events for the context also needs the IPI to
   2810	 * manipulate the cgrp_cpuctx_list.
   2811	 *
   2812	 * The IOC_ENABLE that is sure to follow the creation of a disabled
   2813	 * event will issue the IPI and reprogram the hardware.
   2814	 */
   2815	if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
   2816	    ctx->nr_events && !is_cgroup_event(event)) {
   2817		raw_spin_lock_irq(&ctx->lock);
   2818		if (ctx->task == TASK_TOMBSTONE) {
   2819			raw_spin_unlock_irq(&ctx->lock);
   2820			return;
   2821		}
   2822		add_event_to_ctx(event, ctx);
   2823		raw_spin_unlock_irq(&ctx->lock);
   2824		return;
   2825	}
   2826
   2827	if (!task) {
   2828		cpu_function_call(cpu, __perf_install_in_context, event);
   2829		return;
   2830	}
   2831
   2832	/*
   2833	 * Should not happen, we validate the ctx is still alive before calling.
   2834	 */
   2835	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
   2836		return;
   2837
   2838	/*
   2839	 * Installing events is tricky because we cannot rely on ctx->is_active
   2840	 * to be set in case this is the nr_events 0 -> 1 transition.
   2841	 *
   2842	 * Instead we use task_curr(), which tells us if the task is running.
   2843	 * However, since we use task_curr() outside of rq::lock, we can race
   2844	 * against the actual state. This means the result can be wrong.
   2845	 *
   2846	 * If we get a false positive, we retry, this is harmless.
   2847	 *
   2848	 * If we get a false negative, things are complicated. If we are after
   2849	 * perf_event_context_sched_in() ctx::lock will serialize us, and the
   2850	 * value must be correct. If we're before, it doesn't matter since
   2851	 * perf_event_context_sched_in() will program the counter.
   2852	 *
   2853	 * However, this hinges on the remote context switch having observed
   2854	 * our task->perf_event_ctxp[] store, such that it will in fact take
   2855	 * ctx::lock in perf_event_context_sched_in().
   2856	 *
   2857	 * We do this by task_function_call(), if the IPI fails to hit the task
   2858	 * we know any future context switch of task must see the
   2859	 * perf_event_ctpx[] store.
   2860	 */
   2861
   2862	/*
   2863	 * This smp_mb() orders the task->perf_event_ctxp[] store with the
   2864	 * task_cpu() load, such that if the IPI then does not find the task
   2865	 * running, a future context switch of that task must observe the
   2866	 * store.
   2867	 */
   2868	smp_mb();
   2869again:
   2870	if (!task_function_call(task, __perf_install_in_context, event))
   2871		return;
   2872
   2873	raw_spin_lock_irq(&ctx->lock);
   2874	task = ctx->task;
   2875	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
   2876		/*
   2877		 * Cannot happen because we already checked above (which also
   2878		 * cannot happen), and we hold ctx->mutex, which serializes us
   2879		 * against perf_event_exit_task_context().
   2880		 */
   2881		raw_spin_unlock_irq(&ctx->lock);
   2882		return;
   2883	}
   2884	/*
   2885	 * If the task is not running, ctx->lock will avoid it becoming so,
   2886	 * thus we can safely install the event.
   2887	 */
   2888	if (task_curr(task)) {
   2889		raw_spin_unlock_irq(&ctx->lock);
   2890		goto again;
   2891	}
   2892	add_event_to_ctx(event, ctx);
   2893	raw_spin_unlock_irq(&ctx->lock);
   2894}
   2895
   2896/*
   2897 * Cross CPU call to enable a performance event
   2898 */
   2899static void __perf_event_enable(struct perf_event *event,
   2900				struct perf_cpu_context *cpuctx,
   2901				struct perf_event_context *ctx,
   2902				void *info)
   2903{
   2904	struct perf_event *leader = event->group_leader;
   2905	struct perf_event_context *task_ctx;
   2906
   2907	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
   2908	    event->state <= PERF_EVENT_STATE_ERROR)
   2909		return;
   2910
   2911	if (ctx->is_active)
   2912		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
   2913
   2914	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
   2915	perf_cgroup_event_enable(event, ctx);
   2916
   2917	if (!ctx->is_active)
   2918		return;
   2919
   2920	if (!event_filter_match(event)) {
   2921		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
   2922		return;
   2923	}
   2924
   2925	/*
   2926	 * If the event is in a group and isn't the group leader,
   2927	 * then don't put it on unless the group is on.
   2928	 */
   2929	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
   2930		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
   2931		return;
   2932	}
   2933
   2934	task_ctx = cpuctx->task_ctx;
   2935	if (ctx->task)
   2936		WARN_ON_ONCE(task_ctx != ctx);
   2937
   2938	ctx_resched(cpuctx, task_ctx, get_event_type(event));
   2939}
   2940
   2941/*
   2942 * Enable an event.
   2943 *
   2944 * If event->ctx is a cloned context, callers must make sure that
   2945 * every task struct that event->ctx->task could possibly point to
   2946 * remains valid.  This condition is satisfied when called through
   2947 * perf_event_for_each_child or perf_event_for_each as described
   2948 * for perf_event_disable.
   2949 */
   2950static void _perf_event_enable(struct perf_event *event)
   2951{
   2952	struct perf_event_context *ctx = event->ctx;
   2953
   2954	raw_spin_lock_irq(&ctx->lock);
   2955	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
   2956	    event->state <  PERF_EVENT_STATE_ERROR) {
   2957out:
   2958		raw_spin_unlock_irq(&ctx->lock);
   2959		return;
   2960	}
   2961
   2962	/*
   2963	 * If the event is in error state, clear that first.
   2964	 *
   2965	 * That way, if we see the event in error state below, we know that it
   2966	 * has gone back into error state, as distinct from the task having
   2967	 * been scheduled away before the cross-call arrived.
   2968	 */
   2969	if (event->state == PERF_EVENT_STATE_ERROR) {
   2970		/*
   2971		 * Detached SIBLING events cannot leave ERROR state.
   2972		 */
   2973		if (event->event_caps & PERF_EV_CAP_SIBLING &&
   2974		    event->group_leader == event)
   2975			goto out;
   2976
   2977		event->state = PERF_EVENT_STATE_OFF;
   2978	}
   2979	raw_spin_unlock_irq(&ctx->lock);
   2980
   2981	event_function_call(event, __perf_event_enable, NULL);
   2982}
   2983
   2984/*
   2985 * See perf_event_disable();
   2986 */
   2987void perf_event_enable(struct perf_event *event)
   2988{
   2989	struct perf_event_context *ctx;
   2990
   2991	ctx = perf_event_ctx_lock(event);
   2992	_perf_event_enable(event);
   2993	perf_event_ctx_unlock(event, ctx);
   2994}
   2995EXPORT_SYMBOL_GPL(perf_event_enable);
   2996
   2997struct stop_event_data {
   2998	struct perf_event	*event;
   2999	unsigned int		restart;
   3000};
   3001
   3002static int __perf_event_stop(void *info)
   3003{
   3004	struct stop_event_data *sd = info;
   3005	struct perf_event *event = sd->event;
   3006
   3007	/* if it's already INACTIVE, do nothing */
   3008	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
   3009		return 0;
   3010
   3011	/* matches smp_wmb() in event_sched_in() */
   3012	smp_rmb();
   3013
   3014	/*
   3015	 * There is a window with interrupts enabled before we get here,
   3016	 * so we need to check again lest we try to stop another CPU's event.
   3017	 */
   3018	if (READ_ONCE(event->oncpu) != smp_processor_id())
   3019		return -EAGAIN;
   3020
   3021	event->pmu->stop(event, PERF_EF_UPDATE);
   3022
   3023	/*
   3024	 * May race with the actual stop (through perf_pmu_output_stop()),
   3025	 * but it is only used for events with AUX ring buffer, and such
   3026	 * events will refuse to restart because of rb::aux_mmap_count==0,
   3027	 * see comments in perf_aux_output_begin().
   3028	 *
   3029	 * Since this is happening on an event-local CPU, no trace is lost
   3030	 * while restarting.
   3031	 */
   3032	if (sd->restart)
   3033		event->pmu->start(event, 0);
   3034
   3035	return 0;
   3036}
   3037
   3038static int perf_event_stop(struct perf_event *event, int restart)
   3039{
   3040	struct stop_event_data sd = {
   3041		.event		= event,
   3042		.restart	= restart,
   3043	};
   3044	int ret = 0;
   3045
   3046	do {
   3047		if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
   3048			return 0;
   3049
   3050		/* matches smp_wmb() in event_sched_in() */
   3051		smp_rmb();
   3052
   3053		/*
   3054		 * We only want to restart ACTIVE events, so if the event goes
   3055		 * inactive here (event->oncpu==-1), there's nothing more to do;
   3056		 * fall through with ret==-ENXIO.
   3057		 */
   3058		ret = cpu_function_call(READ_ONCE(event->oncpu),
   3059					__perf_event_stop, &sd);
   3060	} while (ret == -EAGAIN);
   3061
   3062	return ret;
   3063}
   3064
   3065/*
   3066 * In order to contain the amount of racy and tricky in the address filter
   3067 * configuration management, it is a two part process:
   3068 *
   3069 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
   3070 *      we update the addresses of corresponding vmas in
   3071 *	event::addr_filter_ranges array and bump the event::addr_filters_gen;
   3072 * (p2) when an event is scheduled in (pmu::add), it calls
   3073 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
   3074 *      if the generation has changed since the previous call.
   3075 *
   3076 * If (p1) happens while the event is active, we restart it to force (p2).
   3077 *
   3078 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
   3079 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
   3080 *     ioctl;
   3081 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
   3082 *     registered mapping, called for every new mmap(), with mm::mmap_lock down
   3083 *     for reading;
   3084 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
   3085 *     of exec.
   3086 */
   3087void perf_event_addr_filters_sync(struct perf_event *event)
   3088{
   3089	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
   3090
   3091	if (!has_addr_filter(event))
   3092		return;
   3093
   3094	raw_spin_lock(&ifh->lock);
   3095	if (event->addr_filters_gen != event->hw.addr_filters_gen) {
   3096		event->pmu->addr_filters_sync(event);
   3097		event->hw.addr_filters_gen = event->addr_filters_gen;
   3098	}
   3099	raw_spin_unlock(&ifh->lock);
   3100}
   3101EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
   3102
   3103static int _perf_event_refresh(struct perf_event *event, int refresh)
   3104{
   3105	/*
   3106	 * not supported on inherited events
   3107	 */
   3108	if (event->attr.inherit || !is_sampling_event(event))
   3109		return -EINVAL;
   3110
   3111	atomic_add(refresh, &event->event_limit);
   3112	_perf_event_enable(event);
   3113
   3114	return 0;
   3115}
   3116
   3117/*
   3118 * See perf_event_disable()
   3119 */
   3120int perf_event_refresh(struct perf_event *event, int refresh)
   3121{
   3122	struct perf_event_context *ctx;
   3123	int ret;
   3124
   3125	ctx = perf_event_ctx_lock(event);
   3126	ret = _perf_event_refresh(event, refresh);
   3127	perf_event_ctx_unlock(event, ctx);
   3128
   3129	return ret;
   3130}
   3131EXPORT_SYMBOL_GPL(perf_event_refresh);
   3132
   3133static int perf_event_modify_breakpoint(struct perf_event *bp,
   3134					 struct perf_event_attr *attr)
   3135{
   3136	int err;
   3137
   3138	_perf_event_disable(bp);
   3139
   3140	err = modify_user_hw_breakpoint_check(bp, attr, true);
   3141
   3142	if (!bp->attr.disabled)
   3143		_perf_event_enable(bp);
   3144
   3145	return err;
   3146}
   3147
   3148/*
   3149 * Copy event-type-independent attributes that may be modified.
   3150 */
   3151static void perf_event_modify_copy_attr(struct perf_event_attr *to,
   3152					const struct perf_event_attr *from)
   3153{
   3154	to->sig_data = from->sig_data;
   3155}
   3156
   3157static int perf_event_modify_attr(struct perf_event *event,
   3158				  struct perf_event_attr *attr)
   3159{
   3160	int (*func)(struct perf_event *, struct perf_event_attr *);
   3161	struct perf_event *child;
   3162	int err;
   3163
   3164	if (event->attr.type != attr->type)
   3165		return -EINVAL;
   3166
   3167	switch (event->attr.type) {
   3168	case PERF_TYPE_BREAKPOINT:
   3169		func = perf_event_modify_breakpoint;
   3170		break;
   3171	default:
   3172		/* Place holder for future additions. */
   3173		return -EOPNOTSUPP;
   3174	}
   3175
   3176	WARN_ON_ONCE(event->ctx->parent_ctx);
   3177
   3178	mutex_lock(&event->child_mutex);
   3179	/*
   3180	 * Event-type-independent attributes must be copied before event-type
   3181	 * modification, which will validate that final attributes match the
   3182	 * source attributes after all relevant attributes have been copied.
   3183	 */
   3184	perf_event_modify_copy_attr(&event->attr, attr);
   3185	err = func(event, attr);
   3186	if (err)
   3187		goto out;
   3188	list_for_each_entry(child, &event->child_list, child_list) {
   3189		perf_event_modify_copy_attr(&child->attr, attr);
   3190		err = func(child, attr);
   3191		if (err)
   3192			goto out;
   3193	}
   3194out:
   3195	mutex_unlock(&event->child_mutex);
   3196	return err;
   3197}
   3198
   3199static void ctx_sched_out(struct perf_event_context *ctx,
   3200			  struct perf_cpu_context *cpuctx,
   3201			  enum event_type_t event_type)
   3202{
   3203	struct perf_event *event, *tmp;
   3204	int is_active = ctx->is_active;
   3205
   3206	lockdep_assert_held(&ctx->lock);
   3207
   3208	if (likely(!ctx->nr_events)) {
   3209		/*
   3210		 * See __perf_remove_from_context().
   3211		 */
   3212		WARN_ON_ONCE(ctx->is_active);
   3213		if (ctx->task)
   3214			WARN_ON_ONCE(cpuctx->task_ctx);
   3215		return;
   3216	}
   3217
   3218	/*
   3219	 * Always update time if it was set; not only when it changes.
   3220	 * Otherwise we can 'forget' to update time for any but the last
   3221	 * context we sched out. For example:
   3222	 *
   3223	 *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
   3224	 *   ctx_sched_out(.event_type = EVENT_PINNED)
   3225	 *
   3226	 * would only update time for the pinned events.
   3227	 */
   3228	if (is_active & EVENT_TIME) {
   3229		/* update (and stop) ctx time */
   3230		update_context_time(ctx);
   3231		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
   3232		/*
   3233		 * CPU-release for the below ->is_active store,
   3234		 * see __load_acquire() in perf_event_time_now()
   3235		 */
   3236		barrier();
   3237	}
   3238
   3239	ctx->is_active &= ~event_type;
   3240	if (!(ctx->is_active & EVENT_ALL))
   3241		ctx->is_active = 0;
   3242
   3243	if (ctx->task) {
   3244		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
   3245		if (!ctx->is_active)
   3246			cpuctx->task_ctx = NULL;
   3247	}
   3248
   3249	is_active ^= ctx->is_active; /* changed bits */
   3250
   3251	if (!ctx->nr_active || !(is_active & EVENT_ALL))
   3252		return;
   3253
   3254	perf_pmu_disable(ctx->pmu);
   3255	if (is_active & EVENT_PINNED) {
   3256		list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
   3257			group_sched_out(event, cpuctx, ctx);
   3258	}
   3259
   3260	if (is_active & EVENT_FLEXIBLE) {
   3261		list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
   3262			group_sched_out(event, cpuctx, ctx);
   3263
   3264		/*
   3265		 * Since we cleared EVENT_FLEXIBLE, also clear
   3266		 * rotate_necessary, is will be reset by
   3267		 * ctx_flexible_sched_in() when needed.
   3268		 */
   3269		ctx->rotate_necessary = 0;
   3270	}
   3271	perf_pmu_enable(ctx->pmu);
   3272}
   3273
   3274/*
   3275 * Test whether two contexts are equivalent, i.e. whether they have both been
   3276 * cloned from the same version of the same context.
   3277 *
   3278 * Equivalence is measured using a generation number in the context that is
   3279 * incremented on each modification to it; see unclone_ctx(), list_add_event()
   3280 * and list_del_event().
   3281 */
   3282static int context_equiv(struct perf_event_context *ctx1,
   3283			 struct perf_event_context *ctx2)
   3284{
   3285	lockdep_assert_held(&ctx1->lock);
   3286	lockdep_assert_held(&ctx2->lock);
   3287
   3288	/* Pinning disables the swap optimization */
   3289	if (ctx1->pin_count || ctx2->pin_count)
   3290		return 0;
   3291
   3292	/* If ctx1 is the parent of ctx2 */
   3293	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
   3294		return 1;
   3295
   3296	/* If ctx2 is the parent of ctx1 */
   3297	if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
   3298		return 1;
   3299
   3300	/*
   3301	 * If ctx1 and ctx2 have the same parent; we flatten the parent
   3302	 * hierarchy, see perf_event_init_context().
   3303	 */
   3304	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
   3305			ctx1->parent_gen == ctx2->parent_gen)
   3306		return 1;
   3307
   3308	/* Unmatched */
   3309	return 0;
   3310}
   3311
   3312static void __perf_event_sync_stat(struct perf_event *event,
   3313				     struct perf_event *next_event)
   3314{
   3315	u64 value;
   3316
   3317	if (!event->attr.inherit_stat)
   3318		return;
   3319
   3320	/*
   3321	 * Update the event value, we cannot use perf_event_read()
   3322	 * because we're in the middle of a context switch and have IRQs
   3323	 * disabled, which upsets smp_call_function_single(), however
   3324	 * we know the event must be on the current CPU, therefore we
   3325	 * don't need to use it.
   3326	 */
   3327	if (event->state == PERF_EVENT_STATE_ACTIVE)
   3328		event->pmu->read(event);
   3329
   3330	perf_event_update_time(event);
   3331
   3332	/*
   3333	 * In order to keep per-task stats reliable we need to flip the event
   3334	 * values when we flip the contexts.
   3335	 */
   3336	value = local64_read(&next_event->count);
   3337	value = local64_xchg(&event->count, value);
   3338	local64_set(&next_event->count, value);
   3339
   3340	swap(event->total_time_enabled, next_event->total_time_enabled);
   3341	swap(event->total_time_running, next_event->total_time_running);
   3342
   3343	/*
   3344	 * Since we swizzled the values, update the user visible data too.
   3345	 */
   3346	perf_event_update_userpage(event);
   3347	perf_event_update_userpage(next_event);
   3348}
   3349
   3350static void perf_event_sync_stat(struct perf_event_context *ctx,
   3351				   struct perf_event_context *next_ctx)
   3352{
   3353	struct perf_event *event, *next_event;
   3354
   3355	if (!ctx->nr_stat)
   3356		return;
   3357
   3358	update_context_time(ctx);
   3359
   3360	event = list_first_entry(&ctx->event_list,
   3361				   struct perf_event, event_entry);
   3362
   3363	next_event = list_first_entry(&next_ctx->event_list,
   3364					struct perf_event, event_entry);
   3365
   3366	while (&event->event_entry != &ctx->event_list &&
   3367	       &next_event->event_entry != &next_ctx->event_list) {
   3368
   3369		__perf_event_sync_stat(event, next_event);
   3370
   3371		event = list_next_entry(event, event_entry);
   3372		next_event = list_next_entry(next_event, event_entry);
   3373	}
   3374}
   3375
   3376static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
   3377					 struct task_struct *next)
   3378{
   3379	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
   3380	struct perf_event_context *next_ctx;
   3381	struct perf_event_context *parent, *next_parent;
   3382	struct perf_cpu_context *cpuctx;
   3383	int do_switch = 1;
   3384	struct pmu *pmu;
   3385
   3386	if (likely(!ctx))
   3387		return;
   3388
   3389	pmu = ctx->pmu;
   3390	cpuctx = __get_cpu_context(ctx);
   3391	if (!cpuctx->task_ctx)
   3392		return;
   3393
   3394	rcu_read_lock();
   3395	next_ctx = next->perf_event_ctxp[ctxn];
   3396	if (!next_ctx)
   3397		goto unlock;
   3398
   3399	parent = rcu_dereference(ctx->parent_ctx);
   3400	next_parent = rcu_dereference(next_ctx->parent_ctx);
   3401
   3402	/* If neither context have a parent context; they cannot be clones. */
   3403	if (!parent && !next_parent)
   3404		goto unlock;
   3405
   3406	if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
   3407		/*
   3408		 * Looks like the two contexts are clones, so we might be
   3409		 * able to optimize the context switch.  We lock both
   3410		 * contexts and check that they are clones under the
   3411		 * lock (including re-checking that neither has been
   3412		 * uncloned in the meantime).  It doesn't matter which
   3413		 * order we take the locks because no other cpu could
   3414		 * be trying to lock both of these tasks.
   3415		 */
   3416		raw_spin_lock(&ctx->lock);
   3417		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
   3418		if (context_equiv(ctx, next_ctx)) {
   3419
   3420			WRITE_ONCE(ctx->task, next);
   3421			WRITE_ONCE(next_ctx->task, task);
   3422
   3423			perf_pmu_disable(pmu);
   3424
   3425			if (cpuctx->sched_cb_usage && pmu->sched_task)
   3426				pmu->sched_task(ctx, false);
   3427
   3428			/*
   3429			 * PMU specific parts of task perf context can require
   3430			 * additional synchronization. As an example of such
   3431			 * synchronization see implementation details of Intel
   3432			 * LBR call stack data profiling;
   3433			 */
   3434			if (pmu->swap_task_ctx)
   3435				pmu->swap_task_ctx(ctx, next_ctx);
   3436			else
   3437				swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
   3438
   3439			perf_pmu_enable(pmu);
   3440
   3441			/*
   3442			 * RCU_INIT_POINTER here is safe because we've not
   3443			 * modified the ctx and the above modification of
   3444			 * ctx->task and ctx->task_ctx_data are immaterial
   3445			 * since those values are always verified under
   3446			 * ctx->lock which we're now holding.
   3447			 */
   3448			RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
   3449			RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
   3450
   3451			do_switch = 0;
   3452
   3453			perf_event_sync_stat(ctx, next_ctx);
   3454		}
   3455		raw_spin_unlock(&next_ctx->lock);
   3456		raw_spin_unlock(&ctx->lock);
   3457	}
   3458unlock:
   3459	rcu_read_unlock();
   3460
   3461	if (do_switch) {
   3462		raw_spin_lock(&ctx->lock);
   3463		perf_pmu_disable(pmu);
   3464
   3465		if (cpuctx->sched_cb_usage && pmu->sched_task)
   3466			pmu->sched_task(ctx, false);
   3467		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
   3468
   3469		perf_pmu_enable(pmu);
   3470		raw_spin_unlock(&ctx->lock);
   3471	}
   3472}
   3473
   3474static DEFINE_PER_CPU(struct list_head, sched_cb_list);
   3475
   3476void perf_sched_cb_dec(struct pmu *pmu)
   3477{
   3478	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
   3479
   3480	this_cpu_dec(perf_sched_cb_usages);
   3481
   3482	if (!--cpuctx->sched_cb_usage)
   3483		list_del(&cpuctx->sched_cb_entry);
   3484}
   3485
   3486
   3487void perf_sched_cb_inc(struct pmu *pmu)
   3488{
   3489	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
   3490
   3491	if (!cpuctx->sched_cb_usage++)
   3492		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
   3493
   3494	this_cpu_inc(perf_sched_cb_usages);
   3495}
   3496
   3497/*
   3498 * This function provides the context switch callback to the lower code
   3499 * layer. It is invoked ONLY when the context switch callback is enabled.
   3500 *
   3501 * This callback is relevant even to per-cpu events; for example multi event
   3502 * PEBS requires this to provide PID/TID information. This requires we flush
   3503 * all queued PEBS records before we context switch to a new task.
   3504 */
   3505static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
   3506{
   3507	struct pmu *pmu;
   3508
   3509	pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
   3510
   3511	if (WARN_ON_ONCE(!pmu->sched_task))
   3512		return;
   3513
   3514	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
   3515	perf_pmu_disable(pmu);
   3516
   3517	pmu->sched_task(cpuctx->task_ctx, sched_in);
   3518
   3519	perf_pmu_enable(pmu);
   3520	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
   3521}
   3522
   3523static void perf_pmu_sched_task(struct task_struct *prev,
   3524				struct task_struct *next,
   3525				bool sched_in)
   3526{
   3527	struct perf_cpu_context *cpuctx;
   3528
   3529	if (prev == next)
   3530		return;
   3531
   3532	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
   3533		/* will be handled in perf_event_context_sched_in/out */
   3534		if (cpuctx->task_ctx)
   3535			continue;
   3536
   3537		__perf_pmu_sched_task(cpuctx, sched_in);
   3538	}
   3539}
   3540
   3541static void perf_event_switch(struct task_struct *task,
   3542			      struct task_struct *next_prev, bool sched_in);
   3543
   3544#define for_each_task_context_nr(ctxn)					\
   3545	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
   3546
   3547/*
   3548 * Called from scheduler to remove the events of the current task,
   3549 * with interrupts disabled.
   3550 *
   3551 * We stop each event and update the event value in event->count.
   3552 *
   3553 * This does not protect us against NMI, but disable()
   3554 * sets the disabled bit in the control field of event _before_
   3555 * accessing the event control register. If a NMI hits, then it will
   3556 * not restart the event.
   3557 */
   3558void __perf_event_task_sched_out(struct task_struct *task,
   3559				 struct task_struct *next)
   3560{
   3561	int ctxn;
   3562
   3563	if (__this_cpu_read(perf_sched_cb_usages))
   3564		perf_pmu_sched_task(task, next, false);
   3565
   3566	if (atomic_read(&nr_switch_events))
   3567		perf_event_switch(task, next, false);
   3568
   3569	for_each_task_context_nr(ctxn)
   3570		perf_event_context_sched_out(task, ctxn, next);
   3571
   3572	/*
   3573	 * if cgroup events exist on this CPU, then we need
   3574	 * to check if we have to switch out PMU state.
   3575	 * cgroup event are system-wide mode only
   3576	 */
   3577	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
   3578		perf_cgroup_switch(next);
   3579}
   3580
   3581/*
   3582 * Called with IRQs disabled
   3583 */
   3584static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
   3585			      enum event_type_t event_type)
   3586{
   3587	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
   3588}
   3589
   3590static bool perf_less_group_idx(const void *l, const void *r)
   3591{
   3592	const struct perf_event *le = *(const struct perf_event **)l;
   3593	const struct perf_event *re = *(const struct perf_event **)r;
   3594
   3595	return le->group_index < re->group_index;
   3596}
   3597
   3598static void swap_ptr(void *l, void *r)
   3599{
   3600	void **lp = l, **rp = r;
   3601
   3602	swap(*lp, *rp);
   3603}
   3604
   3605static const struct min_heap_callbacks perf_min_heap = {
   3606	.elem_size = sizeof(struct perf_event *),
   3607	.less = perf_less_group_idx,
   3608	.swp = swap_ptr,
   3609};
   3610
   3611static void __heap_add(struct min_heap *heap, struct perf_event *event)
   3612{
   3613	struct perf_event **itrs = heap->data;
   3614
   3615	if (event) {
   3616		itrs[heap->nr] = event;
   3617		heap->nr++;
   3618	}
   3619}
   3620
   3621static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
   3622				struct perf_event_groups *groups, int cpu,
   3623				int (*func)(struct perf_event *, void *),
   3624				void *data)
   3625{
   3626#ifdef CONFIG_CGROUP_PERF
   3627	struct cgroup_subsys_state *css = NULL;
   3628#endif
   3629	/* Space for per CPU and/or any CPU event iterators. */
   3630	struct perf_event *itrs[2];
   3631	struct min_heap event_heap;
   3632	struct perf_event **evt;
   3633	int ret;
   3634
   3635	if (cpuctx) {
   3636		event_heap = (struct min_heap){
   3637			.data = cpuctx->heap,
   3638			.nr = 0,
   3639			.size = cpuctx->heap_size,
   3640		};
   3641
   3642		lockdep_assert_held(&cpuctx->ctx.lock);
   3643
   3644#ifdef CONFIG_CGROUP_PERF
   3645		if (cpuctx->cgrp)
   3646			css = &cpuctx->cgrp->css;
   3647#endif
   3648	} else {
   3649		event_heap = (struct min_heap){
   3650			.data = itrs,
   3651			.nr = 0,
   3652			.size = ARRAY_SIZE(itrs),
   3653		};
   3654		/* Events not within a CPU context may be on any CPU. */
   3655		__heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
   3656	}
   3657	evt = event_heap.data;
   3658
   3659	__heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
   3660
   3661#ifdef CONFIG_CGROUP_PERF
   3662	for (; css; css = css->parent)
   3663		__heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
   3664#endif
   3665
   3666	min_heapify_all(&event_heap, &perf_min_heap);
   3667
   3668	while (event_heap.nr) {
   3669		ret = func(*evt, data);
   3670		if (ret)
   3671			return ret;
   3672
   3673		*evt = perf_event_groups_next(*evt);
   3674		if (*evt)
   3675			min_heapify(&event_heap, 0, &perf_min_heap);
   3676		else
   3677			min_heap_pop(&event_heap, &perf_min_heap);
   3678	}
   3679
   3680	return 0;
   3681}
   3682
   3683/*
   3684 * Because the userpage is strictly per-event (there is no concept of context,
   3685 * so there cannot be a context indirection), every userpage must be updated
   3686 * when context time starts :-(
   3687 *
   3688 * IOW, we must not miss EVENT_TIME edges.
   3689 */
   3690static inline bool event_update_userpage(struct perf_event *event)
   3691{
   3692	if (likely(!atomic_read(&event->mmap_count)))
   3693		return false;
   3694
   3695	perf_event_update_time(event);
   3696	perf_event_update_userpage(event);
   3697
   3698	return true;
   3699}
   3700
   3701static inline void group_update_userpage(struct perf_event *group_event)
   3702{
   3703	struct perf_event *event;
   3704
   3705	if (!event_update_userpage(group_event))
   3706		return;
   3707
   3708	for_each_sibling_event(event, group_event)
   3709		event_update_userpage(event);
   3710}
   3711
   3712static int merge_sched_in(struct perf_event *event, void *data)
   3713{
   3714	struct perf_event_context *ctx = event->ctx;
   3715	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
   3716	int *can_add_hw = data;
   3717
   3718	if (event->state <= PERF_EVENT_STATE_OFF)
   3719		return 0;
   3720
   3721	if (!event_filter_match(event))
   3722		return 0;
   3723
   3724	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
   3725		if (!group_sched_in(event, cpuctx, ctx))
   3726			list_add_tail(&event->active_list, get_event_list(event));
   3727	}
   3728
   3729	if (event->state == PERF_EVENT_STATE_INACTIVE) {
   3730		*can_add_hw = 0;
   3731		if (event->attr.pinned) {
   3732			perf_cgroup_event_disable(event, ctx);
   3733			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
   3734		} else {
   3735			ctx->rotate_necessary = 1;
   3736			perf_mux_hrtimer_restart(cpuctx);
   3737			group_update_userpage(event);
   3738		}
   3739	}
   3740
   3741	return 0;
   3742}
   3743
   3744static void
   3745ctx_pinned_sched_in(struct perf_event_context *ctx,
   3746		    struct perf_cpu_context *cpuctx)
   3747{
   3748	int can_add_hw = 1;
   3749
   3750	if (ctx != &cpuctx->ctx)
   3751		cpuctx = NULL;
   3752
   3753	visit_groups_merge(cpuctx, &ctx->pinned_groups,
   3754			   smp_processor_id(),
   3755			   merge_sched_in, &can_add_hw);
   3756}
   3757
   3758static void
   3759ctx_flexible_sched_in(struct perf_event_context *ctx,
   3760		      struct perf_cpu_context *cpuctx)
   3761{
   3762	int can_add_hw = 1;
   3763
   3764	if (ctx != &cpuctx->ctx)
   3765		cpuctx = NULL;
   3766
   3767	visit_groups_merge(cpuctx, &ctx->flexible_groups,
   3768			   smp_processor_id(),
   3769			   merge_sched_in, &can_add_hw);
   3770}
   3771
   3772static void
   3773ctx_sched_in(struct perf_event_context *ctx,
   3774	     struct perf_cpu_context *cpuctx,
   3775	     enum event_type_t event_type)
   3776{
   3777	int is_active = ctx->is_active;
   3778
   3779	lockdep_assert_held(&ctx->lock);
   3780
   3781	if (likely(!ctx->nr_events))
   3782		return;
   3783
   3784	if (is_active ^ EVENT_TIME) {
   3785		/* start ctx time */
   3786		__update_context_time(ctx, false);
   3787		perf_cgroup_set_timestamp(cpuctx);
   3788		/*
   3789		 * CPU-release for the below ->is_active store,
   3790		 * see __load_acquire() in perf_event_time_now()
   3791		 */
   3792		barrier();
   3793	}
   3794
   3795	ctx->is_active |= (event_type | EVENT_TIME);
   3796	if (ctx->task) {
   3797		if (!is_active)
   3798			cpuctx->task_ctx = ctx;
   3799		else
   3800			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
   3801	}
   3802
   3803	is_active ^= ctx->is_active; /* changed bits */
   3804
   3805	/*
   3806	 * First go through the list and put on any pinned groups
   3807	 * in order to give them the best chance of going on.
   3808	 */
   3809	if (is_active & EVENT_PINNED)
   3810		ctx_pinned_sched_in(ctx, cpuctx);
   3811
   3812	/* Then walk through the lower prio flexible groups */
   3813	if (is_active & EVENT_FLEXIBLE)
   3814		ctx_flexible_sched_in(ctx, cpuctx);
   3815}
   3816
   3817static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
   3818			     enum event_type_t event_type)
   3819{
   3820	struct perf_event_context *ctx = &cpuctx->ctx;
   3821
   3822	ctx_sched_in(ctx, cpuctx, event_type);
   3823}
   3824
   3825static void perf_event_context_sched_in(struct perf_event_context *ctx,
   3826					struct task_struct *task)
   3827{
   3828	struct perf_cpu_context *cpuctx;
   3829	struct pmu *pmu;
   3830
   3831	cpuctx = __get_cpu_context(ctx);
   3832
   3833	/*
   3834	 * HACK: for HETEROGENEOUS the task context might have switched to a
   3835	 * different PMU, force (re)set the context,
   3836	 */
   3837	pmu = ctx->pmu = cpuctx->ctx.pmu;
   3838
   3839	if (cpuctx->task_ctx == ctx) {
   3840		if (cpuctx->sched_cb_usage)
   3841			__perf_pmu_sched_task(cpuctx, true);
   3842		return;
   3843	}
   3844
   3845	perf_ctx_lock(cpuctx, ctx);
   3846	/*
   3847	 * We must check ctx->nr_events while holding ctx->lock, such
   3848	 * that we serialize against perf_install_in_context().
   3849	 */
   3850	if (!ctx->nr_events)
   3851		goto unlock;
   3852
   3853	perf_pmu_disable(pmu);
   3854	/*
   3855	 * We want to keep the following priority order:
   3856	 * cpu pinned (that don't need to move), task pinned,
   3857	 * cpu flexible, task flexible.
   3858	 *
   3859	 * However, if task's ctx is not carrying any pinned
   3860	 * events, no need to flip the cpuctx's events around.
   3861	 */
   3862	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
   3863		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
   3864	perf_event_sched_in(cpuctx, ctx);
   3865
   3866	if (cpuctx->sched_cb_usage && pmu->sched_task)
   3867		pmu->sched_task(cpuctx->task_ctx, true);
   3868
   3869	perf_pmu_enable(pmu);
   3870
   3871unlock:
   3872	perf_ctx_unlock(cpuctx, ctx);
   3873}
   3874
   3875/*
   3876 * Called from scheduler to add the events of the current task
   3877 * with interrupts disabled.
   3878 *
   3879 * We restore the event value and then enable it.
   3880 *
   3881 * This does not protect us against NMI, but enable()
   3882 * sets the enabled bit in the control field of event _before_
   3883 * accessing the event control register. If a NMI hits, then it will
   3884 * keep the event running.
   3885 */
   3886void __perf_event_task_sched_in(struct task_struct *prev,
   3887				struct task_struct *task)
   3888{
   3889	struct perf_event_context *ctx;
   3890	int ctxn;
   3891
   3892	for_each_task_context_nr(ctxn) {
   3893		ctx = task->perf_event_ctxp[ctxn];
   3894		if (likely(!ctx))
   3895			continue;
   3896
   3897		perf_event_context_sched_in(ctx, task);
   3898	}
   3899
   3900	if (atomic_read(&nr_switch_events))
   3901		perf_event_switch(task, prev, true);
   3902
   3903	if (__this_cpu_read(perf_sched_cb_usages))
   3904		perf_pmu_sched_task(prev, task, true);
   3905}
   3906
   3907static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
   3908{
   3909	u64 frequency = event->attr.sample_freq;
   3910	u64 sec = NSEC_PER_SEC;
   3911	u64 divisor, dividend;
   3912
   3913	int count_fls, nsec_fls, frequency_fls, sec_fls;
   3914
   3915	count_fls = fls64(count);
   3916	nsec_fls = fls64(nsec);
   3917	frequency_fls = fls64(frequency);
   3918	sec_fls = 30;
   3919
   3920	/*
   3921	 * We got @count in @nsec, with a target of sample_freq HZ
   3922	 * the target period becomes:
   3923	 *
   3924	 *             @count * 10^9
   3925	 * period = -------------------
   3926	 *          @nsec * sample_freq
   3927	 *
   3928	 */
   3929
   3930	/*
   3931	 * Reduce accuracy by one bit such that @a and @b converge
   3932	 * to a similar magnitude.
   3933	 */
   3934#define REDUCE_FLS(a, b)		\
   3935do {					\
   3936	if (a##_fls > b##_fls) {	\
   3937		a >>= 1;		\
   3938		a##_fls--;		\
   3939	} else {			\
   3940		b >>= 1;		\
   3941		b##_fls--;		\
   3942	}				\
   3943} while (0)
   3944
   3945	/*
   3946	 * Reduce accuracy until either term fits in a u64, then proceed with
   3947	 * the other, so that finally we can do a u64/u64 division.
   3948	 */
   3949	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
   3950		REDUCE_FLS(nsec, frequency);
   3951		REDUCE_FLS(sec, count);
   3952	}
   3953
   3954	if (count_fls + sec_fls > 64) {
   3955		divisor = nsec * frequency;
   3956
   3957		while (count_fls + sec_fls > 64) {
   3958			REDUCE_FLS(count, sec);
   3959			divisor >>= 1;
   3960		}
   3961
   3962		dividend = count * sec;
   3963	} else {
   3964		dividend = count * sec;
   3965
   3966		while (nsec_fls + frequency_fls > 64) {
   3967			REDUCE_FLS(nsec, frequency);
   3968			dividend >>= 1;
   3969		}
   3970
   3971		divisor = nsec * frequency;
   3972	}
   3973
   3974	if (!divisor)
   3975		return dividend;
   3976
   3977	return div64_u64(dividend, divisor);
   3978}
   3979
   3980static DEFINE_PER_CPU(int, perf_throttled_count);
   3981static DEFINE_PER_CPU(u64, perf_throttled_seq);
   3982
   3983static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
   3984{
   3985	struct hw_perf_event *hwc = &event->hw;
   3986	s64 period, sample_period;
   3987	s64 delta;
   3988
   3989	period = perf_calculate_period(event, nsec, count);
   3990
   3991	delta = (s64)(period - hwc->sample_period);
   3992	delta = (delta + 7) / 8; /* low pass filter */
   3993
   3994	sample_period = hwc->sample_period + delta;
   3995
   3996	if (!sample_period)
   3997		sample_period = 1;
   3998
   3999	hwc->sample_period = sample_period;
   4000
   4001	if (local64_read(&hwc->period_left) > 8*sample_period) {
   4002		if (disable)
   4003			event->pmu->stop(event, PERF_EF_UPDATE);
   4004
   4005		local64_set(&hwc->period_left, 0);
   4006
   4007		if (disable)
   4008			event->pmu->start(event, PERF_EF_RELOAD);
   4009	}
   4010}
   4011
   4012/*
   4013 * combine freq adjustment with unthrottling to avoid two passes over the
   4014 * events. At the same time, make sure, having freq events does not change
   4015 * the rate of unthrottling as that would introduce bias.
   4016 */
   4017static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
   4018					   int needs_unthr)
   4019{
   4020	struct perf_event *event;
   4021	struct hw_perf_event *hwc;
   4022	u64 now, period = TICK_NSEC;
   4023	s64 delta;
   4024
   4025	/*
   4026	 * only need to iterate over all events iff:
   4027	 * - context have events in frequency mode (needs freq adjust)
   4028	 * - there are events to unthrottle on this cpu
   4029	 */
   4030	if (!(ctx->nr_freq || needs_unthr))
   4031		return;
   4032
   4033	raw_spin_lock(&ctx->lock);
   4034	perf_pmu_disable(ctx->pmu);
   4035
   4036	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
   4037		if (event->state != PERF_EVENT_STATE_ACTIVE)
   4038			continue;
   4039
   4040		if (!event_filter_match(event))
   4041			continue;
   4042
   4043		perf_pmu_disable(event->pmu);
   4044
   4045		hwc = &event->hw;
   4046
   4047		if (hwc->interrupts == MAX_INTERRUPTS) {
   4048			hwc->interrupts = 0;
   4049			perf_log_throttle(event, 1);
   4050			event->pmu->start(event, 0);
   4051		}
   4052
   4053		if (!event->attr.freq || !event->attr.sample_freq)
   4054			goto next;
   4055
   4056		/*
   4057		 * stop the event and update event->count
   4058		 */
   4059		event->pmu->stop(event, PERF_EF_UPDATE);
   4060
   4061		now = local64_read(&event->count);
   4062		delta = now - hwc->freq_count_stamp;
   4063		hwc->freq_count_stamp = now;
   4064
   4065		/*
   4066		 * restart the event
   4067		 * reload only if value has changed
   4068		 * we have stopped the event so tell that
   4069		 * to perf_adjust_period() to avoid stopping it
   4070		 * twice.
   4071		 */
   4072		if (delta > 0)
   4073			perf_adjust_period(event, period, delta, false);
   4074
   4075		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
   4076	next:
   4077		perf_pmu_enable(event->pmu);
   4078	}
   4079
   4080	perf_pmu_enable(ctx->pmu);
   4081	raw_spin_unlock(&ctx->lock);
   4082}
   4083
   4084/*
   4085 * Move @event to the tail of the @ctx's elegible events.
   4086 */
   4087static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
   4088{
   4089	/*
   4090	 * Rotate the first entry last of non-pinned groups. Rotation might be
   4091	 * disabled by the inheritance code.
   4092	 */
   4093	if (ctx->rotate_disable)
   4094		return;
   4095
   4096	perf_event_groups_delete(&ctx->flexible_groups, event);
   4097	perf_event_groups_insert(&ctx->flexible_groups, event);
   4098}
   4099
   4100/* pick an event from the flexible_groups to rotate */
   4101static inline struct perf_event *
   4102ctx_event_to_rotate(struct perf_event_context *ctx)
   4103{
   4104	struct perf_event *event;
   4105
   4106	/* pick the first active flexible event */
   4107	event = list_first_entry_or_null(&ctx->flexible_active,
   4108					 struct perf_event, active_list);
   4109
   4110	/* if no active flexible event, pick the first event */
   4111	if (!event) {
   4112		event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
   4113				      typeof(*event), group_node);
   4114	}
   4115
   4116	/*
   4117	 * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
   4118	 * finds there are unschedulable events, it will set it again.
   4119	 */
   4120	ctx->rotate_necessary = 0;
   4121
   4122	return event;
   4123}
   4124
   4125static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
   4126{
   4127	struct perf_event *cpu_event = NULL, *task_event = NULL;
   4128	struct perf_event_context *task_ctx = NULL;
   4129	int cpu_rotate, task_rotate;
   4130
   4131	/*
   4132	 * Since we run this from IRQ context, nobody can install new
   4133	 * events, thus the event count values are stable.
   4134	 */
   4135
   4136	cpu_rotate = cpuctx->ctx.rotate_necessary;
   4137	task_ctx = cpuctx->task_ctx;
   4138	task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
   4139
   4140	if (!(cpu_rotate || task_rotate))
   4141		return false;
   4142
   4143	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
   4144	perf_pmu_disable(cpuctx->ctx.pmu);
   4145
   4146	if (task_rotate)
   4147		task_event = ctx_event_to_rotate(task_ctx);
   4148	if (cpu_rotate)
   4149		cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
   4150
   4151	/*
   4152	 * As per the order given at ctx_resched() first 'pop' task flexible
   4153	 * and then, if needed CPU flexible.
   4154	 */
   4155	if (task_event || (task_ctx && cpu_event))
   4156		ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
   4157	if (cpu_event)
   4158		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
   4159
   4160	if (task_event)
   4161		rotate_ctx(task_ctx, task_event);
   4162	if (cpu_event)
   4163		rotate_ctx(&cpuctx->ctx, cpu_event);
   4164
   4165	perf_event_sched_in(cpuctx, task_ctx);
   4166
   4167	perf_pmu_enable(cpuctx->ctx.pmu);
   4168	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
   4169
   4170	return true;
   4171}
   4172
   4173void perf_event_task_tick(void)
   4174{
   4175	struct list_head *head = this_cpu_ptr(&active_ctx_list);
   4176	struct perf_event_context *ctx, *tmp;
   4177	int throttled;
   4178
   4179	lockdep_assert_irqs_disabled();
   4180
   4181	__this_cpu_inc(perf_throttled_seq);
   4182	throttled = __this_cpu_xchg(perf_throttled_count, 0);
   4183	tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
   4184
   4185	list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
   4186		perf_adjust_freq_unthr_context(ctx, throttled);
   4187}
   4188
   4189static int event_enable_on_exec(struct perf_event *event,
   4190				struct perf_event_context *ctx)
   4191{
   4192	if (!event->attr.enable_on_exec)
   4193		return 0;
   4194
   4195	event->attr.enable_on_exec = 0;
   4196	if (event->state >= PERF_EVENT_STATE_INACTIVE)
   4197		return 0;
   4198
   4199	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
   4200
   4201	return 1;
   4202}
   4203
   4204/*
   4205 * Enable all of a task's events that have been marked enable-on-exec.
   4206 * This expects task == current.
   4207 */
   4208static void perf_event_enable_on_exec(int ctxn)
   4209{
   4210	struct perf_event_context *ctx, *clone_ctx = NULL;
   4211	enum event_type_t event_type = 0;
   4212	struct perf_cpu_context *cpuctx;
   4213	struct perf_event *event;
   4214	unsigned long flags;
   4215	int enabled = 0;
   4216
   4217	local_irq_save(flags);
   4218	ctx = current->perf_event_ctxp[ctxn];
   4219	if (!ctx || !ctx->nr_events)
   4220		goto out;
   4221
   4222	cpuctx = __get_cpu_context(ctx);
   4223	perf_ctx_lock(cpuctx, ctx);
   4224	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
   4225	list_for_each_entry(event, &ctx->event_list, event_entry) {
   4226		enabled |= event_enable_on_exec(event, ctx);
   4227		event_type |= get_event_type(event);
   4228	}
   4229
   4230	/*
   4231	 * Unclone and reschedule this context if we enabled any event.
   4232	 */
   4233	if (enabled) {
   4234		clone_ctx = unclone_ctx(ctx);
   4235		ctx_resched(cpuctx, ctx, event_type);
   4236	} else {
   4237		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
   4238	}
   4239	perf_ctx_unlock(cpuctx, ctx);
   4240
   4241out:
   4242	local_irq_restore(flags);
   4243
   4244	if (clone_ctx)
   4245		put_ctx(clone_ctx);
   4246}
   4247
   4248static void perf_remove_from_owner(struct perf_event *event);
   4249static void perf_event_exit_event(struct perf_event *event,
   4250				  struct perf_event_context *ctx);
   4251
   4252/*
   4253 * Removes all events from the current task that have been marked
   4254 * remove-on-exec, and feeds their values back to parent events.
   4255 */
   4256static void perf_event_remove_on_exec(int ctxn)
   4257{
   4258	struct perf_event_context *ctx, *clone_ctx = NULL;
   4259	struct perf_event *event, *next;
   4260	unsigned long flags;
   4261	bool modified = false;
   4262
   4263	ctx = perf_pin_task_context(current, ctxn);
   4264	if (!ctx)
   4265		return;
   4266
   4267	mutex_lock(&ctx->mutex);
   4268
   4269	if (WARN_ON_ONCE(ctx->task != current))
   4270		goto unlock;
   4271
   4272	list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
   4273		if (!event->attr.remove_on_exec)
   4274			continue;
   4275
   4276		if (!is_kernel_event(event))
   4277			perf_remove_from_owner(event);
   4278
   4279		modified = true;
   4280
   4281		perf_event_exit_event(event, ctx);
   4282	}
   4283
   4284	raw_spin_lock_irqsave(&ctx->lock, flags);
   4285	if (modified)
   4286		clone_ctx = unclone_ctx(ctx);
   4287	--ctx->pin_count;
   4288	raw_spin_unlock_irqrestore(&ctx->lock, flags);
   4289
   4290unlock:
   4291	mutex_unlock(&ctx->mutex);
   4292
   4293	put_ctx(ctx);
   4294	if (clone_ctx)
   4295		put_ctx(clone_ctx);
   4296}
   4297
   4298struct perf_read_data {
   4299	struct perf_event *event;
   4300	bool group;
   4301	int ret;
   4302};
   4303
   4304static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
   4305{
   4306	u16 local_pkg, event_pkg;
   4307
   4308	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
   4309		int local_cpu = smp_processor_id();
   4310
   4311		event_pkg = topology_physical_package_id(event_cpu);
   4312		local_pkg = topology_physical_package_id(local_cpu);
   4313
   4314		if (event_pkg == local_pkg)
   4315			return local_cpu;
   4316	}
   4317
   4318	return event_cpu;
   4319}
   4320
   4321/*
   4322 * Cross CPU call to read the hardware event
   4323 */
   4324static void __perf_event_read(void *info)
   4325{
   4326	struct perf_read_data *data = info;
   4327	struct perf_event *sub, *event = data->event;
   4328	struct perf_event_context *ctx = event->ctx;
   4329	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
   4330	struct pmu *pmu = event->pmu;
   4331
   4332	/*
   4333	 * If this is a task context, we need to check whether it is
   4334	 * the current task context of this cpu.  If not it has been
   4335	 * scheduled out before the smp call arrived.  In that case
   4336	 * event->count would have been updated to a recent sample
   4337	 * when the event was scheduled out.
   4338	 */
   4339	if (ctx->task && cpuctx->task_ctx != ctx)
   4340		return;
   4341
   4342	raw_spin_lock(&ctx->lock);
   4343	if (ctx->is_active & EVENT_TIME) {
   4344		update_context_time(ctx);
   4345		update_cgrp_time_from_event(event);
   4346	}
   4347
   4348	perf_event_update_time(event);
   4349	if (data->group)
   4350		perf_event_update_sibling_time(event);
   4351
   4352	if (event->state != PERF_EVENT_STATE_ACTIVE)
   4353		goto unlock;
   4354
   4355	if (!data->group) {
   4356		pmu->read(event);
   4357		data->ret = 0;
   4358		goto unlock;
   4359	}
   4360
   4361	pmu->start_txn(pmu, PERF_PMU_TXN_READ);
   4362
   4363	pmu->read(event);
   4364
   4365	for_each_sibling_event(sub, event) {
   4366		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
   4367			/*
   4368			 * Use sibling's PMU rather than @event's since
   4369			 * sibling could be on different (eg: software) PMU.
   4370			 */
   4371			sub->pmu->read(sub);
   4372		}
   4373	}
   4374
   4375	data->ret = pmu->commit_txn(pmu);
   4376
   4377unlock:
   4378	raw_spin_unlock(&ctx->lock);
   4379}
   4380
   4381static inline u64 perf_event_count(struct perf_event *event)
   4382{
   4383	return local64_read(&event->count) + atomic64_read(&event->child_count);
   4384}
   4385
   4386static void calc_timer_values(struct perf_event *event,
   4387				u64 *now,
   4388				u64 *enabled,
   4389				u64 *running)
   4390{
   4391	u64 ctx_time;
   4392
   4393	*now = perf_clock();
   4394	ctx_time = perf_event_time_now(event, *now);
   4395	__perf_update_times(event, ctx_time, enabled, running);
   4396}
   4397
   4398/*
   4399 * NMI-safe method to read a local event, that is an event that
   4400 * is:
   4401 *   - either for the current task, or for this CPU
   4402 *   - does not have inherit set, for inherited task events
   4403 *     will not be local and we cannot read them atomically
   4404 *   - must not have a pmu::count method
   4405 */
   4406int perf_event_read_local(struct perf_event *event, u64 *value,
   4407			  u64 *enabled, u64 *running)
   4408{
   4409	unsigned long flags;
   4410	int ret = 0;
   4411
   4412	/*
   4413	 * Disabling interrupts avoids all counter scheduling (context
   4414	 * switches, timer based rotation and IPIs).
   4415	 */
   4416	local_irq_save(flags);
   4417
   4418	/*
   4419	 * It must not be an event with inherit set, we cannot read
   4420	 * all child counters from atomic context.
   4421	 */
   4422	if (event->attr.inherit) {
   4423		ret = -EOPNOTSUPP;
   4424		goto out;
   4425	}
   4426
   4427	/* If this is a per-task event, it must be for current */
   4428	if ((event->attach_state & PERF_ATTACH_TASK) &&
   4429	    event->hw.target != current) {
   4430		ret = -EINVAL;
   4431		goto out;
   4432	}
   4433
   4434	/* If this is a per-CPU event, it must be for this CPU */
   4435	if (!(event->attach_state & PERF_ATTACH_TASK) &&
   4436	    event->cpu != smp_processor_id()) {
   4437		ret = -EINVAL;
   4438		goto out;
   4439	}
   4440
   4441	/* If this is a pinned event it must be running on this CPU */
   4442	if (event->attr.pinned && event->oncpu != smp_processor_id()) {
   4443		ret = -EBUSY;
   4444		goto out;
   4445	}
   4446
   4447	/*
   4448	 * If the event is currently on this CPU, its either a per-task event,
   4449	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
   4450	 * oncpu == -1).
   4451	 */
   4452	if (event->oncpu == smp_processor_id())
   4453		event->pmu->read(event);
   4454
   4455	*value = local64_read(&event->count);
   4456	if (enabled || running) {
   4457		u64 __enabled, __running, __now;;
   4458
   4459		calc_timer_values(event, &__now, &__enabled, &__running);
   4460		if (enabled)
   4461			*enabled = __enabled;
   4462		if (running)
   4463			*running = __running;
   4464	}
   4465out:
   4466	local_irq_restore(flags);
   4467
   4468	return ret;
   4469}
   4470
   4471static int perf_event_read(struct perf_event *event, bool group)
   4472{
   4473	enum perf_event_state state = READ_ONCE(event->state);
   4474	int event_cpu, ret = 0;
   4475
   4476	/*
   4477	 * If event is enabled and currently active on a CPU, update the
   4478	 * value in the event structure:
   4479	 */
   4480again:
   4481	if (state == PERF_EVENT_STATE_ACTIVE) {
   4482		struct perf_read_data data;
   4483
   4484		/*
   4485		 * Orders the ->state and ->oncpu loads such that if we see
   4486		 * ACTIVE we must also see the right ->oncpu.
   4487		 *
   4488		 * Matches the smp_wmb() from event_sched_in().
   4489		 */
   4490		smp_rmb();
   4491
   4492		event_cpu = READ_ONCE(event->oncpu);
   4493		if ((unsigned)event_cpu >= nr_cpu_ids)
   4494			return 0;
   4495
   4496		data = (struct perf_read_data){
   4497			.event = event,
   4498			.group = group,
   4499			.ret = 0,
   4500		};
   4501
   4502		preempt_disable();
   4503		event_cpu = __perf_event_read_cpu(event, event_cpu);
   4504
   4505		/*
   4506		 * Purposely ignore the smp_call_function_single() return
   4507		 * value.
   4508		 *
   4509		 * If event_cpu isn't a valid CPU it means the event got
   4510		 * scheduled out and that will have updated the event count.
   4511		 *
   4512		 * Therefore, either way, we'll have an up-to-date event count
   4513		 * after this.
   4514		 */
   4515		(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
   4516		preempt_enable();
   4517		ret = data.ret;
   4518
   4519	} else if (state == PERF_EVENT_STATE_INACTIVE) {
   4520		struct perf_event_context *ctx = event->ctx;
   4521		unsigned long flags;
   4522
   4523		raw_spin_lock_irqsave(&ctx->lock, flags);
   4524		state = event->state;
   4525		if (state != PERF_EVENT_STATE_INACTIVE) {
   4526			raw_spin_unlock_irqrestore(&ctx->lock, flags);
   4527			goto again;
   4528		}
   4529
   4530		/*
   4531		 * May read while context is not active (e.g., thread is
   4532		 * blocked), in that case we cannot update context time
   4533		 */
   4534		if (ctx->is_active & EVENT_TIME) {
   4535			update_context_time(ctx);
   4536			update_cgrp_time_from_event(event);
   4537		}
   4538
   4539		perf_event_update_time(event);
   4540		if (group)
   4541			perf_event_update_sibling_time(event);
   4542		raw_spin_unlock_irqrestore(&ctx->lock, flags);
   4543	}
   4544
   4545	return ret;
   4546}
   4547
   4548/*
   4549 * Initialize the perf_event context in a task_struct:
   4550 */
   4551static void __perf_event_init_context(struct perf_event_context *ctx)
   4552{
   4553	raw_spin_lock_init(&ctx->lock);
   4554	mutex_init(&ctx->mutex);
   4555	INIT_LIST_HEAD(&ctx->active_ctx_list);
   4556	perf_event_groups_init(&ctx->pinned_groups);
   4557	perf_event_groups_init(&ctx->flexible_groups);
   4558	INIT_LIST_HEAD(&ctx->event_list);
   4559	INIT_LIST_HEAD(&ctx->pinned_active);
   4560	INIT_LIST_HEAD(&ctx->flexible_active);
   4561	refcount_set(&ctx->refcount, 1);
   4562}
   4563
   4564static struct perf_event_context *
   4565alloc_perf_context(struct pmu *pmu, struct task_struct *task)
   4566{
   4567	struct perf_event_context *ctx;
   4568
   4569	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
   4570	if (!ctx)
   4571		return NULL;
   4572
   4573	__perf_event_init_context(ctx);
   4574	if (task)
   4575		ctx->task = get_task_struct(task);
   4576	ctx->pmu = pmu;
   4577
   4578	return ctx;
   4579}
   4580
   4581static struct task_struct *
   4582find_lively_task_by_vpid(pid_t vpid)
   4583{
   4584	struct task_struct *task;
   4585
   4586	rcu_read_lock();
   4587	if (!vpid)
   4588		task = current;
   4589	else
   4590		task = find_task_by_vpid(vpid);
   4591	if (task)
   4592		get_task_struct(task);
   4593	rcu_read_unlock();
   4594
   4595	if (!task)
   4596		return ERR_PTR(-ESRCH);
   4597
   4598	return task;
   4599}
   4600
   4601/*
   4602 * Returns a matching context with refcount and pincount.
   4603 */
   4604static struct perf_event_context *
   4605find_get_context(struct pmu *pmu, struct task_struct *task,
   4606		struct perf_event *event)
   4607{
   4608	struct perf_event_context *ctx, *clone_ctx = NULL;
   4609	struct perf_cpu_context *cpuctx;
   4610	void *task_ctx_data = NULL;
   4611	unsigned long flags;
   4612	int ctxn, err;
   4613	int cpu = event->cpu;
   4614
   4615	if (!task) {
   4616		/* Must be root to operate on a CPU event: */
   4617		err = perf_allow_cpu(&event->attr);
   4618		if (err)
   4619			return ERR_PTR(err);
   4620
   4621		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
   4622		ctx = &cpuctx->ctx;
   4623		get_ctx(ctx);
   4624		raw_spin_lock_irqsave(&ctx->lock, flags);
   4625		++ctx->pin_count;
   4626		raw_spin_unlock_irqrestore(&ctx->lock, flags);
   4627
   4628		return ctx;
   4629	}
   4630
   4631	err = -EINVAL;
   4632	ctxn = pmu->task_ctx_nr;
   4633	if (ctxn < 0)
   4634		goto errout;
   4635
   4636	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
   4637		task_ctx_data = alloc_task_ctx_data(pmu);
   4638		if (!task_ctx_data) {
   4639			err = -ENOMEM;
   4640			goto errout;
   4641		}
   4642	}
   4643
   4644retry:
   4645	ctx = perf_lock_task_context(task, ctxn, &flags);
   4646	if (ctx) {
   4647		clone_ctx = unclone_ctx(ctx);
   4648		++ctx->pin_count;
   4649
   4650		if (task_ctx_data && !ctx->task_ctx_data) {
   4651			ctx->task_ctx_data = task_ctx_data;
   4652			task_ctx_data = NULL;
   4653		}
   4654		raw_spin_unlock_irqrestore(&ctx->lock, flags);
   4655
   4656		if (clone_ctx)
   4657			put_ctx(clone_ctx);
   4658	} else {
   4659		ctx = alloc_perf_context(pmu, task);
   4660		err = -ENOMEM;
   4661		if (!ctx)
   4662			goto errout;
   4663
   4664		if (task_ctx_data) {
   4665			ctx->task_ctx_data = task_ctx_data;
   4666			task_ctx_data = NULL;
   4667		}
   4668
   4669		err = 0;
   4670		mutex_lock(&task->perf_event_mutex);
   4671		/*
   4672		 * If it has already passed perf_event_exit_task().
   4673		 * we must see PF_EXITING, it takes this mutex too.
   4674		 */
   4675		if (task->flags & PF_EXITING)
   4676			err = -ESRCH;
   4677		else if (task->perf_event_ctxp[ctxn])
   4678			err = -EAGAIN;
   4679		else {
   4680			get_ctx(ctx);
   4681			++ctx->pin_count;
   4682			rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
   4683		}
   4684		mutex_unlock(&task->perf_event_mutex);
   4685
   4686		if (unlikely(err)) {
   4687			put_ctx(ctx);
   4688
   4689			if (err == -EAGAIN)
   4690				goto retry;
   4691			goto errout;
   4692		}
   4693	}
   4694
   4695	free_task_ctx_data(pmu, task_ctx_data);
   4696	return ctx;
   4697
   4698errout:
   4699	free_task_ctx_data(pmu, task_ctx_data);
   4700	return ERR_PTR(err);
   4701}
   4702
   4703static void perf_event_free_filter(struct perf_event *event);
   4704
   4705static void free_event_rcu(struct rcu_head *head)
   4706{
   4707	struct perf_event *event;
   4708
   4709	event = container_of(head, struct perf_event, rcu_head);
   4710	if (event->ns)
   4711		put_pid_ns(event->ns);
   4712	perf_event_free_filter(event);
   4713	kmem_cache_free(perf_event_cache, event);
   4714}
   4715
   4716static void ring_buffer_attach(struct perf_event *event,
   4717			       struct perf_buffer *rb);
   4718
   4719static void detach_sb_event(struct perf_event *event)
   4720{
   4721	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
   4722
   4723	raw_spin_lock(&pel->lock);
   4724	list_del_rcu(&event->sb_list);
   4725	raw_spin_unlock(&pel->lock);
   4726}
   4727
   4728static bool is_sb_event(struct perf_event *event)
   4729{
   4730	struct perf_event_attr *attr = &event->attr;
   4731
   4732	if (event->parent)
   4733		return false;
   4734
   4735	if (event->attach_state & PERF_ATTACH_TASK)
   4736		return false;
   4737
   4738	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
   4739	    attr->comm || attr->comm_exec ||
   4740	    attr->task || attr->ksymbol ||
   4741	    attr->context_switch || attr->text_poke ||
   4742	    attr->bpf_event)
   4743		return true;
   4744	return false;
   4745}
   4746
   4747static void unaccount_pmu_sb_event(struct perf_event *event)
   4748{
   4749	if (is_sb_event(event))
   4750		detach_sb_event(event);
   4751}
   4752
   4753static void unaccount_event_cpu(struct perf_event *event, int cpu)
   4754{
   4755	if (event->parent)
   4756		return;
   4757
   4758	if (is_cgroup_event(event))
   4759		atomic_dec(&per_cpu(perf_cgroup_events, cpu));
   4760}
   4761
   4762#ifdef CONFIG_NO_HZ_FULL
   4763static DEFINE_SPINLOCK(nr_freq_lock);
   4764#endif
   4765
   4766static void unaccount_freq_event_nohz(void)
   4767{
   4768#ifdef CONFIG_NO_HZ_FULL
   4769	spin_lock(&nr_freq_lock);
   4770	if (atomic_dec_and_test(&nr_freq_events))
   4771		tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
   4772	spin_unlock(&nr_freq_lock);
   4773#endif
   4774}
   4775
   4776static void unaccount_freq_event(void)
   4777{
   4778	if (tick_nohz_full_enabled())
   4779		unaccount_freq_event_nohz();
   4780	else
   4781		atomic_dec(&nr_freq_events);
   4782}
   4783
   4784static void unaccount_event(struct perf_event *event)
   4785{
   4786	bool dec = false;
   4787
   4788	if (event->parent)
   4789		return;
   4790
   4791	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
   4792		dec = true;
   4793	if (event->attr.mmap || event->attr.mmap_data)
   4794		atomic_dec(&nr_mmap_events);
   4795	if (event->attr.build_id)
   4796		atomic_dec(&nr_build_id_events);
   4797	if (event->attr.comm)
   4798		atomic_dec(&nr_comm_events);
   4799	if (event->attr.namespaces)
   4800		atomic_dec(&nr_namespaces_events);
   4801	if (event->attr.cgroup)
   4802		atomic_dec(&nr_cgroup_events);
   4803	if (event->attr.task)
   4804		atomic_dec(&nr_task_events);
   4805	if (event->attr.freq)
   4806		unaccount_freq_event();
   4807	if (event->attr.context_switch) {
   4808		dec = true;
   4809		atomic_dec(&nr_switch_events);
   4810	}
   4811	if (is_cgroup_event(event))
   4812		dec = true;
   4813	if (has_branch_stack(event))
   4814		dec = true;
   4815	if (event->attr.ksymbol)
   4816		atomic_dec(&nr_ksymbol_events);
   4817	if (event->attr.bpf_event)
   4818		atomic_dec(&nr_bpf_events);
   4819	if (event->attr.text_poke)
   4820		atomic_dec(&nr_text_poke_events);
   4821
   4822	if (dec) {
   4823		if (!atomic_add_unless(&perf_sched_count, -1, 1))
   4824			schedule_delayed_work(&perf_sched_work, HZ);
   4825	}
   4826
   4827	unaccount_event_cpu(event, event->cpu);
   4828
   4829	unaccount_pmu_sb_event(event);
   4830}
   4831
   4832static void perf_sched_delayed(struct work_struct *work)
   4833{
   4834	mutex_lock(&perf_sched_mutex);
   4835	if (atomic_dec_and_test(&perf_sched_count))
   4836		static_branch_disable(&perf_sched_events);
   4837	mutex_unlock(&perf_sched_mutex);
   4838}
   4839
   4840/*
   4841 * The following implement mutual exclusion of events on "exclusive" pmus
   4842 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
   4843 * at a time, so we disallow creating events that might conflict, namely:
   4844 *
   4845 *  1) cpu-wide events in the presence of per-task events,
   4846 *  2) per-task events in the presence of cpu-wide events,
   4847 *  3) two matching events on the same context.
   4848 *
   4849 * The former two cases are handled in the allocation path (perf_event_alloc(),
   4850 * _free_event()), the latter -- before the first perf_install_in_context().
   4851 */
   4852static int exclusive_event_init(struct perf_event *event)
   4853{
   4854	struct pmu *pmu = event->pmu;
   4855
   4856	if (!is_exclusive_pmu(pmu))
   4857		return 0;
   4858
   4859	/*
   4860	 * Prevent co-existence of per-task and cpu-wide events on the
   4861	 * same exclusive pmu.
   4862	 *
   4863	 * Negative pmu::exclusive_cnt means there are cpu-wide
   4864	 * events on this "exclusive" pmu, positive means there are
   4865	 * per-task events.
   4866	 *
   4867	 * Since this is called in perf_event_alloc() path, event::ctx
   4868	 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
   4869	 * to mean "per-task event", because unlike other attach states it
   4870	 * never gets cleared.
   4871	 */
   4872	if (event->attach_state & PERF_ATTACH_TASK) {
   4873		if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
   4874			return -EBUSY;
   4875	} else {
   4876		if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
   4877			return -EBUSY;
   4878	}
   4879
   4880	return 0;
   4881}
   4882
   4883static void exclusive_event_destroy(struct perf_event *event)
   4884{
   4885	struct pmu *pmu = event->pmu;
   4886
   4887	if (!is_exclusive_pmu(pmu))
   4888		return;
   4889
   4890	/* see comment in exclusive_event_init() */
   4891	if (event->attach_state & PERF_ATTACH_TASK)
   4892		atomic_dec(&pmu->exclusive_cnt);
   4893	else
   4894		atomic_inc(&pmu->exclusive_cnt);
   4895}
   4896
   4897static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
   4898{
   4899	if ((e1->pmu == e2->pmu) &&
   4900	    (e1->cpu == e2->cpu ||
   4901	     e1->cpu == -1 ||
   4902	     e2->cpu == -1))
   4903		return true;
   4904	return false;
   4905}
   4906
   4907static bool exclusive_event_installable(struct perf_event *event,
   4908					struct perf_event_context *ctx)
   4909{
   4910	struct perf_event *iter_event;
   4911	struct pmu *pmu = event->pmu;
   4912
   4913	lockdep_assert_held(&ctx->mutex);
   4914
   4915	if (!is_exclusive_pmu(pmu))
   4916		return true;
   4917
   4918	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
   4919		if (exclusive_event_match(iter_event, event))
   4920			return false;
   4921	}
   4922
   4923	return true;
   4924}
   4925
   4926static void perf_addr_filters_splice(struct perf_event *event,
   4927				       struct list_head *head);
   4928
   4929static void _free_event(struct perf_event *event)
   4930{
   4931	irq_work_sync(&event->pending);
   4932
   4933	unaccount_event(event);
   4934
   4935	security_perf_event_free(event);
   4936
   4937	if (event->rb) {
   4938		/*
   4939		 * Can happen when we close an event with re-directed output.
   4940		 *
   4941		 * Since we have a 0 refcount, perf_mmap_close() will skip
   4942		 * over us; possibly making our ring_buffer_put() the last.
   4943		 */
   4944		mutex_lock(&event->mmap_mutex);
   4945		ring_buffer_attach(event, NULL);
   4946		mutex_unlock(&event->mmap_mutex);
   4947	}
   4948
   4949	if (is_cgroup_event(event))
   4950		perf_detach_cgroup(event);
   4951
   4952	if (!event->parent) {
   4953		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
   4954			put_callchain_buffers();
   4955	}
   4956
   4957	perf_event_free_bpf_prog(event);
   4958	perf_addr_filters_splice(event, NULL);
   4959	kfree(event->addr_filter_ranges);
   4960
   4961	if (event->destroy)
   4962		event->destroy(event);
   4963
   4964	/*
   4965	 * Must be after ->destroy(), due to uprobe_perf_close() using
   4966	 * hw.target.
   4967	 */
   4968	if (event->hw.target)
   4969		put_task_struct(event->hw.target);
   4970
   4971	/*
   4972	 * perf_event_free_task() relies on put_ctx() being 'last', in particular
   4973	 * all task references must be cleaned up.
   4974	 */
   4975	if (event->ctx)
   4976		put_ctx(event->ctx);
   4977
   4978	exclusive_event_destroy(event);
   4979	module_put(event->pmu->module);
   4980
   4981	call_rcu(&event->rcu_head, free_event_rcu);
   4982}
   4983
   4984/*
   4985 * Used to free events which have a known refcount of 1, such as in error paths
   4986 * where the event isn't exposed yet and inherited events.
   4987 */
   4988static void free_event(struct perf_event *event)
   4989{
   4990	if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
   4991				"unexpected event refcount: %ld; ptr=%p\n",
   4992				atomic_long_read(&event->refcount), event)) {
   4993		/* leak to avoid use-after-free */
   4994		return;
   4995	}
   4996
   4997	_free_event(event);
   4998}
   4999
   5000/*
   5001 * Remove user event from the owner task.
   5002 */
   5003static void perf_remove_from_owner(struct perf_event *event)
   5004{
   5005	struct task_struct *owner;
   5006
   5007	rcu_read_lock();
   5008	/*
   5009	 * Matches the smp_store_release() in perf_event_exit_task(). If we
   5010	 * observe !owner it means the list deletion is complete and we can
   5011	 * indeed free this event, otherwise we need to serialize on
   5012	 * owner->perf_event_mutex.
   5013	 */
   5014	owner = READ_ONCE(event->owner);
   5015	if (owner) {
   5016		/*
   5017		 * Since delayed_put_task_struct() also drops the last
   5018		 * task reference we can safely take a new reference
   5019		 * while holding the rcu_read_lock().
   5020		 */
   5021		get_task_struct(owner);
   5022	}
   5023	rcu_read_unlock();
   5024
   5025	if (owner) {
   5026		/*
   5027		 * If we're here through perf_event_exit_task() we're already
   5028		 * holding ctx->mutex which would be an inversion wrt. the
   5029		 * normal lock order.
   5030		 *
   5031		 * However we can safely take this lock because its the child
   5032		 * ctx->mutex.
   5033		 */
   5034		mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
   5035
   5036		/*
   5037		 * We have to re-check the event->owner field, if it is cleared
   5038		 * we raced with perf_event_exit_task(), acquiring the mutex
   5039		 * ensured they're done, and we can proceed with freeing the
   5040		 * event.
   5041		 */
   5042		if (event->owner) {
   5043			list_del_init(&event->owner_entry);
   5044			smp_store_release(&event->owner, NULL);
   5045		}
   5046		mutex_unlock(&owner->perf_event_mutex);
   5047		put_task_struct(owner);
   5048	}
   5049}
   5050
   5051static void put_event(struct perf_event *event)
   5052{
   5053	if (!atomic_long_dec_and_test(&event->refcount))
   5054		return;
   5055
   5056	_free_event(event);
   5057}
   5058
   5059/*
   5060 * Kill an event dead; while event:refcount will preserve the event
   5061 * object, it will not preserve its functionality. Once the last 'user'
   5062 * gives up the object, we'll destroy the thing.
   5063 */
   5064int perf_event_release_kernel(struct perf_event *event)
   5065{
   5066	struct perf_event_context *ctx = event->ctx;
   5067	struct perf_event *child, *tmp;
   5068	LIST_HEAD(free_list);
   5069
   5070	/*
   5071	 * If we got here through err_file: fput(event_file); we will not have
   5072	 * attached to a context yet.
   5073	 */
   5074	if (!ctx) {
   5075		WARN_ON_ONCE(event->attach_state &
   5076				(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
   5077		goto no_ctx;
   5078	}
   5079
   5080	if (!is_kernel_event(event))
   5081		perf_remove_from_owner(event);
   5082
   5083	ctx = perf_event_ctx_lock(event);
   5084	WARN_ON_ONCE(ctx->parent_ctx);
   5085	perf_remove_from_context(event, DETACH_GROUP);
   5086
   5087	raw_spin_lock_irq(&ctx->lock);
   5088	/*
   5089	 * Mark this event as STATE_DEAD, there is no external reference to it
   5090	 * anymore.
   5091	 *
   5092	 * Anybody acquiring event->child_mutex after the below loop _must_
   5093	 * also see this, most importantly inherit_event() which will avoid
   5094	 * placing more children on the list.
   5095	 *
   5096	 * Thus this guarantees that we will in fact observe and kill _ALL_
   5097	 * child events.
   5098	 */
   5099	event->state = PERF_EVENT_STATE_DEAD;
   5100	raw_spin_unlock_irq(&ctx->lock);
   5101
   5102	perf_event_ctx_unlock(event, ctx);
   5103
   5104again:
   5105	mutex_lock(&event->child_mutex);
   5106	list_for_each_entry(child, &event->child_list, child_list) {
   5107
   5108		/*
   5109		 * Cannot change, child events are not migrated, see the
   5110		 * comment with perf_event_ctx_lock_nested().
   5111		 */
   5112		ctx = READ_ONCE(child->ctx);
   5113		/*
   5114		 * Since child_mutex nests inside ctx::mutex, we must jump
   5115		 * through hoops. We start by grabbing a reference on the ctx.
   5116		 *
   5117		 * Since the event cannot get freed while we hold the
   5118		 * child_mutex, the context must also exist and have a !0
   5119		 * reference count.
   5120		 */
   5121		get_ctx(ctx);
   5122
   5123		/*
   5124		 * Now that we have a ctx ref, we can drop child_mutex, and
   5125		 * acquire ctx::mutex without fear of it going away. Then we
   5126		 * can re-acquire child_mutex.
   5127		 */
   5128		mutex_unlock(&event->child_mutex);
   5129		mutex_lock(&ctx->mutex);
   5130		mutex_lock(&event->child_mutex);
   5131
   5132		/*
   5133		 * Now that we hold ctx::mutex and child_mutex, revalidate our
   5134		 * state, if child is still the first entry, it didn't get freed
   5135		 * and we can continue doing so.
   5136		 */
   5137		tmp = list_first_entry_or_null(&event->child_list,
   5138					       struct perf_event, child_list);
   5139		if (tmp == child) {
   5140			perf_remove_from_context(child, DETACH_GROUP);
   5141			list_move(&child->child_list, &free_list);
   5142			/*
   5143			 * This matches the refcount bump in inherit_event();
   5144			 * this can't be the last reference.
   5145			 */
   5146			put_event(event);
   5147		}
   5148
   5149		mutex_unlock(&event->child_mutex);
   5150		mutex_unlock(&ctx->mutex);
   5151		put_ctx(ctx);
   5152		goto again;
   5153	}
   5154	mutex_unlock(&event->child_mutex);
   5155
   5156	list_for_each_entry_safe(child, tmp, &free_list, child_list) {
   5157		void *var = &child->ctx->refcount;
   5158
   5159		list_del(&child->child_list);
   5160		free_event(child);
   5161
   5162		/*
   5163		 * Wake any perf_event_free_task() waiting for this event to be
   5164		 * freed.
   5165		 */
   5166		smp_mb(); /* pairs with wait_var_event() */
   5167		wake_up_var(var);
   5168	}
   5169
   5170no_ctx:
   5171	put_event(event); /* Must be the 'last' reference */
   5172	return 0;
   5173}
   5174EXPORT_SYMBOL_GPL(perf_event_release_kernel);
   5175
   5176/*
   5177 * Called when the last reference to the file is gone.
   5178 */
   5179static int perf_release(struct inode *inode, struct file *file)
   5180{
   5181	perf_event_release_kernel(file->private_data);
   5182	return 0;
   5183}
   5184
   5185static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
   5186{
   5187	struct perf_event *child;
   5188	u64 total = 0;
   5189
   5190	*enabled = 0;
   5191	*running = 0;
   5192
   5193	mutex_lock(&event->child_mutex);
   5194
   5195	(void)perf_event_read(event, false);
   5196	total += perf_event_count(event);
   5197
   5198	*enabled += event->total_time_enabled +
   5199			atomic64_read(&event->child_total_time_enabled);
   5200	*running += event->total_time_running +
   5201			atomic64_read(&event->child_total_time_running);
   5202
   5203	list_for_each_entry(child, &event->child_list, child_list) {
   5204		(void)perf_event_read(child, false);
   5205		total += perf_event_count(child);
   5206		*enabled += child->total_time_enabled;
   5207		*running += child->total_time_running;
   5208	}
   5209	mutex_unlock(&event->child_mutex);
   5210
   5211	return total;
   5212}
   5213
   5214u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
   5215{
   5216	struct perf_event_context *ctx;
   5217	u64 count;
   5218
   5219	ctx = perf_event_ctx_lock(event);
   5220	count = __perf_event_read_value(event, enabled, running);
   5221	perf_event_ctx_unlock(event, ctx);
   5222
   5223	return count;
   5224}
   5225EXPORT_SYMBOL_GPL(perf_event_read_value);
   5226
   5227static int __perf_read_group_add(struct perf_event *leader,
   5228					u64 read_format, u64 *values)
   5229{
   5230	struct perf_event_context *ctx = leader->ctx;
   5231	struct perf_event *sub;
   5232	unsigned long flags;
   5233	int n = 1; /* skip @nr */
   5234	int ret;
   5235
   5236	ret = perf_event_read(leader, true);
   5237	if (ret)
   5238		return ret;
   5239
   5240	raw_spin_lock_irqsave(&ctx->lock, flags);
   5241
   5242	/*
   5243	 * Since we co-schedule groups, {enabled,running} times of siblings
   5244	 * will be identical to those of the leader, so we only publish one
   5245	 * set.
   5246	 */
   5247	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
   5248		values[n++] += leader->total_time_enabled +
   5249			atomic64_read(&leader->child_total_time_enabled);
   5250	}
   5251
   5252	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
   5253		values[n++] += leader->total_time_running +
   5254			atomic64_read(&leader->child_total_time_running);
   5255	}
   5256
   5257	/*
   5258	 * Write {count,id} tuples for every sibling.
   5259	 */
   5260	values[n++] += perf_event_count(leader);
   5261	if (read_format & PERF_FORMAT_ID)
   5262		values[n++] = primary_event_id(leader);
   5263
   5264	for_each_sibling_event(sub, leader) {
   5265		values[n++] += perf_event_count(sub);
   5266		if (read_format & PERF_FORMAT_ID)
   5267			values[n++] = primary_event_id(sub);
   5268	}
   5269
   5270	raw_spin_unlock_irqrestore(&ctx->lock, flags);
   5271	return 0;
   5272}
   5273
   5274static int perf_read_group(struct perf_event *event,
   5275				   u64 read_format, char __user *buf)
   5276{
   5277	struct perf_event *leader = event->group_leader, *child;
   5278	struct perf_event_context *ctx = leader->ctx;
   5279	int ret;
   5280	u64 *values;
   5281
   5282	lockdep_assert_held(&ctx->mutex);
   5283
   5284	values = kzalloc(event->read_size, GFP_KERNEL);
   5285	if (!values)
   5286		return -ENOMEM;
   5287
   5288	values[0] = 1 + leader->nr_siblings;
   5289
   5290	/*
   5291	 * By locking the child_mutex of the leader we effectively
   5292	 * lock the child list of all siblings.. XXX explain how.
   5293	 */
   5294	mutex_lock(&leader->child_mutex);
   5295
   5296	ret = __perf_read_group_add(leader, read_format, values);
   5297	if (ret)
   5298		goto unlock;
   5299
   5300	list_for_each_entry(child, &leader->child_list, child_list) {
   5301		ret = __perf_read_group_add(child, read_format, values);
   5302		if (ret)
   5303			goto unlock;
   5304	}
   5305
   5306	mutex_unlock(&leader->child_mutex);
   5307
   5308	ret = event->read_size;
   5309	if (copy_to_user(buf, values, event->read_size))
   5310		ret = -EFAULT;
   5311	goto out;
   5312
   5313unlock:
   5314	mutex_unlock(&leader->child_mutex);
   5315out:
   5316	kfree(values);
   5317	return ret;
   5318}
   5319
   5320static int perf_read_one(struct perf_event *event,
   5321				 u64 read_format, char __user *buf)
   5322{
   5323	u64 enabled, running;
   5324	u64 values[4];
   5325	int n = 0;
   5326
   5327	values[n++] = __perf_event_read_value(event, &enabled, &running);
   5328	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
   5329		values[n++] = enabled;
   5330	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
   5331		values[n++] = running;
   5332	if (read_format & PERF_FORMAT_ID)
   5333		values[n++] = primary_event_id(event);
   5334
   5335	if (copy_to_user(buf, values, n * sizeof(u64)))
   5336		return -EFAULT;
   5337
   5338	return n * sizeof(u64);
   5339}
   5340
   5341static bool is_event_hup(struct perf_event *event)
   5342{
   5343	bool no_children;
   5344
   5345	if (event->state > PERF_EVENT_STATE_EXIT)
   5346		return false;
   5347
   5348	mutex_lock(&event->child_mutex);
   5349	no_children = list_empty(&event->child_list);
   5350	mutex_unlock(&event->child_mutex);
   5351	return no_children;
   5352}
   5353
   5354/*
   5355 * Read the performance event - simple non blocking version for now
   5356 */
   5357static ssize_t
   5358__perf_read(struct perf_event *event, char __user *buf, size_t count)
   5359{
   5360	u64 read_format = event->attr.read_format;
   5361	int ret;
   5362
   5363	/*
   5364	 * Return end-of-file for a read on an event that is in
   5365	 * error state (i.e. because it was pinned but it couldn't be
   5366	 * scheduled on to the CPU at some point).
   5367	 */
   5368	if (event->state == PERF_EVENT_STATE_ERROR)
   5369		return 0;
   5370
   5371	if (count < event->read_size)
   5372		return -ENOSPC;
   5373
   5374	WARN_ON_ONCE(event->ctx->parent_ctx);
   5375	if (read_format & PERF_FORMAT_GROUP)
   5376		ret = perf_read_group(event, read_format, buf);
   5377	else
   5378		ret = perf_read_one(event, read_format, buf);
   5379
   5380	return ret;
   5381}
   5382
   5383static ssize_t
   5384perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
   5385{
   5386	struct perf_event *event = file->private_data;
   5387	struct perf_event_context *ctx;
   5388	int ret;
   5389
   5390	ret = security_perf_event_read(event);
   5391	if (ret)
   5392		return ret;
   5393
   5394	ctx = perf_event_ctx_lock(event);
   5395	ret = __perf_read(event, buf, count);
   5396	perf_event_ctx_unlock(event, ctx);
   5397
   5398	return ret;
   5399}
   5400
   5401static __poll_t perf_poll(struct file *file, poll_table *wait)
   5402{
   5403	struct perf_event *event = file->private_data;
   5404	struct perf_buffer *rb;
   5405	__poll_t events = EPOLLHUP;
   5406
   5407	poll_wait(file, &event->waitq, wait);
   5408
   5409	if (is_event_hup(event))
   5410		return events;
   5411
   5412	/*
   5413	 * Pin the event->rb by taking event->mmap_mutex; otherwise
   5414	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
   5415	 */
   5416	mutex_lock(&event->mmap_mutex);
   5417	rb = event->rb;
   5418	if (rb)
   5419		events = atomic_xchg(&rb->poll, 0);
   5420	mutex_unlock(&event->mmap_mutex);
   5421	return events;
   5422}
   5423
   5424static void _perf_event_reset(struct perf_event *event)
   5425{
   5426	(void)perf_event_read(event, false);
   5427	local64_set(&event->count, 0);
   5428	perf_event_update_userpage(event);
   5429}
   5430
   5431/* Assume it's not an event with inherit set. */
   5432u64 perf_event_pause(struct perf_event *event, bool reset)
   5433{
   5434	struct perf_event_context *ctx;
   5435	u64 count;
   5436
   5437	ctx = perf_event_ctx_lock(event);
   5438	WARN_ON_ONCE(event->attr.inherit);
   5439	_perf_event_disable(event);
   5440	count = local64_read(&event->count);
   5441	if (reset)
   5442		local64_set(&event->count, 0);
   5443	perf_event_ctx_unlock(event, ctx);
   5444
   5445	return count;
   5446}
   5447EXPORT_SYMBOL_GPL(perf_event_pause);
   5448
   5449/*
   5450 * Holding the top-level event's child_mutex means that any
   5451 * descendant process that has inherited this event will block
   5452 * in perf_event_exit_event() if it goes to exit, thus satisfying the
   5453 * task existence requirements of perf_event_enable/disable.
   5454 */
   5455static void perf_event_for_each_child(struct perf_event *event,
   5456					void (*func)(struct perf_event *))
   5457{
   5458	struct perf_event *child;
   5459
   5460	WARN_ON_ONCE(event->ctx->parent_ctx);
   5461
   5462	mutex_lock(&event->child_mutex);
   5463	func(event);
   5464	list_for_each_entry(child, &event->child_list, child_list)
   5465		func(child);
   5466	mutex_unlock(&event->child_mutex);
   5467}
   5468
   5469static void perf_event_for_each(struct perf_event *event,
   5470				  void (*func)(struct perf_event *))
   5471{
   5472	struct perf_event_context *ctx = event->ctx;
   5473	struct perf_event *sibling;
   5474
   5475	lockdep_assert_held(&ctx->mutex);
   5476
   5477	event = event->group_leader;
   5478
   5479	perf_event_for_each_child(event, func);
   5480	for_each_sibling_event(sibling, event)
   5481		perf_event_for_each_child(sibling, func);
   5482}
   5483
   5484static void __perf_event_period(struct perf_event *event,
   5485				struct perf_cpu_context *cpuctx,
   5486				struct perf_event_context *ctx,
   5487				void *info)
   5488{
   5489	u64 value = *((u64 *)info);
   5490	bool active;
   5491
   5492	if (event->attr.freq) {
   5493		event->attr.sample_freq = value;
   5494	} else {
   5495		event->attr.sample_period = value;
   5496		event->hw.sample_period = value;
   5497	}
   5498
   5499	active = (event->state == PERF_EVENT_STATE_ACTIVE);
   5500	if (active) {
   5501		perf_pmu_disable(ctx->pmu);
   5502		/*
   5503		 * We could be throttled; unthrottle now to avoid the tick
   5504		 * trying to unthrottle while we already re-started the event.
   5505		 */
   5506		if (event->hw.interrupts == MAX_INTERRUPTS) {
   5507			event->hw.interrupts = 0;
   5508			perf_log_throttle(event, 1);
   5509		}
   5510		event->pmu->stop(event, PERF_EF_UPDATE);
   5511	}
   5512
   5513	local64_set(&event->hw.period_left, 0);
   5514
   5515	if (active) {
   5516		event->pmu->start(event, PERF_EF_RELOAD);
   5517		perf_pmu_enable(ctx->pmu);
   5518	}
   5519}
   5520
   5521static int perf_event_check_period(struct perf_event *event, u64 value)
   5522{
   5523	return event->pmu->check_period(event, value);
   5524}
   5525
   5526static int _perf_event_period(struct perf_event *event, u64 value)
   5527{
   5528	if (!is_sampling_event(event))
   5529		return -EINVAL;
   5530
   5531	if (!value)
   5532		return -EINVAL;
   5533
   5534	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
   5535		return -EINVAL;
   5536
   5537	if (perf_event_check_period(event, value))
   5538		return -EINVAL;
   5539
   5540	if (!event->attr.freq && (value & (1ULL << 63)))
   5541		return -EINVAL;
   5542
   5543	event_function_call(event, __perf_event_period, &value);
   5544
   5545	return 0;
   5546}
   5547
   5548int perf_event_period(struct perf_event *event, u64 value)
   5549{
   5550	struct perf_event_context *ctx;
   5551	int ret;
   5552
   5553	ctx = perf_event_ctx_lock(event);
   5554	ret = _perf_event_period(event, value);
   5555	perf_event_ctx_unlock(event, ctx);
   5556
   5557	return ret;
   5558}
   5559EXPORT_SYMBOL_GPL(perf_event_period);
   5560
   5561static const struct file_operations perf_fops;
   5562
   5563static inline int perf_fget_light(int fd, struct fd *p)
   5564{
   5565	struct fd f = fdget(fd);
   5566	if (!f.file)
   5567		return -EBADF;
   5568
   5569	if (f.file->f_op != &perf_fops) {
   5570		fdput(f);
   5571		return -EBADF;
   5572	}
   5573	*p = f;
   5574	return 0;
   5575}
   5576
   5577static int perf_event_set_output(struct perf_event *event,
   5578				 struct perf_event *output_event);
   5579static int perf_event_set_filter(struct perf_event *event, void __user *arg);
   5580static int perf_copy_attr(struct perf_event_attr __user *uattr,
   5581			  struct perf_event_attr *attr);
   5582
   5583static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
   5584{
   5585	void (*func)(struct perf_event *);
   5586	u32 flags = arg;
   5587
   5588	switch (cmd) {
   5589	case PERF_EVENT_IOC_ENABLE:
   5590		func = _perf_event_enable;
   5591		break;
   5592	case PERF_EVENT_IOC_DISABLE:
   5593		func = _perf_event_disable;
   5594		break;
   5595	case PERF_EVENT_IOC_RESET:
   5596		func = _perf_event_reset;
   5597		break;
   5598
   5599	case PERF_EVENT_IOC_REFRESH:
   5600		return _perf_event_refresh(event, arg);
   5601
   5602	case PERF_EVENT_IOC_PERIOD:
   5603	{
   5604		u64 value;
   5605
   5606		if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
   5607			return -EFAULT;
   5608
   5609		return _perf_event_period(event, value);
   5610	}
   5611	case PERF_EVENT_IOC_ID:
   5612	{
   5613		u64 id = primary_event_id(event);
   5614
   5615		if (copy_to_user((void __user *)arg, &id, sizeof(id)))
   5616			return -EFAULT;
   5617		return 0;
   5618	}
   5619
   5620	case PERF_EVENT_IOC_SET_OUTPUT:
   5621	{
   5622		int ret;
   5623		if (arg != -1) {
   5624			struct perf_event *output_event;
   5625			struct fd output;
   5626			ret = perf_fget_light(arg, &output);
   5627			if (ret)
   5628				return ret;
   5629			output_event = output.file->private_data;
   5630			ret = perf_event_set_output(event, output_event);
   5631			fdput(output);
   5632		} else {
   5633			ret = perf_event_set_output(event, NULL);
   5634		}
   5635		return ret;
   5636	}
   5637
   5638	case PERF_EVENT_IOC_SET_FILTER:
   5639		return perf_event_set_filter(event, (void __user *)arg);
   5640
   5641	case PERF_EVENT_IOC_SET_BPF:
   5642	{
   5643		struct bpf_prog *prog;
   5644		int err;
   5645
   5646		prog = bpf_prog_get(arg);
   5647		if (IS_ERR(prog))
   5648			return PTR_ERR(prog);
   5649
   5650		err = perf_event_set_bpf_prog(event, prog, 0);
   5651		if (err) {
   5652			bpf_prog_put(prog);
   5653			return err;
   5654		}
   5655
   5656		return 0;
   5657	}
   5658
   5659	case PERF_EVENT_IOC_PAUSE_OUTPUT: {
   5660		struct perf_buffer *rb;
   5661
   5662		rcu_read_lock();
   5663		rb = rcu_dereference(event->rb);
   5664		if (!rb || !rb->nr_pages) {
   5665			rcu_read_unlock();
   5666			return -EINVAL;
   5667		}
   5668		rb_toggle_paused(rb, !!arg);
   5669		rcu_read_unlock();
   5670		return 0;
   5671	}
   5672
   5673	case PERF_EVENT_IOC_QUERY_BPF:
   5674		return perf_event_query_prog_array(event, (void __user *)arg);
   5675
   5676	case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
   5677		struct perf_event_attr new_attr;
   5678		int err = perf_copy_attr((struct perf_event_attr __user *)arg,
   5679					 &new_attr);
   5680
   5681		if (err)
   5682			return err;
   5683
   5684		return perf_event_modify_attr(event,  &new_attr);
   5685	}
   5686	default:
   5687		return -ENOTTY;
   5688	}
   5689
   5690	if (flags & PERF_IOC_FLAG_GROUP)
   5691		perf_event_for_each(event, func);
   5692	else
   5693		perf_event_for_each_child(event, func);
   5694
   5695	return 0;
   5696}
   5697
   5698static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
   5699{
   5700	struct perf_event *event = file->private_data;
   5701	struct perf_event_context *ctx;
   5702	long ret;
   5703
   5704	/* Treat ioctl like writes as it is likely a mutating operation. */
   5705	ret = security_perf_event_write(event);
   5706	if (ret)
   5707		return ret;
   5708
   5709	ctx = perf_event_ctx_lock(event);
   5710	ret = _perf_ioctl(event, cmd, arg);
   5711	perf_event_ctx_unlock(event, ctx);
   5712
   5713	return ret;
   5714}
   5715
   5716#ifdef CONFIG_COMPAT
   5717static long perf_compat_ioctl(struct file *file, unsigned int cmd,
   5718				unsigned long arg)
   5719{
   5720	switch (_IOC_NR(cmd)) {
   5721	case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
   5722	case _IOC_NR(PERF_EVENT_IOC_ID):
   5723	case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
   5724	case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
   5725		/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
   5726		if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
   5727			cmd &= ~IOCSIZE_MASK;
   5728			cmd |= sizeof(void *) << IOCSIZE_SHIFT;
   5729		}
   5730		break;
   5731	}
   5732	return perf_ioctl(file, cmd, arg);
   5733}
   5734#else
   5735# define perf_compat_ioctl NULL
   5736#endif
   5737
   5738int perf_event_task_enable(void)
   5739{
   5740	struct perf_event_context *ctx;
   5741	struct perf_event *event;
   5742
   5743	mutex_lock(&current->perf_event_mutex);
   5744	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
   5745		ctx = perf_event_ctx_lock(event);
   5746		perf_event_for_each_child(event, _perf_event_enable);
   5747		perf_event_ctx_unlock(event, ctx);
   5748	}
   5749	mutex_unlock(&current->perf_event_mutex);
   5750
   5751	return 0;
   5752}
   5753
   5754int perf_event_task_disable(void)
   5755{
   5756	struct perf_event_context *ctx;
   5757	struct perf_event *event;
   5758
   5759	mutex_lock(&current->perf_event_mutex);
   5760	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
   5761		ctx = perf_event_ctx_lock(event);
   5762		perf_event_for_each_child(event, _perf_event_disable);
   5763		perf_event_ctx_unlock(event, ctx);
   5764	}
   5765	mutex_unlock(&current->perf_event_mutex);
   5766
   5767	return 0;
   5768}
   5769
   5770static int perf_event_index(struct perf_event *event)
   5771{
   5772	if (event->hw.state & PERF_HES_STOPPED)
   5773		return 0;
   5774
   5775	if (event->state != PERF_EVENT_STATE_ACTIVE)
   5776		return 0;
   5777
   5778	return event->pmu->event_idx(event);
   5779}
   5780
   5781static void perf_event_init_userpage(struct perf_event *event)
   5782{
   5783	struct perf_event_mmap_page *userpg;
   5784	struct perf_buffer *rb;
   5785
   5786	rcu_read_lock();
   5787	rb = rcu_dereference(event->rb);
   5788	if (!rb)
   5789		goto unlock;
   5790
   5791	userpg = rb->user_page;
   5792
   5793	/* Allow new userspace to detect that bit 0 is deprecated */
   5794	userpg->cap_bit0_is_deprecated = 1;
   5795	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
   5796	userpg->data_offset = PAGE_SIZE;
   5797	userpg->data_size = perf_data_size(rb);
   5798
   5799unlock:
   5800	rcu_read_unlock();
   5801}
   5802
   5803void __weak arch_perf_update_userpage(
   5804	struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
   5805{
   5806}
   5807
   5808/*
   5809 * Callers need to ensure there can be no nesting of this function, otherwise
   5810 * the seqlock logic goes bad. We can not serialize this because the arch
   5811 * code calls this from NMI context.
   5812 */
   5813void perf_event_update_userpage(struct perf_event *event)
   5814{
   5815	struct perf_event_mmap_page *userpg;
   5816	struct perf_buffer *rb;
   5817	u64 enabled, running, now;
   5818
   5819	rcu_read_lock();
   5820	rb = rcu_dereference(event->rb);
   5821	if (!rb)
   5822		goto unlock;
   5823
   5824	/*
   5825	 * compute total_time_enabled, total_time_running
   5826	 * based on snapshot values taken when the event
   5827	 * was last scheduled in.
   5828	 *
   5829	 * we cannot simply called update_context_time()
   5830	 * because of locking issue as we can be called in
   5831	 * NMI context
   5832	 */
   5833	calc_timer_values(event, &now, &enabled, &running);
   5834
   5835	userpg = rb->user_page;
   5836	/*
   5837	 * Disable preemption to guarantee consistent time stamps are stored to
   5838	 * the user page.
   5839	 */
   5840	preempt_disable();
   5841	++userpg->lock;
   5842	barrier();
   5843	userpg->index = perf_event_index(event);
   5844	userpg->offset = perf_event_count(event);
   5845	if (userpg->index)
   5846		userpg->offset -= local64_read(&event->hw.prev_count);
   5847
   5848	userpg->time_enabled = enabled +
   5849			atomic64_read(&event->child_total_time_enabled);
   5850
   5851	userpg->time_running = running +
   5852			atomic64_read(&event->child_total_time_running);
   5853
   5854	arch_perf_update_userpage(event, userpg, now);
   5855
   5856	barrier();
   5857	++userpg->lock;
   5858	preempt_enable();
   5859unlock:
   5860	rcu_read_unlock();
   5861}
   5862EXPORT_SYMBOL_GPL(perf_event_update_userpage);
   5863
   5864static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
   5865{
   5866	struct perf_event *event = vmf->vma->vm_file->private_data;
   5867	struct perf_buffer *rb;
   5868	vm_fault_t ret = VM_FAULT_SIGBUS;
   5869
   5870	if (vmf->flags & FAULT_FLAG_MKWRITE) {
   5871		if (vmf->pgoff == 0)
   5872			ret = 0;
   5873		return ret;
   5874	}
   5875
   5876	rcu_read_lock();
   5877	rb = rcu_dereference(event->rb);
   5878	if (!rb)
   5879		goto unlock;
   5880
   5881	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
   5882		goto unlock;
   5883
   5884	vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
   5885	if (!vmf->page)
   5886		goto unlock;
   5887
   5888	get_page(vmf->page);
   5889	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
   5890	vmf->page->index   = vmf->pgoff;
   5891
   5892	ret = 0;
   5893unlock:
   5894	rcu_read_unlock();
   5895
   5896	return ret;
   5897}
   5898
   5899static void ring_buffer_attach(struct perf_event *event,
   5900			       struct perf_buffer *rb)
   5901{
   5902	struct perf_buffer *old_rb = NULL;
   5903	unsigned long flags;
   5904
   5905	WARN_ON_ONCE(event->parent);
   5906
   5907	if (event->rb) {
   5908		/*
   5909		 * Should be impossible, we set this when removing
   5910		 * event->rb_entry and wait/clear when adding event->rb_entry.
   5911		 */
   5912		WARN_ON_ONCE(event->rcu_pending);
   5913
   5914		old_rb = event->rb;
   5915		spin_lock_irqsave(&old_rb->event_lock, flags);
   5916		list_del_rcu(&event->rb_entry);
   5917		spin_unlock_irqrestore(&old_rb->event_lock, flags);
   5918
   5919		event->rcu_batches = get_state_synchronize_rcu();
   5920		event->rcu_pending = 1;
   5921	}
   5922
   5923	if (rb) {
   5924		if (event->rcu_pending) {
   5925			cond_synchronize_rcu(event->rcu_batches);
   5926			event->rcu_pending = 0;
   5927		}
   5928
   5929		spin_lock_irqsave(&rb->event_lock, flags);
   5930		list_add_rcu(&event->rb_entry, &rb->event_list);
   5931		spin_unlock_irqrestore(&rb->event_lock, flags);
   5932	}
   5933
   5934	/*
   5935	 * Avoid racing with perf_mmap_close(AUX): stop the event
   5936	 * before swizzling the event::rb pointer; if it's getting
   5937	 * unmapped, its aux_mmap_count will be 0 and it won't
   5938	 * restart. See the comment in __perf_pmu_output_stop().
   5939	 *
   5940	 * Data will inevitably be lost when set_output is done in
   5941	 * mid-air, but then again, whoever does it like this is
   5942	 * not in for the data anyway.
   5943	 */
   5944	if (has_aux(event))
   5945		perf_event_stop(event, 0);
   5946
   5947	rcu_assign_pointer(event->rb, rb);
   5948
   5949	if (old_rb) {
   5950		ring_buffer_put(old_rb);
   5951		/*
   5952		 * Since we detached before setting the new rb, so that we
   5953		 * could attach the new rb, we could have missed a wakeup.
   5954		 * Provide it now.
   5955		 */
   5956		wake_up_all(&event->waitq);
   5957	}
   5958}
   5959
   5960static void ring_buffer_wakeup(struct perf_event *event)
   5961{
   5962	struct perf_buffer *rb;
   5963
   5964	if (event->parent)
   5965		event = event->parent;
   5966
   5967	rcu_read_lock();
   5968	rb = rcu_dereference(event->rb);
   5969	if (rb) {
   5970		list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
   5971			wake_up_all(&event->waitq);
   5972	}
   5973	rcu_read_unlock();
   5974}
   5975
   5976struct perf_buffer *ring_buffer_get(struct perf_event *event)
   5977{
   5978	struct perf_buffer *rb;
   5979
   5980	if (event->parent)
   5981		event = event->parent;
   5982
   5983	rcu_read_lock();
   5984	rb = rcu_dereference(event->rb);
   5985	if (rb) {
   5986		if (!refcount_inc_not_zero(&rb->refcount))
   5987			rb = NULL;
   5988	}
   5989	rcu_read_unlock();
   5990
   5991	return rb;
   5992}
   5993
   5994void ring_buffer_put(struct perf_buffer *rb)
   5995{
   5996	if (!refcount_dec_and_test(&rb->refcount))
   5997		return;
   5998
   5999	WARN_ON_ONCE(!list_empty(&rb->event_list));
   6000
   6001	call_rcu(&rb->rcu_head, rb_free_rcu);
   6002}
   6003
   6004static void perf_mmap_open(struct vm_area_struct *vma)
   6005{
   6006	struct perf_event *event = vma->vm_file->private_data;
   6007
   6008	atomic_inc(&event->mmap_count);
   6009	atomic_inc(&event->rb->mmap_count);
   6010
   6011	if (vma->vm_pgoff)
   6012		atomic_inc(&event->rb->aux_mmap_count);
   6013
   6014	if (event->pmu->event_mapped)
   6015		event->pmu->event_mapped(event, vma->vm_mm);
   6016}
   6017
   6018static void perf_pmu_output_stop(struct perf_event *event);
   6019
   6020/*
   6021 * A buffer can be mmap()ed multiple times; either directly through the same
   6022 * event, or through other events by use of perf_event_set_output().
   6023 *
   6024 * In order to undo the VM accounting done by perf_mmap() we need to destroy
   6025 * the buffer here, where we still have a VM context. This means we need
   6026 * to detach all events redirecting to us.
   6027 */
   6028static void perf_mmap_close(struct vm_area_struct *vma)
   6029{
   6030	struct perf_event *event = vma->vm_file->private_data;
   6031	struct perf_buffer *rb = ring_buffer_get(event);
   6032	struct user_struct *mmap_user = rb->mmap_user;
   6033	int mmap_locked = rb->mmap_locked;
   6034	unsigned long size = perf_data_size(rb);
   6035	bool detach_rest = false;
   6036
   6037	if (event->pmu->event_unmapped)
   6038		event->pmu->event_unmapped(event, vma->vm_mm);
   6039
   6040	/*
   6041	 * rb->aux_mmap_count will always drop before rb->mmap_count and
   6042	 * event->mmap_count, so it is ok to use event->mmap_mutex to
   6043	 * serialize with perf_mmap here.
   6044	 */
   6045	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
   6046	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
   6047		/*
   6048		 * Stop all AUX events that are writing to this buffer,
   6049		 * so that we can free its AUX pages and corresponding PMU
   6050		 * data. Note that after rb::aux_mmap_count dropped to zero,
   6051		 * they won't start any more (see perf_aux_output_begin()).
   6052		 */
   6053		perf_pmu_output_stop(event);
   6054
   6055		/* now it's safe to free the pages */
   6056		atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
   6057		atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
   6058
   6059		/* this has to be the last one */
   6060		rb_free_aux(rb);
   6061		WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
   6062
   6063		mutex_unlock(&event->mmap_mutex);
   6064	}
   6065
   6066	if (atomic_dec_and_test(&rb->mmap_count))
   6067		detach_rest = true;
   6068
   6069	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
   6070		goto out_put;
   6071
   6072	ring_buffer_attach(event, NULL);
   6073	mutex_unlock(&event->mmap_mutex);
   6074
   6075	/* If there's still other mmap()s of this buffer, we're done. */
   6076	if (!detach_rest)
   6077		goto out_put;
   6078
   6079	/*
   6080	 * No other mmap()s, detach from all other events that might redirect
   6081	 * into the now unreachable buffer. Somewhat complicated by the
   6082	 * fact that rb::event_lock otherwise nests inside mmap_mutex.
   6083	 */
   6084again:
   6085	rcu_read_lock();
   6086	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
   6087		if (!atomic_long_inc_not_zero(&event->refcount)) {
   6088			/*
   6089			 * This event is en-route to free_event() which will
   6090			 * detach it and remove it from the list.
   6091			 */
   6092			continue;
   6093		}
   6094		rcu_read_unlock();
   6095
   6096		mutex_lock(&event->mmap_mutex);
   6097		/*
   6098		 * Check we didn't race with perf_event_set_output() which can
   6099		 * swizzle the rb from under us while we were waiting to
   6100		 * acquire mmap_mutex.
   6101		 *
   6102		 * If we find a different rb; ignore this event, a next
   6103		 * iteration will no longer find it on the list. We have to
   6104		 * still restart the iteration to make sure we're not now
   6105		 * iterating the wrong list.
   6106		 */
   6107		if (event->rb == rb)
   6108			ring_buffer_attach(event, NULL);
   6109
   6110		mutex_unlock(&event->mmap_mutex);
   6111		put_event(event);
   6112
   6113		/*
   6114		 * Restart the iteration; either we're on the wrong list or
   6115		 * destroyed its integrity by doing a deletion.
   6116		 */
   6117		goto again;
   6118	}
   6119	rcu_read_unlock();
   6120
   6121	/*
   6122	 * It could be there's still a few 0-ref events on the list; they'll
   6123	 * get cleaned up by free_event() -- they'll also still have their
   6124	 * ref on the rb and will free it whenever they are done with it.
   6125	 *
   6126	 * Aside from that, this buffer is 'fully' detached and unmapped,
   6127	 * undo the VM accounting.
   6128	 */
   6129
   6130	atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
   6131			&mmap_user->locked_vm);
   6132	atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
   6133	free_uid(mmap_user);
   6134
   6135out_put:
   6136	ring_buffer_put(rb); /* could be last */
   6137}
   6138
   6139static const struct vm_operations_struct perf_mmap_vmops = {
   6140	.open		= perf_mmap_open,
   6141	.close		= perf_mmap_close, /* non mergeable */
   6142	.fault		= perf_mmap_fault,
   6143	.page_mkwrite	= perf_mmap_fault,
   6144};
   6145
   6146static int perf_mmap(struct file *file, struct vm_area_struct *vma)
   6147{
   6148	struct perf_event *event = file->private_data;
   6149	unsigned long user_locked, user_lock_limit;
   6150	struct user_struct *user = current_user();
   6151	struct perf_buffer *rb = NULL;
   6152	unsigned long locked, lock_limit;
   6153	unsigned long vma_size;
   6154	unsigned long nr_pages;
   6155	long user_extra = 0, extra = 0;
   6156	int ret = 0, flags = 0;
   6157
   6158	/*
   6159	 * Don't allow mmap() of inherited per-task counters. This would
   6160	 * create a performance issue due to all children writing to the
   6161	 * same rb.
   6162	 */
   6163	if (event->cpu == -1 && event->attr.inherit)
   6164		return -EINVAL;
   6165
   6166	if (!(vma->vm_flags & VM_SHARED))
   6167		return -EINVAL;
   6168
   6169	ret = security_perf_event_read(event);
   6170	if (ret)
   6171		return ret;
   6172
   6173	vma_size = vma->vm_end - vma->vm_start;
   6174
   6175	if (vma->vm_pgoff == 0) {
   6176		nr_pages = (vma_size / PAGE_SIZE) - 1;
   6177	} else {
   6178		/*
   6179		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
   6180		 * mapped, all subsequent mappings should have the same size
   6181		 * and offset. Must be above the normal perf buffer.
   6182		 */
   6183		u64 aux_offset, aux_size;
   6184
   6185		if (!event->rb)
   6186			return -EINVAL;
   6187
   6188		nr_pages = vma_size / PAGE_SIZE;
   6189
   6190		mutex_lock(&event->mmap_mutex);
   6191		ret = -EINVAL;
   6192
   6193		rb = event->rb;
   6194		if (!rb)
   6195			goto aux_unlock;
   6196
   6197		aux_offset = READ_ONCE(rb->user_page->aux_offset);
   6198		aux_size = READ_ONCE(rb->user_page->aux_size);
   6199
   6200		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
   6201			goto aux_unlock;
   6202
   6203		if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
   6204			goto aux_unlock;
   6205
   6206		/* already mapped with a different offset */
   6207		if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
   6208			goto aux_unlock;
   6209
   6210		if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
   6211			goto aux_unlock;
   6212
   6213		/* already mapped with a different size */
   6214		if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
   6215			goto aux_unlock;
   6216
   6217		if (!is_power_of_2(nr_pages))
   6218			goto aux_unlock;
   6219
   6220		if (!atomic_inc_not_zero(&rb->mmap_count))
   6221			goto aux_unlock;
   6222
   6223		if (rb_has_aux(rb)) {
   6224			atomic_inc(&rb->aux_mmap_count);
   6225			ret = 0;
   6226			goto unlock;
   6227		}
   6228
   6229		atomic_set(&rb->aux_mmap_count, 1);
   6230		user_extra = nr_pages;
   6231
   6232		goto accounting;
   6233	}
   6234
   6235	/*
   6236	 * If we have rb pages ensure they're a power-of-two number, so we
   6237	 * can do bitmasks instead of modulo.
   6238	 */
   6239	if (nr_pages != 0 && !is_power_of_2(nr_pages))
   6240		return -EINVAL;
   6241
   6242	if (vma_size != PAGE_SIZE * (1 + nr_pages))
   6243		return -EINVAL;
   6244
   6245	WARN_ON_ONCE(event->ctx->parent_ctx);
   6246again:
   6247	mutex_lock(&event->mmap_mutex);
   6248	if (event->rb) {
   6249		if (data_page_nr(event->rb) != nr_pages) {
   6250			ret = -EINVAL;
   6251			goto unlock;
   6252		}
   6253
   6254		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
   6255			/*
   6256			 * Raced against perf_mmap_close() through
   6257			 * perf_event_set_output(). Try again, hope for better
   6258			 * luck.
   6259			 */
   6260			mutex_unlock(&event->mmap_mutex);
   6261			goto again;
   6262		}
   6263
   6264		goto unlock;
   6265	}
   6266
   6267	user_extra = nr_pages + 1;
   6268
   6269accounting:
   6270	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
   6271
   6272	/*
   6273	 * Increase the limit linearly with more CPUs:
   6274	 */
   6275	user_lock_limit *= num_online_cpus();
   6276
   6277	user_locked = atomic_long_read(&user->locked_vm);
   6278
   6279	/*
   6280	 * sysctl_perf_event_mlock may have changed, so that
   6281	 *     user->locked_vm > user_lock_limit
   6282	 */
   6283	if (user_locked > user_lock_limit)
   6284		user_locked = user_lock_limit;
   6285	user_locked += user_extra;
   6286
   6287	if (user_locked > user_lock_limit) {
   6288		/*
   6289		 * charge locked_vm until it hits user_lock_limit;
   6290		 * charge the rest from pinned_vm
   6291		 */
   6292		extra = user_locked - user_lock_limit;
   6293		user_extra -= extra;
   6294	}
   6295
   6296	lock_limit = rlimit(RLIMIT_MEMLOCK);
   6297	lock_limit >>= PAGE_SHIFT;
   6298	locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
   6299
   6300	if ((locked > lock_limit) && perf_is_paranoid() &&
   6301		!capable(CAP_IPC_LOCK)) {
   6302		ret = -EPERM;
   6303		goto unlock;
   6304	}
   6305
   6306	WARN_ON(!rb && event->rb);
   6307
   6308	if (vma->vm_flags & VM_WRITE)
   6309		flags |= RING_BUFFER_WRITABLE;
   6310
   6311	if (!rb) {
   6312		rb = rb_alloc(nr_pages,
   6313			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
   6314			      event->cpu, flags);
   6315
   6316		if (!rb) {
   6317			ret = -ENOMEM;
   6318			goto unlock;
   6319		}
   6320
   6321		atomic_set(&rb->mmap_count, 1);
   6322		rb->mmap_user = get_current_user();
   6323		rb->mmap_locked = extra;
   6324
   6325		ring_buffer_attach(event, rb);
   6326
   6327		perf_event_update_time(event);
   6328		perf_event_init_userpage(event);
   6329		perf_event_update_userpage(event);
   6330	} else {
   6331		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
   6332				   event->attr.aux_watermark, flags);
   6333		if (!ret)
   6334			rb->aux_mmap_locked = extra;
   6335	}
   6336
   6337unlock:
   6338	if (!ret) {
   6339		atomic_long_add(user_extra, &user->locked_vm);
   6340		atomic64_add(extra, &vma->vm_mm->pinned_vm);
   6341
   6342		atomic_inc(&event->mmap_count);
   6343	} else if (rb) {
   6344		atomic_dec(&rb->mmap_count);
   6345	}
   6346aux_unlock:
   6347	mutex_unlock(&event->mmap_mutex);
   6348
   6349	/*
   6350	 * Since pinned accounting is per vm we cannot allow fork() to copy our
   6351	 * vma.
   6352	 */
   6353	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
   6354	vma->vm_ops = &perf_mmap_vmops;
   6355
   6356	if (event->pmu->event_mapped)
   6357		event->pmu->event_mapped(event, vma->vm_mm);
   6358
   6359	return ret;
   6360}
   6361
   6362static int perf_fasync(int fd, struct file *filp, int on)
   6363{
   6364	struct inode *inode = file_inode(filp);
   6365	struct perf_event *event = filp->private_data;
   6366	int retval;
   6367
   6368	inode_lock(inode);
   6369	retval = fasync_helper(fd, filp, on, &event->fasync);
   6370	inode_unlock(inode);
   6371
   6372	if (retval < 0)
   6373		return retval;
   6374
   6375	return 0;
   6376}
   6377
   6378static const struct file_operations perf_fops = {
   6379	.llseek			= no_llseek,
   6380	.release		= perf_release,
   6381	.read			= perf_read,
   6382	.poll			= perf_poll,
   6383	.unlocked_ioctl		= perf_ioctl,
   6384	.compat_ioctl		= perf_compat_ioctl,
   6385	.mmap			= perf_mmap,
   6386	.fasync			= perf_fasync,
   6387};
   6388
   6389/*
   6390 * Perf event wakeup
   6391 *
   6392 * If there's data, ensure we set the poll() state and publish everything
   6393 * to user-space before waking everybody up.
   6394 */
   6395
   6396static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
   6397{
   6398	/* only the parent has fasync state */
   6399	if (event->parent)
   6400		event = event->parent;
   6401	return &event->fasync;
   6402}
   6403
   6404void perf_event_wakeup(struct perf_event *event)
   6405{
   6406	ring_buffer_wakeup(event);
   6407
   6408	if (event->pending_kill) {
   6409		kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
   6410		event->pending_kill = 0;
   6411	}
   6412}
   6413
   6414static void perf_sigtrap(struct perf_event *event)
   6415{
   6416	/*
   6417	 * We'd expect this to only occur if the irq_work is delayed and either
   6418	 * ctx->task or current has changed in the meantime. This can be the
   6419	 * case on architectures that do not implement arch_irq_work_raise().
   6420	 */
   6421	if (WARN_ON_ONCE(event->ctx->task != current))
   6422		return;
   6423
   6424	/*
   6425	 * perf_pending_event() can race with the task exiting.
   6426	 */
   6427	if (current->flags & PF_EXITING)
   6428		return;
   6429
   6430	send_sig_perf((void __user *)event->pending_addr,
   6431		      event->attr.type, event->attr.sig_data);
   6432}
   6433
   6434static void perf_pending_event_disable(struct perf_event *event)
   6435{
   6436	int cpu = READ_ONCE(event->pending_disable);
   6437
   6438	if (cpu < 0)
   6439		return;
   6440
   6441	if (cpu == smp_processor_id()) {
   6442		WRITE_ONCE(event->pending_disable, -1);
   6443
   6444		if (event->attr.sigtrap) {
   6445			perf_sigtrap(event);
   6446			atomic_set_release(&event->event_limit, 1); /* rearm event */
   6447			return;
   6448		}
   6449
   6450		perf_event_disable_local(event);
   6451		return;
   6452	}
   6453
   6454	/*
   6455	 *  CPU-A			CPU-B
   6456	 *
   6457	 *  perf_event_disable_inatomic()
   6458	 *    @pending_disable = CPU-A;
   6459	 *    irq_work_queue();
   6460	 *
   6461	 *  sched-out
   6462	 *    @pending_disable = -1;
   6463	 *
   6464	 *				sched-in
   6465	 *				perf_event_disable_inatomic()
   6466	 *				  @pending_disable = CPU-B;
   6467	 *				  irq_work_queue(); // FAILS
   6468	 *
   6469	 *  irq_work_run()
   6470	 *    perf_pending_event()
   6471	 *
   6472	 * But the event runs on CPU-B and wants disabling there.
   6473	 */
   6474	irq_work_queue_on(&event->pending, cpu);
   6475}
   6476
   6477static void perf_pending_event(struct irq_work *entry)
   6478{
   6479	struct perf_event *event = container_of(entry, struct perf_event, pending);
   6480	int rctx;
   6481
   6482	rctx = perf_swevent_get_recursion_context();
   6483	/*
   6484	 * If we 'fail' here, that's OK, it means recursion is already disabled
   6485	 * and we won't recurse 'further'.
   6486	 */
   6487
   6488	perf_pending_event_disable(event);
   6489
   6490	if (event->pending_wakeup) {
   6491		event->pending_wakeup = 0;
   6492		perf_event_wakeup(event);
   6493	}
   6494
   6495	if (rctx >= 0)
   6496		perf_swevent_put_recursion_context(rctx);
   6497}
   6498
   6499#ifdef CONFIG_GUEST_PERF_EVENTS
   6500struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
   6501
   6502DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
   6503DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
   6504DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
   6505
   6506void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
   6507{
   6508	if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
   6509		return;
   6510
   6511	rcu_assign_pointer(perf_guest_cbs, cbs);
   6512	static_call_update(__perf_guest_state, cbs->state);
   6513	static_call_update(__perf_guest_get_ip, cbs->get_ip);
   6514
   6515	/* Implementing ->handle_intel_pt_intr is optional. */
   6516	if (cbs->handle_intel_pt_intr)
   6517		static_call_update(__perf_guest_handle_intel_pt_intr,
   6518				   cbs->handle_intel_pt_intr);
   6519}
   6520EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
   6521
   6522void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
   6523{
   6524	if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
   6525		return;
   6526
   6527	rcu_assign_pointer(perf_guest_cbs, NULL);
   6528	static_call_update(__perf_guest_state, (void *)&__static_call_return0);
   6529	static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
   6530	static_call_update(__perf_guest_handle_intel_pt_intr,
   6531			   (void *)&__static_call_return0);
   6532	synchronize_rcu();
   6533}
   6534EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
   6535#endif
   6536
   6537static void
   6538perf_output_sample_regs(struct perf_output_handle *handle,
   6539			struct pt_regs *regs, u64 mask)
   6540{
   6541	int bit;
   6542	DECLARE_BITMAP(_mask, 64);
   6543
   6544	bitmap_from_u64(_mask, mask);
   6545	for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
   6546		u64 val;
   6547
   6548		val = perf_reg_value(regs, bit);
   6549		perf_output_put(handle, val);
   6550	}
   6551}
   6552
   6553static void perf_sample_regs_user(struct perf_regs *regs_user,
   6554				  struct pt_regs *regs)
   6555{
   6556	if (user_mode(regs)) {
   6557		regs_user->abi = perf_reg_abi(current);
   6558		regs_user->regs = regs;
   6559	} else if (!(current->flags & PF_KTHREAD)) {
   6560		perf_get_regs_user(regs_user, regs);
   6561	} else {
   6562		regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
   6563		regs_user->regs = NULL;
   6564	}
   6565}
   6566
   6567static void perf_sample_regs_intr(struct perf_regs *regs_intr,
   6568				  struct pt_regs *regs)
   6569{
   6570	regs_intr->regs = regs;
   6571	regs_intr->abi  = perf_reg_abi(current);
   6572}
   6573
   6574
   6575/*
   6576 * Get remaining task size from user stack pointer.
   6577 *
   6578 * It'd be better to take stack vma map and limit this more
   6579 * precisely, but there's no way to get it safely under interrupt,
   6580 * so using TASK_SIZE as limit.
   6581 */
   6582static u64 perf_ustack_task_size(struct pt_regs *regs)
   6583{
   6584	unsigned long addr = perf_user_stack_pointer(regs);
   6585
   6586	if (!addr || addr >= TASK_SIZE)
   6587		return 0;
   6588
   6589	return TASK_SIZE - addr;
   6590}
   6591
   6592static u16
   6593perf_sample_ustack_size(u16 stack_size, u16 header_size,
   6594			struct pt_regs *regs)
   6595{
   6596	u64 task_size;
   6597
   6598	/* No regs, no stack pointer, no dump. */
   6599	if (!regs)
   6600		return 0;
   6601
   6602	/*
   6603	 * Check if we fit in with the requested stack size into the:
   6604	 * - TASK_SIZE
   6605	 *   If we don't, we limit the size to the TASK_SIZE.
   6606	 *
   6607	 * - remaining sample size
   6608	 *   If we don't, we customize the stack size to
   6609	 *   fit in to the remaining sample size.
   6610	 */
   6611
   6612	task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
   6613	stack_size = min(stack_size, (u16) task_size);
   6614
   6615	/* Current header size plus static size and dynamic size. */
   6616	header_size += 2 * sizeof(u64);
   6617
   6618	/* Do we fit in with the current stack dump size? */
   6619	if ((u16) (header_size + stack_size) < header_size) {
   6620		/*
   6621		 * If we overflow the maximum size for the sample,
   6622		 * we customize the stack dump size to fit in.
   6623		 */
   6624		stack_size = USHRT_MAX - header_size - sizeof(u64);
   6625		stack_size = round_up(stack_size, sizeof(u64));
   6626	}
   6627
   6628	return stack_size;
   6629}
   6630
   6631static void
   6632perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
   6633			  struct pt_regs *regs)
   6634{
   6635	/* Case of a kernel thread, nothing to dump */
   6636	if (!regs) {
   6637		u64 size = 0;
   6638		perf_output_put(handle, size);
   6639	} else {
   6640		unsigned long sp;
   6641		unsigned int rem;
   6642		u64 dyn_size;
   6643
   6644		/*
   6645		 * We dump:
   6646		 * static size
   6647		 *   - the size requested by user or the best one we can fit
   6648		 *     in to the sample max size
   6649		 * data
   6650		 *   - user stack dump data
   6651		 * dynamic size
   6652		 *   - the actual dumped size
   6653		 */
   6654
   6655		/* Static size. */
   6656		perf_output_put(handle, dump_size);
   6657
   6658		/* Data. */
   6659		sp = perf_user_stack_pointer(regs);
   6660		rem = __output_copy_user(handle, (void *) sp, dump_size);
   6661		dyn_size = dump_size - rem;
   6662
   6663		perf_output_skip(handle, rem);
   6664
   6665		/* Dynamic size. */
   6666		perf_output_put(handle, dyn_size);
   6667	}
   6668}
   6669
   6670static unsigned long perf_prepare_sample_aux(struct perf_event *event,
   6671					  struct perf_sample_data *data,
   6672					  size_t size)
   6673{
   6674	struct perf_event *sampler = event->aux_event;
   6675	struct perf_buffer *rb;
   6676
   6677	data->aux_size = 0;
   6678
   6679	if (!sampler)
   6680		goto out;
   6681
   6682	if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
   6683		goto out;
   6684
   6685	if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
   6686		goto out;
   6687
   6688	rb = ring_buffer_get(sampler);
   6689	if (!rb)
   6690		goto out;
   6691
   6692	/*
   6693	 * If this is an NMI hit inside sampling code, don't take
   6694	 * the sample. See also perf_aux_sample_output().
   6695	 */
   6696	if (READ_ONCE(rb->aux_in_sampling)) {
   6697		data->aux_size = 0;
   6698	} else {
   6699		size = min_t(size_t, size, perf_aux_size(rb));
   6700		data->aux_size = ALIGN(size, sizeof(u64));
   6701	}
   6702	ring_buffer_put(rb);
   6703
   6704out:
   6705	return data->aux_size;
   6706}
   6707
   6708static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
   6709                                 struct perf_event *event,
   6710                                 struct perf_output_handle *handle,
   6711                                 unsigned long size)
   6712{
   6713	unsigned long flags;
   6714	long ret;
   6715
   6716	/*
   6717	 * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
   6718	 * paths. If we start calling them in NMI context, they may race with
   6719	 * the IRQ ones, that is, for example, re-starting an event that's just
   6720	 * been stopped, which is why we're using a separate callback that
   6721	 * doesn't change the event state.
   6722	 *
   6723	 * IRQs need to be disabled to prevent IPIs from racing with us.
   6724	 */
   6725	local_irq_save(flags);
   6726	/*
   6727	 * Guard against NMI hits inside the critical section;
   6728	 * see also perf_prepare_sample_aux().
   6729	 */
   6730	WRITE_ONCE(rb->aux_in_sampling, 1);
   6731	barrier();
   6732
   6733	ret = event->pmu->snapshot_aux(event, handle, size);
   6734
   6735	barrier();
   6736	WRITE_ONCE(rb->aux_in_sampling, 0);
   6737	local_irq_restore(flags);
   6738
   6739	return ret;
   6740}
   6741
   6742static void perf_aux_sample_output(struct perf_event *event,
   6743				   struct perf_output_handle *handle,
   6744				   struct perf_sample_data *data)
   6745{
   6746	struct perf_event *sampler = event->aux_event;
   6747	struct perf_buffer *rb;
   6748	unsigned long pad;
   6749	long size;
   6750
   6751	if (WARN_ON_ONCE(!sampler || !data->aux_size))
   6752		return;
   6753
   6754	rb = ring_buffer_get(sampler);
   6755	if (!rb)
   6756		return;
   6757
   6758	size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
   6759
   6760	/*
   6761	 * An error here means that perf_output_copy() failed (returned a
   6762	 * non-zero surplus that it didn't copy), which in its current
   6763	 * enlightened implementation is not possible. If that changes, we'd
   6764	 * like to know.
   6765	 */
   6766	if (WARN_ON_ONCE(size < 0))
   6767		goto out_put;
   6768
   6769	/*
   6770	 * The pad comes from ALIGN()ing data->aux_size up to u64 in
   6771	 * perf_prepare_sample_aux(), so should not be more than that.
   6772	 */
   6773	pad = data->aux_size - size;
   6774	if (WARN_ON_ONCE(pad >= sizeof(u64)))
   6775		pad = 8;
   6776
   6777	if (pad) {
   6778		u64 zero = 0;
   6779		perf_output_copy(handle, &zero, pad);
   6780	}
   6781
   6782out_put:
   6783	ring_buffer_put(rb);
   6784}
   6785
   6786static void __perf_event_header__init_id(struct perf_event_header *header,
   6787					 struct perf_sample_data *data,
   6788					 struct perf_event *event)
   6789{
   6790	u64 sample_type = event->attr.sample_type;
   6791
   6792	data->type = sample_type;
   6793	header->size += event->id_header_size;
   6794
   6795	if (sample_type & PERF_SAMPLE_TID) {
   6796		/* namespace issues */
   6797		data->tid_entry.pid = perf_event_pid(event, current);
   6798		data->tid_entry.tid = perf_event_tid(event, current);
   6799	}
   6800
   6801	if (sample_type & PERF_SAMPLE_TIME)
   6802		data->time = perf_event_clock(event);
   6803
   6804	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
   6805		data->id = primary_event_id(event);
   6806
   6807	if (sample_type & PERF_SAMPLE_STREAM_ID)
   6808		data->stream_id = event->id;
   6809
   6810	if (sample_type & PERF_SAMPLE_CPU) {
   6811		data->cpu_entry.cpu	 = raw_smp_processor_id();
   6812		data->cpu_entry.reserved = 0;
   6813	}
   6814}
   6815
   6816void perf_event_header__init_id(struct perf_event_header *header,
   6817				struct perf_sample_data *data,
   6818				struct perf_event *event)
   6819{
   6820	if (event->attr.sample_id_all)
   6821		__perf_event_header__init_id(header, data, event);
   6822}
   6823
   6824static void __perf_event__output_id_sample(struct perf_output_handle *handle,
   6825					   struct perf_sample_data *data)
   6826{
   6827	u64 sample_type = data->type;
   6828
   6829	if (sample_type & PERF_SAMPLE_TID)
   6830		perf_output_put(handle, data->tid_entry);
   6831
   6832	if (sample_type & PERF_SAMPLE_TIME)
   6833		perf_output_put(handle, data->time);
   6834
   6835	if (sample_type & PERF_SAMPLE_ID)
   6836		perf_output_put(handle, data->id);
   6837
   6838	if (sample_type & PERF_SAMPLE_STREAM_ID)
   6839		perf_output_put(handle, data->stream_id);
   6840
   6841	if (sample_type & PERF_SAMPLE_CPU)
   6842		perf_output_put(handle, data->cpu_entry);
   6843
   6844	if (sample_type & PERF_SAMPLE_IDENTIFIER)
   6845		perf_output_put(handle, data->id);
   6846}
   6847
   6848void perf_event__output_id_sample(struct perf_event *event,
   6849				  struct perf_output_handle *handle,
   6850				  struct perf_sample_data *sample)
   6851{
   6852	if (event->attr.sample_id_all)
   6853		__perf_event__output_id_sample(handle, sample);
   6854}
   6855
   6856static void perf_output_read_one(struct perf_output_handle *handle,
   6857				 struct perf_event *event,
   6858				 u64 enabled, u64 running)
   6859{
   6860	u64 read_format = event->attr.read_format;
   6861	u64 values[4];
   6862	int n = 0;
   6863
   6864	values[n++] = perf_event_count(event);
   6865	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
   6866		values[n++] = enabled +
   6867			atomic64_read(&event->child_total_time_enabled);
   6868	}
   6869	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
   6870		values[n++] = running +
   6871			atomic64_read(&event->child_total_time_running);
   6872	}
   6873	if (read_format & PERF_FORMAT_ID)
   6874		values[n++] = primary_event_id(event);
   6875
   6876	__output_copy(handle, values, n * sizeof(u64));
   6877}
   6878
   6879static void perf_output_read_group(struct perf_output_handle *handle,
   6880			    struct perf_event *event,
   6881			    u64 enabled, u64 running)
   6882{
   6883	struct perf_event *leader = event->group_leader, *sub;
   6884	u64 read_format = event->attr.read_format;
   6885	u64 values[5];
   6886	int n = 0;
   6887
   6888	values[n++] = 1 + leader->nr_siblings;
   6889
   6890	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
   6891		values[n++] = enabled;
   6892
   6893	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
   6894		values[n++] = running;
   6895
   6896	if ((leader != event) &&
   6897	    (leader->state == PERF_EVENT_STATE_ACTIVE))
   6898		leader->pmu->read(leader);
   6899
   6900	values[n++] = perf_event_count(leader);
   6901	if (read_format & PERF_FORMAT_ID)
   6902		values[n++] = primary_event_id(leader);
   6903
   6904	__output_copy(handle, values, n * sizeof(u64));
   6905
   6906	for_each_sibling_event(sub, leader) {
   6907		n = 0;
   6908
   6909		if ((sub != event) &&
   6910		    (sub->state == PERF_EVENT_STATE_ACTIVE))
   6911			sub->pmu->read(sub);
   6912
   6913		values[n++] = perf_event_count(sub);
   6914		if (read_format & PERF_FORMAT_ID)
   6915			values[n++] = primary_event_id(sub);
   6916
   6917		__output_copy(handle, values, n * sizeof(u64));
   6918	}
   6919}
   6920
   6921#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
   6922				 PERF_FORMAT_TOTAL_TIME_RUNNING)
   6923
   6924/*
   6925 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
   6926 *
   6927 * The problem is that its both hard and excessively expensive to iterate the
   6928 * child list, not to mention that its impossible to IPI the children running
   6929 * on another CPU, from interrupt/NMI context.
   6930 */
   6931static void perf_output_read(struct perf_output_handle *handle,
   6932			     struct perf_event *event)
   6933{
   6934	u64 enabled = 0, running = 0, now;
   6935	u64 read_format = event->attr.read_format;
   6936
   6937	/*
   6938	 * compute total_time_enabled, total_time_running
   6939	 * based on snapshot values taken when the event
   6940	 * was last scheduled in.
   6941	 *
   6942	 * we cannot simply called update_context_time()
   6943	 * because of locking issue as we are called in
   6944	 * NMI context
   6945	 */
   6946	if (read_format & PERF_FORMAT_TOTAL_TIMES)
   6947		calc_timer_values(event, &now, &enabled, &running);
   6948
   6949	if (event->attr.read_format & PERF_FORMAT_GROUP)
   6950		perf_output_read_group(handle, event, enabled, running);
   6951	else
   6952		perf_output_read_one(handle, event, enabled, running);
   6953}
   6954
   6955static inline bool perf_sample_save_hw_index(struct perf_event *event)
   6956{
   6957	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
   6958}
   6959
   6960void perf_output_sample(struct perf_output_handle *handle,
   6961			struct perf_event_header *header,
   6962			struct perf_sample_data *data,
   6963			struct perf_event *event)
   6964{
   6965	u64 sample_type = data->type;
   6966
   6967	perf_output_put(handle, *header);
   6968
   6969	if (sample_type & PERF_SAMPLE_IDENTIFIER)
   6970		perf_output_put(handle, data->id);
   6971
   6972	if (sample_type & PERF_SAMPLE_IP)
   6973		perf_output_put(handle, data->ip);
   6974
   6975	if (sample_type & PERF_SAMPLE_TID)
   6976		perf_output_put(handle, data->tid_entry);
   6977
   6978	if (sample_type & PERF_SAMPLE_TIME)
   6979		perf_output_put(handle, data->time);
   6980
   6981	if (sample_type & PERF_SAMPLE_ADDR)
   6982		perf_output_put(handle, data->addr);
   6983
   6984	if (sample_type & PERF_SAMPLE_ID)
   6985		perf_output_put(handle, data->id);
   6986
   6987	if (sample_type & PERF_SAMPLE_STREAM_ID)
   6988		perf_output_put(handle, data->stream_id);
   6989
   6990	if (sample_type & PERF_SAMPLE_CPU)
   6991		perf_output_put(handle, data->cpu_entry);
   6992
   6993	if (sample_type & PERF_SAMPLE_PERIOD)
   6994		perf_output_put(handle, data->period);
   6995
   6996	if (sample_type & PERF_SAMPLE_READ)
   6997		perf_output_read(handle, event);
   6998
   6999	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
   7000		int size = 1;
   7001
   7002		size += data->callchain->nr;
   7003		size *= sizeof(u64);
   7004		__output_copy(handle, data->callchain, size);
   7005	}
   7006
   7007	if (sample_type & PERF_SAMPLE_RAW) {
   7008		struct perf_raw_record *raw = data->raw;
   7009
   7010		if (raw) {
   7011			struct perf_raw_frag *frag = &raw->frag;
   7012
   7013			perf_output_put(handle, raw->size);
   7014			do {
   7015				if (frag->copy) {
   7016					__output_custom(handle, frag->copy,
   7017							frag->data, frag->size);
   7018				} else {
   7019					__output_copy(handle, frag->data,
   7020						      frag->size);
   7021				}
   7022				if (perf_raw_frag_last(frag))
   7023					break;
   7024				frag = frag->next;
   7025			} while (1);
   7026			if (frag->pad)
   7027				__output_skip(handle, NULL, frag->pad);
   7028		} else {
   7029			struct {
   7030				u32	size;
   7031				u32	data;
   7032			} raw = {
   7033				.size = sizeof(u32),
   7034				.data = 0,
   7035			};
   7036			perf_output_put(handle, raw);
   7037		}
   7038	}
   7039
   7040	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
   7041		if (data->br_stack) {
   7042			size_t size;
   7043
   7044			size = data->br_stack->nr
   7045			     * sizeof(struct perf_branch_entry);
   7046
   7047			perf_output_put(handle, data->br_stack->nr);
   7048			if (perf_sample_save_hw_index(event))
   7049				perf_output_put(handle, data->br_stack->hw_idx);
   7050			perf_output_copy(handle, data->br_stack->entries, size);
   7051		} else {
   7052			/*
   7053			 * we always store at least the value of nr
   7054			 */
   7055			u64 nr = 0;
   7056			perf_output_put(handle, nr);
   7057		}
   7058	}
   7059
   7060	if (sample_type & PERF_SAMPLE_REGS_USER) {
   7061		u64 abi = data->regs_user.abi;
   7062
   7063		/*
   7064		 * If there are no regs to dump, notice it through
   7065		 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
   7066		 */
   7067		perf_output_put(handle, abi);
   7068
   7069		if (abi) {
   7070			u64 mask = event->attr.sample_regs_user;
   7071			perf_output_sample_regs(handle,
   7072						data->regs_user.regs,
   7073						mask);
   7074		}
   7075	}
   7076
   7077	if (sample_type & PERF_SAMPLE_STACK_USER) {
   7078		perf_output_sample_ustack(handle,
   7079					  data->stack_user_size,
   7080					  data->regs_user.regs);
   7081	}
   7082
   7083	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
   7084		perf_output_put(handle, data->weight.full);
   7085
   7086	if (sample_type & PERF_SAMPLE_DATA_SRC)
   7087		perf_output_put(handle, data->data_src.val);
   7088
   7089	if (sample_type & PERF_SAMPLE_TRANSACTION)
   7090		perf_output_put(handle, data->txn);
   7091
   7092	if (sample_type & PERF_SAMPLE_REGS_INTR) {
   7093		u64 abi = data->regs_intr.abi;
   7094		/*
   7095		 * If there are no regs to dump, notice it through
   7096		 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
   7097		 */
   7098		perf_output_put(handle, abi);
   7099
   7100		if (abi) {
   7101			u64 mask = event->attr.sample_regs_intr;
   7102
   7103			perf_output_sample_regs(handle,
   7104						data->regs_intr.regs,
   7105						mask);
   7106		}
   7107	}
   7108
   7109	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
   7110		perf_output_put(handle, data->phys_addr);
   7111
   7112	if (sample_type & PERF_SAMPLE_CGROUP)
   7113		perf_output_put(handle, data->cgroup);
   7114
   7115	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
   7116		perf_output_put(handle, data->data_page_size);
   7117
   7118	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
   7119		perf_output_put(handle, data->code_page_size);
   7120
   7121	if (sample_type & PERF_SAMPLE_AUX) {
   7122		perf_output_put(handle, data->aux_size);
   7123
   7124		if (data->aux_size)
   7125			perf_aux_sample_output(event, handle, data);
   7126	}
   7127
   7128	if (!event->attr.watermark) {
   7129		int wakeup_events = event->attr.wakeup_events;
   7130
   7131		if (wakeup_events) {
   7132			struct perf_buffer *rb = handle->rb;
   7133			int events = local_inc_return(&rb->events);
   7134
   7135			if (events >= wakeup_events) {
   7136				local_sub(wakeup_events, &rb->events);
   7137				local_inc(&rb->wakeup);
   7138			}
   7139		}
   7140	}
   7141}
   7142
   7143static u64 perf_virt_to_phys(u64 virt)
   7144{
   7145	u64 phys_addr = 0;
   7146
   7147	if (!virt)
   7148		return 0;
   7149
   7150	if (virt >= TASK_SIZE) {
   7151		/* If it's vmalloc()d memory, leave phys_addr as 0 */
   7152		if (virt_addr_valid((void *)(uintptr_t)virt) &&
   7153		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
   7154			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
   7155	} else {
   7156		/*
   7157		 * Walking the pages tables for user address.
   7158		 * Interrupts are disabled, so it prevents any tear down
   7159		 * of the page tables.
   7160		 * Try IRQ-safe get_user_page_fast_only first.
   7161		 * If failed, leave phys_addr as 0.
   7162		 */
   7163		if (current->mm != NULL) {
   7164			struct page *p;
   7165
   7166			pagefault_disable();
   7167			if (get_user_page_fast_only(virt, 0, &p)) {
   7168				phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
   7169				put_page(p);
   7170			}
   7171			pagefault_enable();
   7172		}
   7173	}
   7174
   7175	return phys_addr;
   7176}
   7177
   7178/*
   7179 * Return the pagetable size of a given virtual address.
   7180 */
   7181static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
   7182{
   7183	u64 size = 0;
   7184
   7185#ifdef CONFIG_HAVE_FAST_GUP
   7186	pgd_t *pgdp, pgd;
   7187	p4d_t *p4dp, p4d;
   7188	pud_t *pudp, pud;
   7189	pmd_t *pmdp, pmd;
   7190	pte_t *ptep, pte;
   7191
   7192	pgdp = pgd_offset(mm, addr);
   7193	pgd = READ_ONCE(*pgdp);
   7194	if (pgd_none(pgd))
   7195		return 0;
   7196
   7197	if (pgd_leaf(pgd))
   7198		return pgd_leaf_size(pgd);
   7199
   7200	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
   7201	p4d = READ_ONCE(*p4dp);
   7202	if (!p4d_present(p4d))
   7203		return 0;
   7204
   7205	if (p4d_leaf(p4d))
   7206		return p4d_leaf_size(p4d);
   7207
   7208	pudp = pud_offset_lockless(p4dp, p4d, addr);
   7209	pud = READ_ONCE(*pudp);
   7210	if (!pud_present(pud))
   7211		return 0;
   7212
   7213	if (pud_leaf(pud))
   7214		return pud_leaf_size(pud);
   7215
   7216	pmdp = pmd_offset_lockless(pudp, pud, addr);
   7217	pmd = READ_ONCE(*pmdp);
   7218	if (!pmd_present(pmd))
   7219		return 0;
   7220
   7221	if (pmd_leaf(pmd))
   7222		return pmd_leaf_size(pmd);
   7223
   7224	ptep = pte_offset_map(&pmd, addr);
   7225	pte = ptep_get_lockless(ptep);
   7226	if (pte_present(pte))
   7227		size = pte_leaf_size(pte);
   7228	pte_unmap(ptep);
   7229#endif /* CONFIG_HAVE_FAST_GUP */
   7230
   7231	return size;
   7232}
   7233
   7234static u64 perf_get_page_size(unsigned long addr)
   7235{
   7236	struct mm_struct *mm;
   7237	unsigned long flags;
   7238	u64 size;
   7239
   7240	if (!addr)
   7241		return 0;
   7242
   7243	/*
   7244	 * Software page-table walkers must disable IRQs,
   7245	 * which prevents any tear down of the page tables.
   7246	 */
   7247	local_irq_save(flags);
   7248
   7249	mm = current->mm;
   7250	if (!mm) {
   7251		/*
   7252		 * For kernel threads and the like, use init_mm so that
   7253		 * we can find kernel memory.
   7254		 */
   7255		mm = &init_mm;
   7256	}
   7257
   7258	size = perf_get_pgtable_size(mm, addr);
   7259
   7260	local_irq_restore(flags);
   7261
   7262	return size;
   7263}
   7264
   7265static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
   7266
   7267struct perf_callchain_entry *
   7268perf_callchain(struct perf_event *event, struct pt_regs *regs)
   7269{
   7270	bool kernel = !event->attr.exclude_callchain_kernel;
   7271	bool user   = !event->attr.exclude_callchain_user;
   7272	/* Disallow cross-task user callchains. */
   7273	bool crosstask = event->ctx->task && event->ctx->task != current;
   7274	const u32 max_stack = event->attr.sample_max_stack;
   7275	struct perf_callchain_entry *callchain;
   7276
   7277	if (!kernel && !user)
   7278		return &__empty_callchain;
   7279
   7280	callchain = get_perf_callchain(regs, 0, kernel, user,
   7281				       max_stack, crosstask, true);
   7282	return callchain ?: &__empty_callchain;
   7283}
   7284
   7285void perf_prepare_sample(struct perf_event_header *header,
   7286			 struct perf_sample_data *data,
   7287			 struct perf_event *event,
   7288			 struct pt_regs *regs)
   7289{
   7290	u64 sample_type = event->attr.sample_type;
   7291
   7292	header->type = PERF_RECORD_SAMPLE;
   7293	header->size = sizeof(*header) + event->header_size;
   7294
   7295	header->misc = 0;
   7296	header->misc |= perf_misc_flags(regs);
   7297
   7298	__perf_event_header__init_id(header, data, event);
   7299
   7300	if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
   7301		data->ip = perf_instruction_pointer(regs);
   7302
   7303	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
   7304		int size = 1;
   7305
   7306		if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
   7307			data->callchain = perf_callchain(event, regs);
   7308
   7309		size += data->callchain->nr;
   7310
   7311		header->size += size * sizeof(u64);
   7312	}
   7313
   7314	if (sample_type & PERF_SAMPLE_RAW) {
   7315		struct perf_raw_record *raw = data->raw;
   7316		int size;
   7317
   7318		if (raw) {
   7319			struct perf_raw_frag *frag = &raw->frag;
   7320			u32 sum = 0;
   7321
   7322			do {
   7323				sum += frag->size;
   7324				if (perf_raw_frag_last(frag))
   7325					break;
   7326				frag = frag->next;
   7327			} while (1);
   7328
   7329			size = round_up(sum + sizeof(u32), sizeof(u64));
   7330			raw->size = size - sizeof(u32);
   7331			frag->pad = raw->size - sum;
   7332		} else {
   7333			size = sizeof(u64);
   7334		}
   7335
   7336		header->size += size;
   7337	}
   7338
   7339	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
   7340		int size = sizeof(u64); /* nr */
   7341		if (data->br_stack) {
   7342			if (perf_sample_save_hw_index(event))
   7343				size += sizeof(u64);
   7344
   7345			size += data->br_stack->nr
   7346			      * sizeof(struct perf_branch_entry);
   7347		}
   7348		header->size += size;
   7349	}
   7350
   7351	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
   7352		perf_sample_regs_user(&data->regs_user, regs);
   7353
   7354	if (sample_type & PERF_SAMPLE_REGS_USER) {
   7355		/* regs dump ABI info */
   7356		int size = sizeof(u64);
   7357
   7358		if (data->regs_user.regs) {
   7359			u64 mask = event->attr.sample_regs_user;
   7360			size += hweight64(mask) * sizeof(u64);
   7361		}
   7362
   7363		header->size += size;
   7364	}
   7365
   7366	if (sample_type & PERF_SAMPLE_STACK_USER) {
   7367		/*
   7368		 * Either we need PERF_SAMPLE_STACK_USER bit to be always
   7369		 * processed as the last one or have additional check added
   7370		 * in case new sample type is added, because we could eat
   7371		 * up the rest of the sample size.
   7372		 */
   7373		u16 stack_size = event->attr.sample_stack_user;
   7374		u16 size = sizeof(u64);
   7375
   7376		stack_size = perf_sample_ustack_size(stack_size, header->size,
   7377						     data->regs_user.regs);
   7378
   7379		/*
   7380		 * If there is something to dump, add space for the dump
   7381		 * itself and for the field that tells the dynamic size,
   7382		 * which is how many have been actually dumped.
   7383		 */
   7384		if (stack_size)
   7385			size += sizeof(u64) + stack_size;
   7386
   7387		data->stack_user_size = stack_size;
   7388		header->size += size;
   7389	}
   7390
   7391	if (sample_type & PERF_SAMPLE_REGS_INTR) {
   7392		/* regs dump ABI info */
   7393		int size = sizeof(u64);
   7394
   7395		perf_sample_regs_intr(&data->regs_intr, regs);
   7396
   7397		if (data->regs_intr.regs) {
   7398			u64 mask = event->attr.sample_regs_intr;
   7399
   7400			size += hweight64(mask) * sizeof(u64);
   7401		}
   7402
   7403		header->size += size;
   7404	}
   7405
   7406	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
   7407		data->phys_addr = perf_virt_to_phys(data->addr);
   7408
   7409#ifdef CONFIG_CGROUP_PERF
   7410	if (sample_type & PERF_SAMPLE_CGROUP) {
   7411		struct cgroup *cgrp;
   7412
   7413		/* protected by RCU */
   7414		cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
   7415		data->cgroup = cgroup_id(cgrp);
   7416	}
   7417#endif
   7418
   7419	/*
   7420	 * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
   7421	 * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
   7422	 * but the value will not dump to the userspace.
   7423	 */
   7424	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
   7425		data->data_page_size = perf_get_page_size(data->addr);
   7426
   7427	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
   7428		data->code_page_size = perf_get_page_size(data->ip);
   7429
   7430	if (sample_type & PERF_SAMPLE_AUX) {
   7431		u64 size;
   7432
   7433		header->size += sizeof(u64); /* size */
   7434
   7435		/*
   7436		 * Given the 16bit nature of header::size, an AUX sample can
   7437		 * easily overflow it, what with all the preceding sample bits.
   7438		 * Make sure this doesn't happen by using up to U16_MAX bytes
   7439		 * per sample in total (rounded down to 8 byte boundary).
   7440		 */
   7441		size = min_t(size_t, U16_MAX - header->size,
   7442			     event->attr.aux_sample_size);
   7443		size = rounddown(size, 8);
   7444		size = perf_prepare_sample_aux(event, data, size);
   7445
   7446		WARN_ON_ONCE(size + header->size > U16_MAX);
   7447		header->size += size;
   7448	}
   7449	/*
   7450	 * If you're adding more sample types here, you likely need to do
   7451	 * something about the overflowing header::size, like repurpose the
   7452	 * lowest 3 bits of size, which should be always zero at the moment.
   7453	 * This raises a more important question, do we really need 512k sized
   7454	 * samples and why, so good argumentation is in order for whatever you
   7455	 * do here next.
   7456	 */
   7457	WARN_ON_ONCE(header->size & 7);
   7458}
   7459
   7460static __always_inline int
   7461__perf_event_output(struct perf_event *event,
   7462		    struct perf_sample_data *data,
   7463		    struct pt_regs *regs,
   7464		    int (*output_begin)(struct perf_output_handle *,
   7465					struct perf_sample_data *,
   7466					struct perf_event *,
   7467					unsigned int))
   7468{
   7469	struct perf_output_handle handle;
   7470	struct perf_event_header header;
   7471	int err;
   7472
   7473	/* protect the callchain buffers */
   7474	rcu_read_lock();
   7475
   7476	perf_prepare_sample(&header, data, event, regs);
   7477
   7478	err = output_begin(&handle, data, event, header.size);
   7479	if (err)
   7480		goto exit;
   7481
   7482	perf_output_sample(&handle, &header, data, event);
   7483
   7484	perf_output_end(&handle);
   7485
   7486exit:
   7487	rcu_read_unlock();
   7488	return err;
   7489}
   7490
   7491void
   7492perf_event_output_forward(struct perf_event *event,
   7493			 struct perf_sample_data *data,
   7494			 struct pt_regs *regs)
   7495{
   7496	__perf_event_output(event, data, regs, perf_output_begin_forward);
   7497}
   7498
   7499void
   7500perf_event_output_backward(struct perf_event *event,
   7501			   struct perf_sample_data *data,
   7502			   struct pt_regs *regs)
   7503{
   7504	__perf_event_output(event, data, regs, perf_output_begin_backward);
   7505}
   7506
   7507int
   7508perf_event_output(struct perf_event *event,
   7509		  struct perf_sample_data *data,
   7510		  struct pt_regs *regs)
   7511{
   7512	return __perf_event_output(event, data, regs, perf_output_begin);
   7513}
   7514
   7515/*
   7516 * read event_id
   7517 */
   7518
   7519struct perf_read_event {
   7520	struct perf_event_header	header;
   7521
   7522	u32				pid;
   7523	u32				tid;
   7524};
   7525
   7526static void
   7527perf_event_read_event(struct perf_event *event,
   7528			struct task_struct *task)
   7529{
   7530	struct perf_output_handle handle;
   7531	struct perf_sample_data sample;
   7532	struct perf_read_event read_event = {
   7533		.header = {
   7534			.type = PERF_RECORD_READ,
   7535			.misc = 0,
   7536			.size = sizeof(read_event) + event->read_size,
   7537		},
   7538		.pid = perf_event_pid(event, task),
   7539		.tid = perf_event_tid(event, task),
   7540	};
   7541	int ret;
   7542
   7543	perf_event_header__init_id(&read_event.header, &sample, event);
   7544	ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
   7545	if (ret)
   7546		return;
   7547
   7548	perf_output_put(&handle, read_event);
   7549	perf_output_read(&handle, event);
   7550	perf_event__output_id_sample(event, &handle, &sample);
   7551
   7552	perf_output_end(&handle);
   7553}
   7554
   7555typedef void (perf_iterate_f)(struct perf_event *event, void *data);
   7556
   7557static void
   7558perf_iterate_ctx(struct perf_event_context *ctx,
   7559		   perf_iterate_f output,
   7560		   void *data, bool all)
   7561{
   7562	struct perf_event *event;
   7563
   7564	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
   7565		if (!all) {
   7566			if (event->state < PERF_EVENT_STATE_INACTIVE)
   7567				continue;
   7568			if (!event_filter_match(event))
   7569				continue;
   7570		}
   7571
   7572		output(event, data);
   7573	}
   7574}
   7575
   7576static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
   7577{
   7578	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
   7579	struct perf_event *event;
   7580
   7581	list_for_each_entry_rcu(event, &pel->list, sb_list) {
   7582		/*
   7583		 * Skip events that are not fully formed yet; ensure that
   7584		 * if we observe event->ctx, both event and ctx will be
   7585		 * complete enough. See perf_install_in_context().
   7586		 */
   7587		if (!smp_load_acquire(&event->ctx))
   7588			continue;
   7589
   7590		if (event->state < PERF_EVENT_STATE_INACTIVE)
   7591			continue;
   7592		if (!event_filter_match(event))
   7593			continue;
   7594		output(event, data);
   7595	}
   7596}
   7597
   7598/*
   7599 * Iterate all events that need to receive side-band events.
   7600 *
   7601 * For new callers; ensure that account_pmu_sb_event() includes
   7602 * your event, otherwise it might not get delivered.
   7603 */
   7604static void
   7605perf_iterate_sb(perf_iterate_f output, void *data,
   7606	       struct perf_event_context *task_ctx)
   7607{
   7608	struct perf_event_context *ctx;
   7609	int ctxn;
   7610
   7611	rcu_read_lock();
   7612	preempt_disable();
   7613
   7614	/*
   7615	 * If we have task_ctx != NULL we only notify the task context itself.
   7616	 * The task_ctx is set only for EXIT events before releasing task
   7617	 * context.
   7618	 */
   7619	if (task_ctx) {
   7620		perf_iterate_ctx(task_ctx, output, data, false);
   7621		goto done;
   7622	}
   7623
   7624	perf_iterate_sb_cpu(output, data);
   7625
   7626	for_each_task_context_nr(ctxn) {
   7627		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
   7628		if (ctx)
   7629			perf_iterate_ctx(ctx, output, data, false);
   7630	}
   7631done:
   7632	preempt_enable();
   7633	rcu_read_unlock();
   7634}
   7635
   7636/*
   7637 * Clear all file-based filters at exec, they'll have to be
   7638 * re-instated when/if these objects are mmapped again.
   7639 */
   7640static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
   7641{
   7642	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
   7643	struct perf_addr_filter *filter;
   7644	unsigned int restart = 0, count = 0;
   7645	unsigned long flags;
   7646
   7647	if (!has_addr_filter(event))
   7648		return;
   7649
   7650	raw_spin_lock_irqsave(&ifh->lock, flags);
   7651	list_for_each_entry(filter, &ifh->list, entry) {
   7652		if (filter->path.dentry) {
   7653			event->addr_filter_ranges[count].start = 0;
   7654			event->addr_filter_ranges[count].size = 0;
   7655			restart++;
   7656		}
   7657
   7658		count++;
   7659	}
   7660
   7661	if (restart)
   7662		event->addr_filters_gen++;
   7663	raw_spin_unlock_irqrestore(&ifh->lock, flags);
   7664
   7665	if (restart)
   7666		perf_event_stop(event, 1);
   7667}
   7668
   7669void perf_event_exec(void)
   7670{
   7671	struct perf_event_context *ctx;
   7672	int ctxn;
   7673
   7674	for_each_task_context_nr(ctxn) {
   7675		perf_event_enable_on_exec(ctxn);
   7676		perf_event_remove_on_exec(ctxn);
   7677
   7678		rcu_read_lock();
   7679		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
   7680		if (ctx) {
   7681			perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
   7682					 NULL, true);
   7683		}
   7684		rcu_read_unlock();
   7685	}
   7686}
   7687
   7688struct remote_output {
   7689	struct perf_buffer	*rb;
   7690	int			err;
   7691};
   7692
   7693static void __perf_event_output_stop(struct perf_event *event, void *data)
   7694{
   7695	struct perf_event *parent = event->parent;
   7696	struct remote_output *ro = data;
   7697	struct perf_buffer *rb = ro->rb;
   7698	struct stop_event_data sd = {
   7699		.event	= event,
   7700	};
   7701
   7702	if (!has_aux(event))
   7703		return;
   7704
   7705	if (!parent)
   7706		parent = event;
   7707
   7708	/*
   7709	 * In case of inheritance, it will be the parent that links to the
   7710	 * ring-buffer, but it will be the child that's actually using it.
   7711	 *
   7712	 * We are using event::rb to determine if the event should be stopped,
   7713	 * however this may race with ring_buffer_attach() (through set_output),
   7714	 * which will make us skip the event that actually needs to be stopped.
   7715	 * So ring_buffer_attach() has to stop an aux event before re-assigning
   7716	 * its rb pointer.
   7717	 */
   7718	if (rcu_dereference(parent->rb) == rb)
   7719		ro->err = __perf_event_stop(&sd);
   7720}
   7721
   7722static int __perf_pmu_output_stop(void *info)
   7723{
   7724	struct perf_event *event = info;
   7725	struct pmu *pmu = event->ctx->pmu;
   7726	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
   7727	struct remote_output ro = {
   7728		.rb	= event->rb,
   7729	};
   7730
   7731	rcu_read_lock();
   7732	perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
   7733	if (cpuctx->task_ctx)
   7734		perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
   7735				   &ro, false);
   7736	rcu_read_unlock();
   7737
   7738	return ro.err;
   7739}
   7740
   7741static void perf_pmu_output_stop(struct perf_event *event)
   7742{
   7743	struct perf_event *iter;
   7744	int err, cpu;
   7745
   7746restart:
   7747	rcu_read_lock();
   7748	list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
   7749		/*
   7750		 * For per-CPU events, we need to make sure that neither they
   7751		 * nor their children are running; for cpu==-1 events it's
   7752		 * sufficient to stop the event itself if it's active, since
   7753		 * it can't have children.
   7754		 */
   7755		cpu = iter->cpu;
   7756		if (cpu == -1)
   7757			cpu = READ_ONCE(iter->oncpu);
   7758
   7759		if (cpu == -1)
   7760			continue;
   7761
   7762		err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
   7763		if (err == -EAGAIN) {
   7764			rcu_read_unlock();
   7765			goto restart;
   7766		}
   7767	}
   7768	rcu_read_unlock();
   7769}
   7770
   7771/*
   7772 * task tracking -- fork/exit
   7773 *
   7774 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
   7775 */
   7776
   7777struct perf_task_event {
   7778	struct task_struct		*task;
   7779	struct perf_event_context	*task_ctx;
   7780
   7781	struct {
   7782		struct perf_event_header	header;
   7783
   7784		u32				pid;
   7785		u32				ppid;
   7786		u32				tid;
   7787		u32				ptid;
   7788		u64				time;
   7789	} event_id;
   7790};
   7791
   7792static int perf_event_task_match(struct perf_event *event)
   7793{
   7794	return event->attr.comm  || event->attr.mmap ||
   7795	       event->attr.mmap2 || event->attr.mmap_data ||
   7796	       event->attr.task;
   7797}
   7798
   7799static void perf_event_task_output(struct perf_event *event,
   7800				   void *data)
   7801{
   7802	struct perf_task_event *task_event = data;
   7803	struct perf_output_handle handle;
   7804	struct perf_sample_data	sample;
   7805	struct task_struct *task = task_event->task;
   7806	int ret, size = task_event->event_id.header.size;
   7807
   7808	if (!perf_event_task_match(event))
   7809		return;
   7810
   7811	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
   7812
   7813	ret = perf_output_begin(&handle, &sample, event,
   7814				task_event->event_id.header.size);
   7815	if (ret)
   7816		goto out;
   7817
   7818	task_event->event_id.pid = perf_event_pid(event, task);
   7819	task_event->event_id.tid = perf_event_tid(event, task);
   7820
   7821	if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
   7822		task_event->event_id.ppid = perf_event_pid(event,
   7823							task->real_parent);
   7824		task_event->event_id.ptid = perf_event_pid(event,
   7825							task->real_parent);
   7826	} else {  /* PERF_RECORD_FORK */
   7827		task_event->event_id.ppid = perf_event_pid(event, current);
   7828		task_event->event_id.ptid = perf_event_tid(event, current);
   7829	}
   7830
   7831	task_event->event_id.time = perf_event_clock(event);
   7832
   7833	perf_output_put(&handle, task_event->event_id);
   7834
   7835	perf_event__output_id_sample(event, &handle, &sample);
   7836
   7837	perf_output_end(&handle);
   7838out:
   7839	task_event->event_id.header.size = size;
   7840}
   7841
   7842static void perf_event_task(struct task_struct *task,
   7843			      struct perf_event_context *task_ctx,
   7844			      int new)
   7845{
   7846	struct perf_task_event task_event;
   7847
   7848	if (!atomic_read(&nr_comm_events) &&
   7849	    !atomic_read(&nr_mmap_events) &&
   7850	    !atomic_read(&nr_task_events))
   7851		return;
   7852
   7853	task_event = (struct perf_task_event){
   7854		.task	  = task,
   7855		.task_ctx = task_ctx,
   7856		.event_id    = {
   7857			.header = {
   7858				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
   7859				.misc = 0,
   7860				.size = sizeof(task_event.event_id),
   7861			},
   7862			/* .pid  */
   7863			/* .ppid */
   7864			/* .tid  */
   7865			/* .ptid */
   7866			/* .time */
   7867		},
   7868	};
   7869
   7870	perf_iterate_sb(perf_event_task_output,
   7871		       &task_event,
   7872		       task_ctx);
   7873}
   7874
   7875void perf_event_fork(struct task_struct *task)
   7876{
   7877	perf_event_task(task, NULL, 1);
   7878	perf_event_namespaces(task);
   7879}
   7880
   7881/*
   7882 * comm tracking
   7883 */
   7884
   7885struct perf_comm_event {
   7886	struct task_struct	*task;
   7887	char			*comm;
   7888	int			comm_size;
   7889
   7890	struct {
   7891		struct perf_event_header	header;
   7892
   7893		u32				pid;
   7894		u32				tid;
   7895	} event_id;
   7896};
   7897
   7898static int perf_event_comm_match(struct perf_event *event)
   7899{
   7900	return event->attr.comm;
   7901}
   7902
   7903static void perf_event_comm_output(struct perf_event *event,
   7904				   void *data)
   7905{
   7906	struct perf_comm_event *comm_event = data;
   7907	struct perf_output_handle handle;
   7908	struct perf_sample_data sample;
   7909	int size = comm_event->event_id.header.size;
   7910	int ret;
   7911
   7912	if (!perf_event_comm_match(event))
   7913		return;
   7914
   7915	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
   7916	ret = perf_output_begin(&handle, &sample, event,
   7917				comm_event->event_id.header.size);
   7918
   7919	if (ret)
   7920		goto out;
   7921
   7922	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
   7923	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
   7924
   7925	perf_output_put(&handle, comm_event->event_id);
   7926	__output_copy(&handle, comm_event->comm,
   7927				   comm_event->comm_size);
   7928
   7929	perf_event__output_id_sample(event, &handle, &sample);
   7930
   7931	perf_output_end(&handle);
   7932out:
   7933	comm_event->event_id.header.size = size;
   7934}
   7935
   7936static void perf_event_comm_event(struct perf_comm_event *comm_event)
   7937{
   7938	char comm[TASK_COMM_LEN];
   7939	unsigned int size;
   7940
   7941	memset(comm, 0, sizeof(comm));
   7942	strlcpy(comm, comm_event->task->comm, sizeof(comm));
   7943	size = ALIGN(strlen(comm)+1, sizeof(u64));
   7944
   7945	comm_event->comm = comm;
   7946	comm_event->comm_size = size;
   7947
   7948	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
   7949
   7950	perf_iterate_sb(perf_event_comm_output,
   7951		       comm_event,
   7952		       NULL);
   7953}
   7954
   7955void perf_event_comm(struct task_struct *task, bool exec)
   7956{
   7957	struct perf_comm_event comm_event;
   7958
   7959	if (!atomic_read(&nr_comm_events))
   7960		return;
   7961
   7962	comm_event = (struct perf_comm_event){
   7963		.task	= task,
   7964		/* .comm      */
   7965		/* .comm_size */
   7966		.event_id  = {
   7967			.header = {
   7968				.type = PERF_RECORD_COMM,
   7969				.misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
   7970				/* .size */
   7971			},
   7972			/* .pid */
   7973			/* .tid */
   7974		},
   7975	};
   7976
   7977	perf_event_comm_event(&comm_event);
   7978}
   7979
   7980/*
   7981 * namespaces tracking
   7982 */
   7983
   7984struct perf_namespaces_event {
   7985	struct task_struct		*task;
   7986
   7987	struct {
   7988		struct perf_event_header	header;
   7989
   7990		u32				pid;
   7991		u32				tid;
   7992		u64				nr_namespaces;
   7993		struct perf_ns_link_info	link_info[NR_NAMESPACES];
   7994	} event_id;
   7995};
   7996
   7997static int perf_event_namespaces_match(struct perf_event *event)
   7998{
   7999	return event->attr.namespaces;
   8000}
   8001
   8002static void perf_event_namespaces_output(struct perf_event *event,
   8003					 void *data)
   8004{
   8005	struct perf_namespaces_event *namespaces_event = data;
   8006	struct perf_output_handle handle;
   8007	struct perf_sample_data sample;
   8008	u16 header_size = namespaces_event->event_id.header.size;
   8009	int ret;
   8010
   8011	if (!perf_event_namespaces_match(event))
   8012		return;
   8013
   8014	perf_event_header__init_id(&namespaces_event->event_id.header,
   8015				   &sample, event);
   8016	ret = perf_output_begin(&handle, &sample, event,
   8017				namespaces_event->event_id.header.size);
   8018	if (ret)
   8019		goto out;
   8020
   8021	namespaces_event->event_id.pid = perf_event_pid(event,
   8022							namespaces_event->task);
   8023	namespaces_event->event_id.tid = perf_event_tid(event,
   8024							namespaces_event->task);
   8025
   8026	perf_output_put(&handle, namespaces_event->event_id);
   8027
   8028	perf_event__output_id_sample(event, &handle, &sample);
   8029
   8030	perf_output_end(&handle);
   8031out:
   8032	namespaces_event->event_id.header.size = header_size;
   8033}
   8034
   8035static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
   8036				   struct task_struct *task,
   8037				   const struct proc_ns_operations *ns_ops)
   8038{
   8039	struct path ns_path;
   8040	struct inode *ns_inode;
   8041	int error;
   8042
   8043	error = ns_get_path(&ns_path, task, ns_ops);
   8044	if (!error) {
   8045		ns_inode = ns_path.dentry->d_inode;
   8046		ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
   8047		ns_link_info->ino = ns_inode->i_ino;
   8048		path_put(&ns_path);
   8049	}
   8050}
   8051
   8052void perf_event_namespaces(struct task_struct *task)
   8053{
   8054	struct perf_namespaces_event namespaces_event;
   8055	struct perf_ns_link_info *ns_link_info;
   8056
   8057	if (!atomic_read(&nr_namespaces_events))
   8058		return;
   8059
   8060	namespaces_event = (struct perf_namespaces_event){
   8061		.task	= task,
   8062		.event_id  = {
   8063			.header = {
   8064				.type = PERF_RECORD_NAMESPACES,
   8065				.misc = 0,
   8066				.size = sizeof(namespaces_event.event_id),
   8067			},
   8068			/* .pid */
   8069			/* .tid */
   8070			.nr_namespaces = NR_NAMESPACES,
   8071			/* .link_info[NR_NAMESPACES] */
   8072		},
   8073	};
   8074
   8075	ns_link_info = namespaces_event.event_id.link_info;
   8076
   8077	perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
   8078			       task, &mntns_operations);
   8079
   8080#ifdef CONFIG_USER_NS
   8081	perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
   8082			       task, &userns_operations);
   8083#endif
   8084#ifdef CONFIG_NET_NS
   8085	perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
   8086			       task, &netns_operations);
   8087#endif
   8088#ifdef CONFIG_UTS_NS
   8089	perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
   8090			       task, &utsns_operations);
   8091#endif
   8092#ifdef CONFIG_IPC_NS
   8093	perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
   8094			       task, &ipcns_operations);
   8095#endif
   8096#ifdef CONFIG_PID_NS
   8097	perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
   8098			       task, &pidns_operations);
   8099#endif
   8100#ifdef CONFIG_CGROUPS
   8101	perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
   8102			       task, &cgroupns_operations);
   8103#endif
   8104
   8105	perf_iterate_sb(perf_event_namespaces_output,
   8106			&namespaces_event,
   8107			NULL);
   8108}
   8109
   8110/*
   8111 * cgroup tracking
   8112 */
   8113#ifdef CONFIG_CGROUP_PERF
   8114
   8115struct perf_cgroup_event {
   8116	char				*path;
   8117	int				path_size;
   8118	struct {
   8119		struct perf_event_header	header;
   8120		u64				id;
   8121		char				path[];
   8122	} event_id;
   8123};
   8124
   8125static int perf_event_cgroup_match(struct perf_event *event)
   8126{
   8127	return event->attr.cgroup;
   8128}
   8129
   8130static void perf_event_cgroup_output(struct perf_event *event, void *data)
   8131{
   8132	struct perf_cgroup_event *cgroup_event = data;
   8133	struct perf_output_handle handle;
   8134	struct perf_sample_data sample;
   8135	u16 header_size = cgroup_event->event_id.header.size;
   8136	int ret;
   8137
   8138	if (!perf_event_cgroup_match(event))
   8139		return;
   8140
   8141	perf_event_header__init_id(&cgroup_event->event_id.header,
   8142				   &sample, event);
   8143	ret = perf_output_begin(&handle, &sample, event,
   8144				cgroup_event->event_id.header.size);
   8145	if (ret)
   8146		goto out;
   8147
   8148	perf_output_put(&handle, cgroup_event->event_id);
   8149	__output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
   8150
   8151	perf_event__output_id_sample(event, &handle, &sample);
   8152
   8153	perf_output_end(&handle);
   8154out:
   8155	cgroup_event->event_id.header.size = header_size;
   8156}
   8157
   8158static void perf_event_cgroup(struct cgroup *cgrp)
   8159{
   8160	struct perf_cgroup_event cgroup_event;
   8161	char path_enomem[16] = "//enomem";
   8162	char *pathname;
   8163	size_t size;
   8164
   8165	if (!atomic_read(&nr_cgroup_events))
   8166		return;
   8167
   8168	cgroup_event = (struct perf_cgroup_event){
   8169		.event_id  = {
   8170			.header = {
   8171				.type = PERF_RECORD_CGROUP,
   8172				.misc = 0,
   8173				.size = sizeof(cgroup_event.event_id),
   8174			},
   8175			.id = cgroup_id(cgrp),
   8176		},
   8177	};
   8178
   8179	pathname = kmalloc(PATH_MAX, GFP_KERNEL);
   8180	if (pathname == NULL) {
   8181		cgroup_event.path = path_enomem;
   8182	} else {
   8183		/* just to be sure to have enough space for alignment */
   8184		cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
   8185		cgroup_event.path = pathname;
   8186	}
   8187
   8188	/*
   8189	 * Since our buffer works in 8 byte units we need to align our string
   8190	 * size to a multiple of 8. However, we must guarantee the tail end is
   8191	 * zero'd out to avoid leaking random bits to userspace.
   8192	 */
   8193	size = strlen(cgroup_event.path) + 1;
   8194	while (!IS_ALIGNED(size, sizeof(u64)))
   8195		cgroup_event.path[size++] = '\0';
   8196
   8197	cgroup_event.event_id.header.size += size;
   8198	cgroup_event.path_size = size;
   8199
   8200	perf_iterate_sb(perf_event_cgroup_output,
   8201			&cgroup_event,
   8202			NULL);
   8203
   8204	kfree(pathname);
   8205}
   8206
   8207#endif
   8208
   8209/*
   8210 * mmap tracking
   8211 */
   8212
   8213struct perf_mmap_event {
   8214	struct vm_area_struct	*vma;
   8215
   8216	const char		*file_name;
   8217	int			file_size;
   8218	int			maj, min;
   8219	u64			ino;
   8220	u64			ino_generation;
   8221	u32			prot, flags;
   8222	u8			build_id[BUILD_ID_SIZE_MAX];
   8223	u32			build_id_size;
   8224
   8225	struct {
   8226		struct perf_event_header	header;
   8227
   8228		u32				pid;
   8229		u32				tid;
   8230		u64				start;
   8231		u64				len;
   8232		u64				pgoff;
   8233	} event_id;
   8234};
   8235
   8236static int perf_event_mmap_match(struct perf_event *event,
   8237				 void *data)
   8238{
   8239	struct perf_mmap_event *mmap_event = data;
   8240	struct vm_area_struct *vma = mmap_event->vma;
   8241	int executable = vma->vm_flags & VM_EXEC;
   8242
   8243	return (!executable && event->attr.mmap_data) ||
   8244	       (executable && (event->attr.mmap || event->attr.mmap2));
   8245}
   8246
   8247static void perf_event_mmap_output(struct perf_event *event,
   8248				   void *data)
   8249{
   8250	struct perf_mmap_event *mmap_event = data;
   8251	struct perf_output_handle handle;
   8252	struct perf_sample_data sample;
   8253	int size = mmap_event->event_id.header.size;
   8254	u32 type = mmap_event->event_id.header.type;
   8255	bool use_build_id;
   8256	int ret;
   8257
   8258	if (!perf_event_mmap_match(event, data))
   8259		return;
   8260
   8261	if (event->attr.mmap2) {
   8262		mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
   8263		mmap_event->event_id.header.size += sizeof(mmap_event->maj);
   8264		mmap_event->event_id.header.size += sizeof(mmap_event->min);
   8265		mmap_event->event_id.header.size += sizeof(mmap_event->ino);
   8266		mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
   8267		mmap_event->event_id.header.size += sizeof(mmap_event->prot);
   8268		mmap_event->event_id.header.size += sizeof(mmap_event->flags);
   8269	}
   8270
   8271	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
   8272	ret = perf_output_begin(&handle, &sample, event,
   8273				mmap_event->event_id.header.size);
   8274	if (ret)
   8275		goto out;
   8276
   8277	mmap_event->event_id.pid = perf_event_pid(event, current);
   8278	mmap_event->event_id.tid = perf_event_tid(event, current);
   8279
   8280	use_build_id = event->attr.build_id && mmap_event->build_id_size;
   8281
   8282	if (event->attr.mmap2 && use_build_id)
   8283		mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
   8284
   8285	perf_output_put(&handle, mmap_event->event_id);
   8286
   8287	if (event->attr.mmap2) {
   8288		if (use_build_id) {
   8289			u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
   8290
   8291			__output_copy(&handle, size, 4);
   8292			__output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
   8293		} else {
   8294			perf_output_put(&handle, mmap_event->maj);
   8295			perf_output_put(&handle, mmap_event->min);
   8296			perf_output_put(&handle, mmap_event->ino);
   8297			perf_output_put(&handle, mmap_event->ino_generation);
   8298		}
   8299		perf_output_put(&handle, mmap_event->prot);
   8300		perf_output_put(&handle, mmap_event->flags);
   8301	}
   8302
   8303	__output_copy(&handle, mmap_event->file_name,
   8304				   mmap_event->file_size);
   8305
   8306	perf_event__output_id_sample(event, &handle, &sample);
   8307
   8308	perf_output_end(&handle);
   8309out:
   8310	mmap_event->event_id.header.size = size;
   8311	mmap_event->event_id.header.type = type;
   8312}
   8313
   8314static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
   8315{
   8316	struct vm_area_struct *vma = mmap_event->vma;
   8317	struct file *file = vma->vm_file;
   8318	int maj = 0, min = 0;
   8319	u64 ino = 0, gen = 0;
   8320	u32 prot = 0, flags = 0;
   8321	unsigned int size;
   8322	char tmp[16];
   8323	char *buf = NULL;
   8324	char *name;
   8325
   8326	if (vma->vm_flags & VM_READ)
   8327		prot |= PROT_READ;
   8328	if (vma->vm_flags & VM_WRITE)
   8329		prot |= PROT_WRITE;
   8330	if (vma->vm_flags & VM_EXEC)
   8331		prot |= PROT_EXEC;
   8332
   8333	if (vma->vm_flags & VM_MAYSHARE)
   8334		flags = MAP_SHARED;
   8335	else
   8336		flags = MAP_PRIVATE;
   8337
   8338	if (vma->vm_flags & VM_LOCKED)
   8339		flags |= MAP_LOCKED;
   8340	if (is_vm_hugetlb_page(vma))
   8341		flags |= MAP_HUGETLB;
   8342
   8343	if (file) {
   8344		struct inode *inode;
   8345		dev_t dev;
   8346
   8347		buf = kmalloc(PATH_MAX, GFP_KERNEL);
   8348		if (!buf) {
   8349			name = "//enomem";
   8350			goto cpy_name;
   8351		}
   8352		/*
   8353		 * d_path() works from the end of the rb backwards, so we
   8354		 * need to add enough zero bytes after the string to handle
   8355		 * the 64bit alignment we do later.
   8356		 */
   8357		name = file_path(file, buf, PATH_MAX - sizeof(u64));
   8358		if (IS_ERR(name)) {
   8359			name = "//toolong";
   8360			goto cpy_name;
   8361		}
   8362		inode = file_inode(vma->vm_file);
   8363		dev = inode->i_sb->s_dev;
   8364		ino = inode->i_ino;
   8365		gen = inode->i_generation;
   8366		maj = MAJOR(dev);
   8367		min = MINOR(dev);
   8368
   8369		goto got_name;
   8370	} else {
   8371		if (vma->vm_ops && vma->vm_ops->name) {
   8372			name = (char *) vma->vm_ops->name(vma);
   8373			if (name)
   8374				goto cpy_name;
   8375		}
   8376
   8377		name = (char *)arch_vma_name(vma);
   8378		if (name)
   8379			goto cpy_name;
   8380
   8381		if (vma->vm_start <= vma->vm_mm->start_brk &&
   8382				vma->vm_end >= vma->vm_mm->brk) {
   8383			name = "[heap]";
   8384			goto cpy_name;
   8385		}
   8386		if (vma->vm_start <= vma->vm_mm->start_stack &&
   8387				vma->vm_end >= vma->vm_mm->start_stack) {
   8388			name = "[stack]";
   8389			goto cpy_name;
   8390		}
   8391
   8392		name = "//anon";
   8393		goto cpy_name;
   8394	}
   8395
   8396cpy_name:
   8397	strlcpy(tmp, name, sizeof(tmp));
   8398	name = tmp;
   8399got_name:
   8400	/*
   8401	 * Since our buffer works in 8 byte units we need to align our string
   8402	 * size to a multiple of 8. However, we must guarantee the tail end is
   8403	 * zero'd out to avoid leaking random bits to userspace.
   8404	 */
   8405	size = strlen(name)+1;
   8406	while (!IS_ALIGNED(size, sizeof(u64)))
   8407		name[size++] = '\0';
   8408
   8409	mmap_event->file_name = name;
   8410	mmap_event->file_size = size;
   8411	mmap_event->maj = maj;
   8412	mmap_event->min = min;
   8413	mmap_event->ino = ino;
   8414	mmap_event->ino_generation = gen;
   8415	mmap_event->prot = prot;
   8416	mmap_event->flags = flags;
   8417
   8418	if (!(vma->vm_flags & VM_EXEC))
   8419		mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
   8420
   8421	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
   8422
   8423	if (atomic_read(&nr_build_id_events))
   8424		build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
   8425
   8426	perf_iterate_sb(perf_event_mmap_output,
   8427		       mmap_event,
   8428		       NULL);
   8429
   8430	kfree(buf);
   8431}
   8432
   8433/*
   8434 * Check whether inode and address range match filter criteria.
   8435 */
   8436static bool perf_addr_filter_match(struct perf_addr_filter *filter,
   8437				     struct file *file, unsigned long offset,
   8438				     unsigned long size)
   8439{
   8440	/* d_inode(NULL) won't be equal to any mapped user-space file */
   8441	if (!filter->path.dentry)
   8442		return false;
   8443
   8444	if (d_inode(filter->path.dentry) != file_inode(file))
   8445		return false;
   8446
   8447	if (filter->offset > offset + size)
   8448		return false;
   8449
   8450	if (filter->offset + filter->size < offset)
   8451		return false;
   8452
   8453	return true;
   8454}
   8455
   8456static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
   8457					struct vm_area_struct *vma,
   8458					struct perf_addr_filter_range *fr)
   8459{
   8460	unsigned long vma_size = vma->vm_end - vma->vm_start;
   8461	unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
   8462	struct file *file = vma->vm_file;
   8463
   8464	if (!perf_addr_filter_match(filter, file, off, vma_size))
   8465		return false;
   8466
   8467	if (filter->offset < off) {
   8468		fr->start = vma->vm_start;
   8469		fr->size = min(vma_size, filter->size - (off - filter->offset));
   8470	} else {
   8471		fr->start = vma->vm_start + filter->offset - off;
   8472		fr->size = min(vma->vm_end - fr->start, filter->size);
   8473	}
   8474
   8475	return true;
   8476}
   8477
   8478static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
   8479{
   8480	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
   8481	struct vm_area_struct *vma = data;
   8482	struct perf_addr_filter *filter;
   8483	unsigned int restart = 0, count = 0;
   8484	unsigned long flags;
   8485
   8486	if (!has_addr_filter(event))
   8487		return;
   8488
   8489	if (!vma->vm_file)
   8490		return;
   8491
   8492	raw_spin_lock_irqsave(&ifh->lock, flags);
   8493	list_for_each_entry(filter, &ifh->list, entry) {
   8494		if (perf_addr_filter_vma_adjust(filter, vma,
   8495						&event->addr_filter_ranges[count]))
   8496			restart++;
   8497
   8498		count++;
   8499	}
   8500
   8501	if (restart)
   8502		event->addr_filters_gen++;
   8503	raw_spin_unlock_irqrestore(&ifh->lock, flags);
   8504
   8505	if (restart)
   8506		perf_event_stop(event, 1);
   8507}
   8508
   8509/*
   8510 * Adjust all task's events' filters to the new vma
   8511 */
   8512static void perf_addr_filters_adjust(struct vm_area_struct *vma)
   8513{
   8514	struct perf_event_context *ctx;
   8515	int ctxn;
   8516
   8517	/*
   8518	 * Data tracing isn't supported yet and as such there is no need
   8519	 * to keep track of anything that isn't related to executable code:
   8520	 */
   8521	if (!(vma->vm_flags & VM_EXEC))
   8522		return;
   8523
   8524	rcu_read_lock();
   8525	for_each_task_context_nr(ctxn) {
   8526		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
   8527		if (!ctx)
   8528			continue;
   8529
   8530		perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
   8531	}
   8532	rcu_read_unlock();
   8533}
   8534
   8535void perf_event_mmap(struct vm_area_struct *vma)
   8536{
   8537	struct perf_mmap_event mmap_event;
   8538
   8539	if (!atomic_read(&nr_mmap_events))
   8540		return;
   8541
   8542	mmap_event = (struct perf_mmap_event){
   8543		.vma	= vma,
   8544		/* .file_name */
   8545		/* .file_size */
   8546		.event_id  = {
   8547			.header = {
   8548				.type = PERF_RECORD_MMAP,
   8549				.misc = PERF_RECORD_MISC_USER,
   8550				/* .size */
   8551			},
   8552			/* .pid */
   8553			/* .tid */
   8554			.start  = vma->vm_start,
   8555			.len    = vma->vm_end - vma->vm_start,
   8556			.pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
   8557		},
   8558		/* .maj (attr_mmap2 only) */
   8559		/* .min (attr_mmap2 only) */
   8560		/* .ino (attr_mmap2 only) */
   8561		/* .ino_generation (attr_mmap2 only) */
   8562		/* .prot (attr_mmap2 only) */
   8563		/* .flags (attr_mmap2 only) */
   8564	};
   8565
   8566	perf_addr_filters_adjust(vma);
   8567	perf_event_mmap_event(&mmap_event);
   8568}
   8569
   8570void perf_event_aux_event(struct perf_event *event, unsigned long head,
   8571			  unsigned long size, u64 flags)
   8572{
   8573	struct perf_output_handle handle;
   8574	struct perf_sample_data sample;
   8575	struct perf_aux_event {
   8576		struct perf_event_header	header;
   8577		u64				offset;
   8578		u64				size;
   8579		u64				flags;
   8580	} rec = {
   8581		.header = {
   8582			.type = PERF_RECORD_AUX,
   8583			.misc = 0,
   8584			.size = sizeof(rec),
   8585		},
   8586		.offset		= head,
   8587		.size		= size,
   8588		.flags		= flags,
   8589	};
   8590	int ret;
   8591
   8592	perf_event_header__init_id(&rec.header, &sample, event);
   8593	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
   8594
   8595	if (ret)
   8596		return;
   8597
   8598	perf_output_put(&handle, rec);
   8599	perf_event__output_id_sample(event, &handle, &sample);
   8600
   8601	perf_output_end(&handle);
   8602}
   8603
   8604/*
   8605 * Lost/dropped samples logging
   8606 */
   8607void perf_log_lost_samples(struct perf_event *event, u64 lost)
   8608{
   8609	struct perf_output_handle handle;
   8610	struct perf_sample_data sample;
   8611	int ret;
   8612
   8613	struct {
   8614		struct perf_event_header	header;
   8615		u64				lost;
   8616	} lost_samples_event = {
   8617		.header = {
   8618			.type = PERF_RECORD_LOST_SAMPLES,
   8619			.misc = 0,
   8620			.size = sizeof(lost_samples_event),
   8621		},
   8622		.lost		= lost,
   8623	};
   8624
   8625	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
   8626
   8627	ret = perf_output_begin(&handle, &sample, event,
   8628				lost_samples_event.header.size);
   8629	if (ret)
   8630		return;
   8631
   8632	perf_output_put(&handle, lost_samples_event);
   8633	perf_event__output_id_sample(event, &handle, &sample);
   8634	perf_output_end(&handle);
   8635}
   8636
   8637/*
   8638 * context_switch tracking
   8639 */
   8640
   8641struct perf_switch_event {
   8642	struct task_struct	*task;
   8643	struct task_struct	*next_prev;
   8644
   8645	struct {
   8646		struct perf_event_header	header;
   8647		u32				next_prev_pid;
   8648		u32				next_prev_tid;
   8649	} event_id;
   8650};
   8651
   8652static int perf_event_switch_match(struct perf_event *event)
   8653{
   8654	return event->attr.context_switch;
   8655}
   8656
   8657static void perf_event_switch_output(struct perf_event *event, void *data)
   8658{
   8659	struct perf_switch_event *se = data;
   8660	struct perf_output_handle handle;
   8661	struct perf_sample_data sample;
   8662	int ret;
   8663
   8664	if (!perf_event_switch_match(event))
   8665		return;
   8666
   8667	/* Only CPU-wide events are allowed to see next/prev pid/tid */
   8668	if (event->ctx->task) {
   8669		se->event_id.header.type = PERF_RECORD_SWITCH;
   8670		se->event_id.header.size = sizeof(se->event_id.header);
   8671	} else {
   8672		se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
   8673		se->event_id.header.size = sizeof(se->event_id);
   8674		se->event_id.next_prev_pid =
   8675					perf_event_pid(event, se->next_prev);
   8676		se->event_id.next_prev_tid =
   8677					perf_event_tid(event, se->next_prev);
   8678	}
   8679
   8680	perf_event_header__init_id(&se->event_id.header, &sample, event);
   8681
   8682	ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
   8683	if (ret)
   8684		return;
   8685
   8686	if (event->ctx->task)
   8687		perf_output_put(&handle, se->event_id.header);
   8688	else
   8689		perf_output_put(&handle, se->event_id);
   8690
   8691	perf_event__output_id_sample(event, &handle, &sample);
   8692
   8693	perf_output_end(&handle);
   8694}
   8695
   8696static void perf_event_switch(struct task_struct *task,
   8697			      struct task_struct *next_prev, bool sched_in)
   8698{
   8699	struct perf_switch_event switch_event;
   8700
   8701	/* N.B. caller checks nr_switch_events != 0 */
   8702
   8703	switch_event = (struct perf_switch_event){
   8704		.task		= task,
   8705		.next_prev	= next_prev,
   8706		.event_id	= {
   8707			.header = {
   8708				/* .type */
   8709				.misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
   8710				/* .size */
   8711			},
   8712			/* .next_prev_pid */
   8713			/* .next_prev_tid */
   8714		},
   8715	};
   8716
   8717	if (!sched_in && task->on_rq) {
   8718		switch_event.event_id.header.misc |=
   8719				PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
   8720	}
   8721
   8722	perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
   8723}
   8724
   8725/*
   8726 * IRQ throttle logging
   8727 */
   8728
   8729static void perf_log_throttle(struct perf_event *event, int enable)
   8730{
   8731	struct perf_output_handle handle;
   8732	struct perf_sample_data sample;
   8733	int ret;
   8734
   8735	struct {
   8736		struct perf_event_header	header;
   8737		u64				time;
   8738		u64				id;
   8739		u64				stream_id;
   8740	} throttle_event = {
   8741		.header = {
   8742			.type = PERF_RECORD_THROTTLE,
   8743			.misc = 0,
   8744			.size = sizeof(throttle_event),
   8745		},
   8746		.time		= perf_event_clock(event),
   8747		.id		= primary_event_id(event),
   8748		.stream_id	= event->id,
   8749	};
   8750
   8751	if (enable)
   8752		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
   8753
   8754	perf_event_header__init_id(&throttle_event.header, &sample, event);
   8755
   8756	ret = perf_output_begin(&handle, &sample, event,
   8757				throttle_event.header.size);
   8758	if (ret)
   8759		return;
   8760
   8761	perf_output_put(&handle, throttle_event);
   8762	perf_event__output_id_sample(event, &handle, &sample);
   8763	perf_output_end(&handle);
   8764}
   8765
   8766/*
   8767 * ksymbol register/unregister tracking
   8768 */
   8769
   8770struct perf_ksymbol_event {
   8771	const char	*name;
   8772	int		name_len;
   8773	struct {
   8774		struct perf_event_header        header;
   8775		u64				addr;
   8776		u32				len;
   8777		u16				ksym_type;
   8778		u16				flags;
   8779	} event_id;
   8780};
   8781
   8782static int perf_event_ksymbol_match(struct perf_event *event)
   8783{
   8784	return event->attr.ksymbol;
   8785}
   8786
   8787static void perf_event_ksymbol_output(struct perf_event *event, void *data)
   8788{
   8789	struct perf_ksymbol_event *ksymbol_event = data;
   8790	struct perf_output_handle handle;
   8791	struct perf_sample_data sample;
   8792	int ret;
   8793
   8794	if (!perf_event_ksymbol_match(event))
   8795		return;
   8796
   8797	perf_event_header__init_id(&ksymbol_event->event_id.header,
   8798				   &sample, event);
   8799	ret = perf_output_begin(&handle, &sample, event,
   8800				ksymbol_event->event_id.header.size);
   8801	if (ret)
   8802		return;
   8803
   8804	perf_output_put(&handle, ksymbol_event->event_id);
   8805	__output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
   8806	perf_event__output_id_sample(event, &handle, &sample);
   8807
   8808	perf_output_end(&handle);
   8809}
   8810
   8811void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
   8812			const char *sym)
   8813{
   8814	struct perf_ksymbol_event ksymbol_event;
   8815	char name[KSYM_NAME_LEN];
   8816	u16 flags = 0;
   8817	int name_len;
   8818
   8819	if (!atomic_read(&nr_ksymbol_events))
   8820		return;
   8821
   8822	if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
   8823	    ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
   8824		goto err;
   8825
   8826	strlcpy(name, sym, KSYM_NAME_LEN);
   8827	name_len = strlen(name) + 1;
   8828	while (!IS_ALIGNED(name_len, sizeof(u64)))
   8829		name[name_len++] = '\0';
   8830	BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
   8831
   8832	if (unregister)
   8833		flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
   8834
   8835	ksymbol_event = (struct perf_ksymbol_event){
   8836		.name = name,
   8837		.name_len = name_len,
   8838		.event_id = {
   8839			.header = {
   8840				.type = PERF_RECORD_KSYMBOL,
   8841				.size = sizeof(ksymbol_event.event_id) +
   8842					name_len,
   8843			},
   8844			.addr = addr,
   8845			.len = len,
   8846			.ksym_type = ksym_type,
   8847			.flags = flags,
   8848		},
   8849	};
   8850
   8851	perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
   8852	return;
   8853err:
   8854	WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
   8855}
   8856
   8857/*
   8858 * bpf program load/unload tracking
   8859 */
   8860
   8861struct perf_bpf_event {
   8862	struct bpf_prog	*prog;
   8863	struct {
   8864		struct perf_event_header        header;
   8865		u16				type;
   8866		u16				flags;
   8867		u32				id;
   8868		u8				tag[BPF_TAG_SIZE];
   8869	} event_id;
   8870};
   8871
   8872static int perf_event_bpf_match(struct perf_event *event)
   8873{
   8874	return event->attr.bpf_event;
   8875}
   8876
   8877static void perf_event_bpf_output(struct perf_event *event, void *data)
   8878{
   8879	struct perf_bpf_event *bpf_event = data;
   8880	struct perf_output_handle handle;
   8881	struct perf_sample_data sample;
   8882	int ret;
   8883
   8884	if (!perf_event_bpf_match(event))
   8885		return;
   8886
   8887	perf_event_header__init_id(&bpf_event->event_id.header,
   8888				   &sample, event);
   8889	ret = perf_output_begin(&handle, data, event,
   8890				bpf_event->event_id.header.size);
   8891	if (ret)
   8892		return;
   8893
   8894	perf_output_put(&handle, bpf_event->event_id);
   8895	perf_event__output_id_sample(event, &handle, &sample);
   8896
   8897	perf_output_end(&handle);
   8898}
   8899
   8900static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
   8901					 enum perf_bpf_event_type type)
   8902{
   8903	bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
   8904	int i;
   8905
   8906	if (prog->aux->func_cnt == 0) {
   8907		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
   8908				   (u64)(unsigned long)prog->bpf_func,
   8909				   prog->jited_len, unregister,
   8910				   prog->aux->ksym.name);
   8911	} else {
   8912		for (i = 0; i < prog->aux->func_cnt; i++) {
   8913			struct bpf_prog *subprog = prog->aux->func[i];
   8914
   8915			perf_event_ksymbol(
   8916				PERF_RECORD_KSYMBOL_TYPE_BPF,
   8917				(u64)(unsigned long)subprog->bpf_func,
   8918				subprog->jited_len, unregister,
   8919				prog->aux->ksym.name);
   8920		}
   8921	}
   8922}
   8923
   8924void perf_event_bpf_event(struct bpf_prog *prog,
   8925			  enum perf_bpf_event_type type,
   8926			  u16 flags)
   8927{
   8928	struct perf_bpf_event bpf_event;
   8929
   8930	if (type <= PERF_BPF_EVENT_UNKNOWN ||
   8931	    type >= PERF_BPF_EVENT_MAX)
   8932		return;
   8933
   8934	switch (type) {
   8935	case PERF_BPF_EVENT_PROG_LOAD:
   8936	case PERF_BPF_EVENT_PROG_UNLOAD:
   8937		if (atomic_read(&nr_ksymbol_events))
   8938			perf_event_bpf_emit_ksymbols(prog, type);
   8939		break;
   8940	default:
   8941		break;
   8942	}
   8943
   8944	if (!atomic_read(&nr_bpf_events))
   8945		return;
   8946
   8947	bpf_event = (struct perf_bpf_event){
   8948		.prog = prog,
   8949		.event_id = {
   8950			.header = {
   8951				.type = PERF_RECORD_BPF_EVENT,
   8952				.size = sizeof(bpf_event.event_id),
   8953			},
   8954			.type = type,
   8955			.flags = flags,
   8956			.id = prog->aux->id,
   8957		},
   8958	};
   8959
   8960	BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
   8961
   8962	memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
   8963	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
   8964}
   8965
   8966struct perf_text_poke_event {
   8967	const void		*old_bytes;
   8968	const void		*new_bytes;
   8969	size_t			pad;
   8970	u16			old_len;
   8971	u16			new_len;
   8972
   8973	struct {
   8974		struct perf_event_header	header;
   8975
   8976		u64				addr;
   8977	} event_id;
   8978};
   8979
   8980static int perf_event_text_poke_match(struct perf_event *event)
   8981{
   8982	return event->attr.text_poke;
   8983}
   8984
   8985static void perf_event_text_poke_output(struct perf_event *event, void *data)
   8986{
   8987	struct perf_text_poke_event *text_poke_event = data;
   8988	struct perf_output_handle handle;
   8989	struct perf_sample_data sample;
   8990	u64 padding = 0;
   8991	int ret;
   8992
   8993	if (!perf_event_text_poke_match(event))
   8994		return;
   8995
   8996	perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
   8997
   8998	ret = perf_output_begin(&handle, &sample, event,
   8999				text_poke_event->event_id.header.size);
   9000	if (ret)
   9001		return;
   9002
   9003	perf_output_put(&handle, text_poke_event->event_id);
   9004	perf_output_put(&handle, text_poke_event->old_len);
   9005	perf_output_put(&handle, text_poke_event->new_len);
   9006
   9007	__output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
   9008	__output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
   9009
   9010	if (text_poke_event->pad)
   9011		__output_copy(&handle, &padding, text_poke_event->pad);
   9012
   9013	perf_event__output_id_sample(event, &handle, &sample);
   9014
   9015	perf_output_end(&handle);
   9016}
   9017
   9018void perf_event_text_poke(const void *addr, const void *old_bytes,
   9019			  size_t old_len, const void *new_bytes, size_t new_len)
   9020{
   9021	struct perf_text_poke_event text_poke_event;
   9022	size_t tot, pad;
   9023
   9024	if (!atomic_read(&nr_text_poke_events))
   9025		return;
   9026
   9027	tot  = sizeof(text_poke_event.old_len) + old_len;
   9028	tot += sizeof(text_poke_event.new_len) + new_len;
   9029	pad  = ALIGN(tot, sizeof(u64)) - tot;
   9030
   9031	text_poke_event = (struct perf_text_poke_event){
   9032		.old_bytes    = old_bytes,
   9033		.new_bytes    = new_bytes,
   9034		.pad          = pad,
   9035		.old_len      = old_len,
   9036		.new_len      = new_len,
   9037		.event_id  = {
   9038			.header = {
   9039				.type = PERF_RECORD_TEXT_POKE,
   9040				.misc = PERF_RECORD_MISC_KERNEL,
   9041				.size = sizeof(text_poke_event.event_id) + tot + pad,
   9042			},
   9043			.addr = (unsigned long)addr,
   9044		},
   9045	};
   9046
   9047	perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
   9048}
   9049
   9050void perf_event_itrace_started(struct perf_event *event)
   9051{
   9052	event->attach_state |= PERF_ATTACH_ITRACE;
   9053}
   9054
   9055static void perf_log_itrace_start(struct perf_event *event)
   9056{
   9057	struct perf_output_handle handle;
   9058	struct perf_sample_data sample;
   9059	struct perf_aux_event {
   9060		struct perf_event_header        header;
   9061		u32				pid;
   9062		u32				tid;
   9063	} rec;
   9064	int ret;
   9065
   9066	if (event->parent)
   9067		event = event->parent;
   9068
   9069	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
   9070	    event->attach_state & PERF_ATTACH_ITRACE)
   9071		return;
   9072
   9073	rec.header.type	= PERF_RECORD_ITRACE_START;
   9074	rec.header.misc	= 0;
   9075	rec.header.size	= sizeof(rec);
   9076	rec.pid	= perf_event_pid(event, current);
   9077	rec.tid	= perf_event_tid(event, current);
   9078
   9079	perf_event_header__init_id(&rec.header, &sample, event);
   9080	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
   9081
   9082	if (ret)
   9083		return;
   9084
   9085	perf_output_put(&handle, rec);
   9086	perf_event__output_id_sample(event, &handle, &sample);
   9087
   9088	perf_output_end(&handle);
   9089}
   9090
   9091void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
   9092{
   9093	struct perf_output_handle handle;
   9094	struct perf_sample_data sample;
   9095	struct perf_aux_event {
   9096		struct perf_event_header        header;
   9097		u64				hw_id;
   9098	} rec;
   9099	int ret;
   9100
   9101	if (event->parent)
   9102		event = event->parent;
   9103
   9104	rec.header.type	= PERF_RECORD_AUX_OUTPUT_HW_ID;
   9105	rec.header.misc	= 0;
   9106	rec.header.size	= sizeof(rec);
   9107	rec.hw_id	= hw_id;
   9108
   9109	perf_event_header__init_id(&rec.header, &sample, event);
   9110	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
   9111
   9112	if (ret)
   9113		return;
   9114
   9115	perf_output_put(&handle, rec);
   9116	perf_event__output_id_sample(event, &handle, &sample);
   9117
   9118	perf_output_end(&handle);
   9119}
   9120
   9121static int
   9122__perf_event_account_interrupt(struct perf_event *event, int throttle)
   9123{
   9124	struct hw_perf_event *hwc = &event->hw;
   9125	int ret = 0;
   9126	u64 seq;
   9127
   9128	seq = __this_cpu_read(perf_throttled_seq);
   9129	if (seq != hwc->interrupts_seq) {
   9130		hwc->interrupts_seq = seq;
   9131		hwc->interrupts = 1;
   9132	} else {
   9133		hwc->interrupts++;
   9134		if (unlikely(throttle
   9135			     && hwc->interrupts >= max_samples_per_tick)) {
   9136			__this_cpu_inc(perf_throttled_count);
   9137			tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
   9138			hwc->interrupts = MAX_INTERRUPTS;
   9139			perf_log_throttle(event, 0);
   9140			ret = 1;
   9141		}
   9142	}
   9143
   9144	if (event->attr.freq) {
   9145		u64 now = perf_clock();
   9146		s64 delta = now - hwc->freq_time_stamp;
   9147
   9148		hwc->freq_time_stamp = now;
   9149
   9150		if (delta > 0 && delta < 2*TICK_NSEC)
   9151			perf_adjust_period(event, delta, hwc->last_period, true);
   9152	}
   9153
   9154	return ret;
   9155}
   9156
   9157int perf_event_account_interrupt(struct perf_event *event)
   9158{
   9159	return __perf_event_account_interrupt(event, 1);
   9160}
   9161
   9162/*
   9163 * Generic event overflow handling, sampling.
   9164 */
   9165
   9166static int __perf_event_overflow(struct perf_event *event,
   9167				   int throttle, struct perf_sample_data *data,
   9168				   struct pt_regs *regs)
   9169{
   9170	int events = atomic_read(&event->event_limit);
   9171	int ret = 0;
   9172
   9173	/*
   9174	 * Non-sampling counters might still use the PMI to fold short
   9175	 * hardware counters, ignore those.
   9176	 */
   9177	if (unlikely(!is_sampling_event(event)))
   9178		return 0;
   9179
   9180	ret = __perf_event_account_interrupt(event, throttle);
   9181
   9182	/*
   9183	 * XXX event_limit might not quite work as expected on inherited
   9184	 * events
   9185	 */
   9186
   9187	event->pending_kill = POLL_IN;
   9188	if (events && atomic_dec_and_test(&event->event_limit)) {
   9189		ret = 1;
   9190		event->pending_kill = POLL_HUP;
   9191		event->pending_addr = data->addr;
   9192
   9193		perf_event_disable_inatomic(event);
   9194	}
   9195
   9196	READ_ONCE(event->overflow_handler)(event, data, regs);
   9197
   9198	if (*perf_event_fasync(event) && event->pending_kill) {
   9199		event->pending_wakeup = 1;
   9200		irq_work_queue(&event->pending);
   9201	}
   9202
   9203	return ret;
   9204}
   9205
   9206int perf_event_overflow(struct perf_event *event,
   9207			  struct perf_sample_data *data,
   9208			  struct pt_regs *regs)
   9209{
   9210	return __perf_event_overflow(event, 1, data, regs);
   9211}
   9212
   9213/*
   9214 * Generic software event infrastructure
   9215 */
   9216
   9217struct swevent_htable {
   9218	struct swevent_hlist		*swevent_hlist;
   9219	struct mutex			hlist_mutex;
   9220	int				hlist_refcount;
   9221
   9222	/* Recursion avoidance in each contexts */
   9223	int				recursion[PERF_NR_CONTEXTS];
   9224};
   9225
   9226static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
   9227
   9228/*
   9229 * We directly increment event->count and keep a second value in
   9230 * event->hw.period_left to count intervals. This period event
   9231 * is kept in the range [-sample_period, 0] so that we can use the
   9232 * sign as trigger.
   9233 */
   9234
   9235u64 perf_swevent_set_period(struct perf_event *event)
   9236{
   9237	struct hw_perf_event *hwc = &event->hw;
   9238	u64 period = hwc->last_period;
   9239	u64 nr, offset;
   9240	s64 old, val;
   9241
   9242	hwc->last_period = hwc->sample_period;
   9243
   9244again:
   9245	old = val = local64_read(&hwc->period_left);
   9246	if (val < 0)
   9247		return 0;
   9248
   9249	nr = div64_u64(period + val, period);
   9250	offset = nr * period;
   9251	val -= offset;
   9252	if (local64_cmpxchg(&hwc->period_left, old, val) != old)
   9253		goto again;
   9254
   9255	return nr;
   9256}
   9257
   9258static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
   9259				    struct perf_sample_data *data,
   9260				    struct pt_regs *regs)
   9261{
   9262	struct hw_perf_event *hwc = &event->hw;
   9263	int throttle = 0;
   9264
   9265	if (!overflow)
   9266		overflow = perf_swevent_set_period(event);
   9267
   9268	if (hwc->interrupts == MAX_INTERRUPTS)
   9269		return;
   9270
   9271	for (; overflow; overflow--) {
   9272		if (__perf_event_overflow(event, throttle,
   9273					    data, regs)) {
   9274			/*
   9275			 * We inhibit the overflow from happening when
   9276			 * hwc->interrupts == MAX_INTERRUPTS.
   9277			 */
   9278			break;
   9279		}
   9280		throttle = 1;
   9281	}
   9282}
   9283
   9284static void perf_swevent_event(struct perf_event *event, u64 nr,
   9285			       struct perf_sample_data *data,
   9286			       struct pt_regs *regs)
   9287{
   9288	struct hw_perf_event *hwc = &event->hw;
   9289
   9290	local64_add(nr, &event->count);
   9291
   9292	if (!regs)
   9293		return;
   9294
   9295	if (!is_sampling_event(event))
   9296		return;
   9297
   9298	if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
   9299		data->period = nr;
   9300		return perf_swevent_overflow(event, 1, data, regs);
   9301	} else
   9302		data->period = event->hw.last_period;
   9303
   9304	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
   9305		return perf_swevent_overflow(event, 1, data, regs);
   9306
   9307	if (local64_add_negative(nr, &hwc->period_left))
   9308		return;
   9309
   9310	perf_swevent_overflow(event, 0, data, regs);
   9311}
   9312
   9313static int perf_exclude_event(struct perf_event *event,
   9314			      struct pt_regs *regs)
   9315{
   9316	if (event->hw.state & PERF_HES_STOPPED)
   9317		return 1;
   9318
   9319	if (regs) {
   9320		if (event->attr.exclude_user && user_mode(regs))
   9321			return 1;
   9322
   9323		if (event->attr.exclude_kernel && !user_mode(regs))
   9324			return 1;
   9325	}
   9326
   9327	return 0;
   9328}
   9329
   9330static int perf_swevent_match(struct perf_event *event,
   9331				enum perf_type_id type,
   9332				u32 event_id,
   9333				struct perf_sample_data *data,
   9334				struct pt_regs *regs)
   9335{
   9336	if (event->attr.type != type)
   9337		return 0;
   9338
   9339	if (event->attr.config != event_id)
   9340		return 0;
   9341
   9342	if (perf_exclude_event(event, regs))
   9343		return 0;
   9344
   9345	return 1;
   9346}
   9347
   9348static inline u64 swevent_hash(u64 type, u32 event_id)
   9349{
   9350	u64 val = event_id | (type << 32);
   9351
   9352	return hash_64(val, SWEVENT_HLIST_BITS);
   9353}
   9354
   9355static inline struct hlist_head *
   9356__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
   9357{
   9358	u64 hash = swevent_hash(type, event_id);
   9359
   9360	return &hlist->heads[hash];
   9361}
   9362
   9363/* For the read side: events when they trigger */
   9364static inline struct hlist_head *
   9365find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
   9366{
   9367	struct swevent_hlist *hlist;
   9368
   9369	hlist = rcu_dereference(swhash->swevent_hlist);
   9370	if (!hlist)
   9371		return NULL;
   9372
   9373	return __find_swevent_head(hlist, type, event_id);
   9374}
   9375
   9376/* For the event head insertion and removal in the hlist */
   9377static inline struct hlist_head *
   9378find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
   9379{
   9380	struct swevent_hlist *hlist;
   9381	u32 event_id = event->attr.config;
   9382	u64 type = event->attr.type;
   9383
   9384	/*
   9385	 * Event scheduling is always serialized against hlist allocation
   9386	 * and release. Which makes the protected version suitable here.
   9387	 * The context lock guarantees that.
   9388	 */
   9389	hlist = rcu_dereference_protected(swhash->swevent_hlist,
   9390					  lockdep_is_held(&event->ctx->lock));
   9391	if (!hlist)
   9392		return NULL;
   9393
   9394	return __find_swevent_head(hlist, type, event_id);
   9395}
   9396
   9397static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
   9398				    u64 nr,
   9399				    struct perf_sample_data *data,
   9400				    struct pt_regs *regs)
   9401{
   9402	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
   9403	struct perf_event *event;
   9404	struct hlist_head *head;
   9405
   9406	rcu_read_lock();
   9407	head = find_swevent_head_rcu(swhash, type, event_id);
   9408	if (!head)
   9409		goto end;
   9410
   9411	hlist_for_each_entry_rcu(event, head, hlist_entry) {
   9412		if (perf_swevent_match(event, type, event_id, data, regs))
   9413			perf_swevent_event(event, nr, data, regs);
   9414	}
   9415end:
   9416	rcu_read_unlock();
   9417}
   9418
   9419DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
   9420
   9421int perf_swevent_get_recursion_context(void)
   9422{
   9423	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
   9424
   9425	return get_recursion_context(swhash->recursion);
   9426}
   9427EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
   9428
   9429void perf_swevent_put_recursion_context(int rctx)
   9430{
   9431	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
   9432
   9433	put_recursion_context(swhash->recursion, rctx);
   9434}
   9435
   9436void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
   9437{
   9438	struct perf_sample_data data;
   9439
   9440	if (WARN_ON_ONCE(!regs))
   9441		return;
   9442
   9443	perf_sample_data_init(&data, addr, 0);
   9444	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
   9445}
   9446
   9447void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
   9448{
   9449	int rctx;
   9450
   9451	preempt_disable_notrace();
   9452	rctx = perf_swevent_get_recursion_context();
   9453	if (unlikely(rctx < 0))
   9454		goto fail;
   9455
   9456	___perf_sw_event(event_id, nr, regs, addr);
   9457
   9458	perf_swevent_put_recursion_context(rctx);
   9459fail:
   9460	preempt_enable_notrace();
   9461}
   9462
   9463static void perf_swevent_read(struct perf_event *event)
   9464{
   9465}
   9466
   9467static int perf_swevent_add(struct perf_event *event, int flags)
   9468{
   9469	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
   9470	struct hw_perf_event *hwc = &event->hw;
   9471	struct hlist_head *head;
   9472
   9473	if (is_sampling_event(event)) {
   9474		hwc->last_period = hwc->sample_period;
   9475		perf_swevent_set_period(event);
   9476	}
   9477
   9478	hwc->state = !(flags & PERF_EF_START);
   9479
   9480	head = find_swevent_head(swhash, event);
   9481	if (WARN_ON_ONCE(!head))
   9482		return -EINVAL;
   9483
   9484	hlist_add_head_rcu(&event->hlist_entry, head);
   9485	perf_event_update_userpage(event);
   9486
   9487	return 0;
   9488}
   9489
   9490static void perf_swevent_del(struct perf_event *event, int flags)
   9491{
   9492	hlist_del_rcu(&event->hlist_entry);
   9493}
   9494
   9495static void perf_swevent_start(struct perf_event *event, int flags)
   9496{
   9497	event->hw.state = 0;
   9498}
   9499
   9500static void perf_swevent_stop(struct perf_event *event, int flags)
   9501{
   9502	event->hw.state = PERF_HES_STOPPED;
   9503}
   9504
   9505/* Deref the hlist from the update side */
   9506static inline struct swevent_hlist *
   9507swevent_hlist_deref(struct swevent_htable *swhash)
   9508{
   9509	return rcu_dereference_protected(swhash->swevent_hlist,
   9510					 lockdep_is_held(&swhash->hlist_mutex));
   9511}
   9512
   9513static void swevent_hlist_release(struct swevent_htable *swhash)
   9514{
   9515	struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
   9516
   9517	if (!hlist)
   9518		return;
   9519
   9520	RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
   9521	kfree_rcu(hlist, rcu_head);
   9522}
   9523
   9524static void swevent_hlist_put_cpu(int cpu)
   9525{
   9526	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
   9527
   9528	mutex_lock(&swhash->hlist_mutex);
   9529
   9530	if (!--swhash->hlist_refcount)
   9531		swevent_hlist_release(swhash);
   9532
   9533	mutex_unlock(&swhash->hlist_mutex);
   9534}
   9535
   9536static void swevent_hlist_put(void)
   9537{
   9538	int cpu;
   9539
   9540	for_each_possible_cpu(cpu)
   9541		swevent_hlist_put_cpu(cpu);
   9542}
   9543
   9544static int swevent_hlist_get_cpu(int cpu)
   9545{
   9546	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
   9547	int err = 0;
   9548
   9549	mutex_lock(&swhash->hlist_mutex);
   9550	if (!swevent_hlist_deref(swhash) &&
   9551	    cpumask_test_cpu(cpu, perf_online_mask)) {
   9552		struct swevent_hlist *hlist;
   9553
   9554		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
   9555		if (!hlist) {
   9556			err = -ENOMEM;
   9557			goto exit;
   9558		}
   9559		rcu_assign_pointer(swhash->swevent_hlist, hlist);
   9560	}
   9561	swhash->hlist_refcount++;
   9562exit:
   9563	mutex_unlock(&swhash->hlist_mutex);
   9564
   9565	return err;
   9566}
   9567
   9568static int swevent_hlist_get(void)
   9569{
   9570	int err, cpu, failed_cpu;
   9571
   9572	mutex_lock(&pmus_lock);
   9573	for_each_possible_cpu(cpu) {
   9574		err = swevent_hlist_get_cpu(cpu);
   9575		if (err) {
   9576			failed_cpu = cpu;
   9577			goto fail;
   9578		}
   9579	}
   9580	mutex_unlock(&pmus_lock);
   9581	return 0;
   9582fail:
   9583	for_each_possible_cpu(cpu) {
   9584		if (cpu == failed_cpu)
   9585			break;
   9586		swevent_hlist_put_cpu(cpu);
   9587	}
   9588	mutex_unlock(&pmus_lock);
   9589	return err;
   9590}
   9591
   9592struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
   9593
   9594static void sw_perf_event_destroy(struct perf_event *event)
   9595{
   9596	u64 event_id = event->attr.config;
   9597
   9598	WARN_ON(event->parent);
   9599
   9600	static_key_slow_dec(&perf_swevent_enabled[event_id]);
   9601	swevent_hlist_put();
   9602}
   9603
   9604static int perf_swevent_init(struct perf_event *event)
   9605{
   9606	u64 event_id = event->attr.config;
   9607
   9608	if (event->attr.type != PERF_TYPE_SOFTWARE)
   9609		return -ENOENT;
   9610
   9611	/*
   9612	 * no branch sampling for software events
   9613	 */
   9614	if (has_branch_stack(event))
   9615		return -EOPNOTSUPP;
   9616
   9617	switch (event_id) {
   9618	case PERF_COUNT_SW_CPU_CLOCK:
   9619	case PERF_COUNT_SW_TASK_CLOCK:
   9620		return -ENOENT;
   9621
   9622	default:
   9623		break;
   9624	}
   9625
   9626	if (event_id >= PERF_COUNT_SW_MAX)
   9627		return -ENOENT;
   9628
   9629	if (!event->parent) {
   9630		int err;
   9631
   9632		err = swevent_hlist_get();
   9633		if (err)
   9634			return err;
   9635
   9636		static_key_slow_inc(&perf_swevent_enabled[event_id]);
   9637		event->destroy = sw_perf_event_destroy;
   9638	}
   9639
   9640	return 0;
   9641}
   9642
   9643static struct pmu perf_swevent = {
   9644	.task_ctx_nr	= perf_sw_context,
   9645
   9646	.capabilities	= PERF_PMU_CAP_NO_NMI,
   9647
   9648	.event_init	= perf_swevent_init,
   9649	.add		= perf_swevent_add,
   9650	.del		= perf_swevent_del,
   9651	.start		= perf_swevent_start,
   9652	.stop		= perf_swevent_stop,
   9653	.read		= perf_swevent_read,
   9654};
   9655
   9656#ifdef CONFIG_EVENT_TRACING
   9657
   9658static int perf_tp_filter_match(struct perf_event *event,
   9659				struct perf_sample_data *data)
   9660{
   9661	void *record = data->raw->frag.data;
   9662
   9663	/* only top level events have filters set */
   9664	if (event->parent)
   9665		event = event->parent;
   9666
   9667	if (likely(!event->filter) || filter_match_preds(event->filter, record))
   9668		return 1;
   9669	return 0;
   9670}
   9671
   9672static int perf_tp_event_match(struct perf_event *event,
   9673				struct perf_sample_data *data,
   9674				struct pt_regs *regs)
   9675{
   9676	if (event->hw.state & PERF_HES_STOPPED)
   9677		return 0;
   9678	/*
   9679	 * If exclude_kernel, only trace user-space tracepoints (uprobes)
   9680	 */
   9681	if (event->attr.exclude_kernel && !user_mode(regs))
   9682		return 0;
   9683
   9684	if (!perf_tp_filter_match(event, data))
   9685		return 0;
   9686
   9687	return 1;
   9688}
   9689
   9690void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
   9691			       struct trace_event_call *call, u64 count,
   9692			       struct pt_regs *regs, struct hlist_head *head,
   9693			       struct task_struct *task)
   9694{
   9695	if (bpf_prog_array_valid(call)) {
   9696		*(struct pt_regs **)raw_data = regs;
   9697		if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
   9698			perf_swevent_put_recursion_context(rctx);
   9699			return;
   9700		}
   9701	}
   9702	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
   9703		      rctx, task);
   9704}
   9705EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
   9706
   9707void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
   9708		   struct pt_regs *regs, struct hlist_head *head, int rctx,
   9709		   struct task_struct *task)
   9710{
   9711	struct perf_sample_data data;
   9712	struct perf_event *event;
   9713
   9714	struct perf_raw_record raw = {
   9715		.frag = {
   9716			.size = entry_size,
   9717			.data = record,
   9718		},
   9719	};
   9720
   9721	perf_sample_data_init(&data, 0, 0);
   9722	data.raw = &raw;
   9723
   9724	perf_trace_buf_update(record, event_type);
   9725
   9726	hlist_for_each_entry_rcu(event, head, hlist_entry) {
   9727		if (perf_tp_event_match(event, &data, regs))
   9728			perf_swevent_event(event, count, &data, regs);
   9729	}
   9730
   9731	/*
   9732	 * If we got specified a target task, also iterate its context and
   9733	 * deliver this event there too.
   9734	 */
   9735	if (task && task != current) {
   9736		struct perf_event_context *ctx;
   9737		struct trace_entry *entry = record;
   9738
   9739		rcu_read_lock();
   9740		ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
   9741		if (!ctx)
   9742			goto unlock;
   9743
   9744		list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
   9745			if (event->cpu != smp_processor_id())
   9746				continue;
   9747			if (event->attr.type != PERF_TYPE_TRACEPOINT)
   9748				continue;
   9749			if (event->attr.config != entry->type)
   9750				continue;
   9751			/* Cannot deliver synchronous signal to other task. */
   9752			if (event->attr.sigtrap)
   9753				continue;
   9754			if (perf_tp_event_match(event, &data, regs))
   9755				perf_swevent_event(event, count, &data, regs);
   9756		}
   9757unlock:
   9758		rcu_read_unlock();
   9759	}
   9760
   9761	perf_swevent_put_recursion_context(rctx);
   9762}
   9763EXPORT_SYMBOL_GPL(perf_tp_event);
   9764
   9765static void tp_perf_event_destroy(struct perf_event *event)
   9766{
   9767	perf_trace_destroy(event);
   9768}
   9769
   9770static int perf_tp_event_init(struct perf_event *event)
   9771{
   9772	int err;
   9773
   9774	if (event->attr.type != PERF_TYPE_TRACEPOINT)
   9775		return -ENOENT;
   9776
   9777	/*
   9778	 * no branch sampling for tracepoint events
   9779	 */
   9780	if (has_branch_stack(event))
   9781		return -EOPNOTSUPP;
   9782
   9783	err = perf_trace_init(event);
   9784	if (err)
   9785		return err;
   9786
   9787	event->destroy = tp_perf_event_destroy;
   9788
   9789	return 0;
   9790}
   9791
   9792static struct pmu perf_tracepoint = {
   9793	.task_ctx_nr	= perf_sw_context,
   9794
   9795	.event_init	= perf_tp_event_init,
   9796	.add		= perf_trace_add,
   9797	.del		= perf_trace_del,
   9798	.start		= perf_swevent_start,
   9799	.stop		= perf_swevent_stop,
   9800	.read		= perf_swevent_read,
   9801};
   9802
   9803#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
   9804/*
   9805 * Flags in config, used by dynamic PMU kprobe and uprobe
   9806 * The flags should match following PMU_FORMAT_ATTR().
   9807 *
   9808 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
   9809 *                               if not set, create kprobe/uprobe
   9810 *
   9811 * The following values specify a reference counter (or semaphore in the
   9812 * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
   9813 * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
   9814 *
   9815 * PERF_UPROBE_REF_CTR_OFFSET_BITS	# of bits in config as th offset
   9816 * PERF_UPROBE_REF_CTR_OFFSET_SHIFT	# of bits to shift left
   9817 */
   9818enum perf_probe_config {
   9819	PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
   9820	PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
   9821	PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
   9822};
   9823
   9824PMU_FORMAT_ATTR(retprobe, "config:0");
   9825#endif
   9826
   9827#ifdef CONFIG_KPROBE_EVENTS
   9828static struct attribute *kprobe_attrs[] = {
   9829	&format_attr_retprobe.attr,
   9830	NULL,
   9831};
   9832
   9833static struct attribute_group kprobe_format_group = {
   9834	.name = "format",
   9835	.attrs = kprobe_attrs,
   9836};
   9837
   9838static const struct attribute_group *kprobe_attr_groups[] = {
   9839	&kprobe_format_group,
   9840	NULL,
   9841};
   9842
   9843static int perf_kprobe_event_init(struct perf_event *event);
   9844static struct pmu perf_kprobe = {
   9845	.task_ctx_nr	= perf_sw_context,
   9846	.event_init	= perf_kprobe_event_init,
   9847	.add		= perf_trace_add,
   9848	.del		= perf_trace_del,
   9849	.start		= perf_swevent_start,
   9850	.stop		= perf_swevent_stop,
   9851	.read		= perf_swevent_read,
   9852	.attr_groups	= kprobe_attr_groups,
   9853};
   9854
   9855static int perf_kprobe_event_init(struct perf_event *event)
   9856{
   9857	int err;
   9858	bool is_retprobe;
   9859
   9860	if (event->attr.type != perf_kprobe.type)
   9861		return -ENOENT;
   9862
   9863	if (!perfmon_capable())
   9864		return -EACCES;
   9865
   9866	/*
   9867	 * no branch sampling for probe events
   9868	 */
   9869	if (has_branch_stack(event))
   9870		return -EOPNOTSUPP;
   9871
   9872	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
   9873	err = perf_kprobe_init(event, is_retprobe);
   9874	if (err)
   9875		return err;
   9876
   9877	event->destroy = perf_kprobe_destroy;
   9878
   9879	return 0;
   9880}
   9881#endif /* CONFIG_KPROBE_EVENTS */
   9882
   9883#ifdef CONFIG_UPROBE_EVENTS
   9884PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
   9885
   9886static struct attribute *uprobe_attrs[] = {
   9887	&format_attr_retprobe.attr,
   9888	&format_attr_ref_ctr_offset.attr,
   9889	NULL,
   9890};
   9891
   9892static struct attribute_group uprobe_format_group = {
   9893	.name = "format",
   9894	.attrs = uprobe_attrs,
   9895};
   9896
   9897static const struct attribute_group *uprobe_attr_groups[] = {
   9898	&uprobe_format_group,
   9899	NULL,
   9900};
   9901
   9902static int perf_uprobe_event_init(struct perf_event *event);
   9903static struct pmu perf_uprobe = {
   9904	.task_ctx_nr	= perf_sw_context,
   9905	.event_init	= perf_uprobe_event_init,
   9906	.add		= perf_trace_add,
   9907	.del		= perf_trace_del,
   9908	.start		= perf_swevent_start,
   9909	.stop		= perf_swevent_stop,
   9910	.read		= perf_swevent_read,
   9911	.attr_groups	= uprobe_attr_groups,
   9912};
   9913
   9914static int perf_uprobe_event_init(struct perf_event *event)
   9915{
   9916	int err;
   9917	unsigned long ref_ctr_offset;
   9918	bool is_retprobe;
   9919
   9920	if (event->attr.type != perf_uprobe.type)
   9921		return -ENOENT;
   9922
   9923	if (!perfmon_capable())
   9924		return -EACCES;
   9925
   9926	/*
   9927	 * no branch sampling for probe events
   9928	 */
   9929	if (has_branch_stack(event))
   9930		return -EOPNOTSUPP;
   9931
   9932	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
   9933	ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
   9934	err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
   9935	if (err)
   9936		return err;
   9937
   9938	event->destroy = perf_uprobe_destroy;
   9939
   9940	return 0;
   9941}
   9942#endif /* CONFIG_UPROBE_EVENTS */
   9943
   9944static inline void perf_tp_register(void)
   9945{
   9946	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
   9947#ifdef CONFIG_KPROBE_EVENTS
   9948	perf_pmu_register(&perf_kprobe, "kprobe", -1);
   9949#endif
   9950#ifdef CONFIG_UPROBE_EVENTS
   9951	perf_pmu_register(&perf_uprobe, "uprobe", -1);
   9952#endif
   9953}
   9954
   9955static void perf_event_free_filter(struct perf_event *event)
   9956{
   9957	ftrace_profile_free_filter(event);
   9958}
   9959
   9960#ifdef CONFIG_BPF_SYSCALL
   9961static void bpf_overflow_handler(struct perf_event *event,
   9962				 struct perf_sample_data *data,
   9963				 struct pt_regs *regs)
   9964{
   9965	struct bpf_perf_event_data_kern ctx = {
   9966		.data = data,
   9967		.event = event,
   9968	};
   9969	struct bpf_prog *prog;
   9970	int ret = 0;
   9971
   9972	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
   9973	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
   9974		goto out;
   9975	rcu_read_lock();
   9976	prog = READ_ONCE(event->prog);
   9977	if (prog)
   9978		ret = bpf_prog_run(prog, &ctx);
   9979	rcu_read_unlock();
   9980out:
   9981	__this_cpu_dec(bpf_prog_active);
   9982	if (!ret)
   9983		return;
   9984
   9985	event->orig_overflow_handler(event, data, regs);
   9986}
   9987
   9988static int perf_event_set_bpf_handler(struct perf_event *event,
   9989				      struct bpf_prog *prog,
   9990				      u64 bpf_cookie)
   9991{
   9992	if (event->overflow_handler_context)
   9993		/* hw breakpoint or kernel counter */
   9994		return -EINVAL;
   9995
   9996	if (event->prog)
   9997		return -EEXIST;
   9998
   9999	if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
  10000		return -EINVAL;
  10001
  10002	if (event->attr.precise_ip &&
  10003	    prog->call_get_stack &&
  10004	    (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
  10005	     event->attr.exclude_callchain_kernel ||
  10006	     event->attr.exclude_callchain_user)) {
  10007		/*
  10008		 * On perf_event with precise_ip, calling bpf_get_stack()
  10009		 * may trigger unwinder warnings and occasional crashes.
  10010		 * bpf_get_[stack|stackid] works around this issue by using
  10011		 * callchain attached to perf_sample_data. If the
  10012		 * perf_event does not full (kernel and user) callchain
  10013		 * attached to perf_sample_data, do not allow attaching BPF
  10014		 * program that calls bpf_get_[stack|stackid].
  10015		 */
  10016		return -EPROTO;
  10017	}
  10018
  10019	event->prog = prog;
  10020	event->bpf_cookie = bpf_cookie;
  10021	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
  10022	WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
  10023	return 0;
  10024}
  10025
  10026static void perf_event_free_bpf_handler(struct perf_event *event)
  10027{
  10028	struct bpf_prog *prog = event->prog;
  10029
  10030	if (!prog)
  10031		return;
  10032
  10033	WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
  10034	event->prog = NULL;
  10035	bpf_prog_put(prog);
  10036}
  10037#else
  10038static int perf_event_set_bpf_handler(struct perf_event *event,
  10039				      struct bpf_prog *prog,
  10040				      u64 bpf_cookie)
  10041{
  10042	return -EOPNOTSUPP;
  10043}
  10044static void perf_event_free_bpf_handler(struct perf_event *event)
  10045{
  10046}
  10047#endif
  10048
  10049/*
  10050 * returns true if the event is a tracepoint, or a kprobe/upprobe created
  10051 * with perf_event_open()
  10052 */
  10053static inline bool perf_event_is_tracing(struct perf_event *event)
  10054{
  10055	if (event->pmu == &perf_tracepoint)
  10056		return true;
  10057#ifdef CONFIG_KPROBE_EVENTS
  10058	if (event->pmu == &perf_kprobe)
  10059		return true;
  10060#endif
  10061#ifdef CONFIG_UPROBE_EVENTS
  10062	if (event->pmu == &perf_uprobe)
  10063		return true;
  10064#endif
  10065	return false;
  10066}
  10067
  10068int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
  10069			    u64 bpf_cookie)
  10070{
  10071	bool is_kprobe, is_tracepoint, is_syscall_tp;
  10072
  10073	if (!perf_event_is_tracing(event))
  10074		return perf_event_set_bpf_handler(event, prog, bpf_cookie);
  10075
  10076	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
  10077	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
  10078	is_syscall_tp = is_syscall_trace_event(event->tp_event);
  10079	if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
  10080		/* bpf programs can only be attached to u/kprobe or tracepoint */
  10081		return -EINVAL;
  10082
  10083	if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
  10084	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
  10085	    (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
  10086		return -EINVAL;
  10087
  10088	/* Kprobe override only works for kprobes, not uprobes. */
  10089	if (prog->kprobe_override &&
  10090	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
  10091		return -EINVAL;
  10092
  10093	if (is_tracepoint || is_syscall_tp) {
  10094		int off = trace_event_get_offsets(event->tp_event);
  10095
  10096		if (prog->aux->max_ctx_offset > off)
  10097			return -EACCES;
  10098	}
  10099
  10100	return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
  10101}
  10102
  10103void perf_event_free_bpf_prog(struct perf_event *event)
  10104{
  10105	if (!perf_event_is_tracing(event)) {
  10106		perf_event_free_bpf_handler(event);
  10107		return;
  10108	}
  10109	perf_event_detach_bpf_prog(event);
  10110}
  10111
  10112#else
  10113
  10114static inline void perf_tp_register(void)
  10115{
  10116}
  10117
  10118static void perf_event_free_filter(struct perf_event *event)
  10119{
  10120}
  10121
  10122int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
  10123			    u64 bpf_cookie)
  10124{
  10125	return -ENOENT;
  10126}
  10127
  10128void perf_event_free_bpf_prog(struct perf_event *event)
  10129{
  10130}
  10131#endif /* CONFIG_EVENT_TRACING */
  10132
  10133#ifdef CONFIG_HAVE_HW_BREAKPOINT
  10134void perf_bp_event(struct perf_event *bp, void *data)
  10135{
  10136	struct perf_sample_data sample;
  10137	struct pt_regs *regs = data;
  10138
  10139	perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
  10140
  10141	if (!bp->hw.state && !perf_exclude_event(bp, regs))
  10142		perf_swevent_event(bp, 1, &sample, regs);
  10143}
  10144#endif
  10145
  10146/*
  10147 * Allocate a new address filter
  10148 */
  10149static struct perf_addr_filter *
  10150perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
  10151{
  10152	int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
  10153	struct perf_addr_filter *filter;
  10154
  10155	filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
  10156	if (!filter)
  10157		return NULL;
  10158
  10159	INIT_LIST_HEAD(&filter->entry);
  10160	list_add_tail(&filter->entry, filters);
  10161
  10162	return filter;
  10163}
  10164
  10165static void free_filters_list(struct list_head *filters)
  10166{
  10167	struct perf_addr_filter *filter, *iter;
  10168
  10169	list_for_each_entry_safe(filter, iter, filters, entry) {
  10170		path_put(&filter->path);
  10171		list_del(&filter->entry);
  10172		kfree(filter);
  10173	}
  10174}
  10175
  10176/*
  10177 * Free existing address filters and optionally install new ones
  10178 */
  10179static void perf_addr_filters_splice(struct perf_event *event,
  10180				     struct list_head *head)
  10181{
  10182	unsigned long flags;
  10183	LIST_HEAD(list);
  10184
  10185	if (!has_addr_filter(event))
  10186		return;
  10187
  10188	/* don't bother with children, they don't have their own filters */
  10189	if (event->parent)
  10190		return;
  10191
  10192	raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
  10193
  10194	list_splice_init(&event->addr_filters.list, &list);
  10195	if (head)
  10196		list_splice(head, &event->addr_filters.list);
  10197
  10198	raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
  10199
  10200	free_filters_list(&list);
  10201}
  10202
  10203/*
  10204 * Scan through mm's vmas and see if one of them matches the
  10205 * @filter; if so, adjust filter's address range.
  10206 * Called with mm::mmap_lock down for reading.
  10207 */
  10208static void perf_addr_filter_apply(struct perf_addr_filter *filter,
  10209				   struct mm_struct *mm,
  10210				   struct perf_addr_filter_range *fr)
  10211{
  10212	struct vm_area_struct *vma;
  10213
  10214	for (vma = mm->mmap; vma; vma = vma->vm_next) {
  10215		if (!vma->vm_file)
  10216			continue;
  10217
  10218		if (perf_addr_filter_vma_adjust(filter, vma, fr))
  10219			return;
  10220	}
  10221}
  10222
  10223/*
  10224 * Update event's address range filters based on the
  10225 * task's existing mappings, if any.
  10226 */
  10227static void perf_event_addr_filters_apply(struct perf_event *event)
  10228{
  10229	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
  10230	struct task_struct *task = READ_ONCE(event->ctx->task);
  10231	struct perf_addr_filter *filter;
  10232	struct mm_struct *mm = NULL;
  10233	unsigned int count = 0;
  10234	unsigned long flags;
  10235
  10236	/*
  10237	 * We may observe TASK_TOMBSTONE, which means that the event tear-down
  10238	 * will stop on the parent's child_mutex that our caller is also holding
  10239	 */
  10240	if (task == TASK_TOMBSTONE)
  10241		return;
  10242
  10243	if (ifh->nr_file_filters) {
  10244		mm = get_task_mm(task);
  10245		if (!mm)
  10246			goto restart;
  10247
  10248		mmap_read_lock(mm);
  10249	}
  10250
  10251	raw_spin_lock_irqsave(&ifh->lock, flags);
  10252	list_for_each_entry(filter, &ifh->list, entry) {
  10253		if (filter->path.dentry) {
  10254			/*
  10255			 * Adjust base offset if the filter is associated to a
  10256			 * binary that needs to be mapped:
  10257			 */
  10258			event->addr_filter_ranges[count].start = 0;
  10259			event->addr_filter_ranges[count].size = 0;
  10260
  10261			perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
  10262		} else {
  10263			event->addr_filter_ranges[count].start = filter->offset;
  10264			event->addr_filter_ranges[count].size  = filter->size;
  10265		}
  10266
  10267		count++;
  10268	}
  10269
  10270	event->addr_filters_gen++;
  10271	raw_spin_unlock_irqrestore(&ifh->lock, flags);
  10272
  10273	if (ifh->nr_file_filters) {
  10274		mmap_read_unlock(mm);
  10275
  10276		mmput(mm);
  10277	}
  10278
  10279restart:
  10280	perf_event_stop(event, 1);
  10281}
  10282
  10283/*
  10284 * Address range filtering: limiting the data to certain
  10285 * instruction address ranges. Filters are ioctl()ed to us from
  10286 * userspace as ascii strings.
  10287 *
  10288 * Filter string format:
  10289 *
  10290 * ACTION RANGE_SPEC
  10291 * where ACTION is one of the
  10292 *  * "filter": limit the trace to this region
  10293 *  * "start": start tracing from this address
  10294 *  * "stop": stop tracing at this address/region;
  10295 * RANGE_SPEC is
  10296 *  * for kernel addresses: <start address>[/<size>]
  10297 *  * for object files:     <start address>[/<size>]@</path/to/object/file>
  10298 *
  10299 * if <size> is not specified or is zero, the range is treated as a single
  10300 * address; not valid for ACTION=="filter".
  10301 */
  10302enum {
  10303	IF_ACT_NONE = -1,
  10304	IF_ACT_FILTER,
  10305	IF_ACT_START,
  10306	IF_ACT_STOP,
  10307	IF_SRC_FILE,
  10308	IF_SRC_KERNEL,
  10309	IF_SRC_FILEADDR,
  10310	IF_SRC_KERNELADDR,
  10311};
  10312
  10313enum {
  10314	IF_STATE_ACTION = 0,
  10315	IF_STATE_SOURCE,
  10316	IF_STATE_END,
  10317};
  10318
  10319static const match_table_t if_tokens = {
  10320	{ IF_ACT_FILTER,	"filter" },
  10321	{ IF_ACT_START,		"start" },
  10322	{ IF_ACT_STOP,		"stop" },
  10323	{ IF_SRC_FILE,		"%u/%u@%s" },
  10324	{ IF_SRC_KERNEL,	"%u/%u" },
  10325	{ IF_SRC_FILEADDR,	"%u@%s" },
  10326	{ IF_SRC_KERNELADDR,	"%u" },
  10327	{ IF_ACT_NONE,		NULL },
  10328};
  10329
  10330/*
  10331 * Address filter string parser
  10332 */
  10333static int
  10334perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
  10335			     struct list_head *filters)
  10336{
  10337	struct perf_addr_filter *filter = NULL;
  10338	char *start, *orig, *filename = NULL;
  10339	substring_t args[MAX_OPT_ARGS];
  10340	int state = IF_STATE_ACTION, token;
  10341	unsigned int kernel = 0;
  10342	int ret = -EINVAL;
  10343
  10344	orig = fstr = kstrdup(fstr, GFP_KERNEL);
  10345	if (!fstr)
  10346		return -ENOMEM;
  10347
  10348	while ((start = strsep(&fstr, " ,\n")) != NULL) {
  10349		static const enum perf_addr_filter_action_t actions[] = {
  10350			[IF_ACT_FILTER]	= PERF_ADDR_FILTER_ACTION_FILTER,
  10351			[IF_ACT_START]	= PERF_ADDR_FILTER_ACTION_START,
  10352			[IF_ACT_STOP]	= PERF_ADDR_FILTER_ACTION_STOP,
  10353		};
  10354		ret = -EINVAL;
  10355
  10356		if (!*start)
  10357			continue;
  10358
  10359		/* filter definition begins */
  10360		if (state == IF_STATE_ACTION) {
  10361			filter = perf_addr_filter_new(event, filters);
  10362			if (!filter)
  10363				goto fail;
  10364		}
  10365
  10366		token = match_token(start, if_tokens, args);
  10367		switch (token) {
  10368		case IF_ACT_FILTER:
  10369		case IF_ACT_START:
  10370		case IF_ACT_STOP:
  10371			if (state != IF_STATE_ACTION)
  10372				goto fail;
  10373
  10374			filter->action = actions[token];
  10375			state = IF_STATE_SOURCE;
  10376			break;
  10377
  10378		case IF_SRC_KERNELADDR:
  10379		case IF_SRC_KERNEL:
  10380			kernel = 1;
  10381			fallthrough;
  10382
  10383		case IF_SRC_FILEADDR:
  10384		case IF_SRC_FILE:
  10385			if (state != IF_STATE_SOURCE)
  10386				goto fail;
  10387
  10388			*args[0].to = 0;
  10389			ret = kstrtoul(args[0].from, 0, &filter->offset);
  10390			if (ret)
  10391				goto fail;
  10392
  10393			if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
  10394				*args[1].to = 0;
  10395				ret = kstrtoul(args[1].from, 0, &filter->size);
  10396				if (ret)
  10397					goto fail;
  10398			}
  10399
  10400			if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
  10401				int fpos = token == IF_SRC_FILE ? 2 : 1;
  10402
  10403				kfree(filename);
  10404				filename = match_strdup(&args[fpos]);
  10405				if (!filename) {
  10406					ret = -ENOMEM;
  10407					goto fail;
  10408				}
  10409			}
  10410
  10411			state = IF_STATE_END;
  10412			break;
  10413
  10414		default:
  10415			goto fail;
  10416		}
  10417
  10418		/*
  10419		 * Filter definition is fully parsed, validate and install it.
  10420		 * Make sure that it doesn't contradict itself or the event's
  10421		 * attribute.
  10422		 */
  10423		if (state == IF_STATE_END) {
  10424			ret = -EINVAL;
  10425
  10426			/*
  10427			 * ACTION "filter" must have a non-zero length region
  10428			 * specified.
  10429			 */
  10430			if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
  10431			    !filter->size)
  10432				goto fail;
  10433
  10434			if (!kernel) {
  10435				if (!filename)
  10436					goto fail;
  10437
  10438				/*
  10439				 * For now, we only support file-based filters
  10440				 * in per-task events; doing so for CPU-wide
  10441				 * events requires additional context switching
  10442				 * trickery, since same object code will be
  10443				 * mapped at different virtual addresses in
  10444				 * different processes.
  10445				 */
  10446				ret = -EOPNOTSUPP;
  10447				if (!event->ctx->task)
  10448					goto fail;
  10449
  10450				/* look up the path and grab its inode */
  10451				ret = kern_path(filename, LOOKUP_FOLLOW,
  10452						&filter->path);
  10453				if (ret)
  10454					goto fail;
  10455
  10456				ret = -EINVAL;
  10457				if (!filter->path.dentry ||
  10458				    !S_ISREG(d_inode(filter->path.dentry)
  10459					     ->i_mode))
  10460					goto fail;
  10461
  10462				event->addr_filters.nr_file_filters++;
  10463			}
  10464
  10465			/* ready to consume more filters */
  10466			kfree(filename);
  10467			filename = NULL;
  10468			state = IF_STATE_ACTION;
  10469			filter = NULL;
  10470			kernel = 0;
  10471		}
  10472	}
  10473
  10474	if (state != IF_STATE_ACTION)
  10475		goto fail;
  10476
  10477	kfree(filename);
  10478	kfree(orig);
  10479
  10480	return 0;
  10481
  10482fail:
  10483	kfree(filename);
  10484	free_filters_list(filters);
  10485	kfree(orig);
  10486
  10487	return ret;
  10488}
  10489
  10490static int
  10491perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
  10492{
  10493	LIST_HEAD(filters);
  10494	int ret;
  10495
  10496	/*
  10497	 * Since this is called in perf_ioctl() path, we're already holding
  10498	 * ctx::mutex.
  10499	 */
  10500	lockdep_assert_held(&event->ctx->mutex);
  10501
  10502	if (WARN_ON_ONCE(event->parent))
  10503		return -EINVAL;
  10504
  10505	ret = perf_event_parse_addr_filter(event, filter_str, &filters);
  10506	if (ret)
  10507		goto fail_clear_files;
  10508
  10509	ret = event->pmu->addr_filters_validate(&filters);
  10510	if (ret)
  10511		goto fail_free_filters;
  10512
  10513	/* remove existing filters, if any */
  10514	perf_addr_filters_splice(event, &filters);
  10515
  10516	/* install new filters */
  10517	perf_event_for_each_child(event, perf_event_addr_filters_apply);
  10518
  10519	return ret;
  10520
  10521fail_free_filters:
  10522	free_filters_list(&filters);
  10523
  10524fail_clear_files:
  10525	event->addr_filters.nr_file_filters = 0;
  10526
  10527	return ret;
  10528}
  10529
  10530static int perf_event_set_filter(struct perf_event *event, void __user *arg)
  10531{
  10532	int ret = -EINVAL;
  10533	char *filter_str;
  10534
  10535	filter_str = strndup_user(arg, PAGE_SIZE);
  10536	if (IS_ERR(filter_str))
  10537		return PTR_ERR(filter_str);
  10538
  10539#ifdef CONFIG_EVENT_TRACING
  10540	if (perf_event_is_tracing(event)) {
  10541		struct perf_event_context *ctx = event->ctx;
  10542
  10543		/*
  10544		 * Beware, here be dragons!!
  10545		 *
  10546		 * the tracepoint muck will deadlock against ctx->mutex, but
  10547		 * the tracepoint stuff does not actually need it. So
  10548		 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
  10549		 * already have a reference on ctx.
  10550		 *
  10551		 * This can result in event getting moved to a different ctx,
  10552		 * but that does not affect the tracepoint state.
  10553		 */
  10554		mutex_unlock(&ctx->mutex);
  10555		ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
  10556		mutex_lock(&ctx->mutex);
  10557	} else
  10558#endif
  10559	if (has_addr_filter(event))
  10560		ret = perf_event_set_addr_filter(event, filter_str);
  10561
  10562	kfree(filter_str);
  10563	return ret;
  10564}
  10565
  10566/*
  10567 * hrtimer based swevent callback
  10568 */
  10569
  10570static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
  10571{
  10572	enum hrtimer_restart ret = HRTIMER_RESTART;
  10573	struct perf_sample_data data;
  10574	struct pt_regs *regs;
  10575	struct perf_event *event;
  10576	u64 period;
  10577
  10578	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
  10579
  10580	if (event->state != PERF_EVENT_STATE_ACTIVE)
  10581		return HRTIMER_NORESTART;
  10582
  10583	event->pmu->read(event);
  10584
  10585	perf_sample_data_init(&data, 0, event->hw.last_period);
  10586	regs = get_irq_regs();
  10587
  10588	if (regs && !perf_exclude_event(event, regs)) {
  10589		if (!(event->attr.exclude_idle && is_idle_task(current)))
  10590			if (__perf_event_overflow(event, 1, &data, regs))
  10591				ret = HRTIMER_NORESTART;
  10592	}
  10593
  10594	period = max_t(u64, 10000, event->hw.sample_period);
  10595	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
  10596
  10597	return ret;
  10598}
  10599
  10600static void perf_swevent_start_hrtimer(struct perf_event *event)
  10601{
  10602	struct hw_perf_event *hwc = &event->hw;
  10603	s64 period;
  10604
  10605	if (!is_sampling_event(event))
  10606		return;
  10607
  10608	period = local64_read(&hwc->period_left);
  10609	if (period) {
  10610		if (period < 0)
  10611			period = 10000;
  10612
  10613		local64_set(&hwc->period_left, 0);
  10614	} else {
  10615		period = max_t(u64, 10000, hwc->sample_period);
  10616	}
  10617	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
  10618		      HRTIMER_MODE_REL_PINNED_HARD);
  10619}
  10620
  10621static void perf_swevent_cancel_hrtimer(struct perf_event *event)
  10622{
  10623	struct hw_perf_event *hwc = &event->hw;
  10624
  10625	if (is_sampling_event(event)) {
  10626		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
  10627		local64_set(&hwc->period_left, ktime_to_ns(remaining));
  10628
  10629		hrtimer_cancel(&hwc->hrtimer);
  10630	}
  10631}
  10632
  10633static void perf_swevent_init_hrtimer(struct perf_event *event)
  10634{
  10635	struct hw_perf_event *hwc = &event->hw;
  10636
  10637	if (!is_sampling_event(event))
  10638		return;
  10639
  10640	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
  10641	hwc->hrtimer.function = perf_swevent_hrtimer;
  10642
  10643	/*
  10644	 * Since hrtimers have a fixed rate, we can do a static freq->period
  10645	 * mapping and avoid the whole period adjust feedback stuff.
  10646	 */
  10647	if (event->attr.freq) {
  10648		long freq = event->attr.sample_freq;
  10649
  10650		event->attr.sample_period = NSEC_PER_SEC / freq;
  10651		hwc->sample_period = event->attr.sample_period;
  10652		local64_set(&hwc->period_left, hwc->sample_period);
  10653		hwc->last_period = hwc->sample_period;
  10654		event->attr.freq = 0;
  10655	}
  10656}
  10657
  10658/*
  10659 * Software event: cpu wall time clock
  10660 */
  10661
  10662static void cpu_clock_event_update(struct perf_event *event)
  10663{
  10664	s64 prev;
  10665	u64 now;
  10666
  10667	now = local_clock();
  10668	prev = local64_xchg(&event->hw.prev_count, now);
  10669	local64_add(now - prev, &event->count);
  10670}
  10671
  10672static void cpu_clock_event_start(struct perf_event *event, int flags)
  10673{
  10674	local64_set(&event->hw.prev_count, local_clock());
  10675	perf_swevent_start_hrtimer(event);
  10676}
  10677
  10678static void cpu_clock_event_stop(struct perf_event *event, int flags)
  10679{
  10680	perf_swevent_cancel_hrtimer(event);
  10681	cpu_clock_event_update(event);
  10682}
  10683
  10684static int cpu_clock_event_add(struct perf_event *event, int flags)
  10685{
  10686	if (flags & PERF_EF_START)
  10687		cpu_clock_event_start(event, flags);
  10688	perf_event_update_userpage(event);
  10689
  10690	return 0;
  10691}
  10692
  10693static void cpu_clock_event_del(struct perf_event *event, int flags)
  10694{
  10695	cpu_clock_event_stop(event, flags);
  10696}
  10697
  10698static void cpu_clock_event_read(struct perf_event *event)
  10699{
  10700	cpu_clock_event_update(event);
  10701}
  10702
  10703static int cpu_clock_event_init(struct perf_event *event)
  10704{
  10705	if (event->attr.type != PERF_TYPE_SOFTWARE)
  10706		return -ENOENT;
  10707
  10708	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
  10709		return -ENOENT;
  10710
  10711	/*
  10712	 * no branch sampling for software events
  10713	 */
  10714	if (has_branch_stack(event))
  10715		return -EOPNOTSUPP;
  10716
  10717	perf_swevent_init_hrtimer(event);
  10718
  10719	return 0;
  10720}
  10721
  10722static struct pmu perf_cpu_clock = {
  10723	.task_ctx_nr	= perf_sw_context,
  10724
  10725	.capabilities	= PERF_PMU_CAP_NO_NMI,
  10726
  10727	.event_init	= cpu_clock_event_init,
  10728	.add		= cpu_clock_event_add,
  10729	.del		= cpu_clock_event_del,
  10730	.start		= cpu_clock_event_start,
  10731	.stop		= cpu_clock_event_stop,
  10732	.read		= cpu_clock_event_read,
  10733};
  10734
  10735/*
  10736 * Software event: task time clock
  10737 */
  10738
  10739static void task_clock_event_update(struct perf_event *event, u64 now)
  10740{
  10741	u64 prev;
  10742	s64 delta;
  10743
  10744	prev = local64_xchg(&event->hw.prev_count, now);
  10745	delta = now - prev;
  10746	local64_add(delta, &event->count);
  10747}
  10748
  10749static void task_clock_event_start(struct perf_event *event, int flags)
  10750{
  10751	local64_set(&event->hw.prev_count, event->ctx->time);
  10752	perf_swevent_start_hrtimer(event);
  10753}
  10754
  10755static void task_clock_event_stop(struct perf_event *event, int flags)
  10756{
  10757	perf_swevent_cancel_hrtimer(event);
  10758	task_clock_event_update(event, event->ctx->time);
  10759}
  10760
  10761static int task_clock_event_add(struct perf_event *event, int flags)
  10762{
  10763	if (flags & PERF_EF_START)
  10764		task_clock_event_start(event, flags);
  10765	perf_event_update_userpage(event);
  10766
  10767	return 0;
  10768}
  10769
  10770static void task_clock_event_del(struct perf_event *event, int flags)
  10771{
  10772	task_clock_event_stop(event, PERF_EF_UPDATE);
  10773}
  10774
  10775static void task_clock_event_read(struct perf_event *event)
  10776{
  10777	u64 now = perf_clock();
  10778	u64 delta = now - event->ctx->timestamp;
  10779	u64 time = event->ctx->time + delta;
  10780
  10781	task_clock_event_update(event, time);
  10782}
  10783
  10784static int task_clock_event_init(struct perf_event *event)
  10785{
  10786	if (event->attr.type != PERF_TYPE_SOFTWARE)
  10787		return -ENOENT;
  10788
  10789	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
  10790		return -ENOENT;
  10791
  10792	/*
  10793	 * no branch sampling for software events
  10794	 */
  10795	if (has_branch_stack(event))
  10796		return -EOPNOTSUPP;
  10797
  10798	perf_swevent_init_hrtimer(event);
  10799
  10800	return 0;
  10801}
  10802
  10803static struct pmu perf_task_clock = {
  10804	.task_ctx_nr	= perf_sw_context,
  10805
  10806	.capabilities	= PERF_PMU_CAP_NO_NMI,
  10807
  10808	.event_init	= task_clock_event_init,
  10809	.add		= task_clock_event_add,
  10810	.del		= task_clock_event_del,
  10811	.start		= task_clock_event_start,
  10812	.stop		= task_clock_event_stop,
  10813	.read		= task_clock_event_read,
  10814};
  10815
  10816static void perf_pmu_nop_void(struct pmu *pmu)
  10817{
  10818}
  10819
  10820static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
  10821{
  10822}
  10823
  10824static int perf_pmu_nop_int(struct pmu *pmu)
  10825{
  10826	return 0;
  10827}
  10828
  10829static int perf_event_nop_int(struct perf_event *event, u64 value)
  10830{
  10831	return 0;
  10832}
  10833
  10834static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
  10835
  10836static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
  10837{
  10838	__this_cpu_write(nop_txn_flags, flags);
  10839
  10840	if (flags & ~PERF_PMU_TXN_ADD)
  10841		return;
  10842
  10843	perf_pmu_disable(pmu);
  10844}
  10845
  10846static int perf_pmu_commit_txn(struct pmu *pmu)
  10847{
  10848	unsigned int flags = __this_cpu_read(nop_txn_flags);
  10849
  10850	__this_cpu_write(nop_txn_flags, 0);
  10851
  10852	if (flags & ~PERF_PMU_TXN_ADD)
  10853		return 0;
  10854
  10855	perf_pmu_enable(pmu);
  10856	return 0;
  10857}
  10858
  10859static void perf_pmu_cancel_txn(struct pmu *pmu)
  10860{
  10861	unsigned int flags =  __this_cpu_read(nop_txn_flags);
  10862
  10863	__this_cpu_write(nop_txn_flags, 0);
  10864
  10865	if (flags & ~PERF_PMU_TXN_ADD)
  10866		return;
  10867
  10868	perf_pmu_enable(pmu);
  10869}
  10870
  10871static int perf_event_idx_default(struct perf_event *event)
  10872{
  10873	return 0;
  10874}
  10875
  10876/*
  10877 * Ensures all contexts with the same task_ctx_nr have the same
  10878 * pmu_cpu_context too.
  10879 */
  10880static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
  10881{
  10882	struct pmu *pmu;
  10883
  10884	if (ctxn < 0)
  10885		return NULL;
  10886
  10887	list_for_each_entry(pmu, &pmus, entry) {
  10888		if (pmu->task_ctx_nr == ctxn)
  10889			return pmu->pmu_cpu_context;
  10890	}
  10891
  10892	return NULL;
  10893}
  10894
  10895static void free_pmu_context(struct pmu *pmu)
  10896{
  10897	/*
  10898	 * Static contexts such as perf_sw_context have a global lifetime
  10899	 * and may be shared between different PMUs. Avoid freeing them
  10900	 * when a single PMU is going away.
  10901	 */
  10902	if (pmu->task_ctx_nr > perf_invalid_context)
  10903		return;
  10904
  10905	free_percpu(pmu->pmu_cpu_context);
  10906}
  10907
  10908/*
  10909 * Let userspace know that this PMU supports address range filtering:
  10910 */
  10911static ssize_t nr_addr_filters_show(struct device *dev,
  10912				    struct device_attribute *attr,
  10913				    char *page)
  10914{
  10915	struct pmu *pmu = dev_get_drvdata(dev);
  10916
  10917	return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
  10918}
  10919DEVICE_ATTR_RO(nr_addr_filters);
  10920
  10921static struct idr pmu_idr;
  10922
  10923static ssize_t
  10924type_show(struct device *dev, struct device_attribute *attr, char *page)
  10925{
  10926	struct pmu *pmu = dev_get_drvdata(dev);
  10927
  10928	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
  10929}
  10930static DEVICE_ATTR_RO(type);
  10931
  10932static ssize_t
  10933perf_event_mux_interval_ms_show(struct device *dev,
  10934				struct device_attribute *attr,
  10935				char *page)
  10936{
  10937	struct pmu *pmu = dev_get_drvdata(dev);
  10938
  10939	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
  10940}
  10941
  10942static DEFINE_MUTEX(mux_interval_mutex);
  10943
  10944static ssize_t
  10945perf_event_mux_interval_ms_store(struct device *dev,
  10946				 struct device_attribute *attr,
  10947				 const char *buf, size_t count)
  10948{
  10949	struct pmu *pmu = dev_get_drvdata(dev);
  10950	int timer, cpu, ret;
  10951
  10952	ret = kstrtoint(buf, 0, &timer);
  10953	if (ret)
  10954		return ret;
  10955
  10956	if (timer < 1)
  10957		return -EINVAL;
  10958
  10959	/* same value, noting to do */
  10960	if (timer == pmu->hrtimer_interval_ms)
  10961		return count;
  10962
  10963	mutex_lock(&mux_interval_mutex);
  10964	pmu->hrtimer_interval_ms = timer;
  10965
  10966	/* update all cpuctx for this PMU */
  10967	cpus_read_lock();
  10968	for_each_online_cpu(cpu) {
  10969		struct perf_cpu_context *cpuctx;
  10970		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
  10971		cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
  10972
  10973		cpu_function_call(cpu,
  10974			(remote_function_f)perf_mux_hrtimer_restart, cpuctx);
  10975	}
  10976	cpus_read_unlock();
  10977	mutex_unlock(&mux_interval_mutex);
  10978
  10979	return count;
  10980}
  10981static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
  10982
  10983static struct attribute *pmu_dev_attrs[] = {
  10984	&dev_attr_type.attr,
  10985	&dev_attr_perf_event_mux_interval_ms.attr,
  10986	NULL,
  10987};
  10988ATTRIBUTE_GROUPS(pmu_dev);
  10989
  10990static int pmu_bus_running;
  10991static struct bus_type pmu_bus = {
  10992	.name		= "event_source",
  10993	.dev_groups	= pmu_dev_groups,
  10994};
  10995
  10996static void pmu_dev_release(struct device *dev)
  10997{
  10998	kfree(dev);
  10999}
  11000
  11001static int pmu_dev_alloc(struct pmu *pmu)
  11002{
  11003	int ret = -ENOMEM;
  11004
  11005	pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
  11006	if (!pmu->dev)
  11007		goto out;
  11008
  11009	pmu->dev->groups = pmu->attr_groups;
  11010	device_initialize(pmu->dev);
  11011	ret = dev_set_name(pmu->dev, "%s", pmu->name);
  11012	if (ret)
  11013		goto free_dev;
  11014
  11015	dev_set_drvdata(pmu->dev, pmu);
  11016	pmu->dev->bus = &pmu_bus;
  11017	pmu->dev->release = pmu_dev_release;
  11018	ret = device_add(pmu->dev);
  11019	if (ret)
  11020		goto free_dev;
  11021
  11022	/* For PMUs with address filters, throw in an extra attribute: */
  11023	if (pmu->nr_addr_filters)
  11024		ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
  11025
  11026	if (ret)
  11027		goto del_dev;
  11028
  11029	if (pmu->attr_update)
  11030		ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
  11031
  11032	if (ret)
  11033		goto del_dev;
  11034
  11035out:
  11036	return ret;
  11037
  11038del_dev:
  11039	device_del(pmu->dev);
  11040
  11041free_dev:
  11042	put_device(pmu->dev);
  11043	goto out;
  11044}
  11045
  11046static struct lock_class_key cpuctx_mutex;
  11047static struct lock_class_key cpuctx_lock;
  11048
  11049int perf_pmu_register(struct pmu *pmu, const char *name, int type)
  11050{
  11051	int cpu, ret, max = PERF_TYPE_MAX;
  11052
  11053	mutex_lock(&pmus_lock);
  11054	ret = -ENOMEM;
  11055	pmu->pmu_disable_count = alloc_percpu(int);
  11056	if (!pmu->pmu_disable_count)
  11057		goto unlock;
  11058
  11059	pmu->type = -1;
  11060	if (!name)
  11061		goto skip_type;
  11062	pmu->name = name;
  11063
  11064	if (type != PERF_TYPE_SOFTWARE) {
  11065		if (type >= 0)
  11066			max = type;
  11067
  11068		ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
  11069		if (ret < 0)
  11070			goto free_pdc;
  11071
  11072		WARN_ON(type >= 0 && ret != type);
  11073
  11074		type = ret;
  11075	}
  11076	pmu->type = type;
  11077
  11078	if (pmu_bus_running) {
  11079		ret = pmu_dev_alloc(pmu);
  11080		if (ret)
  11081			goto free_idr;
  11082	}
  11083
  11084skip_type:
  11085	if (pmu->task_ctx_nr == perf_hw_context) {
  11086		static int hw_context_taken = 0;
  11087
  11088		/*
  11089		 * Other than systems with heterogeneous CPUs, it never makes
  11090		 * sense for two PMUs to share perf_hw_context. PMUs which are
  11091		 * uncore must use perf_invalid_context.
  11092		 */
  11093		if (WARN_ON_ONCE(hw_context_taken &&
  11094		    !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
  11095			pmu->task_ctx_nr = perf_invalid_context;
  11096
  11097		hw_context_taken = 1;
  11098	}
  11099
  11100	pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
  11101	if (pmu->pmu_cpu_context)
  11102		goto got_cpu_context;
  11103
  11104	ret = -ENOMEM;
  11105	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
  11106	if (!pmu->pmu_cpu_context)
  11107		goto free_dev;
  11108
  11109	for_each_possible_cpu(cpu) {
  11110		struct perf_cpu_context *cpuctx;
  11111
  11112		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
  11113		__perf_event_init_context(&cpuctx->ctx);
  11114		lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
  11115		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
  11116		cpuctx->ctx.pmu = pmu;
  11117		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
  11118
  11119		__perf_mux_hrtimer_init(cpuctx, cpu);
  11120
  11121		cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
  11122		cpuctx->heap = cpuctx->heap_default;
  11123	}
  11124
  11125got_cpu_context:
  11126	if (!pmu->start_txn) {
  11127		if (pmu->pmu_enable) {
  11128			/*
  11129			 * If we have pmu_enable/pmu_disable calls, install
  11130			 * transaction stubs that use that to try and batch
  11131			 * hardware accesses.
  11132			 */
  11133			pmu->start_txn  = perf_pmu_start_txn;
  11134			pmu->commit_txn = perf_pmu_commit_txn;
  11135			pmu->cancel_txn = perf_pmu_cancel_txn;
  11136		} else {
  11137			pmu->start_txn  = perf_pmu_nop_txn;
  11138			pmu->commit_txn = perf_pmu_nop_int;
  11139			pmu->cancel_txn = perf_pmu_nop_void;
  11140		}
  11141	}
  11142
  11143	if (!pmu->pmu_enable) {
  11144		pmu->pmu_enable  = perf_pmu_nop_void;
  11145		pmu->pmu_disable = perf_pmu_nop_void;
  11146	}
  11147
  11148	if (!pmu->check_period)
  11149		pmu->check_period = perf_event_nop_int;
  11150
  11151	if (!pmu->event_idx)
  11152		pmu->event_idx = perf_event_idx_default;
  11153
  11154	/*
  11155	 * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
  11156	 * since these cannot be in the IDR. This way the linear search
  11157	 * is fast, provided a valid software event is provided.
  11158	 */
  11159	if (type == PERF_TYPE_SOFTWARE || !name)
  11160		list_add_rcu(&pmu->entry, &pmus);
  11161	else
  11162		list_add_tail_rcu(&pmu->entry, &pmus);
  11163
  11164	atomic_set(&pmu->exclusive_cnt, 0);
  11165	ret = 0;
  11166unlock:
  11167	mutex_unlock(&pmus_lock);
  11168
  11169	return ret;
  11170
  11171free_dev:
  11172	device_del(pmu->dev);
  11173	put_device(pmu->dev);
  11174
  11175free_idr:
  11176	if (pmu->type != PERF_TYPE_SOFTWARE)
  11177		idr_remove(&pmu_idr, pmu->type);
  11178
  11179free_pdc:
  11180	free_percpu(pmu->pmu_disable_count);
  11181	goto unlock;
  11182}
  11183EXPORT_SYMBOL_GPL(perf_pmu_register);
  11184
  11185void perf_pmu_unregister(struct pmu *pmu)
  11186{
  11187	mutex_lock(&pmus_lock);
  11188	list_del_rcu(&pmu->entry);
  11189
  11190	/*
  11191	 * We dereference the pmu list under both SRCU and regular RCU, so
  11192	 * synchronize against both of those.
  11193	 */
  11194	synchronize_srcu(&pmus_srcu);
  11195	synchronize_rcu();
  11196
  11197	free_percpu(pmu->pmu_disable_count);
  11198	if (pmu->type != PERF_TYPE_SOFTWARE)
  11199		idr_remove(&pmu_idr, pmu->type);
  11200	if (pmu_bus_running) {
  11201		if (pmu->nr_addr_filters)
  11202			device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
  11203		device_del(pmu->dev);
  11204		put_device(pmu->dev);
  11205	}
  11206	free_pmu_context(pmu);
  11207	mutex_unlock(&pmus_lock);
  11208}
  11209EXPORT_SYMBOL_GPL(perf_pmu_unregister);
  11210
  11211static inline bool has_extended_regs(struct perf_event *event)
  11212{
  11213	return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
  11214	       (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
  11215}
  11216
  11217static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
  11218{
  11219	struct perf_event_context *ctx = NULL;
  11220	int ret;
  11221
  11222	if (!try_module_get(pmu->module))
  11223		return -ENODEV;
  11224
  11225	/*
  11226	 * A number of pmu->event_init() methods iterate the sibling_list to,
  11227	 * for example, validate if the group fits on the PMU. Therefore,
  11228	 * if this is a sibling event, acquire the ctx->mutex to protect
  11229	 * the sibling_list.
  11230	 */
  11231	if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
  11232		/*
  11233		 * This ctx->mutex can nest when we're called through
  11234		 * inheritance. See the perf_event_ctx_lock_nested() comment.
  11235		 */
  11236		ctx = perf_event_ctx_lock_nested(event->group_leader,
  11237						 SINGLE_DEPTH_NESTING);
  11238		BUG_ON(!ctx);
  11239	}
  11240
  11241	event->pmu = pmu;
  11242	ret = pmu->event_init(event);
  11243
  11244	if (ctx)
  11245		perf_event_ctx_unlock(event->group_leader, ctx);
  11246
  11247	if (!ret) {
  11248		if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
  11249		    has_extended_regs(event))
  11250			ret = -EOPNOTSUPP;
  11251
  11252		if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
  11253		    event_has_any_exclude_flag(event))
  11254			ret = -EINVAL;
  11255
  11256		if (ret && event->destroy)
  11257			event->destroy(event);
  11258	}
  11259
  11260	if (ret)
  11261		module_put(pmu->module);
  11262
  11263	return ret;
  11264}
  11265
  11266static struct pmu *perf_init_event(struct perf_event *event)
  11267{
  11268	bool extended_type = false;
  11269	int idx, type, ret;
  11270	struct pmu *pmu;
  11271
  11272	idx = srcu_read_lock(&pmus_srcu);
  11273
  11274	/* Try parent's PMU first: */
  11275	if (event->parent && event->parent->pmu) {
  11276		pmu = event->parent->pmu;
  11277		ret = perf_try_init_event(pmu, event);
  11278		if (!ret)
  11279			goto unlock;
  11280	}
  11281
  11282	/*
  11283	 * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
  11284	 * are often aliases for PERF_TYPE_RAW.
  11285	 */
  11286	type = event->attr.type;
  11287	if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
  11288		type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
  11289		if (!type) {
  11290			type = PERF_TYPE_RAW;
  11291		} else {
  11292			extended_type = true;
  11293			event->attr.config &= PERF_HW_EVENT_MASK;
  11294		}
  11295	}
  11296
  11297again:
  11298	rcu_read_lock();
  11299	pmu = idr_find(&pmu_idr, type);
  11300	rcu_read_unlock();
  11301	if (pmu) {
  11302		if (event->attr.type != type && type != PERF_TYPE_RAW &&
  11303		    !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
  11304			goto fail;
  11305
  11306		ret = perf_try_init_event(pmu, event);
  11307		if (ret == -ENOENT && event->attr.type != type && !extended_type) {
  11308			type = event->attr.type;
  11309			goto again;
  11310		}
  11311
  11312		if (ret)
  11313			pmu = ERR_PTR(ret);
  11314
  11315		goto unlock;
  11316	}
  11317
  11318	list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
  11319		ret = perf_try_init_event(pmu, event);
  11320		if (!ret)
  11321			goto unlock;
  11322
  11323		if (ret != -ENOENT) {
  11324			pmu = ERR_PTR(ret);
  11325			goto unlock;
  11326		}
  11327	}
  11328fail:
  11329	pmu = ERR_PTR(-ENOENT);
  11330unlock:
  11331	srcu_read_unlock(&pmus_srcu, idx);
  11332
  11333	return pmu;
  11334}
  11335
  11336static void attach_sb_event(struct perf_event *event)
  11337{
  11338	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
  11339
  11340	raw_spin_lock(&pel->lock);
  11341	list_add_rcu(&event->sb_list, &pel->list);
  11342	raw_spin_unlock(&pel->lock);
  11343}
  11344
  11345/*
  11346 * We keep a list of all !task (and therefore per-cpu) events
  11347 * that need to receive side-band records.
  11348 *
  11349 * This avoids having to scan all the various PMU per-cpu contexts
  11350 * looking for them.
  11351 */
  11352static void account_pmu_sb_event(struct perf_event *event)
  11353{
  11354	if (is_sb_event(event))
  11355		attach_sb_event(event);
  11356}
  11357
  11358static void account_event_cpu(struct perf_event *event, int cpu)
  11359{
  11360	if (event->parent)
  11361		return;
  11362
  11363	if (is_cgroup_event(event))
  11364		atomic_inc(&per_cpu(perf_cgroup_events, cpu));
  11365}
  11366
  11367/* Freq events need the tick to stay alive (see perf_event_task_tick). */
  11368static void account_freq_event_nohz(void)
  11369{
  11370#ifdef CONFIG_NO_HZ_FULL
  11371	/* Lock so we don't race with concurrent unaccount */
  11372	spin_lock(&nr_freq_lock);
  11373	if (atomic_inc_return(&nr_freq_events) == 1)
  11374		tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
  11375	spin_unlock(&nr_freq_lock);
  11376#endif
  11377}
  11378
  11379static void account_freq_event(void)
  11380{
  11381	if (tick_nohz_full_enabled())
  11382		account_freq_event_nohz();
  11383	else
  11384		atomic_inc(&nr_freq_events);
  11385}
  11386
  11387
  11388static void account_event(struct perf_event *event)
  11389{
  11390	bool inc = false;
  11391
  11392	if (event->parent)
  11393		return;
  11394
  11395	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
  11396		inc = true;
  11397	if (event->attr.mmap || event->attr.mmap_data)
  11398		atomic_inc(&nr_mmap_events);
  11399	if (event->attr.build_id)
  11400		atomic_inc(&nr_build_id_events);
  11401	if (event->attr.comm)
  11402		atomic_inc(&nr_comm_events);
  11403	if (event->attr.namespaces)
  11404		atomic_inc(&nr_namespaces_events);
  11405	if (event->attr.cgroup)
  11406		atomic_inc(&nr_cgroup_events);
  11407	if (event->attr.task)
  11408		atomic_inc(&nr_task_events);
  11409	if (event->attr.freq)
  11410		account_freq_event();
  11411	if (event->attr.context_switch) {
  11412		atomic_inc(&nr_switch_events);
  11413		inc = true;
  11414	}
  11415	if (has_branch_stack(event))
  11416		inc = true;
  11417	if (is_cgroup_event(event))
  11418		inc = true;
  11419	if (event->attr.ksymbol)
  11420		atomic_inc(&nr_ksymbol_events);
  11421	if (event->attr.bpf_event)
  11422		atomic_inc(&nr_bpf_events);
  11423	if (event->attr.text_poke)
  11424		atomic_inc(&nr_text_poke_events);
  11425
  11426	if (inc) {
  11427		/*
  11428		 * We need the mutex here because static_branch_enable()
  11429		 * must complete *before* the perf_sched_count increment
  11430		 * becomes visible.
  11431		 */
  11432		if (atomic_inc_not_zero(&perf_sched_count))
  11433			goto enabled;
  11434
  11435		mutex_lock(&perf_sched_mutex);
  11436		if (!atomic_read(&perf_sched_count)) {
  11437			static_branch_enable(&perf_sched_events);
  11438			/*
  11439			 * Guarantee that all CPUs observe they key change and
  11440			 * call the perf scheduling hooks before proceeding to
  11441			 * install events that need them.
  11442			 */
  11443			synchronize_rcu();
  11444		}
  11445		/*
  11446		 * Now that we have waited for the sync_sched(), allow further
  11447		 * increments to by-pass the mutex.
  11448		 */
  11449		atomic_inc(&perf_sched_count);
  11450		mutex_unlock(&perf_sched_mutex);
  11451	}
  11452enabled:
  11453
  11454	account_event_cpu(event, event->cpu);
  11455
  11456	account_pmu_sb_event(event);
  11457}
  11458
  11459/*
  11460 * Allocate and initialize an event structure
  11461 */
  11462static struct perf_event *
  11463perf_event_alloc(struct perf_event_attr *attr, int cpu,
  11464		 struct task_struct *task,
  11465		 struct perf_event *group_leader,
  11466		 struct perf_event *parent_event,
  11467		 perf_overflow_handler_t overflow_handler,
  11468		 void *context, int cgroup_fd)
  11469{
  11470	struct pmu *pmu;
  11471	struct perf_event *event;
  11472	struct hw_perf_event *hwc;
  11473	long err = -EINVAL;
  11474	int node;
  11475
  11476	if ((unsigned)cpu >= nr_cpu_ids) {
  11477		if (!task || cpu != -1)
  11478			return ERR_PTR(-EINVAL);
  11479	}
  11480	if (attr->sigtrap && !task) {
  11481		/* Requires a task: avoid signalling random tasks. */
  11482		return ERR_PTR(-EINVAL);
  11483	}
  11484
  11485	node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
  11486	event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
  11487				      node);
  11488	if (!event)
  11489		return ERR_PTR(-ENOMEM);
  11490
  11491	/*
  11492	 * Single events are their own group leaders, with an
  11493	 * empty sibling list:
  11494	 */
  11495	if (!group_leader)
  11496		group_leader = event;
  11497
  11498	mutex_init(&event->child_mutex);
  11499	INIT_LIST_HEAD(&event->child_list);
  11500
  11501	INIT_LIST_HEAD(&event->event_entry);
  11502	INIT_LIST_HEAD(&event->sibling_list);
  11503	INIT_LIST_HEAD(&event->active_list);
  11504	init_event_group(event);
  11505	INIT_LIST_HEAD(&event->rb_entry);
  11506	INIT_LIST_HEAD(&event->active_entry);
  11507	INIT_LIST_HEAD(&event->addr_filters.list);
  11508	INIT_HLIST_NODE(&event->hlist_entry);
  11509
  11510
  11511	init_waitqueue_head(&event->waitq);
  11512	event->pending_disable = -1;
  11513	init_irq_work(&event->pending, perf_pending_event);
  11514
  11515	mutex_init(&event->mmap_mutex);
  11516	raw_spin_lock_init(&event->addr_filters.lock);
  11517
  11518	atomic_long_set(&event->refcount, 1);
  11519	event->cpu		= cpu;
  11520	event->attr		= *attr;
  11521	event->group_leader	= group_leader;
  11522	event->pmu		= NULL;
  11523	event->oncpu		= -1;
  11524
  11525	event->parent		= parent_event;
  11526
  11527	event->ns		= get_pid_ns(task_active_pid_ns(current));
  11528	event->id		= atomic64_inc_return(&perf_event_id);
  11529
  11530	event->state		= PERF_EVENT_STATE_INACTIVE;
  11531
  11532	if (parent_event)
  11533		event->event_caps = parent_event->event_caps;
  11534
  11535	if (event->attr.sigtrap)
  11536		atomic_set(&event->event_limit, 1);
  11537
  11538	if (task) {
  11539		event->attach_state = PERF_ATTACH_TASK;
  11540		/*
  11541		 * XXX pmu::event_init needs to know what task to account to
  11542		 * and we cannot use the ctx information because we need the
  11543		 * pmu before we get a ctx.
  11544		 */
  11545		event->hw.target = get_task_struct(task);
  11546	}
  11547
  11548	event->clock = &local_clock;
  11549	if (parent_event)
  11550		event->clock = parent_event->clock;
  11551
  11552	if (!overflow_handler && parent_event) {
  11553		overflow_handler = parent_event->overflow_handler;
  11554		context = parent_event->overflow_handler_context;
  11555#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
  11556		if (overflow_handler == bpf_overflow_handler) {
  11557			struct bpf_prog *prog = parent_event->prog;
  11558
  11559			bpf_prog_inc(prog);
  11560			event->prog = prog;
  11561			event->orig_overflow_handler =
  11562				parent_event->orig_overflow_handler;
  11563		}
  11564#endif
  11565	}
  11566
  11567	if (overflow_handler) {
  11568		event->overflow_handler	= overflow_handler;
  11569		event->overflow_handler_context = context;
  11570	} else if (is_write_backward(event)){
  11571		event->overflow_handler = perf_event_output_backward;
  11572		event->overflow_handler_context = NULL;
  11573	} else {
  11574		event->overflow_handler = perf_event_output_forward;
  11575		event->overflow_handler_context = NULL;
  11576	}
  11577
  11578	perf_event__state_init(event);
  11579
  11580	pmu = NULL;
  11581
  11582	hwc = &event->hw;
  11583	hwc->sample_period = attr->sample_period;
  11584	if (attr->freq && attr->sample_freq)
  11585		hwc->sample_period = 1;
  11586	hwc->last_period = hwc->sample_period;
  11587
  11588	local64_set(&hwc->period_left, hwc->sample_period);
  11589
  11590	/*
  11591	 * We currently do not support PERF_SAMPLE_READ on inherited events.
  11592	 * See perf_output_read().
  11593	 */
  11594	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
  11595		goto err_ns;
  11596
  11597	if (!has_branch_stack(event))
  11598		event->attr.branch_sample_type = 0;
  11599
  11600	pmu = perf_init_event(event);
  11601	if (IS_ERR(pmu)) {
  11602		err = PTR_ERR(pmu);
  11603		goto err_ns;
  11604	}
  11605
  11606	/*
  11607	 * Disallow uncore-cgroup events, they don't make sense as the cgroup will
  11608	 * be different on other CPUs in the uncore mask.
  11609	 */
  11610	if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
  11611		err = -EINVAL;
  11612		goto err_pmu;
  11613	}
  11614
  11615	if (event->attr.aux_output &&
  11616	    !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
  11617		err = -EOPNOTSUPP;
  11618		goto err_pmu;
  11619	}
  11620
  11621	if (cgroup_fd != -1) {
  11622		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
  11623		if (err)
  11624			goto err_pmu;
  11625	}
  11626
  11627	err = exclusive_event_init(event);
  11628	if (err)
  11629		goto err_pmu;
  11630
  11631	if (has_addr_filter(event)) {
  11632		event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
  11633						    sizeof(struct perf_addr_filter_range),
  11634						    GFP_KERNEL);
  11635		if (!event->addr_filter_ranges) {
  11636			err = -ENOMEM;
  11637			goto err_per_task;
  11638		}
  11639
  11640		/*
  11641		 * Clone the parent's vma offsets: they are valid until exec()
  11642		 * even if the mm is not shared with the parent.
  11643		 */
  11644		if (event->parent) {
  11645			struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
  11646
  11647			raw_spin_lock_irq(&ifh->lock);
  11648			memcpy(event->addr_filter_ranges,
  11649			       event->parent->addr_filter_ranges,
  11650			       pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
  11651			raw_spin_unlock_irq(&ifh->lock);
  11652		}
  11653
  11654		/* force hw sync on the address filters */
  11655		event->addr_filters_gen = 1;
  11656	}
  11657
  11658	if (!event->parent) {
  11659		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
  11660			err = get_callchain_buffers(attr->sample_max_stack);
  11661			if (err)
  11662				goto err_addr_filters;
  11663		}
  11664	}
  11665
  11666	err = security_perf_event_alloc(event);
  11667	if (err)
  11668		goto err_callchain_buffer;
  11669
  11670	/* symmetric to unaccount_event() in _free_event() */
  11671	account_event(event);
  11672
  11673	return event;
  11674
  11675err_callchain_buffer:
  11676	if (!event->parent) {
  11677		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
  11678			put_callchain_buffers();
  11679	}
  11680err_addr_filters:
  11681	kfree(event->addr_filter_ranges);
  11682
  11683err_per_task:
  11684	exclusive_event_destroy(event);
  11685
  11686err_pmu:
  11687	if (is_cgroup_event(event))
  11688		perf_detach_cgroup(event);
  11689	if (event->destroy)
  11690		event->destroy(event);
  11691	module_put(pmu->module);
  11692err_ns:
  11693	if (event->ns)
  11694		put_pid_ns(event->ns);
  11695	if (event->hw.target)
  11696		put_task_struct(event->hw.target);
  11697	kmem_cache_free(perf_event_cache, event);
  11698
  11699	return ERR_PTR(err);
  11700}
  11701
  11702static int perf_copy_attr(struct perf_event_attr __user *uattr,
  11703			  struct perf_event_attr *attr)
  11704{
  11705	u32 size;
  11706	int ret;
  11707
  11708	/* Zero the full structure, so that a short copy will be nice. */
  11709	memset(attr, 0, sizeof(*attr));
  11710
  11711	ret = get_user(size, &uattr->size);
  11712	if (ret)
  11713		return ret;
  11714
  11715	/* ABI compatibility quirk: */
  11716	if (!size)
  11717		size = PERF_ATTR_SIZE_VER0;
  11718	if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
  11719		goto err_size;
  11720
  11721	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
  11722	if (ret) {
  11723		if (ret == -E2BIG)
  11724			goto err_size;
  11725		return ret;
  11726	}
  11727
  11728	attr->size = size;
  11729
  11730	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
  11731		return -EINVAL;
  11732
  11733	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
  11734		return -EINVAL;
  11735
  11736	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
  11737		return -EINVAL;
  11738
  11739	if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
  11740		u64 mask = attr->branch_sample_type;
  11741
  11742		/* only using defined bits */
  11743		if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
  11744			return -EINVAL;
  11745
  11746		/* at least one branch bit must be set */
  11747		if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
  11748			return -EINVAL;
  11749
  11750		/* propagate priv level, when not set for branch */
  11751		if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
  11752
  11753			/* exclude_kernel checked on syscall entry */
  11754			if (!attr->exclude_kernel)
  11755				mask |= PERF_SAMPLE_BRANCH_KERNEL;
  11756
  11757			if (!attr->exclude_user)
  11758				mask |= PERF_SAMPLE_BRANCH_USER;
  11759
  11760			if (!attr->exclude_hv)
  11761				mask |= PERF_SAMPLE_BRANCH_HV;
  11762			/*
  11763			 * adjust user setting (for HW filter setup)
  11764			 */
  11765			attr->branch_sample_type = mask;
  11766		}
  11767		/* privileged levels capture (kernel, hv): check permissions */
  11768		if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
  11769			ret = perf_allow_kernel(attr);
  11770			if (ret)
  11771				return ret;
  11772		}
  11773	}
  11774
  11775	if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
  11776		ret = perf_reg_validate(attr->sample_regs_user);
  11777		if (ret)
  11778			return ret;
  11779	}
  11780
  11781	if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
  11782		if (!arch_perf_have_user_stack_dump())
  11783			return -ENOSYS;
  11784
  11785		/*
  11786		 * We have __u32 type for the size, but so far
  11787		 * we can only use __u16 as maximum due to the
  11788		 * __u16 sample size limit.
  11789		 */
  11790		if (attr->sample_stack_user >= USHRT_MAX)
  11791			return -EINVAL;
  11792		else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
  11793			return -EINVAL;
  11794	}
  11795
  11796	if (!attr->sample_max_stack)
  11797		attr->sample_max_stack = sysctl_perf_event_max_stack;
  11798
  11799	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
  11800		ret = perf_reg_validate(attr->sample_regs_intr);
  11801
  11802#ifndef CONFIG_CGROUP_PERF
  11803	if (attr->sample_type & PERF_SAMPLE_CGROUP)
  11804		return -EINVAL;
  11805#endif
  11806	if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
  11807	    (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
  11808		return -EINVAL;
  11809
  11810	if (!attr->inherit && attr->inherit_thread)
  11811		return -EINVAL;
  11812
  11813	if (attr->remove_on_exec && attr->enable_on_exec)
  11814		return -EINVAL;
  11815
  11816	if (attr->sigtrap && !attr->remove_on_exec)
  11817		return -EINVAL;
  11818
  11819out:
  11820	return ret;
  11821
  11822err_size:
  11823	put_user(sizeof(*attr), &uattr->size);
  11824	ret = -E2BIG;
  11825	goto out;
  11826}
  11827
  11828static int
  11829perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
  11830{
  11831	struct perf_buffer *rb = NULL;
  11832	int ret = -EINVAL;
  11833
  11834	if (!output_event)
  11835		goto set;
  11836
  11837	/* don't allow circular references */
  11838	if (event == output_event)
  11839		goto out;
  11840
  11841	/*
  11842	 * Don't allow cross-cpu buffers
  11843	 */
  11844	if (output_event->cpu != event->cpu)
  11845		goto out;
  11846
  11847	/*
  11848	 * If its not a per-cpu rb, it must be the same task.
  11849	 */
  11850	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
  11851		goto out;
  11852
  11853	/*
  11854	 * Mixing clocks in the same buffer is trouble you don't need.
  11855	 */
  11856	if (output_event->clock != event->clock)
  11857		goto out;
  11858
  11859	/*
  11860	 * Either writing ring buffer from beginning or from end.
  11861	 * Mixing is not allowed.
  11862	 */
  11863	if (is_write_backward(output_event) != is_write_backward(event))
  11864		goto out;
  11865
  11866	/*
  11867	 * If both events generate aux data, they must be on the same PMU
  11868	 */
  11869	if (has_aux(event) && has_aux(output_event) &&
  11870	    event->pmu != output_event->pmu)
  11871		goto out;
  11872
  11873set:
  11874	mutex_lock(&event->mmap_mutex);
  11875	/* Can't redirect output if we've got an active mmap() */
  11876	if (atomic_read(&event->mmap_count))
  11877		goto unlock;
  11878
  11879	if (output_event) {
  11880		/* get the rb we want to redirect to */
  11881		rb = ring_buffer_get(output_event);
  11882		if (!rb)
  11883			goto unlock;
  11884	}
  11885
  11886	ring_buffer_attach(event, rb);
  11887
  11888	ret = 0;
  11889unlock:
  11890	mutex_unlock(&event->mmap_mutex);
  11891
  11892out:
  11893	return ret;
  11894}
  11895
  11896static void mutex_lock_double(struct mutex *a, struct mutex *b)
  11897{
  11898	if (b < a)
  11899		swap(a, b);
  11900
  11901	mutex_lock(a);
  11902	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
  11903}
  11904
  11905static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
  11906{
  11907	bool nmi_safe = false;
  11908
  11909	switch (clk_id) {
  11910	case CLOCK_MONOTONIC:
  11911		event->clock = &ktime_get_mono_fast_ns;
  11912		nmi_safe = true;
  11913		break;
  11914
  11915	case CLOCK_MONOTONIC_RAW:
  11916		event->clock = &ktime_get_raw_fast_ns;
  11917		nmi_safe = true;
  11918		break;
  11919
  11920	case CLOCK_REALTIME:
  11921		event->clock = &ktime_get_real_ns;
  11922		break;
  11923
  11924	case CLOCK_BOOTTIME:
  11925		event->clock = &ktime_get_boottime_ns;
  11926		break;
  11927
  11928	case CLOCK_TAI:
  11929		event->clock = &ktime_get_clocktai_ns;
  11930		break;
  11931
  11932	default:
  11933		return -EINVAL;
  11934	}
  11935
  11936	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
  11937		return -EINVAL;
  11938
  11939	return 0;
  11940}
  11941
  11942/*
  11943 * Variation on perf_event_ctx_lock_nested(), except we take two context
  11944 * mutexes.
  11945 */
  11946static struct perf_event_context *
  11947__perf_event_ctx_lock_double(struct perf_event *group_leader,
  11948			     struct perf_event_context *ctx)
  11949{
  11950	struct perf_event_context *gctx;
  11951
  11952again:
  11953	rcu_read_lock();
  11954	gctx = READ_ONCE(group_leader->ctx);
  11955	if (!refcount_inc_not_zero(&gctx->refcount)) {
  11956		rcu_read_unlock();
  11957		goto again;
  11958	}
  11959	rcu_read_unlock();
  11960
  11961	mutex_lock_double(&gctx->mutex, &ctx->mutex);
  11962
  11963	if (group_leader->ctx != gctx) {
  11964		mutex_unlock(&ctx->mutex);
  11965		mutex_unlock(&gctx->mutex);
  11966		put_ctx(gctx);
  11967		goto again;
  11968	}
  11969
  11970	return gctx;
  11971}
  11972
  11973static bool
  11974perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
  11975{
  11976	unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
  11977	bool is_capable = perfmon_capable();
  11978
  11979	if (attr->sigtrap) {
  11980		/*
  11981		 * perf_event_attr::sigtrap sends signals to the other task.
  11982		 * Require the current task to also have CAP_KILL.
  11983		 */
  11984		rcu_read_lock();
  11985		is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
  11986		rcu_read_unlock();
  11987
  11988		/*
  11989		 * If the required capabilities aren't available, checks for
  11990		 * ptrace permissions: upgrade to ATTACH, since sending signals
  11991		 * can effectively change the target task.
  11992		 */
  11993		ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
  11994	}
  11995
  11996	/*
  11997	 * Preserve ptrace permission check for backwards compatibility. The
  11998	 * ptrace check also includes checks that the current task and other
  11999	 * task have matching uids, and is therefore not done here explicitly.
  12000	 */
  12001	return is_capable || ptrace_may_access(task, ptrace_mode);
  12002}
  12003
  12004/**
  12005 * sys_perf_event_open - open a performance event, associate it to a task/cpu
  12006 *
  12007 * @attr_uptr:	event_id type attributes for monitoring/sampling
  12008 * @pid:		target pid
  12009 * @cpu:		target cpu
  12010 * @group_fd:		group leader event fd
  12011 * @flags:		perf event open flags
  12012 */
  12013SYSCALL_DEFINE5(perf_event_open,
  12014		struct perf_event_attr __user *, attr_uptr,
  12015		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
  12016{
  12017	struct perf_event *group_leader = NULL, *output_event = NULL;
  12018	struct perf_event *event, *sibling;
  12019	struct perf_event_attr attr;
  12020	struct perf_event_context *ctx, *gctx;
  12021	struct file *event_file = NULL;
  12022	struct fd group = {NULL, 0};
  12023	struct task_struct *task = NULL;
  12024	struct pmu *pmu;
  12025	int event_fd;
  12026	int move_group = 0;
  12027	int err;
  12028	int f_flags = O_RDWR;
  12029	int cgroup_fd = -1;
  12030
  12031	/* for future expandability... */
  12032	if (flags & ~PERF_FLAG_ALL)
  12033		return -EINVAL;
  12034
  12035	/* Do we allow access to perf_event_open(2) ? */
  12036	err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
  12037	if (err)
  12038		return err;
  12039
  12040	err = perf_copy_attr(attr_uptr, &attr);
  12041	if (err)
  12042		return err;
  12043
  12044	if (!attr.exclude_kernel) {
  12045		err = perf_allow_kernel(&attr);
  12046		if (err)
  12047			return err;
  12048	}
  12049
  12050	if (attr.namespaces) {
  12051		if (!perfmon_capable())
  12052			return -EACCES;
  12053	}
  12054
  12055	if (attr.freq) {
  12056		if (attr.sample_freq > sysctl_perf_event_sample_rate)
  12057			return -EINVAL;
  12058	} else {
  12059		if (attr.sample_period & (1ULL << 63))
  12060			return -EINVAL;
  12061	}
  12062
  12063	/* Only privileged users can get physical addresses */
  12064	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
  12065		err = perf_allow_kernel(&attr);
  12066		if (err)
  12067			return err;
  12068	}
  12069
  12070	/* REGS_INTR can leak data, lockdown must prevent this */
  12071	if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
  12072		err = security_locked_down(LOCKDOWN_PERF);
  12073		if (err)
  12074			return err;
  12075	}
  12076
  12077	/*
  12078	 * In cgroup mode, the pid argument is used to pass the fd
  12079	 * opened to the cgroup directory in cgroupfs. The cpu argument
  12080	 * designates the cpu on which to monitor threads from that
  12081	 * cgroup.
  12082	 */
  12083	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
  12084		return -EINVAL;
  12085
  12086	if (flags & PERF_FLAG_FD_CLOEXEC)
  12087		f_flags |= O_CLOEXEC;
  12088
  12089	event_fd = get_unused_fd_flags(f_flags);
  12090	if (event_fd < 0)
  12091		return event_fd;
  12092
  12093	if (group_fd != -1) {
  12094		err = perf_fget_light(group_fd, &group);
  12095		if (err)
  12096			goto err_fd;
  12097		group_leader = group.file->private_data;
  12098		if (flags & PERF_FLAG_FD_OUTPUT)
  12099			output_event = group_leader;
  12100		if (flags & PERF_FLAG_FD_NO_GROUP)
  12101			group_leader = NULL;
  12102	}
  12103
  12104	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
  12105		task = find_lively_task_by_vpid(pid);
  12106		if (IS_ERR(task)) {
  12107			err = PTR_ERR(task);
  12108			goto err_group_fd;
  12109		}
  12110	}
  12111
  12112	if (task && group_leader &&
  12113	    group_leader->attr.inherit != attr.inherit) {
  12114		err = -EINVAL;
  12115		goto err_task;
  12116	}
  12117
  12118	if (flags & PERF_FLAG_PID_CGROUP)
  12119		cgroup_fd = pid;
  12120
  12121	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
  12122				 NULL, NULL, cgroup_fd);
  12123	if (IS_ERR(event)) {
  12124		err = PTR_ERR(event);
  12125		goto err_task;
  12126	}
  12127
  12128	if (is_sampling_event(event)) {
  12129		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
  12130			err = -EOPNOTSUPP;
  12131			goto err_alloc;
  12132		}
  12133	}
  12134
  12135	/*
  12136	 * Special case software events and allow them to be part of
  12137	 * any hardware group.
  12138	 */
  12139	pmu = event->pmu;
  12140
  12141	if (attr.use_clockid) {
  12142		err = perf_event_set_clock(event, attr.clockid);
  12143		if (err)
  12144			goto err_alloc;
  12145	}
  12146
  12147	if (pmu->task_ctx_nr == perf_sw_context)
  12148		event->event_caps |= PERF_EV_CAP_SOFTWARE;
  12149
  12150	if (group_leader) {
  12151		if (is_software_event(event) &&
  12152		    !in_software_context(group_leader)) {
  12153			/*
  12154			 * If the event is a sw event, but the group_leader
  12155			 * is on hw context.
  12156			 *
  12157			 * Allow the addition of software events to hw
  12158			 * groups, this is safe because software events
  12159			 * never fail to schedule.
  12160			 */
  12161			pmu = group_leader->ctx->pmu;
  12162		} else if (!is_software_event(event) &&
  12163			   is_software_event(group_leader) &&
  12164			   (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
  12165			/*
  12166			 * In case the group is a pure software group, and we
  12167			 * try to add a hardware event, move the whole group to
  12168			 * the hardware context.
  12169			 */
  12170			move_group = 1;
  12171		}
  12172	}
  12173
  12174	/*
  12175	 * Get the target context (task or percpu):
  12176	 */
  12177	ctx = find_get_context(pmu, task, event);
  12178	if (IS_ERR(ctx)) {
  12179		err = PTR_ERR(ctx);
  12180		goto err_alloc;
  12181	}
  12182
  12183	/*
  12184	 * Look up the group leader (we will attach this event to it):
  12185	 */
  12186	if (group_leader) {
  12187		err = -EINVAL;
  12188
  12189		/*
  12190		 * Do not allow a recursive hierarchy (this new sibling
  12191		 * becoming part of another group-sibling):
  12192		 */
  12193		if (group_leader->group_leader != group_leader)
  12194			goto err_context;
  12195
  12196		/* All events in a group should have the same clock */
  12197		if (group_leader->clock != event->clock)
  12198			goto err_context;
  12199
  12200		/*
  12201		 * Make sure we're both events for the same CPU;
  12202		 * grouping events for different CPUs is broken; since
  12203		 * you can never concurrently schedule them anyhow.
  12204		 */
  12205		if (group_leader->cpu != event->cpu)
  12206			goto err_context;
  12207
  12208		/*
  12209		 * Make sure we're both on the same task, or both
  12210		 * per-CPU events.
  12211		 */
  12212		if (group_leader->ctx->task != ctx->task)
  12213			goto err_context;
  12214
  12215		/*
  12216		 * Do not allow to attach to a group in a different task
  12217		 * or CPU context. If we're moving SW events, we'll fix
  12218		 * this up later, so allow that.
  12219		 *
  12220		 * Racy, not holding group_leader->ctx->mutex, see comment with
  12221		 * perf_event_ctx_lock().
  12222		 */
  12223		if (!move_group && group_leader->ctx != ctx)
  12224			goto err_context;
  12225
  12226		/*
  12227		 * Only a group leader can be exclusive or pinned
  12228		 */
  12229		if (attr.exclusive || attr.pinned)
  12230			goto err_context;
  12231	}
  12232
  12233	if (output_event) {
  12234		err = perf_event_set_output(event, output_event);
  12235		if (err)
  12236			goto err_context;
  12237	}
  12238
  12239	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
  12240					f_flags);
  12241	if (IS_ERR(event_file)) {
  12242		err = PTR_ERR(event_file);
  12243		event_file = NULL;
  12244		goto err_context;
  12245	}
  12246
  12247	if (task) {
  12248		err = down_read_interruptible(&task->signal->exec_update_lock);
  12249		if (err)
  12250			goto err_file;
  12251
  12252		/*
  12253		 * We must hold exec_update_lock across this and any potential
  12254		 * perf_install_in_context() call for this new event to
  12255		 * serialize against exec() altering our credentials (and the
  12256		 * perf_event_exit_task() that could imply).
  12257		 */
  12258		err = -EACCES;
  12259		if (!perf_check_permission(&attr, task))
  12260			goto err_cred;
  12261	}
  12262
  12263	if (move_group) {
  12264		gctx = __perf_event_ctx_lock_double(group_leader, ctx);
  12265
  12266		if (gctx->task == TASK_TOMBSTONE) {
  12267			err = -ESRCH;
  12268			goto err_locked;
  12269		}
  12270
  12271		/*
  12272		 * Check if we raced against another sys_perf_event_open() call
  12273		 * moving the software group underneath us.
  12274		 */
  12275		if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
  12276			/*
  12277			 * If someone moved the group out from under us, check
  12278			 * if this new event wound up on the same ctx, if so
  12279			 * its the regular !move_group case, otherwise fail.
  12280			 */
  12281			if (gctx != ctx) {
  12282				err = -EINVAL;
  12283				goto err_locked;
  12284			} else {
  12285				perf_event_ctx_unlock(group_leader, gctx);
  12286				move_group = 0;
  12287				goto not_move_group;
  12288			}
  12289		}
  12290
  12291		/*
  12292		 * Failure to create exclusive events returns -EBUSY.
  12293		 */
  12294		err = -EBUSY;
  12295		if (!exclusive_event_installable(group_leader, ctx))
  12296			goto err_locked;
  12297
  12298		for_each_sibling_event(sibling, group_leader) {
  12299			if (!exclusive_event_installable(sibling, ctx))
  12300				goto err_locked;
  12301		}
  12302	} else {
  12303		mutex_lock(&ctx->mutex);
  12304
  12305		/*
  12306		 * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
  12307		 * see the group_leader && !move_group test earlier.
  12308		 */
  12309		if (group_leader && group_leader->ctx != ctx) {
  12310			err = -EINVAL;
  12311			goto err_locked;
  12312		}
  12313	}
  12314not_move_group:
  12315
  12316	if (ctx->task == TASK_TOMBSTONE) {
  12317		err = -ESRCH;
  12318		goto err_locked;
  12319	}
  12320
  12321	if (!perf_event_validate_size(event)) {
  12322		err = -E2BIG;
  12323		goto err_locked;
  12324	}
  12325
  12326	if (!task) {
  12327		/*
  12328		 * Check if the @cpu we're creating an event for is online.
  12329		 *
  12330		 * We use the perf_cpu_context::ctx::mutex to serialize against
  12331		 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
  12332		 */
  12333		struct perf_cpu_context *cpuctx =
  12334			container_of(ctx, struct perf_cpu_context, ctx);
  12335
  12336		if (!cpuctx->online) {
  12337			err = -ENODEV;
  12338			goto err_locked;
  12339		}
  12340	}
  12341
  12342	if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
  12343		err = -EINVAL;
  12344		goto err_locked;
  12345	}
  12346
  12347	/*
  12348	 * Must be under the same ctx::mutex as perf_install_in_context(),
  12349	 * because we need to serialize with concurrent event creation.
  12350	 */
  12351	if (!exclusive_event_installable(event, ctx)) {
  12352		err = -EBUSY;
  12353		goto err_locked;
  12354	}
  12355
  12356	WARN_ON_ONCE(ctx->parent_ctx);
  12357
  12358	/*
  12359	 * This is the point on no return; we cannot fail hereafter. This is
  12360	 * where we start modifying current state.
  12361	 */
  12362
  12363	if (move_group) {
  12364		/*
  12365		 * See perf_event_ctx_lock() for comments on the details
  12366		 * of swizzling perf_event::ctx.
  12367		 */
  12368		perf_remove_from_context(group_leader, 0);
  12369		put_ctx(gctx);
  12370
  12371		for_each_sibling_event(sibling, group_leader) {
  12372			perf_remove_from_context(sibling, 0);
  12373			put_ctx(gctx);
  12374		}
  12375
  12376		/*
  12377		 * Wait for everybody to stop referencing the events through
  12378		 * the old lists, before installing it on new lists.
  12379		 */
  12380		synchronize_rcu();
  12381
  12382		/*
  12383		 * Install the group siblings before the group leader.
  12384		 *
  12385		 * Because a group leader will try and install the entire group
  12386		 * (through the sibling list, which is still in-tact), we can
  12387		 * end up with siblings installed in the wrong context.
  12388		 *
  12389		 * By installing siblings first we NO-OP because they're not
  12390		 * reachable through the group lists.
  12391		 */
  12392		for_each_sibling_event(sibling, group_leader) {
  12393			perf_event__state_init(sibling);
  12394			perf_install_in_context(ctx, sibling, sibling->cpu);
  12395			get_ctx(ctx);
  12396		}
  12397
  12398		/*
  12399		 * Removing from the context ends up with disabled
  12400		 * event. What we want here is event in the initial
  12401		 * startup state, ready to be add into new context.
  12402		 */
  12403		perf_event__state_init(group_leader);
  12404		perf_install_in_context(ctx, group_leader, group_leader->cpu);
  12405		get_ctx(ctx);
  12406	}
  12407
  12408	/*
  12409	 * Precalculate sample_data sizes; do while holding ctx::mutex such
  12410	 * that we're serialized against further additions and before
  12411	 * perf_install_in_context() which is the point the event is active and
  12412	 * can use these values.
  12413	 */
  12414	perf_event__header_size(event);
  12415	perf_event__id_header_size(event);
  12416
  12417	event->owner = current;
  12418
  12419	perf_install_in_context(ctx, event, event->cpu);
  12420	perf_unpin_context(ctx);
  12421
  12422	if (move_group)
  12423		perf_event_ctx_unlock(group_leader, gctx);
  12424	mutex_unlock(&ctx->mutex);
  12425
  12426	if (task) {
  12427		up_read(&task->signal->exec_update_lock);
  12428		put_task_struct(task);
  12429	}
  12430
  12431	mutex_lock(&current->perf_event_mutex);
  12432	list_add_tail(&event->owner_entry, &current->perf_event_list);
  12433	mutex_unlock(&current->perf_event_mutex);
  12434
  12435	/*
  12436	 * Drop the reference on the group_event after placing the
  12437	 * new event on the sibling_list. This ensures destruction
  12438	 * of the group leader will find the pointer to itself in
  12439	 * perf_group_detach().
  12440	 */
  12441	fdput(group);
  12442	fd_install(event_fd, event_file);
  12443	return event_fd;
  12444
  12445err_locked:
  12446	if (move_group)
  12447		perf_event_ctx_unlock(group_leader, gctx);
  12448	mutex_unlock(&ctx->mutex);
  12449err_cred:
  12450	if (task)
  12451		up_read(&task->signal->exec_update_lock);
  12452err_file:
  12453	fput(event_file);
  12454err_context:
  12455	perf_unpin_context(ctx);
  12456	put_ctx(ctx);
  12457err_alloc:
  12458	/*
  12459	 * If event_file is set, the fput() above will have called ->release()
  12460	 * and that will take care of freeing the event.
  12461	 */
  12462	if (!event_file)
  12463		free_event(event);
  12464err_task:
  12465	if (task)
  12466		put_task_struct(task);
  12467err_group_fd:
  12468	fdput(group);
  12469err_fd:
  12470	put_unused_fd(event_fd);
  12471	return err;
  12472}
  12473
  12474/**
  12475 * perf_event_create_kernel_counter
  12476 *
  12477 * @attr: attributes of the counter to create
  12478 * @cpu: cpu in which the counter is bound
  12479 * @task: task to profile (NULL for percpu)
  12480 * @overflow_handler: callback to trigger when we hit the event
  12481 * @context: context data could be used in overflow_handler callback
  12482 */
  12483struct perf_event *
  12484perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
  12485				 struct task_struct *task,
  12486				 perf_overflow_handler_t overflow_handler,
  12487				 void *context)
  12488{
  12489	struct perf_event_context *ctx;
  12490	struct perf_event *event;
  12491	int err;
  12492
  12493	/*
  12494	 * Grouping is not supported for kernel events, neither is 'AUX',
  12495	 * make sure the caller's intentions are adjusted.
  12496	 */
  12497	if (attr->aux_output)
  12498		return ERR_PTR(-EINVAL);
  12499
  12500	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
  12501				 overflow_handler, context, -1);
  12502	if (IS_ERR(event)) {
  12503		err = PTR_ERR(event);
  12504		goto err;
  12505	}
  12506
  12507	/* Mark owner so we could distinguish it from user events. */
  12508	event->owner = TASK_TOMBSTONE;
  12509
  12510	/*
  12511	 * Get the target context (task or percpu):
  12512	 */
  12513	ctx = find_get_context(event->pmu, task, event);
  12514	if (IS_ERR(ctx)) {
  12515		err = PTR_ERR(ctx);
  12516		goto err_free;
  12517	}
  12518
  12519	WARN_ON_ONCE(ctx->parent_ctx);
  12520	mutex_lock(&ctx->mutex);
  12521	if (ctx->task == TASK_TOMBSTONE) {
  12522		err = -ESRCH;
  12523		goto err_unlock;
  12524	}
  12525
  12526	if (!task) {
  12527		/*
  12528		 * Check if the @cpu we're creating an event for is online.
  12529		 *
  12530		 * We use the perf_cpu_context::ctx::mutex to serialize against
  12531		 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
  12532		 */
  12533		struct perf_cpu_context *cpuctx =
  12534			container_of(ctx, struct perf_cpu_context, ctx);
  12535		if (!cpuctx->online) {
  12536			err = -ENODEV;
  12537			goto err_unlock;
  12538		}
  12539	}
  12540
  12541	if (!exclusive_event_installable(event, ctx)) {
  12542		err = -EBUSY;
  12543		goto err_unlock;
  12544	}
  12545
  12546	perf_install_in_context(ctx, event, event->cpu);
  12547	perf_unpin_context(ctx);
  12548	mutex_unlock(&ctx->mutex);
  12549
  12550	return event;
  12551
  12552err_unlock:
  12553	mutex_unlock(&ctx->mutex);
  12554	perf_unpin_context(ctx);
  12555	put_ctx(ctx);
  12556err_free:
  12557	free_event(event);
  12558err:
  12559	return ERR_PTR(err);
  12560}
  12561EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
  12562
  12563void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
  12564{
  12565	struct perf_event_context *src_ctx;
  12566	struct perf_event_context *dst_ctx;
  12567	struct perf_event *event, *tmp;
  12568	LIST_HEAD(events);
  12569
  12570	src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
  12571	dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
  12572
  12573	/*
  12574	 * See perf_event_ctx_lock() for comments on the details
  12575	 * of swizzling perf_event::ctx.
  12576	 */
  12577	mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
  12578	list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
  12579				 event_entry) {
  12580		perf_remove_from_context(event, 0);
  12581		unaccount_event_cpu(event, src_cpu);
  12582		put_ctx(src_ctx);
  12583		list_add(&event->migrate_entry, &events);
  12584	}
  12585
  12586	/*
  12587	 * Wait for the events to quiesce before re-instating them.
  12588	 */
  12589	synchronize_rcu();
  12590
  12591	/*
  12592	 * Re-instate events in 2 passes.
  12593	 *
  12594	 * Skip over group leaders and only install siblings on this first
  12595	 * pass, siblings will not get enabled without a leader, however a
  12596	 * leader will enable its siblings, even if those are still on the old
  12597	 * context.
  12598	 */
  12599	list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
  12600		if (event->group_leader == event)
  12601			continue;
  12602
  12603		list_del(&event->migrate_entry);
  12604		if (event->state >= PERF_EVENT_STATE_OFF)
  12605			event->state = PERF_EVENT_STATE_INACTIVE;
  12606		account_event_cpu(event, dst_cpu);
  12607		perf_install_in_context(dst_ctx, event, dst_cpu);
  12608		get_ctx(dst_ctx);
  12609	}
  12610
  12611	/*
  12612	 * Once all the siblings are setup properly, install the group leaders
  12613	 * to make it go.
  12614	 */
  12615	list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
  12616		list_del(&event->migrate_entry);
  12617		if (event->state >= PERF_EVENT_STATE_OFF)
  12618			event->state = PERF_EVENT_STATE_INACTIVE;
  12619		account_event_cpu(event, dst_cpu);
  12620		perf_install_in_context(dst_ctx, event, dst_cpu);
  12621		get_ctx(dst_ctx);
  12622	}
  12623	mutex_unlock(&dst_ctx->mutex);
  12624	mutex_unlock(&src_ctx->mutex);
  12625}
  12626EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
  12627
  12628static void sync_child_event(struct perf_event *child_event)
  12629{
  12630	struct perf_event *parent_event = child_event->parent;
  12631	u64 child_val;
  12632
  12633	if (child_event->attr.inherit_stat) {
  12634		struct task_struct *task = child_event->ctx->task;
  12635
  12636		if (task && task != TASK_TOMBSTONE)
  12637			perf_event_read_event(child_event, task);
  12638	}
  12639
  12640	child_val = perf_event_count(child_event);
  12641
  12642	/*
  12643	 * Add back the child's count to the parent's count:
  12644	 */
  12645	atomic64_add(child_val, &parent_event->child_count);
  12646	atomic64_add(child_event->total_time_enabled,
  12647		     &parent_event->child_total_time_enabled);
  12648	atomic64_add(child_event->total_time_running,
  12649		     &parent_event->child_total_time_running);
  12650}
  12651
  12652static void
  12653perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
  12654{
  12655	struct perf_event *parent_event = event->parent;
  12656	unsigned long detach_flags = 0;
  12657
  12658	if (parent_event) {
  12659		/*
  12660		 * Do not destroy the 'original' grouping; because of the
  12661		 * context switch optimization the original events could've
  12662		 * ended up in a random child task.
  12663		 *
  12664		 * If we were to destroy the original group, all group related
  12665		 * operations would cease to function properly after this
  12666		 * random child dies.
  12667		 *
  12668		 * Do destroy all inherited groups, we don't care about those
  12669		 * and being thorough is better.
  12670		 */
  12671		detach_flags = DETACH_GROUP | DETACH_CHILD;
  12672		mutex_lock(&parent_event->child_mutex);
  12673	}
  12674
  12675	perf_remove_from_context(event, detach_flags);
  12676
  12677	raw_spin_lock_irq(&ctx->lock);
  12678	if (event->state > PERF_EVENT_STATE_EXIT)
  12679		perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
  12680	raw_spin_unlock_irq(&ctx->lock);
  12681
  12682	/*
  12683	 * Child events can be freed.
  12684	 */
  12685	if (parent_event) {
  12686		mutex_unlock(&parent_event->child_mutex);
  12687		/*
  12688		 * Kick perf_poll() for is_event_hup();
  12689		 */
  12690		perf_event_wakeup(parent_event);
  12691		free_event(event);
  12692		put_event(parent_event);
  12693		return;
  12694	}
  12695
  12696	/*
  12697	 * Parent events are governed by their filedesc, retain them.
  12698	 */
  12699	perf_event_wakeup(event);
  12700}
  12701
  12702static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
  12703{
  12704	struct perf_event_context *child_ctx, *clone_ctx = NULL;
  12705	struct perf_event *child_event, *next;
  12706
  12707	WARN_ON_ONCE(child != current);
  12708
  12709	child_ctx = perf_pin_task_context(child, ctxn);
  12710	if (!child_ctx)
  12711		return;
  12712
  12713	/*
  12714	 * In order to reduce the amount of tricky in ctx tear-down, we hold
  12715	 * ctx::mutex over the entire thing. This serializes against almost
  12716	 * everything that wants to access the ctx.
  12717	 *
  12718	 * The exception is sys_perf_event_open() /
  12719	 * perf_event_create_kernel_count() which does find_get_context()
  12720	 * without ctx::mutex (it cannot because of the move_group double mutex
  12721	 * lock thing). See the comments in perf_install_in_context().
  12722	 */
  12723	mutex_lock(&child_ctx->mutex);
  12724
  12725	/*
  12726	 * In a single ctx::lock section, de-schedule the events and detach the
  12727	 * context from the task such that we cannot ever get it scheduled back
  12728	 * in.
  12729	 */
  12730	raw_spin_lock_irq(&child_ctx->lock);
  12731	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
  12732
  12733	/*
  12734	 * Now that the context is inactive, destroy the task <-> ctx relation
  12735	 * and mark the context dead.
  12736	 */
  12737	RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
  12738	put_ctx(child_ctx); /* cannot be last */
  12739	WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
  12740	put_task_struct(current); /* cannot be last */
  12741
  12742	clone_ctx = unclone_ctx(child_ctx);
  12743	raw_spin_unlock_irq(&child_ctx->lock);
  12744
  12745	if (clone_ctx)
  12746		put_ctx(clone_ctx);
  12747
  12748	/*
  12749	 * Report the task dead after unscheduling the events so that we
  12750	 * won't get any samples after PERF_RECORD_EXIT. We can however still
  12751	 * get a few PERF_RECORD_READ events.
  12752	 */
  12753	perf_event_task(child, child_ctx, 0);
  12754
  12755	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
  12756		perf_event_exit_event(child_event, child_ctx);
  12757
  12758	mutex_unlock(&child_ctx->mutex);
  12759
  12760	put_ctx(child_ctx);
  12761}
  12762
  12763/*
  12764 * When a child task exits, feed back event values to parent events.
  12765 *
  12766 * Can be called with exec_update_lock held when called from
  12767 * setup_new_exec().
  12768 */
  12769void perf_event_exit_task(struct task_struct *child)
  12770{
  12771	struct perf_event *event, *tmp;
  12772	int ctxn;
  12773
  12774	mutex_lock(&child->perf_event_mutex);
  12775	list_for_each_entry_safe(event, tmp, &child->perf_event_list,
  12776				 owner_entry) {
  12777		list_del_init(&event->owner_entry);
  12778
  12779		/*
  12780		 * Ensure the list deletion is visible before we clear
  12781		 * the owner, closes a race against perf_release() where
  12782		 * we need to serialize on the owner->perf_event_mutex.
  12783		 */
  12784		smp_store_release(&event->owner, NULL);
  12785	}
  12786	mutex_unlock(&child->perf_event_mutex);
  12787
  12788	for_each_task_context_nr(ctxn)
  12789		perf_event_exit_task_context(child, ctxn);
  12790
  12791	/*
  12792	 * The perf_event_exit_task_context calls perf_event_task
  12793	 * with child's task_ctx, which generates EXIT events for
  12794	 * child contexts and sets child->perf_event_ctxp[] to NULL.
  12795	 * At this point we need to send EXIT events to cpu contexts.
  12796	 */
  12797	perf_event_task(child, NULL, 0);
  12798}
  12799
  12800static void perf_free_event(struct perf_event *event,
  12801			    struct perf_event_context *ctx)
  12802{
  12803	struct perf_event *parent = event->parent;
  12804
  12805	if (WARN_ON_ONCE(!parent))
  12806		return;
  12807
  12808	mutex_lock(&parent->child_mutex);
  12809	list_del_init(&event->child_list);
  12810	mutex_unlock(&parent->child_mutex);
  12811
  12812	put_event(parent);
  12813
  12814	raw_spin_lock_irq(&ctx->lock);
  12815	perf_group_detach(event);
  12816	list_del_event(event, ctx);
  12817	raw_spin_unlock_irq(&ctx->lock);
  12818	free_event(event);
  12819}
  12820
  12821/*
  12822 * Free a context as created by inheritance by perf_event_init_task() below,
  12823 * used by fork() in case of fail.
  12824 *
  12825 * Even though the task has never lived, the context and events have been
  12826 * exposed through the child_list, so we must take care tearing it all down.
  12827 */
  12828void perf_event_free_task(struct task_struct *task)
  12829{
  12830	struct perf_event_context *ctx;
  12831	struct perf_event *event, *tmp;
  12832	int ctxn;
  12833
  12834	for_each_task_context_nr(ctxn) {
  12835		ctx = task->perf_event_ctxp[ctxn];
  12836		if (!ctx)
  12837			continue;
  12838
  12839		mutex_lock(&ctx->mutex);
  12840		raw_spin_lock_irq(&ctx->lock);
  12841		/*
  12842		 * Destroy the task <-> ctx relation and mark the context dead.
  12843		 *
  12844		 * This is important because even though the task hasn't been
  12845		 * exposed yet the context has been (through child_list).
  12846		 */
  12847		RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
  12848		WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
  12849		put_task_struct(task); /* cannot be last */
  12850		raw_spin_unlock_irq(&ctx->lock);
  12851
  12852		list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
  12853			perf_free_event(event, ctx);
  12854
  12855		mutex_unlock(&ctx->mutex);
  12856
  12857		/*
  12858		 * perf_event_release_kernel() could've stolen some of our
  12859		 * child events and still have them on its free_list. In that
  12860		 * case we must wait for these events to have been freed (in
  12861		 * particular all their references to this task must've been
  12862		 * dropped).
  12863		 *
  12864		 * Without this copy_process() will unconditionally free this
  12865		 * task (irrespective of its reference count) and
  12866		 * _free_event()'s put_task_struct(event->hw.target) will be a
  12867		 * use-after-free.
  12868		 *
  12869		 * Wait for all events to drop their context reference.
  12870		 */
  12871		wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
  12872		put_ctx(ctx); /* must be last */
  12873	}
  12874}
  12875
  12876void perf_event_delayed_put(struct task_struct *task)
  12877{
  12878	int ctxn;
  12879
  12880	for_each_task_context_nr(ctxn)
  12881		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
  12882}
  12883
  12884struct file *perf_event_get(unsigned int fd)
  12885{
  12886	struct file *file = fget(fd);
  12887	if (!file)
  12888		return ERR_PTR(-EBADF);
  12889
  12890	if (file->f_op != &perf_fops) {
  12891		fput(file);
  12892		return ERR_PTR(-EBADF);
  12893	}
  12894
  12895	return file;
  12896}
  12897
  12898const struct perf_event *perf_get_event(struct file *file)
  12899{
  12900	if (file->f_op != &perf_fops)
  12901		return ERR_PTR(-EINVAL);
  12902
  12903	return file->private_data;
  12904}
  12905
  12906const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
  12907{
  12908	if (!event)
  12909		return ERR_PTR(-EINVAL);
  12910
  12911	return &event->attr;
  12912}
  12913
  12914/*
  12915 * Inherit an event from parent task to child task.
  12916 *
  12917 * Returns:
  12918 *  - valid pointer on success
  12919 *  - NULL for orphaned events
  12920 *  - IS_ERR() on error
  12921 */
  12922static struct perf_event *
  12923inherit_event(struct perf_event *parent_event,
  12924	      struct task_struct *parent,
  12925	      struct perf_event_context *parent_ctx,
  12926	      struct task_struct *child,
  12927	      struct perf_event *group_leader,
  12928	      struct perf_event_context *child_ctx)
  12929{
  12930	enum perf_event_state parent_state = parent_event->state;
  12931	struct perf_event *child_event;
  12932	unsigned long flags;
  12933
  12934	/*
  12935	 * Instead of creating recursive hierarchies of events,
  12936	 * we link inherited events back to the original parent,
  12937	 * which has a filp for sure, which we use as the reference
  12938	 * count:
  12939	 */
  12940	if (parent_event->parent)
  12941		parent_event = parent_event->parent;
  12942
  12943	child_event = perf_event_alloc(&parent_event->attr,
  12944					   parent_event->cpu,
  12945					   child,
  12946					   group_leader, parent_event,
  12947					   NULL, NULL, -1);
  12948	if (IS_ERR(child_event))
  12949		return child_event;
  12950
  12951
  12952	if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
  12953	    !child_ctx->task_ctx_data) {
  12954		struct pmu *pmu = child_event->pmu;
  12955
  12956		child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
  12957		if (!child_ctx->task_ctx_data) {
  12958			free_event(child_event);
  12959			return ERR_PTR(-ENOMEM);
  12960		}
  12961	}
  12962
  12963	/*
  12964	 * is_orphaned_event() and list_add_tail(&parent_event->child_list)
  12965	 * must be under the same lock in order to serialize against
  12966	 * perf_event_release_kernel(), such that either we must observe
  12967	 * is_orphaned_event() or they will observe us on the child_list.
  12968	 */
  12969	mutex_lock(&parent_event->child_mutex);
  12970	if (is_orphaned_event(parent_event) ||
  12971	    !atomic_long_inc_not_zero(&parent_event->refcount)) {
  12972		mutex_unlock(&parent_event->child_mutex);
  12973		/* task_ctx_data is freed with child_ctx */
  12974		free_event(child_event);
  12975		return NULL;
  12976	}
  12977
  12978	get_ctx(child_ctx);
  12979
  12980	/*
  12981	 * Make the child state follow the state of the parent event,
  12982	 * not its attr.disabled bit.  We hold the parent's mutex,
  12983	 * so we won't race with perf_event_{en, dis}able_family.
  12984	 */
  12985	if (parent_state >= PERF_EVENT_STATE_INACTIVE)
  12986		child_event->state = PERF_EVENT_STATE_INACTIVE;
  12987	else
  12988		child_event->state = PERF_EVENT_STATE_OFF;
  12989
  12990	if (parent_event->attr.freq) {
  12991		u64 sample_period = parent_event->hw.sample_period;
  12992		struct hw_perf_event *hwc = &child_event->hw;
  12993
  12994		hwc->sample_period = sample_period;
  12995		hwc->last_period   = sample_period;
  12996
  12997		local64_set(&hwc->period_left, sample_period);
  12998	}
  12999
  13000	child_event->ctx = child_ctx;
  13001	child_event->overflow_handler = parent_event->overflow_handler;
  13002	child_event->overflow_handler_context
  13003		= parent_event->overflow_handler_context;
  13004
  13005	/*
  13006	 * Precalculate sample_data sizes
  13007	 */
  13008	perf_event__header_size(child_event);
  13009	perf_event__id_header_size(child_event);
  13010
  13011	/*
  13012	 * Link it up in the child's context:
  13013	 */
  13014	raw_spin_lock_irqsave(&child_ctx->lock, flags);
  13015	add_event_to_ctx(child_event, child_ctx);
  13016	child_event->attach_state |= PERF_ATTACH_CHILD;
  13017	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
  13018
  13019	/*
  13020	 * Link this into the parent event's child list
  13021	 */
  13022	list_add_tail(&child_event->child_list, &parent_event->child_list);
  13023	mutex_unlock(&parent_event->child_mutex);
  13024
  13025	return child_event;
  13026}
  13027
  13028/*
  13029 * Inherits an event group.
  13030 *
  13031 * This will quietly suppress orphaned events; !inherit_event() is not an error.
  13032 * This matches with perf_event_release_kernel() removing all child events.
  13033 *
  13034 * Returns:
  13035 *  - 0 on success
  13036 *  - <0 on error
  13037 */
  13038static int inherit_group(struct perf_event *parent_event,
  13039	      struct task_struct *parent,
  13040	      struct perf_event_context *parent_ctx,
  13041	      struct task_struct *child,
  13042	      struct perf_event_context *child_ctx)
  13043{
  13044	struct perf_event *leader;
  13045	struct perf_event *sub;
  13046	struct perf_event *child_ctr;
  13047
  13048	leader = inherit_event(parent_event, parent, parent_ctx,
  13049				 child, NULL, child_ctx);
  13050	if (IS_ERR(leader))
  13051		return PTR_ERR(leader);
  13052	/*
  13053	 * @leader can be NULL here because of is_orphaned_event(). In this
  13054	 * case inherit_event() will create individual events, similar to what
  13055	 * perf_group_detach() would do anyway.
  13056	 */
  13057	for_each_sibling_event(sub, parent_event) {
  13058		child_ctr = inherit_event(sub, parent, parent_ctx,
  13059					    child, leader, child_ctx);
  13060		if (IS_ERR(child_ctr))
  13061			return PTR_ERR(child_ctr);
  13062
  13063		if (sub->aux_event == parent_event && child_ctr &&
  13064		    !perf_get_aux_event(child_ctr, leader))
  13065			return -EINVAL;
  13066	}
  13067	return 0;
  13068}
  13069
  13070/*
  13071 * Creates the child task context and tries to inherit the event-group.
  13072 *
  13073 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
  13074 * inherited_all set when we 'fail' to inherit an orphaned event; this is
  13075 * consistent with perf_event_release_kernel() removing all child events.
  13076 *
  13077 * Returns:
  13078 *  - 0 on success
  13079 *  - <0 on error
  13080 */
  13081static int
  13082inherit_task_group(struct perf_event *event, struct task_struct *parent,
  13083		   struct perf_event_context *parent_ctx,
  13084		   struct task_struct *child, int ctxn,
  13085		   u64 clone_flags, int *inherited_all)
  13086{
  13087	int ret;
  13088	struct perf_event_context *child_ctx;
  13089
  13090	if (!event->attr.inherit ||
  13091	    (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
  13092	    /* Do not inherit if sigtrap and signal handlers were cleared. */
  13093	    (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
  13094		*inherited_all = 0;
  13095		return 0;
  13096	}
  13097
  13098	child_ctx = child->perf_event_ctxp[ctxn];
  13099	if (!child_ctx) {
  13100		/*
  13101		 * This is executed from the parent task context, so
  13102		 * inherit events that have been marked for cloning.
  13103		 * First allocate and initialize a context for the
  13104		 * child.
  13105		 */
  13106		child_ctx = alloc_perf_context(parent_ctx->pmu, child);
  13107		if (!child_ctx)
  13108			return -ENOMEM;
  13109
  13110		child->perf_event_ctxp[ctxn] = child_ctx;
  13111	}
  13112
  13113	ret = inherit_group(event, parent, parent_ctx,
  13114			    child, child_ctx);
  13115
  13116	if (ret)
  13117		*inherited_all = 0;
  13118
  13119	return ret;
  13120}
  13121
  13122/*
  13123 * Initialize the perf_event context in task_struct
  13124 */
  13125static int perf_event_init_context(struct task_struct *child, int ctxn,
  13126				   u64 clone_flags)
  13127{
  13128	struct perf_event_context *child_ctx, *parent_ctx;
  13129	struct perf_event_context *cloned_ctx;
  13130	struct perf_event *event;
  13131	struct task_struct *parent = current;
  13132	int inherited_all = 1;
  13133	unsigned long flags;
  13134	int ret = 0;
  13135
  13136	if (likely(!parent->perf_event_ctxp[ctxn]))
  13137		return 0;
  13138
  13139	/*
  13140	 * If the parent's context is a clone, pin it so it won't get
  13141	 * swapped under us.
  13142	 */
  13143	parent_ctx = perf_pin_task_context(parent, ctxn);
  13144	if (!parent_ctx)
  13145		return 0;
  13146
  13147	/*
  13148	 * No need to check if parent_ctx != NULL here; since we saw
  13149	 * it non-NULL earlier, the only reason for it to become NULL
  13150	 * is if we exit, and since we're currently in the middle of
  13151	 * a fork we can't be exiting at the same time.
  13152	 */
  13153
  13154	/*
  13155	 * Lock the parent list. No need to lock the child - not PID
  13156	 * hashed yet and not running, so nobody can access it.
  13157	 */
  13158	mutex_lock(&parent_ctx->mutex);
  13159
  13160	/*
  13161	 * We dont have to disable NMIs - we are only looking at
  13162	 * the list, not manipulating it:
  13163	 */
  13164	perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
  13165		ret = inherit_task_group(event, parent, parent_ctx,
  13166					 child, ctxn, clone_flags,
  13167					 &inherited_all);
  13168		if (ret)
  13169			goto out_unlock;
  13170	}
  13171
  13172	/*
  13173	 * We can't hold ctx->lock when iterating the ->flexible_group list due
  13174	 * to allocations, but we need to prevent rotation because
  13175	 * rotate_ctx() will change the list from interrupt context.
  13176	 */
  13177	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
  13178	parent_ctx->rotate_disable = 1;
  13179	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
  13180
  13181	perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
  13182		ret = inherit_task_group(event, parent, parent_ctx,
  13183					 child, ctxn, clone_flags,
  13184					 &inherited_all);
  13185		if (ret)
  13186			goto out_unlock;
  13187	}
  13188
  13189	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
  13190	parent_ctx->rotate_disable = 0;
  13191
  13192	child_ctx = child->perf_event_ctxp[ctxn];
  13193
  13194	if (child_ctx && inherited_all) {
  13195		/*
  13196		 * Mark the child context as a clone of the parent
  13197		 * context, or of whatever the parent is a clone of.
  13198		 *
  13199		 * Note that if the parent is a clone, the holding of
  13200		 * parent_ctx->lock avoids it from being uncloned.
  13201		 */
  13202		cloned_ctx = parent_ctx->parent_ctx;
  13203		if (cloned_ctx) {
  13204			child_ctx->parent_ctx = cloned_ctx;
  13205			child_ctx->parent_gen = parent_ctx->parent_gen;
  13206		} else {
  13207			child_ctx->parent_ctx = parent_ctx;
  13208			child_ctx->parent_gen = parent_ctx->generation;
  13209		}
  13210		get_ctx(child_ctx->parent_ctx);
  13211	}
  13212
  13213	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
  13214out_unlock:
  13215	mutex_unlock(&parent_ctx->mutex);
  13216
  13217	perf_unpin_context(parent_ctx);
  13218	put_ctx(parent_ctx);
  13219
  13220	return ret;
  13221}
  13222
  13223/*
  13224 * Initialize the perf_event context in task_struct
  13225 */
  13226int perf_event_init_task(struct task_struct *child, u64 clone_flags)
  13227{
  13228	int ctxn, ret;
  13229
  13230	memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
  13231	mutex_init(&child->perf_event_mutex);
  13232	INIT_LIST_HEAD(&child->perf_event_list);
  13233
  13234	for_each_task_context_nr(ctxn) {
  13235		ret = perf_event_init_context(child, ctxn, clone_flags);
  13236		if (ret) {
  13237			perf_event_free_task(child);
  13238			return ret;
  13239		}
  13240	}
  13241
  13242	return 0;
  13243}
  13244
  13245static void __init perf_event_init_all_cpus(void)
  13246{
  13247	struct swevent_htable *swhash;
  13248	int cpu;
  13249
  13250	zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
  13251
  13252	for_each_possible_cpu(cpu) {
  13253		swhash = &per_cpu(swevent_htable, cpu);
  13254		mutex_init(&swhash->hlist_mutex);
  13255		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
  13256
  13257		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
  13258		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
  13259
  13260#ifdef CONFIG_CGROUP_PERF
  13261		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
  13262#endif
  13263		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
  13264	}
  13265}
  13266
  13267static void perf_swevent_init_cpu(unsigned int cpu)
  13268{
  13269	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  13270
  13271	mutex_lock(&swhash->hlist_mutex);
  13272	if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
  13273		struct swevent_hlist *hlist;
  13274
  13275		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
  13276		WARN_ON(!hlist);
  13277		rcu_assign_pointer(swhash->swevent_hlist, hlist);
  13278	}
  13279	mutex_unlock(&swhash->hlist_mutex);
  13280}
  13281
  13282#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
  13283static void __perf_event_exit_context(void *__info)
  13284{
  13285	struct perf_event_context *ctx = __info;
  13286	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  13287	struct perf_event *event;
  13288
  13289	raw_spin_lock(&ctx->lock);
  13290	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
  13291	list_for_each_entry(event, &ctx->event_list, event_entry)
  13292		__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
  13293	raw_spin_unlock(&ctx->lock);
  13294}
  13295
  13296static void perf_event_exit_cpu_context(int cpu)
  13297{
  13298	struct perf_cpu_context *cpuctx;
  13299	struct perf_event_context *ctx;
  13300	struct pmu *pmu;
  13301
  13302	mutex_lock(&pmus_lock);
  13303	list_for_each_entry(pmu, &pmus, entry) {
  13304		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
  13305		ctx = &cpuctx->ctx;
  13306
  13307		mutex_lock(&ctx->mutex);
  13308		smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
  13309		cpuctx->online = 0;
  13310		mutex_unlock(&ctx->mutex);
  13311	}
  13312	cpumask_clear_cpu(cpu, perf_online_mask);
  13313	mutex_unlock(&pmus_lock);
  13314}
  13315#else
  13316
  13317static void perf_event_exit_cpu_context(int cpu) { }
  13318
  13319#endif
  13320
  13321int perf_event_init_cpu(unsigned int cpu)
  13322{
  13323	struct perf_cpu_context *cpuctx;
  13324	struct perf_event_context *ctx;
  13325	struct pmu *pmu;
  13326
  13327	perf_swevent_init_cpu(cpu);
  13328
  13329	mutex_lock(&pmus_lock);
  13330	cpumask_set_cpu(cpu, perf_online_mask);
  13331	list_for_each_entry(pmu, &pmus, entry) {
  13332		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
  13333		ctx = &cpuctx->ctx;
  13334
  13335		mutex_lock(&ctx->mutex);
  13336		cpuctx->online = 1;
  13337		mutex_unlock(&ctx->mutex);
  13338	}
  13339	mutex_unlock(&pmus_lock);
  13340
  13341	return 0;
  13342}
  13343
  13344int perf_event_exit_cpu(unsigned int cpu)
  13345{
  13346	perf_event_exit_cpu_context(cpu);
  13347	return 0;
  13348}
  13349
  13350static int
  13351perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
  13352{
  13353	int cpu;
  13354
  13355	for_each_online_cpu(cpu)
  13356		perf_event_exit_cpu(cpu);
  13357
  13358	return NOTIFY_OK;
  13359}
  13360
  13361/*
  13362 * Run the perf reboot notifier at the very last possible moment so that
  13363 * the generic watchdog code runs as long as possible.
  13364 */
  13365static struct notifier_block perf_reboot_notifier = {
  13366	.notifier_call = perf_reboot,
  13367	.priority = INT_MIN,
  13368};
  13369
  13370void __init perf_event_init(void)
  13371{
  13372	int ret;
  13373
  13374	idr_init(&pmu_idr);
  13375
  13376	perf_event_init_all_cpus();
  13377	init_srcu_struct(&pmus_srcu);
  13378	perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
  13379	perf_pmu_register(&perf_cpu_clock, NULL, -1);
  13380	perf_pmu_register(&perf_task_clock, NULL, -1);
  13381	perf_tp_register();
  13382	perf_event_init_cpu(smp_processor_id());
  13383	register_reboot_notifier(&perf_reboot_notifier);
  13384
  13385	ret = init_hw_breakpoint();
  13386	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
  13387
  13388	perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
  13389
  13390	/*
  13391	 * Build time assertion that we keep the data_head at the intended
  13392	 * location.  IOW, validation we got the __reserved[] size right.
  13393	 */
  13394	BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
  13395		     != 1024);
  13396}
  13397
  13398ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
  13399			      char *page)
  13400{
  13401	struct perf_pmu_events_attr *pmu_attr =
  13402		container_of(attr, struct perf_pmu_events_attr, attr);
  13403
  13404	if (pmu_attr->event_str)
  13405		return sprintf(page, "%s\n", pmu_attr->event_str);
  13406
  13407	return 0;
  13408}
  13409EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
  13410
  13411static int __init perf_event_sysfs_init(void)
  13412{
  13413	struct pmu *pmu;
  13414	int ret;
  13415
  13416	mutex_lock(&pmus_lock);
  13417
  13418	ret = bus_register(&pmu_bus);
  13419	if (ret)
  13420		goto unlock;
  13421
  13422	list_for_each_entry(pmu, &pmus, entry) {
  13423		if (!pmu->name || pmu->type < 0)
  13424			continue;
  13425
  13426		ret = pmu_dev_alloc(pmu);
  13427		WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
  13428	}
  13429	pmu_bus_running = 1;
  13430	ret = 0;
  13431
  13432unlock:
  13433	mutex_unlock(&pmus_lock);
  13434
  13435	return ret;
  13436}
  13437device_initcall(perf_event_sysfs_init);
  13438
  13439#ifdef CONFIG_CGROUP_PERF
  13440static struct cgroup_subsys_state *
  13441perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  13442{
  13443	struct perf_cgroup *jc;
  13444
  13445	jc = kzalloc(sizeof(*jc), GFP_KERNEL);
  13446	if (!jc)
  13447		return ERR_PTR(-ENOMEM);
  13448
  13449	jc->info = alloc_percpu(struct perf_cgroup_info);
  13450	if (!jc->info) {
  13451		kfree(jc);
  13452		return ERR_PTR(-ENOMEM);
  13453	}
  13454
  13455	return &jc->css;
  13456}
  13457
  13458static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
  13459{
  13460	struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
  13461
  13462	free_percpu(jc->info);
  13463	kfree(jc);
  13464}
  13465
  13466static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
  13467{
  13468	perf_event_cgroup(css->cgroup);
  13469	return 0;
  13470}
  13471
  13472static int __perf_cgroup_move(void *info)
  13473{
  13474	struct task_struct *task = info;
  13475	rcu_read_lock();
  13476	perf_cgroup_switch(task);
  13477	rcu_read_unlock();
  13478	return 0;
  13479}
  13480
  13481static void perf_cgroup_attach(struct cgroup_taskset *tset)
  13482{
  13483	struct task_struct *task;
  13484	struct cgroup_subsys_state *css;
  13485
  13486	cgroup_taskset_for_each(task, css, tset)
  13487		task_function_call(task, __perf_cgroup_move, task);
  13488}
  13489
  13490struct cgroup_subsys perf_event_cgrp_subsys = {
  13491	.css_alloc	= perf_cgroup_css_alloc,
  13492	.css_free	= perf_cgroup_css_free,
  13493	.css_online	= perf_cgroup_css_online,
  13494	.attach		= perf_cgroup_attach,
  13495	/*
  13496	 * Implicitly enable on dfl hierarchy so that perf events can
  13497	 * always be filtered by cgroup2 path as long as perf_event
  13498	 * controller is not mounted on a legacy hierarchy.
  13499	 */
  13500	.implicit_on_dfl = true,
  13501	.threaded	= true,
  13502};
  13503#endif /* CONFIG_CGROUP_PERF */
  13504
  13505DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);