ring_buffer.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
ring_buffer.c (23418B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Performance events ring-buffer code:
      4 *
      5 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
      6 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
      7 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
      8 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
      9 */
     10
     11#include <linux/perf_event.h>
     12#include <linux/vmalloc.h>
     13#include <linux/slab.h>
     14#include <linux/circ_buf.h>
     15#include <linux/poll.h>
     16#include <linux/nospec.h>
     17
     18#include "internal.h"
     19
     20static void perf_output_wakeup(struct perf_output_handle *handle)
     21{
     22	atomic_set(&handle->rb->poll, EPOLLIN);
     23
     24	handle->event->pending_wakeup = 1;
     25	irq_work_queue(&handle->event->pending);
     26}
     27
     28/*
     29 * We need to ensure a later event_id doesn't publish a head when a former
     30 * event isn't done writing. However since we need to deal with NMIs we
     31 * cannot fully serialize things.
     32 *
     33 * We only publish the head (and generate a wakeup) when the outer-most
     34 * event completes.
     35 */
     36static void perf_output_get_handle(struct perf_output_handle *handle)
     37{
     38	struct perf_buffer *rb = handle->rb;
     39
     40	preempt_disable();
     41
     42	/*
     43	 * Avoid an explicit LOAD/STORE such that architectures with memops
     44	 * can use them.
     45	 */
     46	(*(volatile unsigned int *)&rb->nest)++;
     47	handle->wakeup = local_read(&rb->wakeup);
     48}
     49
     50static void perf_output_put_handle(struct perf_output_handle *handle)
     51{
     52	struct perf_buffer *rb = handle->rb;
     53	unsigned long head;
     54	unsigned int nest;
     55
     56	/*
     57	 * If this isn't the outermost nesting, we don't have to update
     58	 * @rb->user_page->data_head.
     59	 */
     60	nest = READ_ONCE(rb->nest);
     61	if (nest > 1) {
     62		WRITE_ONCE(rb->nest, nest - 1);
     63		goto out;
     64	}
     65
     66again:
     67	/*
     68	 * In order to avoid publishing a head value that goes backwards,
     69	 * we must ensure the load of @rb->head happens after we've
     70	 * incremented @rb->nest.
     71	 *
     72	 * Otherwise we can observe a @rb->head value before one published
     73	 * by an IRQ/NMI happening between the load and the increment.
     74	 */
     75	barrier();
     76	head = local_read(&rb->head);
     77
     78	/*
     79	 * IRQ/NMI can happen here and advance @rb->head, causing our
     80	 * load above to be stale.
     81	 */
     82
     83	/*
     84	 * Since the mmap() consumer (userspace) can run on a different CPU:
     85	 *
     86	 *   kernel				user
     87	 *
     88	 *   if (LOAD ->data_tail) {		LOAD ->data_head
     89	 *			(A)		smp_rmb()	(C)
     90	 *	STORE $data			LOAD $data
     91	 *	smp_wmb()	(B)		smp_mb()	(D)
     92	 *	STORE ->data_head		STORE ->data_tail
     93	 *   }
     94	 *
     95	 * Where A pairs with D, and B pairs with C.
     96	 *
     97	 * In our case (A) is a control dependency that separates the load of
     98	 * the ->data_tail and the stores of $data. In case ->data_tail
     99	 * indicates there is no room in the buffer to store $data we do not.
    100	 *
    101	 * D needs to be a full barrier since it separates the data READ
    102	 * from the tail WRITE.
    103	 *
    104	 * For B a WMB is sufficient since it separates two WRITEs, and for C
    105	 * an RMB is sufficient since it separates two READs.
    106	 *
    107	 * See perf_output_begin().
    108	 */
    109	smp_wmb(); /* B, matches C */
    110	WRITE_ONCE(rb->user_page->data_head, head);
    111
    112	/*
    113	 * We must publish the head before decrementing the nest count,
    114	 * otherwise an IRQ/NMI can publish a more recent head value and our
    115	 * write will (temporarily) publish a stale value.
    116	 */
    117	barrier();
    118	WRITE_ONCE(rb->nest, 0);
    119
    120	/*
    121	 * Ensure we decrement @rb->nest before we validate the @rb->head.
    122	 * Otherwise we cannot be sure we caught the 'last' nested update.
    123	 */
    124	barrier();
    125	if (unlikely(head != local_read(&rb->head))) {
    126		WRITE_ONCE(rb->nest, 1);
    127		goto again;
    128	}
    129
    130	if (handle->wakeup != local_read(&rb->wakeup))
    131		perf_output_wakeup(handle);
    132
    133out:
    134	preempt_enable();
    135}
    136
    137static __always_inline bool
    138ring_buffer_has_space(unsigned long head, unsigned long tail,
    139		      unsigned long data_size, unsigned int size,
    140		      bool backward)
    141{
    142	if (!backward)
    143		return CIRC_SPACE(head, tail, data_size) >= size;
    144	else
    145		return CIRC_SPACE(tail, head, data_size) >= size;
    146}
    147
    148static __always_inline int
    149__perf_output_begin(struct perf_output_handle *handle,
    150		    struct perf_sample_data *data,
    151		    struct perf_event *event, unsigned int size,
    152		    bool backward)
    153{
    154	struct perf_buffer *rb;
    155	unsigned long tail, offset, head;
    156	int have_lost, page_shift;
    157	struct {
    158		struct perf_event_header header;
    159		u64			 id;
    160		u64			 lost;
    161	} lost_event;
    162
    163	rcu_read_lock();
    164	/*
    165	 * For inherited events we send all the output towards the parent.
    166	 */
    167	if (event->parent)
    168		event = event->parent;
    169
    170	rb = rcu_dereference(event->rb);
    171	if (unlikely(!rb))
    172		goto out;
    173
    174	if (unlikely(rb->paused)) {
    175		if (rb->nr_pages)
    176			local_inc(&rb->lost);
    177		goto out;
    178	}
    179
    180	handle->rb    = rb;
    181	handle->event = event;
    182
    183	have_lost = local_read(&rb->lost);
    184	if (unlikely(have_lost)) {
    185		size += sizeof(lost_event);
    186		if (event->attr.sample_id_all)
    187			size += event->id_header_size;
    188	}
    189
    190	perf_output_get_handle(handle);
    191
    192	do {
    193		tail = READ_ONCE(rb->user_page->data_tail);
    194		offset = head = local_read(&rb->head);
    195		if (!rb->overwrite) {
    196			if (unlikely(!ring_buffer_has_space(head, tail,
    197							    perf_data_size(rb),
    198							    size, backward)))
    199				goto fail;
    200		}
    201
    202		/*
    203		 * The above forms a control dependency barrier separating the
    204		 * @tail load above from the data stores below. Since the @tail
    205		 * load is required to compute the branch to fail below.
    206		 *
    207		 * A, matches D; the full memory barrier userspace SHOULD issue
    208		 * after reading the data and before storing the new tail
    209		 * position.
    210		 *
    211		 * See perf_output_put_handle().
    212		 */
    213
    214		if (!backward)
    215			head += size;
    216		else
    217			head -= size;
    218	} while (local_cmpxchg(&rb->head, offset, head) != offset);
    219
    220	if (backward) {
    221		offset = head;
    222		head = (u64)(-head);
    223	}
    224
    225	/*
    226	 * We rely on the implied barrier() by local_cmpxchg() to ensure
    227	 * none of the data stores below can be lifted up by the compiler.
    228	 */
    229
    230	if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
    231		local_add(rb->watermark, &rb->wakeup);
    232
    233	page_shift = PAGE_SHIFT + page_order(rb);
    234
    235	handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
    236	offset &= (1UL << page_shift) - 1;
    237	handle->addr = rb->data_pages[handle->page] + offset;
    238	handle->size = (1UL << page_shift) - offset;
    239
    240	if (unlikely(have_lost)) {
    241		lost_event.header.size = sizeof(lost_event);
    242		lost_event.header.type = PERF_RECORD_LOST;
    243		lost_event.header.misc = 0;
    244		lost_event.id          = event->id;
    245		lost_event.lost        = local_xchg(&rb->lost, 0);
    246
    247		/* XXX mostly redundant; @data is already fully initializes */
    248		perf_event_header__init_id(&lost_event.header, data, event);
    249		perf_output_put(handle, lost_event);
    250		perf_event__output_id_sample(event, handle, data);
    251	}
    252
    253	return 0;
    254
    255fail:
    256	local_inc(&rb->lost);
    257	perf_output_put_handle(handle);
    258out:
    259	rcu_read_unlock();
    260
    261	return -ENOSPC;
    262}
    263
    264int perf_output_begin_forward(struct perf_output_handle *handle,
    265			      struct perf_sample_data *data,
    266			      struct perf_event *event, unsigned int size)
    267{
    268	return __perf_output_begin(handle, data, event, size, false);
    269}
    270
    271int perf_output_begin_backward(struct perf_output_handle *handle,
    272			       struct perf_sample_data *data,
    273			       struct perf_event *event, unsigned int size)
    274{
    275	return __perf_output_begin(handle, data, event, size, true);
    276}
    277
    278int perf_output_begin(struct perf_output_handle *handle,
    279		      struct perf_sample_data *data,
    280		      struct perf_event *event, unsigned int size)
    281{
    282
    283	return __perf_output_begin(handle, data, event, size,
    284				   unlikely(is_write_backward(event)));
    285}
    286
    287unsigned int perf_output_copy(struct perf_output_handle *handle,
    288		      const void *buf, unsigned int len)
    289{
    290	return __output_copy(handle, buf, len);
    291}
    292
    293unsigned int perf_output_skip(struct perf_output_handle *handle,
    294			      unsigned int len)
    295{
    296	return __output_skip(handle, NULL, len);
    297}
    298
    299void perf_output_end(struct perf_output_handle *handle)
    300{
    301	perf_output_put_handle(handle);
    302	rcu_read_unlock();
    303}
    304
    305static void
    306ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
    307{
    308	long max_size = perf_data_size(rb);
    309
    310	if (watermark)
    311		rb->watermark = min(max_size, watermark);
    312
    313	if (!rb->watermark)
    314		rb->watermark = max_size / 2;
    315
    316	if (flags & RING_BUFFER_WRITABLE)
    317		rb->overwrite = 0;
    318	else
    319		rb->overwrite = 1;
    320
    321	refcount_set(&rb->refcount, 1);
    322
    323	INIT_LIST_HEAD(&rb->event_list);
    324	spin_lock_init(&rb->event_lock);
    325
    326	/*
    327	 * perf_output_begin() only checks rb->paused, therefore
    328	 * rb->paused must be true if we have no pages for output.
    329	 */
    330	if (!rb->nr_pages)
    331		rb->paused = 1;
    332}
    333
    334void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
    335{
    336	/*
    337	 * OVERWRITE is determined by perf_aux_output_end() and can't
    338	 * be passed in directly.
    339	 */
    340	if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
    341		return;
    342
    343	handle->aux_flags |= flags;
    344}
    345EXPORT_SYMBOL_GPL(perf_aux_output_flag);
    346
    347/*
    348 * This is called before hardware starts writing to the AUX area to
    349 * obtain an output handle and make sure there's room in the buffer.
    350 * When the capture completes, call perf_aux_output_end() to commit
    351 * the recorded data to the buffer.
    352 *
    353 * The ordering is similar to that of perf_output_{begin,end}, with
    354 * the exception of (B), which should be taken care of by the pmu
    355 * driver, since ordering rules will differ depending on hardware.
    356 *
    357 * Call this from pmu::start(); see the comment in perf_aux_output_end()
    358 * about its use in pmu callbacks. Both can also be called from the PMI
    359 * handler if needed.
    360 */
    361void *perf_aux_output_begin(struct perf_output_handle *handle,
    362			    struct perf_event *event)
    363{
    364	struct perf_event *output_event = event;
    365	unsigned long aux_head, aux_tail;
    366	struct perf_buffer *rb;
    367	unsigned int nest;
    368
    369	if (output_event->parent)
    370		output_event = output_event->parent;
    371
    372	/*
    373	 * Since this will typically be open across pmu::add/pmu::del, we
    374	 * grab ring_buffer's refcount instead of holding rcu read lock
    375	 * to make sure it doesn't disappear under us.
    376	 */
    377	rb = ring_buffer_get(output_event);
    378	if (!rb)
    379		return NULL;
    380
    381	if (!rb_has_aux(rb))
    382		goto err;
    383
    384	/*
    385	 * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
    386	 * about to get freed, so we leave immediately.
    387	 *
    388	 * Checking rb::aux_mmap_count and rb::refcount has to be done in
    389	 * the same order, see perf_mmap_close. Otherwise we end up freeing
    390	 * aux pages in this path, which is a bug, because in_atomic().
    391	 */
    392	if (!atomic_read(&rb->aux_mmap_count))
    393		goto err;
    394
    395	if (!refcount_inc_not_zero(&rb->aux_refcount))
    396		goto err;
    397
    398	nest = READ_ONCE(rb->aux_nest);
    399	/*
    400	 * Nesting is not supported for AUX area, make sure nested
    401	 * writers are caught early
    402	 */
    403	if (WARN_ON_ONCE(nest))
    404		goto err_put;
    405
    406	WRITE_ONCE(rb->aux_nest, nest + 1);
    407
    408	aux_head = rb->aux_head;
    409
    410	handle->rb = rb;
    411	handle->event = event;
    412	handle->head = aux_head;
    413	handle->size = 0;
    414	handle->aux_flags = 0;
    415
    416	/*
    417	 * In overwrite mode, AUX data stores do not depend on aux_tail,
    418	 * therefore (A) control dependency barrier does not exist. The
    419	 * (B) <-> (C) ordering is still observed by the pmu driver.
    420	 */
    421	if (!rb->aux_overwrite) {
    422		aux_tail = READ_ONCE(rb->user_page->aux_tail);
    423		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
    424		if (aux_head - aux_tail < perf_aux_size(rb))
    425			handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
    426
    427		/*
    428		 * handle->size computation depends on aux_tail load; this forms a
    429		 * control dependency barrier separating aux_tail load from aux data
    430		 * store that will be enabled on successful return
    431		 */
    432		if (!handle->size) { /* A, matches D */
    433			event->pending_disable = smp_processor_id();
    434			perf_output_wakeup(handle);
    435			WRITE_ONCE(rb->aux_nest, 0);
    436			goto err_put;
    437		}
    438	}
    439
    440	return handle->rb->aux_priv;
    441
    442err_put:
    443	/* can't be last */
    444	rb_free_aux(rb);
    445
    446err:
    447	ring_buffer_put(rb);
    448	handle->event = NULL;
    449
    450	return NULL;
    451}
    452EXPORT_SYMBOL_GPL(perf_aux_output_begin);
    453
    454static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb)
    455{
    456	if (rb->aux_overwrite)
    457		return false;
    458
    459	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
    460		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
    461		return true;
    462	}
    463
    464	return false;
    465}
    466
    467/*
    468 * Commit the data written by hardware into the ring buffer by adjusting
    469 * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
    470 * pmu driver's responsibility to observe ordering rules of the hardware,
    471 * so that all the data is externally visible before this is called.
    472 *
    473 * Note: this has to be called from pmu::stop() callback, as the assumption
    474 * of the AUX buffer management code is that after pmu::stop(), the AUX
    475 * transaction must be stopped and therefore drop the AUX reference count.
    476 */
    477void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
    478{
    479	bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
    480	struct perf_buffer *rb = handle->rb;
    481	unsigned long aux_head;
    482
    483	/* in overwrite mode, driver provides aux_head via handle */
    484	if (rb->aux_overwrite) {
    485		handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
    486
    487		aux_head = handle->head;
    488		rb->aux_head = aux_head;
    489	} else {
    490		handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
    491
    492		aux_head = rb->aux_head;
    493		rb->aux_head += size;
    494	}
    495
    496	/*
    497	 * Only send RECORD_AUX if we have something useful to communicate
    498	 *
    499	 * Note: the OVERWRITE records by themselves are not considered
    500	 * useful, as they don't communicate any *new* information,
    501	 * aside from the short-lived offset, that becomes history at
    502	 * the next event sched-in and therefore isn't useful.
    503	 * The userspace that needs to copy out AUX data in overwrite
    504	 * mode should know to use user_page::aux_head for the actual
    505	 * offset. So, from now on we don't output AUX records that
    506	 * have *only* OVERWRITE flag set.
    507	 */
    508	if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
    509		perf_event_aux_event(handle->event, aux_head, size,
    510				     handle->aux_flags);
    511
    512	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
    513	if (rb_need_aux_wakeup(rb))
    514		wakeup = true;
    515
    516	if (wakeup) {
    517		if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
    518			handle->event->pending_disable = smp_processor_id();
    519		perf_output_wakeup(handle);
    520	}
    521
    522	handle->event = NULL;
    523
    524	WRITE_ONCE(rb->aux_nest, 0);
    525	/* can't be last */
    526	rb_free_aux(rb);
    527	ring_buffer_put(rb);
    528}
    529EXPORT_SYMBOL_GPL(perf_aux_output_end);
    530
    531/*
    532 * Skip over a given number of bytes in the AUX buffer, due to, for example,
    533 * hardware's alignment constraints.
    534 */
    535int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
    536{
    537	struct perf_buffer *rb = handle->rb;
    538
    539	if (size > handle->size)
    540		return -ENOSPC;
    541
    542	rb->aux_head += size;
    543
    544	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
    545	if (rb_need_aux_wakeup(rb)) {
    546		perf_output_wakeup(handle);
    547		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
    548	}
    549
    550	handle->head = rb->aux_head;
    551	handle->size -= size;
    552
    553	return 0;
    554}
    555EXPORT_SYMBOL_GPL(perf_aux_output_skip);
    556
    557void *perf_get_aux(struct perf_output_handle *handle)
    558{
    559	/* this is only valid between perf_aux_output_begin and *_end */
    560	if (!handle->event)
    561		return NULL;
    562
    563	return handle->rb->aux_priv;
    564}
    565EXPORT_SYMBOL_GPL(perf_get_aux);
    566
    567/*
    568 * Copy out AUX data from an AUX handle.
    569 */
    570long perf_output_copy_aux(struct perf_output_handle *aux_handle,
    571			  struct perf_output_handle *handle,
    572			  unsigned long from, unsigned long to)
    573{
    574	struct perf_buffer *rb = aux_handle->rb;
    575	unsigned long tocopy, remainder, len = 0;
    576	void *addr;
    577
    578	from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
    579	to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
    580
    581	do {
    582		tocopy = PAGE_SIZE - offset_in_page(from);
    583		if (to > from)
    584			tocopy = min(tocopy, to - from);
    585		if (!tocopy)
    586			break;
    587
    588		addr = rb->aux_pages[from >> PAGE_SHIFT];
    589		addr += offset_in_page(from);
    590
    591		remainder = perf_output_copy(handle, addr, tocopy);
    592		if (remainder)
    593			return -EFAULT;
    594
    595		len += tocopy;
    596		from += tocopy;
    597		from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
    598	} while (to != from);
    599
    600	return len;
    601}
    602
    603#define PERF_AUX_GFP	(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
    604
    605static struct page *rb_alloc_aux_page(int node, int order)
    606{
    607	struct page *page;
    608
    609	if (order > MAX_ORDER)
    610		order = MAX_ORDER;
    611
    612	do {
    613		page = alloc_pages_node(node, PERF_AUX_GFP, order);
    614	} while (!page && order--);
    615
    616	if (page && order) {
    617		/*
    618		 * Communicate the allocation size to the driver:
    619		 * if we managed to secure a high-order allocation,
    620		 * set its first page's private to this order;
    621		 * !PagePrivate(page) means it's just a normal page.
    622		 */
    623		split_page(page, order);
    624		SetPagePrivate(page);
    625		set_page_private(page, order);
    626	}
    627
    628	return page;
    629}
    630
    631static void rb_free_aux_page(struct perf_buffer *rb, int idx)
    632{
    633	struct page *page = virt_to_page(rb->aux_pages[idx]);
    634
    635	ClearPagePrivate(page);
    636	page->mapping = NULL;
    637	__free_page(page);
    638}
    639
    640static void __rb_free_aux(struct perf_buffer *rb)
    641{
    642	int pg;
    643
    644	/*
    645	 * Should never happen, the last reference should be dropped from
    646	 * perf_mmap_close() path, which first stops aux transactions (which
    647	 * in turn are the atomic holders of aux_refcount) and then does the
    648	 * last rb_free_aux().
    649	 */
    650	WARN_ON_ONCE(in_atomic());
    651
    652	if (rb->aux_priv) {
    653		rb->free_aux(rb->aux_priv);
    654		rb->free_aux = NULL;
    655		rb->aux_priv = NULL;
    656	}
    657
    658	if (rb->aux_nr_pages) {
    659		for (pg = 0; pg < rb->aux_nr_pages; pg++)
    660			rb_free_aux_page(rb, pg);
    661
    662		kfree(rb->aux_pages);
    663		rb->aux_nr_pages = 0;
    664	}
    665}
    666
    667int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
    668		 pgoff_t pgoff, int nr_pages, long watermark, int flags)
    669{
    670	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
    671	int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
    672	int ret = -ENOMEM, max_order;
    673
    674	if (!has_aux(event))
    675		return -EOPNOTSUPP;
    676
    677	if (!overwrite) {
    678		/*
    679		 * Watermark defaults to half the buffer, and so does the
    680		 * max_order, to aid PMU drivers in double buffering.
    681		 */
    682		if (!watermark)
    683			watermark = nr_pages << (PAGE_SHIFT - 1);
    684
    685		/*
    686		 * Use aux_watermark as the basis for chunking to
    687		 * help PMU drivers honor the watermark.
    688		 */
    689		max_order = get_order(watermark);
    690	} else {
    691		/*
    692		 * We need to start with the max_order that fits in nr_pages,
    693		 * not the other way around, hence ilog2() and not get_order.
    694		 */
    695		max_order = ilog2(nr_pages);
    696		watermark = 0;
    697	}
    698
    699	rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
    700				     node);
    701	if (!rb->aux_pages)
    702		return -ENOMEM;
    703
    704	rb->free_aux = event->pmu->free_aux;
    705	for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
    706		struct page *page;
    707		int last, order;
    708
    709		order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
    710		page = rb_alloc_aux_page(node, order);
    711		if (!page)
    712			goto out;
    713
    714		for (last = rb->aux_nr_pages + (1 << page_private(page));
    715		     last > rb->aux_nr_pages; rb->aux_nr_pages++)
    716			rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
    717	}
    718
    719	/*
    720	 * In overwrite mode, PMUs that don't support SG may not handle more
    721	 * than one contiguous allocation, since they rely on PMI to do double
    722	 * buffering. In this case, the entire buffer has to be one contiguous
    723	 * chunk.
    724	 */
    725	if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
    726	    overwrite) {
    727		struct page *page = virt_to_page(rb->aux_pages[0]);
    728
    729		if (page_private(page) != max_order)
    730			goto out;
    731	}
    732
    733	rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
    734					     overwrite);
    735	if (!rb->aux_priv)
    736		goto out;
    737
    738	ret = 0;
    739
    740	/*
    741	 * aux_pages (and pmu driver's private data, aux_priv) will be
    742	 * referenced in both producer's and consumer's contexts, thus
    743	 * we keep a refcount here to make sure either of the two can
    744	 * reference them safely.
    745	 */
    746	refcount_set(&rb->aux_refcount, 1);
    747
    748	rb->aux_overwrite = overwrite;
    749	rb->aux_watermark = watermark;
    750
    751out:
    752	if (!ret)
    753		rb->aux_pgoff = pgoff;
    754	else
    755		__rb_free_aux(rb);
    756
    757	return ret;
    758}
    759
    760void rb_free_aux(struct perf_buffer *rb)
    761{
    762	if (refcount_dec_and_test(&rb->aux_refcount))
    763		__rb_free_aux(rb);
    764}
    765
    766#ifndef CONFIG_PERF_USE_VMALLOC
    767
    768/*
    769 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
    770 */
    771
    772static struct page *
    773__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
    774{
    775	if (pgoff > rb->nr_pages)
    776		return NULL;
    777
    778	if (pgoff == 0)
    779		return virt_to_page(rb->user_page);
    780
    781	return virt_to_page(rb->data_pages[pgoff - 1]);
    782}
    783
    784static void *perf_mmap_alloc_page(int cpu)
    785{
    786	struct page *page;
    787	int node;
    788
    789	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
    790	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
    791	if (!page)
    792		return NULL;
    793
    794	return page_address(page);
    795}
    796
    797static void perf_mmap_free_page(void *addr)
    798{
    799	struct page *page = virt_to_page(addr);
    800
    801	page->mapping = NULL;
    802	__free_page(page);
    803}
    804
    805struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
    806{
    807	struct perf_buffer *rb;
    808	unsigned long size;
    809	int i, node;
    810
    811	size = sizeof(struct perf_buffer);
    812	size += nr_pages * sizeof(void *);
    813
    814	if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
    815		goto fail;
    816
    817	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
    818	rb = kzalloc_node(size, GFP_KERNEL, node);
    819	if (!rb)
    820		goto fail;
    821
    822	rb->user_page = perf_mmap_alloc_page(cpu);
    823	if (!rb->user_page)
    824		goto fail_user_page;
    825
    826	for (i = 0; i < nr_pages; i++) {
    827		rb->data_pages[i] = perf_mmap_alloc_page(cpu);
    828		if (!rb->data_pages[i])
    829			goto fail_data_pages;
    830	}
    831
    832	rb->nr_pages = nr_pages;
    833
    834	ring_buffer_init(rb, watermark, flags);
    835
    836	return rb;
    837
    838fail_data_pages:
    839	for (i--; i >= 0; i--)
    840		perf_mmap_free_page(rb->data_pages[i]);
    841
    842	perf_mmap_free_page(rb->user_page);
    843
    844fail_user_page:
    845	kfree(rb);
    846
    847fail:
    848	return NULL;
    849}
    850
    851void rb_free(struct perf_buffer *rb)
    852{
    853	int i;
    854
    855	perf_mmap_free_page(rb->user_page);
    856	for (i = 0; i < rb->nr_pages; i++)
    857		perf_mmap_free_page(rb->data_pages[i]);
    858	kfree(rb);
    859}
    860
    861#else
    862static struct page *
    863__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
    864{
    865	/* The '>' counts in the user page. */
    866	if (pgoff > data_page_nr(rb))
    867		return NULL;
    868
    869	return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
    870}
    871
    872static void perf_mmap_unmark_page(void *addr)
    873{
    874	struct page *page = vmalloc_to_page(addr);
    875
    876	page->mapping = NULL;
    877}
    878
    879static void rb_free_work(struct work_struct *work)
    880{
    881	struct perf_buffer *rb;
    882	void *base;
    883	int i, nr;
    884
    885	rb = container_of(work, struct perf_buffer, work);
    886	nr = data_page_nr(rb);
    887
    888	base = rb->user_page;
    889	/* The '<=' counts in the user page. */
    890	for (i = 0; i <= nr; i++)
    891		perf_mmap_unmark_page(base + (i * PAGE_SIZE));
    892
    893	vfree(base);
    894	kfree(rb);
    895}
    896
    897void rb_free(struct perf_buffer *rb)
    898{
    899	schedule_work(&rb->work);
    900}
    901
    902struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
    903{
    904	struct perf_buffer *rb;
    905	unsigned long size;
    906	void *all_buf;
    907	int node;
    908
    909	size = sizeof(struct perf_buffer);
    910	size += sizeof(void *);
    911
    912	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
    913	rb = kzalloc_node(size, GFP_KERNEL, node);
    914	if (!rb)
    915		goto fail;
    916
    917	INIT_WORK(&rb->work, rb_free_work);
    918
    919	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
    920	if (!all_buf)
    921		goto fail_all_buf;
    922
    923	rb->user_page = all_buf;
    924	rb->data_pages[0] = all_buf + PAGE_SIZE;
    925	if (nr_pages) {
    926		rb->nr_pages = 1;
    927		rb->page_order = ilog2(nr_pages);
    928	}
    929
    930	ring_buffer_init(rb, watermark, flags);
    931
    932	return rb;
    933
    934fail_all_buf:
    935	kfree(rb);
    936
    937fail:
    938	return NULL;
    939}
    940
    941#endif
    942
    943struct page *
    944perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
    945{
    946	if (rb->aux_nr_pages) {
    947		/* above AUX space */
    948		if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
    949			return NULL;
    950
    951		/* AUX space */
    952		if (pgoff >= rb->aux_pgoff) {
    953			int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
    954			return virt_to_page(rb->aux_pages[aux_pgoff]);
    955		}
    956	}
    957
    958	return __perf_mmap_to_page(rb, pgoff);
    959}