cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

builtin-record.c (108991B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * builtin-record.c
      4 *
      5 * Builtin record command: Record the profile of a workload
      6 * (or a CPU, or a PID) into the perf.data output file - for
      7 * later analysis via perf report.
      8 */
      9#include "builtin.h"
     10
     11#include "util/build-id.h"
     12#include <subcmd/parse-options.h>
     13#include "util/parse-events.h"
     14#include "util/config.h"
     15
     16#include "util/callchain.h"
     17#include "util/cgroup.h"
     18#include "util/header.h"
     19#include "util/event.h"
     20#include "util/evlist.h"
     21#include "util/evsel.h"
     22#include "util/debug.h"
     23#include "util/mmap.h"
     24#include "util/target.h"
     25#include "util/session.h"
     26#include "util/tool.h"
     27#include "util/symbol.h"
     28#include "util/record.h"
     29#include "util/cpumap.h"
     30#include "util/thread_map.h"
     31#include "util/data.h"
     32#include "util/perf_regs.h"
     33#include "util/auxtrace.h"
     34#include "util/tsc.h"
     35#include "util/parse-branch-options.h"
     36#include "util/parse-regs-options.h"
     37#include "util/perf_api_probe.h"
     38#include "util/llvm-utils.h"
     39#include "util/bpf-loader.h"
     40#include "util/trigger.h"
     41#include "util/perf-hooks.h"
     42#include "util/cpu-set-sched.h"
     43#include "util/synthetic-events.h"
     44#include "util/time-utils.h"
     45#include "util/units.h"
     46#include "util/bpf-event.h"
     47#include "util/util.h"
     48#include "util/pfm.h"
     49#include "util/clockid.h"
     50#include "util/pmu-hybrid.h"
     51#include "util/evlist-hybrid.h"
     52#include "util/off_cpu.h"
     53#include "asm/bug.h"
     54#include "perf.h"
     55#include "cputopo.h"
     56
     57#include <errno.h>
     58#include <inttypes.h>
     59#include <locale.h>
     60#include <poll.h>
     61#include <pthread.h>
     62#include <unistd.h>
     63#ifndef HAVE_GETTID
     64#include <syscall.h>
     65#endif
     66#include <sched.h>
     67#include <signal.h>
     68#ifdef HAVE_EVENTFD_SUPPORT
     69#include <sys/eventfd.h>
     70#endif
     71#include <sys/mman.h>
     72#include <sys/wait.h>
     73#include <sys/types.h>
     74#include <sys/stat.h>
     75#include <fcntl.h>
     76#include <linux/err.h>
     77#include <linux/string.h>
     78#include <linux/time64.h>
     79#include <linux/zalloc.h>
     80#include <linux/bitmap.h>
     81#include <sys/time.h>
     82
     83struct switch_output {
     84	bool		 enabled;
     85	bool		 signal;
     86	unsigned long	 size;
     87	unsigned long	 time;
     88	const char	*str;
     89	bool		 set;
     90	char		 **filenames;
     91	int		 num_files;
     92	int		 cur_file;
     93};
     94
     95struct thread_mask {
     96	struct mmap_cpu_mask	maps;
     97	struct mmap_cpu_mask	affinity;
     98};
     99
    100struct record_thread {
    101	pid_t			tid;
    102	struct thread_mask	*mask;
    103	struct {
    104		int		msg[2];
    105		int		ack[2];
    106	} pipes;
    107	struct fdarray		pollfd;
    108	int			ctlfd_pos;
    109	int			nr_mmaps;
    110	struct mmap		**maps;
    111	struct mmap		**overwrite_maps;
    112	struct record		*rec;
    113	unsigned long long	samples;
    114	unsigned long		waking;
    115	u64			bytes_written;
    116	u64			bytes_transferred;
    117	u64			bytes_compressed;
    118};
    119
    120static __thread struct record_thread *thread;
    121
    122enum thread_msg {
    123	THREAD_MSG__UNDEFINED = 0,
    124	THREAD_MSG__READY,
    125	THREAD_MSG__MAX,
    126};
    127
    128static const char *thread_msg_tags[THREAD_MSG__MAX] = {
    129	"UNDEFINED", "READY"
    130};
    131
    132enum thread_spec {
    133	THREAD_SPEC__UNDEFINED = 0,
    134	THREAD_SPEC__CPU,
    135	THREAD_SPEC__CORE,
    136	THREAD_SPEC__PACKAGE,
    137	THREAD_SPEC__NUMA,
    138	THREAD_SPEC__USER,
    139	THREAD_SPEC__MAX,
    140};
    141
    142static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
    143	"undefined", "cpu", "core", "package", "numa", "user"
    144};
    145
    146struct record {
    147	struct perf_tool	tool;
    148	struct record_opts	opts;
    149	u64			bytes_written;
    150	struct perf_data	data;
    151	struct auxtrace_record	*itr;
    152	struct evlist	*evlist;
    153	struct perf_session	*session;
    154	struct evlist		*sb_evlist;
    155	pthread_t		thread_id;
    156	int			realtime_prio;
    157	bool			switch_output_event_set;
    158	bool			no_buildid;
    159	bool			no_buildid_set;
    160	bool			no_buildid_cache;
    161	bool			no_buildid_cache_set;
    162	bool			buildid_all;
    163	bool			buildid_mmap;
    164	bool			timestamp_filename;
    165	bool			timestamp_boundary;
    166	bool			off_cpu;
    167	struct switch_output	switch_output;
    168	unsigned long long	samples;
    169	unsigned long		output_max_size;	/* = 0: unlimited */
    170	struct perf_debuginfod	debuginfod;
    171	int			nr_threads;
    172	struct thread_mask	*thread_masks;
    173	struct record_thread	*thread_data;
    174};
    175
    176static volatile int done;
    177
    178static volatile int auxtrace_record__snapshot_started;
    179static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
    180static DEFINE_TRIGGER(switch_output_trigger);
    181
    182static const char *affinity_tags[PERF_AFFINITY_MAX] = {
    183	"SYS", "NODE", "CPU"
    184};
    185
    186#ifndef HAVE_GETTID
    187static inline pid_t gettid(void)
    188{
    189	return (pid_t)syscall(__NR_gettid);
    190}
    191#endif
    192
    193static int record__threads_enabled(struct record *rec)
    194{
    195	return rec->opts.threads_spec;
    196}
    197
    198static bool switch_output_signal(struct record *rec)
    199{
    200	return rec->switch_output.signal &&
    201	       trigger_is_ready(&switch_output_trigger);
    202}
    203
    204static bool switch_output_size(struct record *rec)
    205{
    206	return rec->switch_output.size &&
    207	       trigger_is_ready(&switch_output_trigger) &&
    208	       (rec->bytes_written >= rec->switch_output.size);
    209}
    210
    211static bool switch_output_time(struct record *rec)
    212{
    213	return rec->switch_output.time &&
    214	       trigger_is_ready(&switch_output_trigger);
    215}
    216
    217static u64 record__bytes_written(struct record *rec)
    218{
    219	int t;
    220	u64 bytes_written = rec->bytes_written;
    221	struct record_thread *thread_data = rec->thread_data;
    222
    223	for (t = 0; t < rec->nr_threads; t++)
    224		bytes_written += thread_data[t].bytes_written;
    225
    226	return bytes_written;
    227}
    228
    229static bool record__output_max_size_exceeded(struct record *rec)
    230{
    231	return rec->output_max_size &&
    232	       (record__bytes_written(rec) >= rec->output_max_size);
    233}
    234
    235static int record__write(struct record *rec, struct mmap *map __maybe_unused,
    236			 void *bf, size_t size)
    237{
    238	struct perf_data_file *file = &rec->session->data->file;
    239
    240	if (map && map->file)
    241		file = map->file;
    242
    243	if (perf_data_file__write(file, bf, size) < 0) {
    244		pr_err("failed to write perf data, error: %m\n");
    245		return -1;
    246	}
    247
    248	if (map && map->file)
    249		thread->bytes_written += size;
    250	else
    251		rec->bytes_written += size;
    252
    253	if (record__output_max_size_exceeded(rec) && !done) {
    254		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
    255				" stopping session ]\n",
    256				record__bytes_written(rec) >> 10);
    257		done = 1;
    258	}
    259
    260	if (switch_output_size(rec))
    261		trigger_hit(&switch_output_trigger);
    262
    263	return 0;
    264}
    265
    266static int record__aio_enabled(struct record *rec);
    267static int record__comp_enabled(struct record *rec);
    268static size_t zstd_compress(struct perf_session *session, struct mmap *map,
    269			    void *dst, size_t dst_size, void *src, size_t src_size);
    270
    271#ifdef HAVE_AIO_SUPPORT
    272static int record__aio_write(struct aiocb *cblock, int trace_fd,
    273		void *buf, size_t size, off_t off)
    274{
    275	int rc;
    276
    277	cblock->aio_fildes = trace_fd;
    278	cblock->aio_buf    = buf;
    279	cblock->aio_nbytes = size;
    280	cblock->aio_offset = off;
    281	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
    282
    283	do {
    284		rc = aio_write(cblock);
    285		if (rc == 0) {
    286			break;
    287		} else if (errno != EAGAIN) {
    288			cblock->aio_fildes = -1;
    289			pr_err("failed to queue perf data, error: %m\n");
    290			break;
    291		}
    292	} while (1);
    293
    294	return rc;
    295}
    296
    297static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
    298{
    299	void *rem_buf;
    300	off_t rem_off;
    301	size_t rem_size;
    302	int rc, aio_errno;
    303	ssize_t aio_ret, written;
    304
    305	aio_errno = aio_error(cblock);
    306	if (aio_errno == EINPROGRESS)
    307		return 0;
    308
    309	written = aio_ret = aio_return(cblock);
    310	if (aio_ret < 0) {
    311		if (aio_errno != EINTR)
    312			pr_err("failed to write perf data, error: %m\n");
    313		written = 0;
    314	}
    315
    316	rem_size = cblock->aio_nbytes - written;
    317
    318	if (rem_size == 0) {
    319		cblock->aio_fildes = -1;
    320		/*
    321		 * md->refcount is incremented in record__aio_pushfn() for
    322		 * every aio write request started in record__aio_push() so
    323		 * decrement it because the request is now complete.
    324		 */
    325		perf_mmap__put(&md->core);
    326		rc = 1;
    327	} else {
    328		/*
    329		 * aio write request may require restart with the
    330		 * reminder if the kernel didn't write whole
    331		 * chunk at once.
    332		 */
    333		rem_off = cblock->aio_offset + written;
    334		rem_buf = (void *)(cblock->aio_buf + written);
    335		record__aio_write(cblock, cblock->aio_fildes,
    336				rem_buf, rem_size, rem_off);
    337		rc = 0;
    338	}
    339
    340	return rc;
    341}
    342
    343static int record__aio_sync(struct mmap *md, bool sync_all)
    344{
    345	struct aiocb **aiocb = md->aio.aiocb;
    346	struct aiocb *cblocks = md->aio.cblocks;
    347	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
    348	int i, do_suspend;
    349
    350	do {
    351		do_suspend = 0;
    352		for (i = 0; i < md->aio.nr_cblocks; ++i) {
    353			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
    354				if (sync_all)
    355					aiocb[i] = NULL;
    356				else
    357					return i;
    358			} else {
    359				/*
    360				 * Started aio write is not complete yet
    361				 * so it has to be waited before the
    362				 * next allocation.
    363				 */
    364				aiocb[i] = &cblocks[i];
    365				do_suspend = 1;
    366			}
    367		}
    368		if (!do_suspend)
    369			return -1;
    370
    371		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
    372			if (!(errno == EAGAIN || errno == EINTR))
    373				pr_err("failed to sync perf data, error: %m\n");
    374		}
    375	} while (1);
    376}
    377
    378struct record_aio {
    379	struct record	*rec;
    380	void		*data;
    381	size_t		size;
    382};
    383
    384static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
    385{
    386	struct record_aio *aio = to;
    387
    388	/*
    389	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
    390	 * to release space in the kernel buffer as fast as possible, calling
    391	 * perf_mmap__consume() from perf_mmap__push() function.
    392	 *
    393	 * That lets the kernel to proceed with storing more profiling data into
    394	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
    395	 *
    396	 * Coping can be done in two steps in case the chunk of profiling data
    397	 * crosses the upper bound of the kernel buffer. In this case we first move
    398	 * part of data from map->start till the upper bound and then the reminder
    399	 * from the beginning of the kernel buffer till the end of the data chunk.
    400	 */
    401
    402	if (record__comp_enabled(aio->rec)) {
    403		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
    404				     mmap__mmap_len(map) - aio->size,
    405				     buf, size);
    406	} else {
    407		memcpy(aio->data + aio->size, buf, size);
    408	}
    409
    410	if (!aio->size) {
    411		/*
    412		 * Increment map->refcount to guard map->aio.data[] buffer
    413		 * from premature deallocation because map object can be
    414		 * released earlier than aio write request started on
    415		 * map->aio.data[] buffer is complete.
    416		 *
    417		 * perf_mmap__put() is done at record__aio_complete()
    418		 * after started aio request completion or at record__aio_push()
    419		 * if the request failed to start.
    420		 */
    421		perf_mmap__get(&map->core);
    422	}
    423
    424	aio->size += size;
    425
    426	return size;
    427}
    428
    429static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
    430{
    431	int ret, idx;
    432	int trace_fd = rec->session->data->file.fd;
    433	struct record_aio aio = { .rec = rec, .size = 0 };
    434
    435	/*
    436	 * Call record__aio_sync() to wait till map->aio.data[] buffer
    437	 * becomes available after previous aio write operation.
    438	 */
    439
    440	idx = record__aio_sync(map, false);
    441	aio.data = map->aio.data[idx];
    442	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
    443	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
    444		return ret;
    445
    446	rec->samples++;
    447	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
    448	if (!ret) {
    449		*off += aio.size;
    450		rec->bytes_written += aio.size;
    451		if (switch_output_size(rec))
    452			trigger_hit(&switch_output_trigger);
    453	} else {
    454		/*
    455		 * Decrement map->refcount incremented in record__aio_pushfn()
    456		 * back if record__aio_write() operation failed to start, otherwise
    457		 * map->refcount is decremented in record__aio_complete() after
    458		 * aio write operation finishes successfully.
    459		 */
    460		perf_mmap__put(&map->core);
    461	}
    462
    463	return ret;
    464}
    465
    466static off_t record__aio_get_pos(int trace_fd)
    467{
    468	return lseek(trace_fd, 0, SEEK_CUR);
    469}
    470
    471static void record__aio_set_pos(int trace_fd, off_t pos)
    472{
    473	lseek(trace_fd, pos, SEEK_SET);
    474}
    475
    476static void record__aio_mmap_read_sync(struct record *rec)
    477{
    478	int i;
    479	struct evlist *evlist = rec->evlist;
    480	struct mmap *maps = evlist->mmap;
    481
    482	if (!record__aio_enabled(rec))
    483		return;
    484
    485	for (i = 0; i < evlist->core.nr_mmaps; i++) {
    486		struct mmap *map = &maps[i];
    487
    488		if (map->core.base)
    489			record__aio_sync(map, true);
    490	}
    491}
    492
    493static int nr_cblocks_default = 1;
    494static int nr_cblocks_max = 4;
    495
    496static int record__aio_parse(const struct option *opt,
    497			     const char *str,
    498			     int unset)
    499{
    500	struct record_opts *opts = (struct record_opts *)opt->value;
    501
    502	if (unset) {
    503		opts->nr_cblocks = 0;
    504	} else {
    505		if (str)
    506			opts->nr_cblocks = strtol(str, NULL, 0);
    507		if (!opts->nr_cblocks)
    508			opts->nr_cblocks = nr_cblocks_default;
    509	}
    510
    511	return 0;
    512}
    513#else /* HAVE_AIO_SUPPORT */
    514static int nr_cblocks_max = 0;
    515
    516static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
    517			    off_t *off __maybe_unused)
    518{
    519	return -1;
    520}
    521
    522static off_t record__aio_get_pos(int trace_fd __maybe_unused)
    523{
    524	return -1;
    525}
    526
    527static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
    528{
    529}
    530
    531static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
    532{
    533}
    534#endif
    535
    536static int record__aio_enabled(struct record *rec)
    537{
    538	return rec->opts.nr_cblocks > 0;
    539}
    540
    541#define MMAP_FLUSH_DEFAULT 1
    542static int record__mmap_flush_parse(const struct option *opt,
    543				    const char *str,
    544				    int unset)
    545{
    546	int flush_max;
    547	struct record_opts *opts = (struct record_opts *)opt->value;
    548	static struct parse_tag tags[] = {
    549			{ .tag  = 'B', .mult = 1       },
    550			{ .tag  = 'K', .mult = 1 << 10 },
    551			{ .tag  = 'M', .mult = 1 << 20 },
    552			{ .tag  = 'G', .mult = 1 << 30 },
    553			{ .tag  = 0 },
    554	};
    555
    556	if (unset)
    557		return 0;
    558
    559	if (str) {
    560		opts->mmap_flush = parse_tag_value(str, tags);
    561		if (opts->mmap_flush == (int)-1)
    562			opts->mmap_flush = strtol(str, NULL, 0);
    563	}
    564
    565	if (!opts->mmap_flush)
    566		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
    567
    568	flush_max = evlist__mmap_size(opts->mmap_pages);
    569	flush_max /= 4;
    570	if (opts->mmap_flush > flush_max)
    571		opts->mmap_flush = flush_max;
    572
    573	return 0;
    574}
    575
    576#ifdef HAVE_ZSTD_SUPPORT
    577static unsigned int comp_level_default = 1;
    578
    579static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
    580{
    581	struct record_opts *opts = opt->value;
    582
    583	if (unset) {
    584		opts->comp_level = 0;
    585	} else {
    586		if (str)
    587			opts->comp_level = strtol(str, NULL, 0);
    588		if (!opts->comp_level)
    589			opts->comp_level = comp_level_default;
    590	}
    591
    592	return 0;
    593}
    594#endif
    595static unsigned int comp_level_max = 22;
    596
    597static int record__comp_enabled(struct record *rec)
    598{
    599	return rec->opts.comp_level > 0;
    600}
    601
    602static int process_synthesized_event(struct perf_tool *tool,
    603				     union perf_event *event,
    604				     struct perf_sample *sample __maybe_unused,
    605				     struct machine *machine __maybe_unused)
    606{
    607	struct record *rec = container_of(tool, struct record, tool);
    608	return record__write(rec, NULL, event, event->header.size);
    609}
    610
    611static int process_locked_synthesized_event(struct perf_tool *tool,
    612				     union perf_event *event,
    613				     struct perf_sample *sample __maybe_unused,
    614				     struct machine *machine __maybe_unused)
    615{
    616	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
    617	int ret;
    618
    619	pthread_mutex_lock(&synth_lock);
    620	ret = process_synthesized_event(tool, event, sample, machine);
    621	pthread_mutex_unlock(&synth_lock);
    622	return ret;
    623}
    624
    625static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
    626{
    627	struct record *rec = to;
    628
    629	if (record__comp_enabled(rec)) {
    630		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
    631		bf   = map->data;
    632	}
    633
    634	thread->samples++;
    635	return record__write(rec, map, bf, size);
    636}
    637
    638static volatile int signr = -1;
    639static volatile int child_finished;
    640#ifdef HAVE_EVENTFD_SUPPORT
    641static int done_fd = -1;
    642#endif
    643
    644static void sig_handler(int sig)
    645{
    646	if (sig == SIGCHLD)
    647		child_finished = 1;
    648	else
    649		signr = sig;
    650
    651	done = 1;
    652#ifdef HAVE_EVENTFD_SUPPORT
    653{
    654	u64 tmp = 1;
    655	/*
    656	 * It is possible for this signal handler to run after done is checked
    657	 * in the main loop, but before the perf counter fds are polled. If this
    658	 * happens, the poll() will continue to wait even though done is set,
    659	 * and will only break out if either another signal is received, or the
    660	 * counters are ready for read. To ensure the poll() doesn't sleep when
    661	 * done is set, use an eventfd (done_fd) to wake up the poll().
    662	 */
    663	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
    664		pr_err("failed to signal wakeup fd, error: %m\n");
    665}
    666#endif // HAVE_EVENTFD_SUPPORT
    667}
    668
    669static void sigsegv_handler(int sig)
    670{
    671	perf_hooks__recover();
    672	sighandler_dump_stack(sig);
    673}
    674
    675static void record__sig_exit(void)
    676{
    677	if (signr == -1)
    678		return;
    679
    680	signal(signr, SIG_DFL);
    681	raise(signr);
    682}
    683
    684#ifdef HAVE_AUXTRACE_SUPPORT
    685
    686static int record__process_auxtrace(struct perf_tool *tool,
    687				    struct mmap *map,
    688				    union perf_event *event, void *data1,
    689				    size_t len1, void *data2, size_t len2)
    690{
    691	struct record *rec = container_of(tool, struct record, tool);
    692	struct perf_data *data = &rec->data;
    693	size_t padding;
    694	u8 pad[8] = {0};
    695
    696	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
    697		off_t file_offset;
    698		int fd = perf_data__fd(data);
    699		int err;
    700
    701		file_offset = lseek(fd, 0, SEEK_CUR);
    702		if (file_offset == -1)
    703			return -1;
    704		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
    705						     event, file_offset);
    706		if (err)
    707			return err;
    708	}
    709
    710	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
    711	padding = (len1 + len2) & 7;
    712	if (padding)
    713		padding = 8 - padding;
    714
    715	record__write(rec, map, event, event->header.size);
    716	record__write(rec, map, data1, len1);
    717	if (len2)
    718		record__write(rec, map, data2, len2);
    719	record__write(rec, map, &pad, padding);
    720
    721	return 0;
    722}
    723
    724static int record__auxtrace_mmap_read(struct record *rec,
    725				      struct mmap *map)
    726{
    727	int ret;
    728
    729	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
    730				  record__process_auxtrace);
    731	if (ret < 0)
    732		return ret;
    733
    734	if (ret)
    735		rec->samples++;
    736
    737	return 0;
    738}
    739
    740static int record__auxtrace_mmap_read_snapshot(struct record *rec,
    741					       struct mmap *map)
    742{
    743	int ret;
    744
    745	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
    746					   record__process_auxtrace,
    747					   rec->opts.auxtrace_snapshot_size);
    748	if (ret < 0)
    749		return ret;
    750
    751	if (ret)
    752		rec->samples++;
    753
    754	return 0;
    755}
    756
    757static int record__auxtrace_read_snapshot_all(struct record *rec)
    758{
    759	int i;
    760	int rc = 0;
    761
    762	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
    763		struct mmap *map = &rec->evlist->mmap[i];
    764
    765		if (!map->auxtrace_mmap.base)
    766			continue;
    767
    768		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
    769			rc = -1;
    770			goto out;
    771		}
    772	}
    773out:
    774	return rc;
    775}
    776
    777static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
    778{
    779	pr_debug("Recording AUX area tracing snapshot\n");
    780	if (record__auxtrace_read_snapshot_all(rec) < 0) {
    781		trigger_error(&auxtrace_snapshot_trigger);
    782	} else {
    783		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
    784			trigger_error(&auxtrace_snapshot_trigger);
    785		else
    786			trigger_ready(&auxtrace_snapshot_trigger);
    787	}
    788}
    789
    790static int record__auxtrace_snapshot_exit(struct record *rec)
    791{
    792	if (trigger_is_error(&auxtrace_snapshot_trigger))
    793		return 0;
    794
    795	if (!auxtrace_record__snapshot_started &&
    796	    auxtrace_record__snapshot_start(rec->itr))
    797		return -1;
    798
    799	record__read_auxtrace_snapshot(rec, true);
    800	if (trigger_is_error(&auxtrace_snapshot_trigger))
    801		return -1;
    802
    803	return 0;
    804}
    805
    806static int record__auxtrace_init(struct record *rec)
    807{
    808	int err;
    809
    810	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
    811	    && record__threads_enabled(rec)) {
    812		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
    813		return -EINVAL;
    814	}
    815
    816	if (!rec->itr) {
    817		rec->itr = auxtrace_record__init(rec->evlist, &err);
    818		if (err)
    819			return err;
    820	}
    821
    822	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
    823					      rec->opts.auxtrace_snapshot_opts);
    824	if (err)
    825		return err;
    826
    827	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
    828					    rec->opts.auxtrace_sample_opts);
    829	if (err)
    830		return err;
    831
    832	auxtrace_regroup_aux_output(rec->evlist);
    833
    834	return auxtrace_parse_filters(rec->evlist);
    835}
    836
    837#else
    838
    839static inline
    840int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
    841			       struct mmap *map __maybe_unused)
    842{
    843	return 0;
    844}
    845
    846static inline
    847void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
    848				    bool on_exit __maybe_unused)
    849{
    850}
    851
    852static inline
    853int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
    854{
    855	return 0;
    856}
    857
    858static inline
    859int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
    860{
    861	return 0;
    862}
    863
    864static int record__auxtrace_init(struct record *rec __maybe_unused)
    865{
    866	return 0;
    867}
    868
    869#endif
    870
    871static int record__config_text_poke(struct evlist *evlist)
    872{
    873	struct evsel *evsel;
    874
    875	/* Nothing to do if text poke is already configured */
    876	evlist__for_each_entry(evlist, evsel) {
    877		if (evsel->core.attr.text_poke)
    878			return 0;
    879	}
    880
    881	evsel = evlist__add_dummy_on_all_cpus(evlist);
    882	if (!evsel)
    883		return -ENOMEM;
    884
    885	evsel->core.attr.text_poke = 1;
    886	evsel->core.attr.ksymbol = 1;
    887	evsel->immediate = true;
    888	evsel__set_sample_bit(evsel, TIME);
    889
    890	return 0;
    891}
    892
    893static int record__config_off_cpu(struct record *rec)
    894{
    895	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
    896}
    897
    898static bool record__kcore_readable(struct machine *machine)
    899{
    900	char kcore[PATH_MAX];
    901	int fd;
    902
    903	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
    904
    905	fd = open(kcore, O_RDONLY);
    906	if (fd < 0)
    907		return false;
    908
    909	close(fd);
    910
    911	return true;
    912}
    913
    914static int record__kcore_copy(struct machine *machine, struct perf_data *data)
    915{
    916	char from_dir[PATH_MAX];
    917	char kcore_dir[PATH_MAX];
    918	int ret;
    919
    920	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
    921
    922	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
    923	if (ret)
    924		return ret;
    925
    926	return kcore_copy(from_dir, kcore_dir);
    927}
    928
    929static void record__thread_data_init_pipes(struct record_thread *thread_data)
    930{
    931	thread_data->pipes.msg[0] = -1;
    932	thread_data->pipes.msg[1] = -1;
    933	thread_data->pipes.ack[0] = -1;
    934	thread_data->pipes.ack[1] = -1;
    935}
    936
    937static int record__thread_data_open_pipes(struct record_thread *thread_data)
    938{
    939	if (pipe(thread_data->pipes.msg))
    940		return -EINVAL;
    941
    942	if (pipe(thread_data->pipes.ack)) {
    943		close(thread_data->pipes.msg[0]);
    944		thread_data->pipes.msg[0] = -1;
    945		close(thread_data->pipes.msg[1]);
    946		thread_data->pipes.msg[1] = -1;
    947		return -EINVAL;
    948	}
    949
    950	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
    951		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
    952		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
    953
    954	return 0;
    955}
    956
    957static void record__thread_data_close_pipes(struct record_thread *thread_data)
    958{
    959	if (thread_data->pipes.msg[0] != -1) {
    960		close(thread_data->pipes.msg[0]);
    961		thread_data->pipes.msg[0] = -1;
    962	}
    963	if (thread_data->pipes.msg[1] != -1) {
    964		close(thread_data->pipes.msg[1]);
    965		thread_data->pipes.msg[1] = -1;
    966	}
    967	if (thread_data->pipes.ack[0] != -1) {
    968		close(thread_data->pipes.ack[0]);
    969		thread_data->pipes.ack[0] = -1;
    970	}
    971	if (thread_data->pipes.ack[1] != -1) {
    972		close(thread_data->pipes.ack[1]);
    973		thread_data->pipes.ack[1] = -1;
    974	}
    975}
    976
    977static bool evlist__per_thread(struct evlist *evlist)
    978{
    979	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
    980}
    981
    982static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
    983{
    984	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
    985	struct mmap *mmap = evlist->mmap;
    986	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
    987	struct perf_cpu_map *cpus = evlist->core.all_cpus;
    988	bool per_thread = evlist__per_thread(evlist);
    989
    990	if (per_thread)
    991		thread_data->nr_mmaps = nr_mmaps;
    992	else
    993		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
    994						      thread_data->mask->maps.nbits);
    995	if (mmap) {
    996		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
    997		if (!thread_data->maps)
    998			return -ENOMEM;
    999	}
   1000	if (overwrite_mmap) {
   1001		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
   1002		if (!thread_data->overwrite_maps) {
   1003			zfree(&thread_data->maps);
   1004			return -ENOMEM;
   1005		}
   1006	}
   1007	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
   1008		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
   1009
   1010	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
   1011		if (per_thread ||
   1012		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
   1013			if (thread_data->maps) {
   1014				thread_data->maps[tm] = &mmap[m];
   1015				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
   1016					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
   1017			}
   1018			if (thread_data->overwrite_maps) {
   1019				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
   1020				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
   1021					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
   1022			}
   1023			tm++;
   1024		}
   1025	}
   1026
   1027	return 0;
   1028}
   1029
   1030static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
   1031{
   1032	int f, tm, pos;
   1033	struct mmap *map, *overwrite_map;
   1034
   1035	fdarray__init(&thread_data->pollfd, 64);
   1036
   1037	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
   1038		map = thread_data->maps ? thread_data->maps[tm] : NULL;
   1039		overwrite_map = thread_data->overwrite_maps ?
   1040				thread_data->overwrite_maps[tm] : NULL;
   1041
   1042		for (f = 0; f < evlist->core.pollfd.nr; f++) {
   1043			void *ptr = evlist->core.pollfd.priv[f].ptr;
   1044
   1045			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
   1046				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
   1047							      &evlist->core.pollfd);
   1048				if (pos < 0)
   1049					return pos;
   1050				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
   1051					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
   1052			}
   1053		}
   1054	}
   1055
   1056	return 0;
   1057}
   1058
   1059static void record__free_thread_data(struct record *rec)
   1060{
   1061	int t;
   1062	struct record_thread *thread_data = rec->thread_data;
   1063
   1064	if (thread_data == NULL)
   1065		return;
   1066
   1067	for (t = 0; t < rec->nr_threads; t++) {
   1068		record__thread_data_close_pipes(&thread_data[t]);
   1069		zfree(&thread_data[t].maps);
   1070		zfree(&thread_data[t].overwrite_maps);
   1071		fdarray__exit(&thread_data[t].pollfd);
   1072	}
   1073
   1074	zfree(&rec->thread_data);
   1075}
   1076
   1077static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
   1078{
   1079	int t, ret;
   1080	struct record_thread *thread_data;
   1081
   1082	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
   1083	if (!rec->thread_data) {
   1084		pr_err("Failed to allocate thread data\n");
   1085		return -ENOMEM;
   1086	}
   1087	thread_data = rec->thread_data;
   1088
   1089	for (t = 0; t < rec->nr_threads; t++)
   1090		record__thread_data_init_pipes(&thread_data[t]);
   1091
   1092	for (t = 0; t < rec->nr_threads; t++) {
   1093		thread_data[t].rec = rec;
   1094		thread_data[t].mask = &rec->thread_masks[t];
   1095		ret = record__thread_data_init_maps(&thread_data[t], evlist);
   1096		if (ret) {
   1097			pr_err("Failed to initialize thread[%d] maps\n", t);
   1098			goto out_free;
   1099		}
   1100		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
   1101		if (ret) {
   1102			pr_err("Failed to initialize thread[%d] pollfd\n", t);
   1103			goto out_free;
   1104		}
   1105		if (t) {
   1106			thread_data[t].tid = -1;
   1107			ret = record__thread_data_open_pipes(&thread_data[t]);
   1108			if (ret) {
   1109				pr_err("Failed to open thread[%d] communication pipes\n", t);
   1110				goto out_free;
   1111			}
   1112			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
   1113					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
   1114			if (ret < 0) {
   1115				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
   1116				goto out_free;
   1117			}
   1118			thread_data[t].ctlfd_pos = ret;
   1119			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
   1120				 thread_data, thread_data[t].ctlfd_pos,
   1121				 thread_data[t].pipes.msg[0]);
   1122		} else {
   1123			thread_data[t].tid = gettid();
   1124			if (evlist->ctl_fd.pos == -1)
   1125				continue;
   1126			ret = fdarray__dup_entry_from(&thread_data[t].pollfd, evlist->ctl_fd.pos,
   1127						      &evlist->core.pollfd);
   1128			if (ret < 0) {
   1129				pr_err("Failed to duplicate descriptor in main thread pollfd\n");
   1130				goto out_free;
   1131			}
   1132			thread_data[t].ctlfd_pos = ret;
   1133			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
   1134				 thread_data, thread_data[t].ctlfd_pos,
   1135				 evlist->core.pollfd.entries[evlist->ctl_fd.pos].fd);
   1136		}
   1137	}
   1138
   1139	return 0;
   1140
   1141out_free:
   1142	record__free_thread_data(rec);
   1143
   1144	return ret;
   1145}
   1146
   1147static int record__mmap_evlist(struct record *rec,
   1148			       struct evlist *evlist)
   1149{
   1150	int i, ret;
   1151	struct record_opts *opts = &rec->opts;
   1152	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
   1153				  opts->auxtrace_sample_mode;
   1154	char msg[512];
   1155
   1156	if (opts->affinity != PERF_AFFINITY_SYS)
   1157		cpu__setup_cpunode_map();
   1158
   1159	if (evlist__mmap_ex(evlist, opts->mmap_pages,
   1160				 opts->auxtrace_mmap_pages,
   1161				 auxtrace_overwrite,
   1162				 opts->nr_cblocks, opts->affinity,
   1163				 opts->mmap_flush, opts->comp_level) < 0) {
   1164		if (errno == EPERM) {
   1165			pr_err("Permission error mapping pages.\n"
   1166			       "Consider increasing "
   1167			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
   1168			       "or try again with a smaller value of -m/--mmap_pages.\n"
   1169			       "(current value: %u,%u)\n",
   1170			       opts->mmap_pages, opts->auxtrace_mmap_pages);
   1171			return -errno;
   1172		} else {
   1173			pr_err("failed to mmap with %d (%s)\n", errno,
   1174				str_error_r(errno, msg, sizeof(msg)));
   1175			if (errno)
   1176				return -errno;
   1177			else
   1178				return -EINVAL;
   1179		}
   1180	}
   1181
   1182	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
   1183		return -1;
   1184
   1185	ret = record__alloc_thread_data(rec, evlist);
   1186	if (ret)
   1187		return ret;
   1188
   1189	if (record__threads_enabled(rec)) {
   1190		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
   1191		if (ret) {
   1192			pr_err("Failed to create data directory: %s\n", strerror(-ret));
   1193			return ret;
   1194		}
   1195		for (i = 0; i < evlist->core.nr_mmaps; i++) {
   1196			if (evlist->mmap)
   1197				evlist->mmap[i].file = &rec->data.dir.files[i];
   1198			if (evlist->overwrite_mmap)
   1199				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
   1200		}
   1201	}
   1202
   1203	return 0;
   1204}
   1205
   1206static int record__mmap(struct record *rec)
   1207{
   1208	return record__mmap_evlist(rec, rec->evlist);
   1209}
   1210
   1211static int record__open(struct record *rec)
   1212{
   1213	char msg[BUFSIZ];
   1214	struct evsel *pos;
   1215	struct evlist *evlist = rec->evlist;
   1216	struct perf_session *session = rec->session;
   1217	struct record_opts *opts = &rec->opts;
   1218	int rc = 0;
   1219
   1220	/*
   1221	 * For initial_delay, system wide or a hybrid system, we need to add a
   1222	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
   1223	 * of waiting or event synthesis.
   1224	 */
   1225	if (opts->initial_delay || target__has_cpu(&opts->target) ||
   1226	    perf_pmu__has_hybrid()) {
   1227		pos = evlist__get_tracking_event(evlist);
   1228		if (!evsel__is_dummy_event(pos)) {
   1229			/* Set up dummy event. */
   1230			if (evlist__add_dummy(evlist))
   1231				return -ENOMEM;
   1232			pos = evlist__last(evlist);
   1233			evlist__set_tracking_event(evlist, pos);
   1234		}
   1235
   1236		/*
   1237		 * Enable the dummy event when the process is forked for
   1238		 * initial_delay, immediately for system wide.
   1239		 */
   1240		if (opts->initial_delay && !pos->immediate &&
   1241		    !target__has_cpu(&opts->target))
   1242			pos->core.attr.enable_on_exec = 1;
   1243		else
   1244			pos->immediate = 1;
   1245	}
   1246
   1247	evlist__config(evlist, opts, &callchain_param);
   1248
   1249	evlist__for_each_entry(evlist, pos) {
   1250try_again:
   1251		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
   1252			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
   1253				if (verbose > 0)
   1254					ui__warning("%s\n", msg);
   1255				goto try_again;
   1256			}
   1257			if ((errno == EINVAL || errno == EBADF) &&
   1258			    pos->core.leader != &pos->core &&
   1259			    pos->weak_group) {
   1260			        pos = evlist__reset_weak_group(evlist, pos, true);
   1261				goto try_again;
   1262			}
   1263			rc = -errno;
   1264			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
   1265			ui__error("%s\n", msg);
   1266			goto out;
   1267		}
   1268
   1269		pos->supported = true;
   1270	}
   1271
   1272	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
   1273		pr_warning(
   1274"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
   1275"check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
   1276"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
   1277"file is not found in the buildid cache or in the vmlinux path.\n\n"
   1278"Samples in kernel modules won't be resolved at all.\n\n"
   1279"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
   1280"even with a suitable vmlinux or kallsyms file.\n\n");
   1281	}
   1282
   1283	if (evlist__apply_filters(evlist, &pos)) {
   1284		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
   1285			pos->filter, evsel__name(pos), errno,
   1286			str_error_r(errno, msg, sizeof(msg)));
   1287		rc = -1;
   1288		goto out;
   1289	}
   1290
   1291	rc = record__mmap(rec);
   1292	if (rc)
   1293		goto out;
   1294
   1295	session->evlist = evlist;
   1296	perf_session__set_id_hdr_size(session);
   1297out:
   1298	return rc;
   1299}
   1300
   1301static void set_timestamp_boundary(struct record *rec, u64 sample_time)
   1302{
   1303	if (rec->evlist->first_sample_time == 0)
   1304		rec->evlist->first_sample_time = sample_time;
   1305
   1306	if (sample_time)
   1307		rec->evlist->last_sample_time = sample_time;
   1308}
   1309
   1310static int process_sample_event(struct perf_tool *tool,
   1311				union perf_event *event,
   1312				struct perf_sample *sample,
   1313				struct evsel *evsel,
   1314				struct machine *machine)
   1315{
   1316	struct record *rec = container_of(tool, struct record, tool);
   1317
   1318	set_timestamp_boundary(rec, sample->time);
   1319
   1320	if (rec->buildid_all)
   1321		return 0;
   1322
   1323	rec->samples++;
   1324	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
   1325}
   1326
   1327static int process_buildids(struct record *rec)
   1328{
   1329	struct perf_session *session = rec->session;
   1330
   1331	if (perf_data__size(&rec->data) == 0)
   1332		return 0;
   1333
   1334	/*
   1335	 * During this process, it'll load kernel map and replace the
   1336	 * dso->long_name to a real pathname it found.  In this case
   1337	 * we prefer the vmlinux path like
   1338	 *   /lib/modules/3.16.4/build/vmlinux
   1339	 *
   1340	 * rather than build-id path (in debug directory).
   1341	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
   1342	 */
   1343	symbol_conf.ignore_vmlinux_buildid = true;
   1344
   1345	/*
   1346	 * If --buildid-all is given, it marks all DSO regardless of hits,
   1347	 * so no need to process samples. But if timestamp_boundary is enabled,
   1348	 * it still needs to walk on all samples to get the timestamps of
   1349	 * first/last samples.
   1350	 */
   1351	if (rec->buildid_all && !rec->timestamp_boundary)
   1352		rec->tool.sample = NULL;
   1353
   1354	return perf_session__process_events(session);
   1355}
   1356
   1357static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
   1358{
   1359	int err;
   1360	struct perf_tool *tool = data;
   1361	/*
   1362	 *As for guest kernel when processing subcommand record&report,
   1363	 *we arrange module mmap prior to guest kernel mmap and trigger
   1364	 *a preload dso because default guest module symbols are loaded
   1365	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
   1366	 *method is used to avoid symbol missing when the first addr is
   1367	 *in module instead of in guest kernel.
   1368	 */
   1369	err = perf_event__synthesize_modules(tool, process_synthesized_event,
   1370					     machine);
   1371	if (err < 0)
   1372		pr_err("Couldn't record guest kernel [%d]'s reference"
   1373		       " relocation symbol.\n", machine->pid);
   1374
   1375	/*
   1376	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
   1377	 * have no _text sometimes.
   1378	 */
   1379	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
   1380						 machine);
   1381	if (err < 0)
   1382		pr_err("Couldn't record guest kernel [%d]'s reference"
   1383		       " relocation symbol.\n", machine->pid);
   1384}
   1385
   1386static struct perf_event_header finished_round_event = {
   1387	.size = sizeof(struct perf_event_header),
   1388	.type = PERF_RECORD_FINISHED_ROUND,
   1389};
   1390
   1391static void record__adjust_affinity(struct record *rec, struct mmap *map)
   1392{
   1393	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
   1394	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
   1395			  thread->mask->affinity.nbits)) {
   1396		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
   1397		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
   1398			  map->affinity_mask.bits, thread->mask->affinity.nbits);
   1399		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
   1400					(cpu_set_t *)thread->mask->affinity.bits);
   1401		if (verbose == 2) {
   1402			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
   1403			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
   1404		}
   1405	}
   1406}
   1407
   1408static size_t process_comp_header(void *record, size_t increment)
   1409{
   1410	struct perf_record_compressed *event = record;
   1411	size_t size = sizeof(*event);
   1412
   1413	if (increment) {
   1414		event->header.size += increment;
   1415		return increment;
   1416	}
   1417
   1418	event->header.type = PERF_RECORD_COMPRESSED;
   1419	event->header.size = size;
   1420
   1421	return size;
   1422}
   1423
   1424static size_t zstd_compress(struct perf_session *session, struct mmap *map,
   1425			    void *dst, size_t dst_size, void *src, size_t src_size)
   1426{
   1427	size_t compressed;
   1428	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
   1429	struct zstd_data *zstd_data = &session->zstd_data;
   1430
   1431	if (map && map->file)
   1432		zstd_data = &map->zstd_data;
   1433
   1434	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
   1435						     max_record_size, process_comp_header);
   1436
   1437	if (map && map->file) {
   1438		thread->bytes_transferred += src_size;
   1439		thread->bytes_compressed  += compressed;
   1440	} else {
   1441		session->bytes_transferred += src_size;
   1442		session->bytes_compressed  += compressed;
   1443	}
   1444
   1445	return compressed;
   1446}
   1447
   1448static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
   1449				    bool overwrite, bool synch)
   1450{
   1451	u64 bytes_written = rec->bytes_written;
   1452	int i;
   1453	int rc = 0;
   1454	int nr_mmaps;
   1455	struct mmap **maps;
   1456	int trace_fd = rec->data.file.fd;
   1457	off_t off = 0;
   1458
   1459	if (!evlist)
   1460		return 0;
   1461
   1462	nr_mmaps = thread->nr_mmaps;
   1463	maps = overwrite ? thread->overwrite_maps : thread->maps;
   1464
   1465	if (!maps)
   1466		return 0;
   1467
   1468	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
   1469		return 0;
   1470
   1471	if (record__aio_enabled(rec))
   1472		off = record__aio_get_pos(trace_fd);
   1473
   1474	for (i = 0; i < nr_mmaps; i++) {
   1475		u64 flush = 0;
   1476		struct mmap *map = maps[i];
   1477
   1478		if (map->core.base) {
   1479			record__adjust_affinity(rec, map);
   1480			if (synch) {
   1481				flush = map->core.flush;
   1482				map->core.flush = 1;
   1483			}
   1484			if (!record__aio_enabled(rec)) {
   1485				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
   1486					if (synch)
   1487						map->core.flush = flush;
   1488					rc = -1;
   1489					goto out;
   1490				}
   1491			} else {
   1492				if (record__aio_push(rec, map, &off) < 0) {
   1493					record__aio_set_pos(trace_fd, off);
   1494					if (synch)
   1495						map->core.flush = flush;
   1496					rc = -1;
   1497					goto out;
   1498				}
   1499			}
   1500			if (synch)
   1501				map->core.flush = flush;
   1502		}
   1503
   1504		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
   1505		    !rec->opts.auxtrace_sample_mode &&
   1506		    record__auxtrace_mmap_read(rec, map) != 0) {
   1507			rc = -1;
   1508			goto out;
   1509		}
   1510	}
   1511
   1512	if (record__aio_enabled(rec))
   1513		record__aio_set_pos(trace_fd, off);
   1514
   1515	/*
   1516	 * Mark the round finished in case we wrote
   1517	 * at least one event.
   1518	 *
   1519	 * No need for round events in directory mode,
   1520	 * because per-cpu maps and files have data
   1521	 * sorted by kernel.
   1522	 */
   1523	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
   1524		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
   1525
   1526	if (overwrite)
   1527		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
   1528out:
   1529	return rc;
   1530}
   1531
   1532static int record__mmap_read_all(struct record *rec, bool synch)
   1533{
   1534	int err;
   1535
   1536	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
   1537	if (err)
   1538		return err;
   1539
   1540	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
   1541}
   1542
   1543static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
   1544					   void *arg __maybe_unused)
   1545{
   1546	struct perf_mmap *map = fda->priv[fd].ptr;
   1547
   1548	if (map)
   1549		perf_mmap__put(map);
   1550}
   1551
   1552static void *record__thread(void *arg)
   1553{
   1554	enum thread_msg msg = THREAD_MSG__READY;
   1555	bool terminate = false;
   1556	struct fdarray *pollfd;
   1557	int err, ctlfd_pos;
   1558
   1559	thread = arg;
   1560	thread->tid = gettid();
   1561
   1562	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
   1563	if (err == -1)
   1564		pr_warning("threads[%d]: failed to notify on start: %s\n",
   1565			   thread->tid, strerror(errno));
   1566
   1567	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
   1568
   1569	pollfd = &thread->pollfd;
   1570	ctlfd_pos = thread->ctlfd_pos;
   1571
   1572	for (;;) {
   1573		unsigned long long hits = thread->samples;
   1574
   1575		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
   1576			break;
   1577
   1578		if (hits == thread->samples) {
   1579
   1580			err = fdarray__poll(pollfd, -1);
   1581			/*
   1582			 * Propagate error, only if there's any. Ignore positive
   1583			 * number of returned events and interrupt error.
   1584			 */
   1585			if (err > 0 || (err < 0 && errno == EINTR))
   1586				err = 0;
   1587			thread->waking++;
   1588
   1589			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
   1590					    record__thread_munmap_filtered, NULL) == 0)
   1591				break;
   1592		}
   1593
   1594		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
   1595			terminate = true;
   1596			close(thread->pipes.msg[0]);
   1597			thread->pipes.msg[0] = -1;
   1598			pollfd->entries[ctlfd_pos].fd = -1;
   1599			pollfd->entries[ctlfd_pos].events = 0;
   1600		}
   1601
   1602		pollfd->entries[ctlfd_pos].revents = 0;
   1603	}
   1604	record__mmap_read_all(thread->rec, true);
   1605
   1606	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
   1607	if (err == -1)
   1608		pr_warning("threads[%d]: failed to notify on termination: %s\n",
   1609			   thread->tid, strerror(errno));
   1610
   1611	return NULL;
   1612}
   1613
   1614static void record__init_features(struct record *rec)
   1615{
   1616	struct perf_session *session = rec->session;
   1617	int feat;
   1618
   1619	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
   1620		perf_header__set_feat(&session->header, feat);
   1621
   1622	if (rec->no_buildid)
   1623		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
   1624
   1625	if (!have_tracepoints(&rec->evlist->core.entries))
   1626		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
   1627
   1628	if (!rec->opts.branch_stack)
   1629		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
   1630
   1631	if (!rec->opts.full_auxtrace)
   1632		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
   1633
   1634	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
   1635		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
   1636
   1637	if (!rec->opts.use_clockid)
   1638		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
   1639
   1640	if (!record__threads_enabled(rec))
   1641		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
   1642
   1643	if (!record__comp_enabled(rec))
   1644		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
   1645
   1646	perf_header__clear_feat(&session->header, HEADER_STAT);
   1647}
   1648
   1649static void
   1650record__finish_output(struct record *rec)
   1651{
   1652	int i;
   1653	struct perf_data *data = &rec->data;
   1654	int fd = perf_data__fd(data);
   1655
   1656	if (data->is_pipe)
   1657		return;
   1658
   1659	rec->session->header.data_size += rec->bytes_written;
   1660	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
   1661	if (record__threads_enabled(rec)) {
   1662		for (i = 0; i < data->dir.nr; i++)
   1663			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
   1664	}
   1665
   1666	if (!rec->no_buildid) {
   1667		process_buildids(rec);
   1668
   1669		if (rec->buildid_all)
   1670			dsos__hit_all(rec->session);
   1671	}
   1672	perf_session__write_header(rec->session, rec->evlist, fd, true);
   1673
   1674	return;
   1675}
   1676
   1677static int record__synthesize_workload(struct record *rec, bool tail)
   1678{
   1679	int err;
   1680	struct perf_thread_map *thread_map;
   1681	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
   1682
   1683	if (rec->opts.tail_synthesize != tail)
   1684		return 0;
   1685
   1686	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
   1687	if (thread_map == NULL)
   1688		return -1;
   1689
   1690	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
   1691						 process_synthesized_event,
   1692						 &rec->session->machines.host,
   1693						 needs_mmap,
   1694						 rec->opts.sample_address);
   1695	perf_thread_map__put(thread_map);
   1696	return err;
   1697}
   1698
   1699static int record__synthesize(struct record *rec, bool tail);
   1700
   1701static int
   1702record__switch_output(struct record *rec, bool at_exit)
   1703{
   1704	struct perf_data *data = &rec->data;
   1705	int fd, err;
   1706	char *new_filename;
   1707
   1708	/* Same Size:      "2015122520103046"*/
   1709	char timestamp[] = "InvalidTimestamp";
   1710
   1711	record__aio_mmap_read_sync(rec);
   1712
   1713	record__synthesize(rec, true);
   1714	if (target__none(&rec->opts.target))
   1715		record__synthesize_workload(rec, true);
   1716
   1717	rec->samples = 0;
   1718	record__finish_output(rec);
   1719	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
   1720	if (err) {
   1721		pr_err("Failed to get current timestamp\n");
   1722		return -EINVAL;
   1723	}
   1724
   1725	fd = perf_data__switch(data, timestamp,
   1726				    rec->session->header.data_offset,
   1727				    at_exit, &new_filename);
   1728	if (fd >= 0 && !at_exit) {
   1729		rec->bytes_written = 0;
   1730		rec->session->header.data_size = 0;
   1731	}
   1732
   1733	if (!quiet)
   1734		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
   1735			data->path, timestamp);
   1736
   1737	if (rec->switch_output.num_files) {
   1738		int n = rec->switch_output.cur_file + 1;
   1739
   1740		if (n >= rec->switch_output.num_files)
   1741			n = 0;
   1742		rec->switch_output.cur_file = n;
   1743		if (rec->switch_output.filenames[n]) {
   1744			remove(rec->switch_output.filenames[n]);
   1745			zfree(&rec->switch_output.filenames[n]);
   1746		}
   1747		rec->switch_output.filenames[n] = new_filename;
   1748	} else {
   1749		free(new_filename);
   1750	}
   1751
   1752	/* Output tracking events */
   1753	if (!at_exit) {
   1754		record__synthesize(rec, false);
   1755
   1756		/*
   1757		 * In 'perf record --switch-output' without -a,
   1758		 * record__synthesize() in record__switch_output() won't
   1759		 * generate tracking events because there's no thread_map
   1760		 * in evlist. Which causes newly created perf.data doesn't
   1761		 * contain map and comm information.
   1762		 * Create a fake thread_map and directly call
   1763		 * perf_event__synthesize_thread_map() for those events.
   1764		 */
   1765		if (target__none(&rec->opts.target))
   1766			record__synthesize_workload(rec, false);
   1767	}
   1768	return fd;
   1769}
   1770
   1771static volatile int workload_exec_errno;
   1772
   1773/*
   1774 * evlist__prepare_workload will send a SIGUSR1
   1775 * if the fork fails, since we asked by setting its
   1776 * want_signal to true.
   1777 */
   1778static void workload_exec_failed_signal(int signo __maybe_unused,
   1779					siginfo_t *info,
   1780					void *ucontext __maybe_unused)
   1781{
   1782	workload_exec_errno = info->si_value.sival_int;
   1783	done = 1;
   1784	child_finished = 1;
   1785}
   1786
   1787static void snapshot_sig_handler(int sig);
   1788static void alarm_sig_handler(int sig);
   1789
   1790static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
   1791{
   1792	if (evlist) {
   1793		if (evlist->mmap && evlist->mmap[0].core.base)
   1794			return evlist->mmap[0].core.base;
   1795		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
   1796			return evlist->overwrite_mmap[0].core.base;
   1797	}
   1798	return NULL;
   1799}
   1800
   1801static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
   1802{
   1803	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
   1804	if (pc)
   1805		return pc;
   1806	return NULL;
   1807}
   1808
   1809static int record__synthesize(struct record *rec, bool tail)
   1810{
   1811	struct perf_session *session = rec->session;
   1812	struct machine *machine = &session->machines.host;
   1813	struct perf_data *data = &rec->data;
   1814	struct record_opts *opts = &rec->opts;
   1815	struct perf_tool *tool = &rec->tool;
   1816	int err = 0;
   1817	event_op f = process_synthesized_event;
   1818
   1819	if (rec->opts.tail_synthesize != tail)
   1820		return 0;
   1821
   1822	if (data->is_pipe) {
   1823		err = perf_event__synthesize_for_pipe(tool, session, data,
   1824						      process_synthesized_event);
   1825		if (err < 0)
   1826			goto out;
   1827
   1828		rec->bytes_written += err;
   1829	}
   1830
   1831	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
   1832					  process_synthesized_event, machine);
   1833	if (err)
   1834		goto out;
   1835
   1836	/* Synthesize id_index before auxtrace_info */
   1837	if (rec->opts.auxtrace_sample_mode || rec->opts.full_auxtrace) {
   1838		err = perf_event__synthesize_id_index(tool,
   1839						      process_synthesized_event,
   1840						      session->evlist, machine);
   1841		if (err)
   1842			goto out;
   1843	}
   1844
   1845	if (rec->opts.full_auxtrace) {
   1846		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
   1847					session, process_synthesized_event);
   1848		if (err)
   1849			goto out;
   1850	}
   1851
   1852	if (!evlist__exclude_kernel(rec->evlist)) {
   1853		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
   1854							 machine);
   1855		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
   1856				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
   1857				   "Check /proc/kallsyms permission or run as root.\n");
   1858
   1859		err = perf_event__synthesize_modules(tool, process_synthesized_event,
   1860						     machine);
   1861		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
   1862				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
   1863				   "Check /proc/modules permission or run as root.\n");
   1864	}
   1865
   1866	if (perf_guest) {
   1867		machines__process_guests(&session->machines,
   1868					 perf_event__synthesize_guest_os, tool);
   1869	}
   1870
   1871	err = perf_event__synthesize_extra_attr(&rec->tool,
   1872						rec->evlist,
   1873						process_synthesized_event,
   1874						data->is_pipe);
   1875	if (err)
   1876		goto out;
   1877
   1878	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
   1879						 process_synthesized_event,
   1880						NULL);
   1881	if (err < 0) {
   1882		pr_err("Couldn't synthesize thread map.\n");
   1883		return err;
   1884	}
   1885
   1886	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
   1887					     process_synthesized_event, NULL);
   1888	if (err < 0) {
   1889		pr_err("Couldn't synthesize cpu map.\n");
   1890		return err;
   1891	}
   1892
   1893	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
   1894						machine, opts);
   1895	if (err < 0)
   1896		pr_warning("Couldn't synthesize bpf events.\n");
   1897
   1898	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
   1899		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
   1900						     machine);
   1901		if (err < 0)
   1902			pr_warning("Couldn't synthesize cgroup events.\n");
   1903	}
   1904
   1905	if (rec->opts.nr_threads_synthesize > 1) {
   1906		perf_set_multithreaded();
   1907		f = process_locked_synthesized_event;
   1908	}
   1909
   1910	if (rec->opts.synth & PERF_SYNTH_TASK) {
   1911		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
   1912
   1913		err = __machine__synthesize_threads(machine, tool, &opts->target,
   1914						    rec->evlist->core.threads,
   1915						    f, needs_mmap, opts->sample_address,
   1916						    rec->opts.nr_threads_synthesize);
   1917	}
   1918
   1919	if (rec->opts.nr_threads_synthesize > 1)
   1920		perf_set_singlethreaded();
   1921
   1922out:
   1923	return err;
   1924}
   1925
   1926static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
   1927{
   1928	struct record *rec = data;
   1929	pthread_kill(rec->thread_id, SIGUSR2);
   1930	return 0;
   1931}
   1932
   1933static int record__setup_sb_evlist(struct record *rec)
   1934{
   1935	struct record_opts *opts = &rec->opts;
   1936
   1937	if (rec->sb_evlist != NULL) {
   1938		/*
   1939		 * We get here if --switch-output-event populated the
   1940		 * sb_evlist, so associate a callback that will send a SIGUSR2
   1941		 * to the main thread.
   1942		 */
   1943		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
   1944		rec->thread_id = pthread_self();
   1945	}
   1946#ifdef HAVE_LIBBPF_SUPPORT
   1947	if (!opts->no_bpf_event) {
   1948		if (rec->sb_evlist == NULL) {
   1949			rec->sb_evlist = evlist__new();
   1950
   1951			if (rec->sb_evlist == NULL) {
   1952				pr_err("Couldn't create side band evlist.\n.");
   1953				return -1;
   1954			}
   1955		}
   1956
   1957		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
   1958			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
   1959			return -1;
   1960		}
   1961	}
   1962#endif
   1963	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
   1964		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
   1965		opts->no_bpf_event = true;
   1966	}
   1967
   1968	return 0;
   1969}
   1970
   1971static int record__init_clock(struct record *rec)
   1972{
   1973	struct perf_session *session = rec->session;
   1974	struct timespec ref_clockid;
   1975	struct timeval ref_tod;
   1976	u64 ref;
   1977
   1978	if (!rec->opts.use_clockid)
   1979		return 0;
   1980
   1981	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
   1982		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
   1983
   1984	session->header.env.clock.clockid = rec->opts.clockid;
   1985
   1986	if (gettimeofday(&ref_tod, NULL) != 0) {
   1987		pr_err("gettimeofday failed, cannot set reference time.\n");
   1988		return -1;
   1989	}
   1990
   1991	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
   1992		pr_err("clock_gettime failed, cannot set reference time.\n");
   1993		return -1;
   1994	}
   1995
   1996	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
   1997	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
   1998
   1999	session->header.env.clock.tod_ns = ref;
   2000
   2001	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
   2002	      (u64) ref_clockid.tv_nsec;
   2003
   2004	session->header.env.clock.clockid_ns = ref;
   2005	return 0;
   2006}
   2007
   2008static void hit_auxtrace_snapshot_trigger(struct record *rec)
   2009{
   2010	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
   2011		trigger_hit(&auxtrace_snapshot_trigger);
   2012		auxtrace_record__snapshot_started = 1;
   2013		if (auxtrace_record__snapshot_start(rec->itr))
   2014			trigger_error(&auxtrace_snapshot_trigger);
   2015	}
   2016}
   2017
   2018static void record__uniquify_name(struct record *rec)
   2019{
   2020	struct evsel *pos;
   2021	struct evlist *evlist = rec->evlist;
   2022	char *new_name;
   2023	int ret;
   2024
   2025	if (!perf_pmu__has_hybrid())
   2026		return;
   2027
   2028	evlist__for_each_entry(evlist, pos) {
   2029		if (!evsel__is_hybrid(pos))
   2030			continue;
   2031
   2032		if (strchr(pos->name, '/'))
   2033			continue;
   2034
   2035		ret = asprintf(&new_name, "%s/%s/",
   2036			       pos->pmu_name, pos->name);
   2037		if (ret) {
   2038			free(pos->name);
   2039			pos->name = new_name;
   2040		}
   2041	}
   2042}
   2043
   2044static int record__terminate_thread(struct record_thread *thread_data)
   2045{
   2046	int err;
   2047	enum thread_msg ack = THREAD_MSG__UNDEFINED;
   2048	pid_t tid = thread_data->tid;
   2049
   2050	close(thread_data->pipes.msg[1]);
   2051	thread_data->pipes.msg[1] = -1;
   2052	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
   2053	if (err > 0)
   2054		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
   2055	else
   2056		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
   2057			   thread->tid, tid);
   2058
   2059	return 0;
   2060}
   2061
   2062static int record__start_threads(struct record *rec)
   2063{
   2064	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
   2065	struct record_thread *thread_data = rec->thread_data;
   2066	sigset_t full, mask;
   2067	pthread_t handle;
   2068	pthread_attr_t attrs;
   2069
   2070	thread = &thread_data[0];
   2071
   2072	if (!record__threads_enabled(rec))
   2073		return 0;
   2074
   2075	sigfillset(&full);
   2076	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
   2077		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
   2078		return -1;
   2079	}
   2080
   2081	pthread_attr_init(&attrs);
   2082	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
   2083
   2084	for (t = 1; t < nr_threads; t++) {
   2085		enum thread_msg msg = THREAD_MSG__UNDEFINED;
   2086
   2087#ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
   2088		pthread_attr_setaffinity_np(&attrs,
   2089					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
   2090					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
   2091#endif
   2092		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
   2093			for (tt = 1; tt < t; tt++)
   2094				record__terminate_thread(&thread_data[t]);
   2095			pr_err("Failed to start threads: %s\n", strerror(errno));
   2096			ret = -1;
   2097			goto out_err;
   2098		}
   2099
   2100		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
   2101		if (err > 0)
   2102			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
   2103				  thread_msg_tags[msg]);
   2104		else
   2105			pr_warning("threads[%d]: failed to receive start notification from %d\n",
   2106				   thread->tid, rec->thread_data[t].tid);
   2107	}
   2108
   2109	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
   2110			(cpu_set_t *)thread->mask->affinity.bits);
   2111
   2112	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
   2113
   2114out_err:
   2115	pthread_attr_destroy(&attrs);
   2116
   2117	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
   2118		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
   2119		ret = -1;
   2120	}
   2121
   2122	return ret;
   2123}
   2124
   2125static int record__stop_threads(struct record *rec)
   2126{
   2127	int t;
   2128	struct record_thread *thread_data = rec->thread_data;
   2129
   2130	for (t = 1; t < rec->nr_threads; t++)
   2131		record__terminate_thread(&thread_data[t]);
   2132
   2133	for (t = 0; t < rec->nr_threads; t++) {
   2134		rec->samples += thread_data[t].samples;
   2135		if (!record__threads_enabled(rec))
   2136			continue;
   2137		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
   2138		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
   2139		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
   2140			 thread_data[t].samples, thread_data[t].waking);
   2141		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
   2142			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
   2143				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
   2144		else
   2145			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
   2146	}
   2147
   2148	return 0;
   2149}
   2150
   2151static unsigned long record__waking(struct record *rec)
   2152{
   2153	int t;
   2154	unsigned long waking = 0;
   2155	struct record_thread *thread_data = rec->thread_data;
   2156
   2157	for (t = 0; t < rec->nr_threads; t++)
   2158		waking += thread_data[t].waking;
   2159
   2160	return waking;
   2161}
   2162
   2163static int __cmd_record(struct record *rec, int argc, const char **argv)
   2164{
   2165	int err;
   2166	int status = 0;
   2167	const bool forks = argc > 0;
   2168	struct perf_tool *tool = &rec->tool;
   2169	struct record_opts *opts = &rec->opts;
   2170	struct perf_data *data = &rec->data;
   2171	struct perf_session *session;
   2172	bool disabled = false, draining = false;
   2173	int fd;
   2174	float ratio = 0;
   2175	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
   2176
   2177	atexit(record__sig_exit);
   2178	signal(SIGCHLD, sig_handler);
   2179	signal(SIGINT, sig_handler);
   2180	signal(SIGTERM, sig_handler);
   2181	signal(SIGSEGV, sigsegv_handler);
   2182
   2183	if (rec->opts.record_namespaces)
   2184		tool->namespace_events = true;
   2185
   2186	if (rec->opts.record_cgroup) {
   2187#ifdef HAVE_FILE_HANDLE
   2188		tool->cgroup_events = true;
   2189#else
   2190		pr_err("cgroup tracking is not supported\n");
   2191		return -1;
   2192#endif
   2193	}
   2194
   2195	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
   2196		signal(SIGUSR2, snapshot_sig_handler);
   2197		if (rec->opts.auxtrace_snapshot_mode)
   2198			trigger_on(&auxtrace_snapshot_trigger);
   2199		if (rec->switch_output.enabled)
   2200			trigger_on(&switch_output_trigger);
   2201	} else {
   2202		signal(SIGUSR2, SIG_IGN);
   2203	}
   2204
   2205	session = perf_session__new(data, tool);
   2206	if (IS_ERR(session)) {
   2207		pr_err("Perf session creation failed.\n");
   2208		return PTR_ERR(session);
   2209	}
   2210
   2211	if (record__threads_enabled(rec)) {
   2212		if (perf_data__is_pipe(&rec->data)) {
   2213			pr_err("Parallel trace streaming is not available in pipe mode.\n");
   2214			return -1;
   2215		}
   2216		if (rec->opts.full_auxtrace) {
   2217			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
   2218			return -1;
   2219		}
   2220	}
   2221
   2222	fd = perf_data__fd(data);
   2223	rec->session = session;
   2224
   2225	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
   2226		pr_err("Compression initialization failed.\n");
   2227		return -1;
   2228	}
   2229#ifdef HAVE_EVENTFD_SUPPORT
   2230	done_fd = eventfd(0, EFD_NONBLOCK);
   2231	if (done_fd < 0) {
   2232		pr_err("Failed to create wakeup eventfd, error: %m\n");
   2233		status = -1;
   2234		goto out_delete_session;
   2235	}
   2236	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
   2237	if (err < 0) {
   2238		pr_err("Failed to add wakeup eventfd to poll list\n");
   2239		status = err;
   2240		goto out_delete_session;
   2241	}
   2242#endif // HAVE_EVENTFD_SUPPORT
   2243
   2244	session->header.env.comp_type  = PERF_COMP_ZSTD;
   2245	session->header.env.comp_level = rec->opts.comp_level;
   2246
   2247	if (rec->opts.kcore &&
   2248	    !record__kcore_readable(&session->machines.host)) {
   2249		pr_err("ERROR: kcore is not readable.\n");
   2250		return -1;
   2251	}
   2252
   2253	if (record__init_clock(rec))
   2254		return -1;
   2255
   2256	record__init_features(rec);
   2257
   2258	if (forks) {
   2259		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
   2260					       workload_exec_failed_signal);
   2261		if (err < 0) {
   2262			pr_err("Couldn't run the workload!\n");
   2263			status = err;
   2264			goto out_delete_session;
   2265		}
   2266	}
   2267
   2268	/*
   2269	 * If we have just single event and are sending data
   2270	 * through pipe, we need to force the ids allocation,
   2271	 * because we synthesize event name through the pipe
   2272	 * and need the id for that.
   2273	 */
   2274	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
   2275		rec->opts.sample_id = true;
   2276
   2277	record__uniquify_name(rec);
   2278
   2279	if (record__open(rec) != 0) {
   2280		err = -1;
   2281		goto out_free_threads;
   2282	}
   2283	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
   2284
   2285	if (rec->opts.kcore) {
   2286		err = record__kcore_copy(&session->machines.host, data);
   2287		if (err) {
   2288			pr_err("ERROR: Failed to copy kcore\n");
   2289			goto out_free_threads;
   2290		}
   2291	}
   2292
   2293	err = bpf__apply_obj_config();
   2294	if (err) {
   2295		char errbuf[BUFSIZ];
   2296
   2297		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
   2298		pr_err("ERROR: Apply config to BPF failed: %s\n",
   2299			 errbuf);
   2300		goto out_free_threads;
   2301	}
   2302
   2303	/*
   2304	 * Normally perf_session__new would do this, but it doesn't have the
   2305	 * evlist.
   2306	 */
   2307	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
   2308		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
   2309		rec->tool.ordered_events = false;
   2310	}
   2311
   2312	if (!rec->evlist->core.nr_groups)
   2313		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
   2314
   2315	if (data->is_pipe) {
   2316		err = perf_header__write_pipe(fd);
   2317		if (err < 0)
   2318			goto out_free_threads;
   2319	} else {
   2320		err = perf_session__write_header(session, rec->evlist, fd, false);
   2321		if (err < 0)
   2322			goto out_free_threads;
   2323	}
   2324
   2325	err = -1;
   2326	if (!rec->no_buildid
   2327	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
   2328		pr_err("Couldn't generate buildids. "
   2329		       "Use --no-buildid to profile anyway.\n");
   2330		goto out_free_threads;
   2331	}
   2332
   2333	err = record__setup_sb_evlist(rec);
   2334	if (err)
   2335		goto out_free_threads;
   2336
   2337	err = record__synthesize(rec, false);
   2338	if (err < 0)
   2339		goto out_free_threads;
   2340
   2341	if (rec->realtime_prio) {
   2342		struct sched_param param;
   2343
   2344		param.sched_priority = rec->realtime_prio;
   2345		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
   2346			pr_err("Could not set realtime priority.\n");
   2347			err = -1;
   2348			goto out_free_threads;
   2349		}
   2350	}
   2351
   2352	if (record__start_threads(rec))
   2353		goto out_free_threads;
   2354
   2355	/*
   2356	 * When perf is starting the traced process, all the events
   2357	 * (apart from group members) have enable_on_exec=1 set,
   2358	 * so don't spoil it by prematurely enabling them.
   2359	 */
   2360	if (!target__none(&opts->target) && !opts->initial_delay)
   2361		evlist__enable(rec->evlist);
   2362
   2363	/*
   2364	 * Let the child rip
   2365	 */
   2366	if (forks) {
   2367		struct machine *machine = &session->machines.host;
   2368		union perf_event *event;
   2369		pid_t tgid;
   2370
   2371		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
   2372		if (event == NULL) {
   2373			err = -ENOMEM;
   2374			goto out_child;
   2375		}
   2376
   2377		/*
   2378		 * Some H/W events are generated before COMM event
   2379		 * which is emitted during exec(), so perf script
   2380		 * cannot see a correct process name for those events.
   2381		 * Synthesize COMM event to prevent it.
   2382		 */
   2383		tgid = perf_event__synthesize_comm(tool, event,
   2384						   rec->evlist->workload.pid,
   2385						   process_synthesized_event,
   2386						   machine);
   2387		free(event);
   2388
   2389		if (tgid == -1)
   2390			goto out_child;
   2391
   2392		event = malloc(sizeof(event->namespaces) +
   2393			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
   2394			       machine->id_hdr_size);
   2395		if (event == NULL) {
   2396			err = -ENOMEM;
   2397			goto out_child;
   2398		}
   2399
   2400		/*
   2401		 * Synthesize NAMESPACES event for the command specified.
   2402		 */
   2403		perf_event__synthesize_namespaces(tool, event,
   2404						  rec->evlist->workload.pid,
   2405						  tgid, process_synthesized_event,
   2406						  machine);
   2407		free(event);
   2408
   2409		evlist__start_workload(rec->evlist);
   2410	}
   2411
   2412	if (opts->initial_delay) {
   2413		pr_info(EVLIST_DISABLED_MSG);
   2414		if (opts->initial_delay > 0) {
   2415			usleep(opts->initial_delay * USEC_PER_MSEC);
   2416			evlist__enable(rec->evlist);
   2417			pr_info(EVLIST_ENABLED_MSG);
   2418		}
   2419	}
   2420
   2421	trigger_ready(&auxtrace_snapshot_trigger);
   2422	trigger_ready(&switch_output_trigger);
   2423	perf_hooks__invoke_record_start();
   2424	for (;;) {
   2425		unsigned long long hits = thread->samples;
   2426
   2427		/*
   2428		 * rec->evlist->bkw_mmap_state is possible to be
   2429		 * BKW_MMAP_EMPTY here: when done == true and
   2430		 * hits != rec->samples in previous round.
   2431		 *
   2432		 * evlist__toggle_bkw_mmap ensure we never
   2433		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
   2434		 */
   2435		if (trigger_is_hit(&switch_output_trigger) || done || draining)
   2436			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
   2437
   2438		if (record__mmap_read_all(rec, false) < 0) {
   2439			trigger_error(&auxtrace_snapshot_trigger);
   2440			trigger_error(&switch_output_trigger);
   2441			err = -1;
   2442			goto out_child;
   2443		}
   2444
   2445		if (auxtrace_record__snapshot_started) {
   2446			auxtrace_record__snapshot_started = 0;
   2447			if (!trigger_is_error(&auxtrace_snapshot_trigger))
   2448				record__read_auxtrace_snapshot(rec, false);
   2449			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
   2450				pr_err("AUX area tracing snapshot failed\n");
   2451				err = -1;
   2452				goto out_child;
   2453			}
   2454		}
   2455
   2456		if (trigger_is_hit(&switch_output_trigger)) {
   2457			/*
   2458			 * If switch_output_trigger is hit, the data in
   2459			 * overwritable ring buffer should have been collected,
   2460			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
   2461			 *
   2462			 * If SIGUSR2 raise after or during record__mmap_read_all(),
   2463			 * record__mmap_read_all() didn't collect data from
   2464			 * overwritable ring buffer. Read again.
   2465			 */
   2466			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
   2467				continue;
   2468			trigger_ready(&switch_output_trigger);
   2469
   2470			/*
   2471			 * Reenable events in overwrite ring buffer after
   2472			 * record__mmap_read_all(): we should have collected
   2473			 * data from it.
   2474			 */
   2475			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
   2476
   2477			if (!quiet)
   2478				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
   2479					record__waking(rec));
   2480			thread->waking = 0;
   2481			fd = record__switch_output(rec, false);
   2482			if (fd < 0) {
   2483				pr_err("Failed to switch to new file\n");
   2484				trigger_error(&switch_output_trigger);
   2485				err = fd;
   2486				goto out_child;
   2487			}
   2488
   2489			/* re-arm the alarm */
   2490			if (rec->switch_output.time)
   2491				alarm(rec->switch_output.time);
   2492		}
   2493
   2494		if (hits == thread->samples) {
   2495			if (done || draining)
   2496				break;
   2497			err = fdarray__poll(&thread->pollfd, -1);
   2498			/*
   2499			 * Propagate error, only if there's any. Ignore positive
   2500			 * number of returned events and interrupt error.
   2501			 */
   2502			if (err > 0 || (err < 0 && errno == EINTR))
   2503				err = 0;
   2504			thread->waking++;
   2505
   2506			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
   2507					    record__thread_munmap_filtered, NULL) == 0)
   2508				draining = true;
   2509
   2510			evlist__ctlfd_update(rec->evlist,
   2511				&thread->pollfd.entries[thread->ctlfd_pos]);
   2512		}
   2513
   2514		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
   2515			switch (cmd) {
   2516			case EVLIST_CTL_CMD_SNAPSHOT:
   2517				hit_auxtrace_snapshot_trigger(rec);
   2518				evlist__ctlfd_ack(rec->evlist);
   2519				break;
   2520			case EVLIST_CTL_CMD_STOP:
   2521				done = 1;
   2522				break;
   2523			case EVLIST_CTL_CMD_ACK:
   2524			case EVLIST_CTL_CMD_UNSUPPORTED:
   2525			case EVLIST_CTL_CMD_ENABLE:
   2526			case EVLIST_CTL_CMD_DISABLE:
   2527			case EVLIST_CTL_CMD_EVLIST:
   2528			case EVLIST_CTL_CMD_PING:
   2529			default:
   2530				break;
   2531			}
   2532		}
   2533
   2534		/*
   2535		 * When perf is starting the traced process, at the end events
   2536		 * die with the process and we wait for that. Thus no need to
   2537		 * disable events in this case.
   2538		 */
   2539		if (done && !disabled && !target__none(&opts->target)) {
   2540			trigger_off(&auxtrace_snapshot_trigger);
   2541			evlist__disable(rec->evlist);
   2542			disabled = true;
   2543		}
   2544	}
   2545
   2546	trigger_off(&auxtrace_snapshot_trigger);
   2547	trigger_off(&switch_output_trigger);
   2548
   2549	if (opts->auxtrace_snapshot_on_exit)
   2550		record__auxtrace_snapshot_exit(rec);
   2551
   2552	if (forks && workload_exec_errno) {
   2553		char msg[STRERR_BUFSIZE], strevsels[2048];
   2554		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
   2555
   2556		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
   2557
   2558		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
   2559			strevsels, argv[0], emsg);
   2560		err = -1;
   2561		goto out_child;
   2562	}
   2563
   2564	if (!quiet)
   2565		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
   2566			record__waking(rec));
   2567
   2568	if (target__none(&rec->opts.target))
   2569		record__synthesize_workload(rec, true);
   2570
   2571out_child:
   2572	record__stop_threads(rec);
   2573	record__mmap_read_all(rec, true);
   2574out_free_threads:
   2575	record__free_thread_data(rec);
   2576	evlist__finalize_ctlfd(rec->evlist);
   2577	record__aio_mmap_read_sync(rec);
   2578
   2579	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
   2580		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
   2581		session->header.env.comp_ratio = ratio + 0.5;
   2582	}
   2583
   2584	if (forks) {
   2585		int exit_status;
   2586
   2587		if (!child_finished)
   2588			kill(rec->evlist->workload.pid, SIGTERM);
   2589
   2590		wait(&exit_status);
   2591
   2592		if (err < 0)
   2593			status = err;
   2594		else if (WIFEXITED(exit_status))
   2595			status = WEXITSTATUS(exit_status);
   2596		else if (WIFSIGNALED(exit_status))
   2597			signr = WTERMSIG(exit_status);
   2598	} else
   2599		status = err;
   2600
   2601	if (rec->off_cpu)
   2602		rec->bytes_written += off_cpu_write(rec->session);
   2603
   2604	record__synthesize(rec, true);
   2605	/* this will be recalculated during process_buildids() */
   2606	rec->samples = 0;
   2607
   2608	if (!err) {
   2609		if (!rec->timestamp_filename) {
   2610			record__finish_output(rec);
   2611		} else {
   2612			fd = record__switch_output(rec, true);
   2613			if (fd < 0) {
   2614				status = fd;
   2615				goto out_delete_session;
   2616			}
   2617		}
   2618	}
   2619
   2620	perf_hooks__invoke_record_end();
   2621
   2622	if (!err && !quiet) {
   2623		char samples[128];
   2624		const char *postfix = rec->timestamp_filename ?
   2625					".<timestamp>" : "";
   2626
   2627		if (rec->samples && !rec->opts.full_auxtrace)
   2628			scnprintf(samples, sizeof(samples),
   2629				  " (%" PRIu64 " samples)", rec->samples);
   2630		else
   2631			samples[0] = '\0';
   2632
   2633		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
   2634			perf_data__size(data) / 1024.0 / 1024.0,
   2635			data->path, postfix, samples);
   2636		if (ratio) {
   2637			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
   2638					rec->session->bytes_transferred / 1024.0 / 1024.0,
   2639					ratio);
   2640		}
   2641		fprintf(stderr, " ]\n");
   2642	}
   2643
   2644out_delete_session:
   2645#ifdef HAVE_EVENTFD_SUPPORT
   2646	if (done_fd >= 0)
   2647		close(done_fd);
   2648#endif
   2649	zstd_fini(&session->zstd_data);
   2650	perf_session__delete(session);
   2651
   2652	if (!opts->no_bpf_event)
   2653		evlist__stop_sb_thread(rec->sb_evlist);
   2654	return status;
   2655}
   2656
   2657static void callchain_debug(struct callchain_param *callchain)
   2658{
   2659	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
   2660
   2661	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
   2662
   2663	if (callchain->record_mode == CALLCHAIN_DWARF)
   2664		pr_debug("callchain: stack dump size %d\n",
   2665			 callchain->dump_size);
   2666}
   2667
   2668int record_opts__parse_callchain(struct record_opts *record,
   2669				 struct callchain_param *callchain,
   2670				 const char *arg, bool unset)
   2671{
   2672	int ret;
   2673	callchain->enabled = !unset;
   2674
   2675	/* --no-call-graph */
   2676	if (unset) {
   2677		callchain->record_mode = CALLCHAIN_NONE;
   2678		pr_debug("callchain: disabled\n");
   2679		return 0;
   2680	}
   2681
   2682	ret = parse_callchain_record_opt(arg, callchain);
   2683	if (!ret) {
   2684		/* Enable data address sampling for DWARF unwind. */
   2685		if (callchain->record_mode == CALLCHAIN_DWARF)
   2686			record->sample_address = true;
   2687		callchain_debug(callchain);
   2688	}
   2689
   2690	return ret;
   2691}
   2692
   2693int record_parse_callchain_opt(const struct option *opt,
   2694			       const char *arg,
   2695			       int unset)
   2696{
   2697	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
   2698}
   2699
   2700int record_callchain_opt(const struct option *opt,
   2701			 const char *arg __maybe_unused,
   2702			 int unset __maybe_unused)
   2703{
   2704	struct callchain_param *callchain = opt->value;
   2705
   2706	callchain->enabled = true;
   2707
   2708	if (callchain->record_mode == CALLCHAIN_NONE)
   2709		callchain->record_mode = CALLCHAIN_FP;
   2710
   2711	callchain_debug(callchain);
   2712	return 0;
   2713}
   2714
   2715static int perf_record_config(const char *var, const char *value, void *cb)
   2716{
   2717	struct record *rec = cb;
   2718
   2719	if (!strcmp(var, "record.build-id")) {
   2720		if (!strcmp(value, "cache"))
   2721			rec->no_buildid_cache = false;
   2722		else if (!strcmp(value, "no-cache"))
   2723			rec->no_buildid_cache = true;
   2724		else if (!strcmp(value, "skip"))
   2725			rec->no_buildid = true;
   2726		else if (!strcmp(value, "mmap"))
   2727			rec->buildid_mmap = true;
   2728		else
   2729			return -1;
   2730		return 0;
   2731	}
   2732	if (!strcmp(var, "record.call-graph")) {
   2733		var = "call-graph.record-mode";
   2734		return perf_default_config(var, value, cb);
   2735	}
   2736#ifdef HAVE_AIO_SUPPORT
   2737	if (!strcmp(var, "record.aio")) {
   2738		rec->opts.nr_cblocks = strtol(value, NULL, 0);
   2739		if (!rec->opts.nr_cblocks)
   2740			rec->opts.nr_cblocks = nr_cblocks_default;
   2741	}
   2742#endif
   2743	if (!strcmp(var, "record.debuginfod")) {
   2744		rec->debuginfod.urls = strdup(value);
   2745		if (!rec->debuginfod.urls)
   2746			return -ENOMEM;
   2747		rec->debuginfod.set = true;
   2748	}
   2749
   2750	return 0;
   2751}
   2752
   2753
   2754static int record__parse_affinity(const struct option *opt, const char *str, int unset)
   2755{
   2756	struct record_opts *opts = (struct record_opts *)opt->value;
   2757
   2758	if (unset || !str)
   2759		return 0;
   2760
   2761	if (!strcasecmp(str, "node"))
   2762		opts->affinity = PERF_AFFINITY_NODE;
   2763	else if (!strcasecmp(str, "cpu"))
   2764		opts->affinity = PERF_AFFINITY_CPU;
   2765
   2766	return 0;
   2767}
   2768
   2769static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
   2770{
   2771	mask->nbits = nr_bits;
   2772	mask->bits = bitmap_zalloc(mask->nbits);
   2773	if (!mask->bits)
   2774		return -ENOMEM;
   2775
   2776	return 0;
   2777}
   2778
   2779static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
   2780{
   2781	bitmap_free(mask->bits);
   2782	mask->nbits = 0;
   2783}
   2784
   2785static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
   2786{
   2787	int ret;
   2788
   2789	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
   2790	if (ret) {
   2791		mask->affinity.bits = NULL;
   2792		return ret;
   2793	}
   2794
   2795	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
   2796	if (ret) {
   2797		record__mmap_cpu_mask_free(&mask->maps);
   2798		mask->maps.bits = NULL;
   2799	}
   2800
   2801	return ret;
   2802}
   2803
   2804static void record__thread_mask_free(struct thread_mask *mask)
   2805{
   2806	record__mmap_cpu_mask_free(&mask->maps);
   2807	record__mmap_cpu_mask_free(&mask->affinity);
   2808}
   2809
   2810static int record__parse_threads(const struct option *opt, const char *str, int unset)
   2811{
   2812	int s;
   2813	struct record_opts *opts = opt->value;
   2814
   2815	if (unset || !str || !strlen(str)) {
   2816		opts->threads_spec = THREAD_SPEC__CPU;
   2817	} else {
   2818		for (s = 1; s < THREAD_SPEC__MAX; s++) {
   2819			if (s == THREAD_SPEC__USER) {
   2820				opts->threads_user_spec = strdup(str);
   2821				if (!opts->threads_user_spec)
   2822					return -ENOMEM;
   2823				opts->threads_spec = THREAD_SPEC__USER;
   2824				break;
   2825			}
   2826			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
   2827				opts->threads_spec = s;
   2828				break;
   2829			}
   2830		}
   2831	}
   2832
   2833	if (opts->threads_spec == THREAD_SPEC__USER)
   2834		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
   2835	else
   2836		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
   2837
   2838	return 0;
   2839}
   2840
   2841static int parse_output_max_size(const struct option *opt,
   2842				 const char *str, int unset)
   2843{
   2844	unsigned long *s = (unsigned long *)opt->value;
   2845	static struct parse_tag tags_size[] = {
   2846		{ .tag  = 'B', .mult = 1       },
   2847		{ .tag  = 'K', .mult = 1 << 10 },
   2848		{ .tag  = 'M', .mult = 1 << 20 },
   2849		{ .tag  = 'G', .mult = 1 << 30 },
   2850		{ .tag  = 0 },
   2851	};
   2852	unsigned long val;
   2853
   2854	if (unset) {
   2855		*s = 0;
   2856		return 0;
   2857	}
   2858
   2859	val = parse_tag_value(str, tags_size);
   2860	if (val != (unsigned long) -1) {
   2861		*s = val;
   2862		return 0;
   2863	}
   2864
   2865	return -1;
   2866}
   2867
   2868static int record__parse_mmap_pages(const struct option *opt,
   2869				    const char *str,
   2870				    int unset __maybe_unused)
   2871{
   2872	struct record_opts *opts = opt->value;
   2873	char *s, *p;
   2874	unsigned int mmap_pages;
   2875	int ret;
   2876
   2877	if (!str)
   2878		return -EINVAL;
   2879
   2880	s = strdup(str);
   2881	if (!s)
   2882		return -ENOMEM;
   2883
   2884	p = strchr(s, ',');
   2885	if (p)
   2886		*p = '\0';
   2887
   2888	if (*s) {
   2889		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
   2890		if (ret)
   2891			goto out_free;
   2892		opts->mmap_pages = mmap_pages;
   2893	}
   2894
   2895	if (!p) {
   2896		ret = 0;
   2897		goto out_free;
   2898	}
   2899
   2900	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
   2901	if (ret)
   2902		goto out_free;
   2903
   2904	opts->auxtrace_mmap_pages = mmap_pages;
   2905
   2906out_free:
   2907	free(s);
   2908	return ret;
   2909}
   2910
   2911void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
   2912{
   2913}
   2914
   2915static int parse_control_option(const struct option *opt,
   2916				const char *str,
   2917				int unset __maybe_unused)
   2918{
   2919	struct record_opts *opts = opt->value;
   2920
   2921	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
   2922}
   2923
   2924static void switch_output_size_warn(struct record *rec)
   2925{
   2926	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
   2927	struct switch_output *s = &rec->switch_output;
   2928
   2929	wakeup_size /= 2;
   2930
   2931	if (s->size < wakeup_size) {
   2932		char buf[100];
   2933
   2934		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
   2935		pr_warning("WARNING: switch-output data size lower than "
   2936			   "wakeup kernel buffer size (%s) "
   2937			   "expect bigger perf.data sizes\n", buf);
   2938	}
   2939}
   2940
   2941static int switch_output_setup(struct record *rec)
   2942{
   2943	struct switch_output *s = &rec->switch_output;
   2944	static struct parse_tag tags_size[] = {
   2945		{ .tag  = 'B', .mult = 1       },
   2946		{ .tag  = 'K', .mult = 1 << 10 },
   2947		{ .tag  = 'M', .mult = 1 << 20 },
   2948		{ .tag  = 'G', .mult = 1 << 30 },
   2949		{ .tag  = 0 },
   2950	};
   2951	static struct parse_tag tags_time[] = {
   2952		{ .tag  = 's', .mult = 1        },
   2953		{ .tag  = 'm', .mult = 60       },
   2954		{ .tag  = 'h', .mult = 60*60    },
   2955		{ .tag  = 'd', .mult = 60*60*24 },
   2956		{ .tag  = 0 },
   2957	};
   2958	unsigned long val;
   2959
   2960	/*
   2961	 * If we're using --switch-output-events, then we imply its 
   2962	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
   2963	 *  thread to its parent.
   2964	 */
   2965	if (rec->switch_output_event_set) {
   2966		if (record__threads_enabled(rec)) {
   2967			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
   2968			return 0;
   2969		}
   2970		goto do_signal;
   2971	}
   2972
   2973	if (!s->set)
   2974		return 0;
   2975
   2976	if (record__threads_enabled(rec)) {
   2977		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
   2978		return 0;
   2979	}
   2980
   2981	if (!strcmp(s->str, "signal")) {
   2982do_signal:
   2983		s->signal = true;
   2984		pr_debug("switch-output with SIGUSR2 signal\n");
   2985		goto enabled;
   2986	}
   2987
   2988	val = parse_tag_value(s->str, tags_size);
   2989	if (val != (unsigned long) -1) {
   2990		s->size = val;
   2991		pr_debug("switch-output with %s size threshold\n", s->str);
   2992		goto enabled;
   2993	}
   2994
   2995	val = parse_tag_value(s->str, tags_time);
   2996	if (val != (unsigned long) -1) {
   2997		s->time = val;
   2998		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
   2999			 s->str, s->time);
   3000		goto enabled;
   3001	}
   3002
   3003	return -1;
   3004
   3005enabled:
   3006	rec->timestamp_filename = true;
   3007	s->enabled              = true;
   3008
   3009	if (s->size && !rec->opts.no_buffering)
   3010		switch_output_size_warn(rec);
   3011
   3012	return 0;
   3013}
   3014
   3015static const char * const __record_usage[] = {
   3016	"perf record [<options>] [<command>]",
   3017	"perf record [<options>] -- <command> [<options>]",
   3018	NULL
   3019};
   3020const char * const *record_usage = __record_usage;
   3021
   3022static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
   3023				  struct perf_sample *sample, struct machine *machine)
   3024{
   3025	/*
   3026	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
   3027	 * no need to add them twice.
   3028	 */
   3029	if (!(event->header.misc & PERF_RECORD_MISC_USER))
   3030		return 0;
   3031	return perf_event__process_mmap(tool, event, sample, machine);
   3032}
   3033
   3034static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
   3035				   struct perf_sample *sample, struct machine *machine)
   3036{
   3037	/*
   3038	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
   3039	 * no need to add them twice.
   3040	 */
   3041	if (!(event->header.misc & PERF_RECORD_MISC_USER))
   3042		return 0;
   3043
   3044	return perf_event__process_mmap2(tool, event, sample, machine);
   3045}
   3046
   3047static int process_timestamp_boundary(struct perf_tool *tool,
   3048				      union perf_event *event __maybe_unused,
   3049				      struct perf_sample *sample,
   3050				      struct machine *machine __maybe_unused)
   3051{
   3052	struct record *rec = container_of(tool, struct record, tool);
   3053
   3054	set_timestamp_boundary(rec, sample->time);
   3055	return 0;
   3056}
   3057
   3058static int parse_record_synth_option(const struct option *opt,
   3059				     const char *str,
   3060				     int unset __maybe_unused)
   3061{
   3062	struct record_opts *opts = opt->value;
   3063	char *p = strdup(str);
   3064
   3065	if (p == NULL)
   3066		return -1;
   3067
   3068	opts->synth = parse_synth_opt(p);
   3069	free(p);
   3070
   3071	if (opts->synth < 0) {
   3072		pr_err("Invalid synth option: %s\n", str);
   3073		return -1;
   3074	}
   3075	return 0;
   3076}
   3077
   3078/*
   3079 * XXX Ideally would be local to cmd_record() and passed to a record__new
   3080 * because we need to have access to it in record__exit, that is called
   3081 * after cmd_record() exits, but since record_options need to be accessible to
   3082 * builtin-script, leave it here.
   3083 *
   3084 * At least we don't ouch it in all the other functions here directly.
   3085 *
   3086 * Just say no to tons of global variables, sigh.
   3087 */
   3088static struct record record = {
   3089	.opts = {
   3090		.sample_time	     = true,
   3091		.mmap_pages	     = UINT_MAX,
   3092		.user_freq	     = UINT_MAX,
   3093		.user_interval	     = ULLONG_MAX,
   3094		.freq		     = 4000,
   3095		.target		     = {
   3096			.uses_mmap   = true,
   3097			.default_per_cpu = true,
   3098		},
   3099		.mmap_flush          = MMAP_FLUSH_DEFAULT,
   3100		.nr_threads_synthesize = 1,
   3101		.ctl_fd              = -1,
   3102		.ctl_fd_ack          = -1,
   3103		.synth               = PERF_SYNTH_ALL,
   3104	},
   3105	.tool = {
   3106		.sample		= process_sample_event,
   3107		.fork		= perf_event__process_fork,
   3108		.exit		= perf_event__process_exit,
   3109		.comm		= perf_event__process_comm,
   3110		.namespaces	= perf_event__process_namespaces,
   3111		.mmap		= build_id__process_mmap,
   3112		.mmap2		= build_id__process_mmap2,
   3113		.itrace_start	= process_timestamp_boundary,
   3114		.aux		= process_timestamp_boundary,
   3115		.ordered_events	= true,
   3116	},
   3117};
   3118
   3119const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
   3120	"\n\t\t\t\tDefault: fp";
   3121
   3122static bool dry_run;
   3123
   3124/*
   3125 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
   3126 * with it and switch to use the library functions in perf_evlist that came
   3127 * from builtin-record.c, i.e. use record_opts,
   3128 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
   3129 * using pipes, etc.
   3130 */
   3131static struct option __record_options[] = {
   3132	OPT_CALLBACK('e', "event", &record.evlist, "event",
   3133		     "event selector. use 'perf list' to list available events",
   3134		     parse_events_option),
   3135	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
   3136		     "event filter", parse_filter),
   3137	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
   3138			   NULL, "don't record events from perf itself",
   3139			   exclude_perf),
   3140	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
   3141		    "record events on existing process id"),
   3142	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
   3143		    "record events on existing thread id"),
   3144	OPT_INTEGER('r', "realtime", &record.realtime_prio,
   3145		    "collect data with this RT SCHED_FIFO priority"),
   3146	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
   3147		    "collect data without buffering"),
   3148	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
   3149		    "collect raw sample records from all opened counters"),
   3150	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
   3151			    "system-wide collection from all CPUs"),
   3152	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
   3153		    "list of cpus to monitor"),
   3154	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
   3155	OPT_STRING('o', "output", &record.data.path, "file",
   3156		    "output file name"),
   3157	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
   3158			&record.opts.no_inherit_set,
   3159			"child tasks do not inherit counters"),
   3160	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
   3161		    "synthesize non-sample events at the end of output"),
   3162	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
   3163	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
   3164	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
   3165		    "Fail if the specified frequency can't be used"),
   3166	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
   3167		     "profile at this frequency",
   3168		      record__parse_freq),
   3169	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
   3170		     "number of mmap data pages and AUX area tracing mmap pages",
   3171		     record__parse_mmap_pages),
   3172	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
   3173		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
   3174		     record__mmap_flush_parse),
   3175	OPT_BOOLEAN(0, "group", &record.opts.group,
   3176		    "put the counters into a counter group"),
   3177	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
   3178			   NULL, "enables call-graph recording" ,
   3179			   &record_callchain_opt),
   3180	OPT_CALLBACK(0, "call-graph", &record.opts,
   3181		     "record_mode[,record_size]", record_callchain_help,
   3182		     &record_parse_callchain_opt),
   3183	OPT_INCR('v', "verbose", &verbose,
   3184		    "be more verbose (show counter open errors, etc)"),
   3185	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
   3186	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
   3187		    "per thread counts"),
   3188	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
   3189	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
   3190		    "Record the sample physical addresses"),
   3191	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
   3192		    "Record the sampled data address data page size"),
   3193	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
   3194		    "Record the sampled code address (ip) page size"),
   3195	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
   3196	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
   3197			&record.opts.sample_time_set,
   3198			"Record the sample timestamps"),
   3199	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
   3200			"Record the sample period"),
   3201	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
   3202		    "don't sample"),
   3203	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
   3204			&record.no_buildid_cache_set,
   3205			"do not update the buildid cache"),
   3206	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
   3207			&record.no_buildid_set,
   3208			"do not collect buildids in perf.data"),
   3209	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
   3210		     "monitor event in cgroup name only",
   3211		     parse_cgroups),
   3212	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
   3213		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
   3214	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
   3215	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
   3216		   "user to profile"),
   3217
   3218	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
   3219		     "branch any", "sample any taken branches",
   3220		     parse_branch_stack),
   3221
   3222	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
   3223		     "branch filter mask", "branch stack filter modes",
   3224		     parse_branch_stack),
   3225	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
   3226		    "sample by weight (on special events only)"),
   3227	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
   3228		    "sample transaction flags (special events only)"),
   3229	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
   3230		    "use per-thread mmaps"),
   3231	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
   3232		    "sample selected machine registers on interrupt,"
   3233		    " use '-I?' to list register names", parse_intr_regs),
   3234	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
   3235		    "sample selected machine registers on interrupt,"
   3236		    " use '--user-regs=?' to list register names", parse_user_regs),
   3237	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
   3238		    "Record running/enabled time of read (:S) events"),
   3239	OPT_CALLBACK('k', "clockid", &record.opts,
   3240	"clockid", "clockid to use for events, see clock_gettime()",
   3241	parse_clockid),
   3242	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
   3243			  "opts", "AUX area tracing Snapshot Mode", ""),
   3244	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
   3245			  "opts", "sample AUX area", ""),
   3246	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
   3247			"per thread proc mmap processing timeout in ms"),
   3248	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
   3249		    "Record namespaces events"),
   3250	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
   3251		    "Record cgroup events"),
   3252	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
   3253			&record.opts.record_switch_events_set,
   3254			"Record context switch events"),
   3255	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
   3256			 "Configure all used events to run in kernel space.",
   3257			 PARSE_OPT_EXCLUSIVE),
   3258	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
   3259			 "Configure all used events to run in user space.",
   3260			 PARSE_OPT_EXCLUSIVE),
   3261	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
   3262		    "collect kernel callchains"),
   3263	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
   3264		    "collect user callchains"),
   3265	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
   3266		   "clang binary to use for compiling BPF scriptlets"),
   3267	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
   3268		   "options passed to clang when compiling BPF scriptlets"),
   3269	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
   3270		   "file", "vmlinux pathname"),
   3271	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
   3272		    "Record build-id of all DSOs regardless of hits"),
   3273	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
   3274		    "Record build-id in map events"),
   3275	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
   3276		    "append timestamp to output filename"),
   3277	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
   3278		    "Record timestamp boundary (time of first/last samples)"),
   3279	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
   3280			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
   3281			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
   3282			  "signal"),
   3283	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
   3284			 "switch output event selector. use 'perf list' to list available events",
   3285			 parse_events_option_new_evlist),
   3286	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
   3287		   "Limit number of switch output generated files"),
   3288	OPT_BOOLEAN(0, "dry-run", &dry_run,
   3289		    "Parse options then exit"),
   3290#ifdef HAVE_AIO_SUPPORT
   3291	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
   3292		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
   3293		     record__aio_parse),
   3294#endif
   3295	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
   3296		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
   3297		     record__parse_affinity),
   3298#ifdef HAVE_ZSTD_SUPPORT
   3299	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
   3300			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
   3301			    record__parse_comp_level),
   3302#endif
   3303	OPT_CALLBACK(0, "max-size", &record.output_max_size,
   3304		     "size", "Limit the maximum size of the output file", parse_output_max_size),
   3305	OPT_UINTEGER(0, "num-thread-synthesize",
   3306		     &record.opts.nr_threads_synthesize,
   3307		     "number of threads to run for event synthesis"),
   3308#ifdef HAVE_LIBPFM
   3309	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
   3310		"libpfm4 event selector. use 'perf list' to list available events",
   3311		parse_libpfm_events_option),
   3312#endif
   3313	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
   3314		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
   3315		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
   3316		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
   3317		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
   3318		      parse_control_option),
   3319	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
   3320		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
   3321	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
   3322			  &record.debuginfod.set, "debuginfod urls",
   3323			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
   3324			  "system"),
   3325	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
   3326			    "write collected trace data into several data files using parallel threads",
   3327			    record__parse_threads),
   3328	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
   3329	OPT_END()
   3330};
   3331
   3332struct option *record_options = __record_options;
   3333
   3334static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
   3335{
   3336	struct perf_cpu cpu;
   3337	int idx;
   3338
   3339	if (cpu_map__is_dummy(cpus))
   3340		return;
   3341
   3342	perf_cpu_map__for_each_cpu(cpu, idx, cpus)
   3343		set_bit(cpu.cpu, mask->bits);
   3344}
   3345
   3346static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
   3347{
   3348	struct perf_cpu_map *cpus;
   3349
   3350	cpus = perf_cpu_map__new(mask_spec);
   3351	if (!cpus)
   3352		return -ENOMEM;
   3353
   3354	bitmap_zero(mask->bits, mask->nbits);
   3355	record__mmap_cpu_mask_init(mask, cpus);
   3356	perf_cpu_map__put(cpus);
   3357
   3358	return 0;
   3359}
   3360
   3361static void record__free_thread_masks(struct record *rec, int nr_threads)
   3362{
   3363	int t;
   3364
   3365	if (rec->thread_masks)
   3366		for (t = 0; t < nr_threads; t++)
   3367			record__thread_mask_free(&rec->thread_masks[t]);
   3368
   3369	zfree(&rec->thread_masks);
   3370}
   3371
   3372static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
   3373{
   3374	int t, ret;
   3375
   3376	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
   3377	if (!rec->thread_masks) {
   3378		pr_err("Failed to allocate thread masks\n");
   3379		return -ENOMEM;
   3380	}
   3381
   3382	for (t = 0; t < nr_threads; t++) {
   3383		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
   3384		if (ret) {
   3385			pr_err("Failed to allocate thread masks[%d]\n", t);
   3386			goto out_free;
   3387		}
   3388	}
   3389
   3390	return 0;
   3391
   3392out_free:
   3393	record__free_thread_masks(rec, nr_threads);
   3394
   3395	return ret;
   3396}
   3397
   3398static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
   3399{
   3400	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
   3401
   3402	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
   3403	if (ret)
   3404		return ret;
   3405
   3406	rec->nr_threads = nr_cpus;
   3407	pr_debug("nr_threads: %d\n", rec->nr_threads);
   3408
   3409	for (t = 0; t < rec->nr_threads; t++) {
   3410		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
   3411		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
   3412		if (verbose) {
   3413			pr_debug("thread_masks[%d]: ", t);
   3414			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
   3415			pr_debug("thread_masks[%d]: ", t);
   3416			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
   3417		}
   3418	}
   3419
   3420	return 0;
   3421}
   3422
   3423static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
   3424					  const char **maps_spec, const char **affinity_spec,
   3425					  u32 nr_spec)
   3426{
   3427	u32 s;
   3428	int ret = 0, t = 0;
   3429	struct mmap_cpu_mask cpus_mask;
   3430	struct thread_mask thread_mask, full_mask, *thread_masks;
   3431
   3432	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
   3433	if (ret) {
   3434		pr_err("Failed to allocate CPUs mask\n");
   3435		return ret;
   3436	}
   3437	record__mmap_cpu_mask_init(&cpus_mask, cpus);
   3438
   3439	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
   3440	if (ret) {
   3441		pr_err("Failed to allocate full mask\n");
   3442		goto out_free_cpu_mask;
   3443	}
   3444
   3445	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
   3446	if (ret) {
   3447		pr_err("Failed to allocate thread mask\n");
   3448		goto out_free_full_and_cpu_masks;
   3449	}
   3450
   3451	for (s = 0; s < nr_spec; s++) {
   3452		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
   3453		if (ret) {
   3454			pr_err("Failed to initialize maps thread mask\n");
   3455			goto out_free;
   3456		}
   3457		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
   3458		if (ret) {
   3459			pr_err("Failed to initialize affinity thread mask\n");
   3460			goto out_free;
   3461		}
   3462
   3463		/* ignore invalid CPUs but do not allow empty masks */
   3464		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
   3465				cpus_mask.bits, thread_mask.maps.nbits)) {
   3466			pr_err("Empty maps mask: %s\n", maps_spec[s]);
   3467			ret = -EINVAL;
   3468			goto out_free;
   3469		}
   3470		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
   3471				cpus_mask.bits, thread_mask.affinity.nbits)) {
   3472			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
   3473			ret = -EINVAL;
   3474			goto out_free;
   3475		}
   3476
   3477		/* do not allow intersection with other masks (full_mask) */
   3478		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
   3479				      thread_mask.maps.nbits)) {
   3480			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
   3481			ret = -EINVAL;
   3482			goto out_free;
   3483		}
   3484		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
   3485				      thread_mask.affinity.nbits)) {
   3486			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
   3487			ret = -EINVAL;
   3488			goto out_free;
   3489		}
   3490
   3491		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
   3492			  thread_mask.maps.bits, full_mask.maps.nbits);
   3493		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
   3494			  thread_mask.affinity.bits, full_mask.maps.nbits);
   3495
   3496		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
   3497		if (!thread_masks) {
   3498			pr_err("Failed to reallocate thread masks\n");
   3499			ret = -ENOMEM;
   3500			goto out_free;
   3501		}
   3502		rec->thread_masks = thread_masks;
   3503		rec->thread_masks[t] = thread_mask;
   3504		if (verbose) {
   3505			pr_debug("thread_masks[%d]: ", t);
   3506			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
   3507			pr_debug("thread_masks[%d]: ", t);
   3508			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
   3509		}
   3510		t++;
   3511		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
   3512		if (ret) {
   3513			pr_err("Failed to allocate thread mask\n");
   3514			goto out_free_full_and_cpu_masks;
   3515		}
   3516	}
   3517	rec->nr_threads = t;
   3518	pr_debug("nr_threads: %d\n", rec->nr_threads);
   3519	if (!rec->nr_threads)
   3520		ret = -EINVAL;
   3521
   3522out_free:
   3523	record__thread_mask_free(&thread_mask);
   3524out_free_full_and_cpu_masks:
   3525	record__thread_mask_free(&full_mask);
   3526out_free_cpu_mask:
   3527	record__mmap_cpu_mask_free(&cpus_mask);
   3528
   3529	return ret;
   3530}
   3531
   3532static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
   3533{
   3534	int ret;
   3535	struct cpu_topology *topo;
   3536
   3537	topo = cpu_topology__new();
   3538	if (!topo) {
   3539		pr_err("Failed to allocate CPU topology\n");
   3540		return -ENOMEM;
   3541	}
   3542
   3543	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
   3544					     topo->core_cpus_list, topo->core_cpus_lists);
   3545	cpu_topology__delete(topo);
   3546
   3547	return ret;
   3548}
   3549
   3550static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
   3551{
   3552	int ret;
   3553	struct cpu_topology *topo;
   3554
   3555	topo = cpu_topology__new();
   3556	if (!topo) {
   3557		pr_err("Failed to allocate CPU topology\n");
   3558		return -ENOMEM;
   3559	}
   3560
   3561	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
   3562					     topo->package_cpus_list, topo->package_cpus_lists);
   3563	cpu_topology__delete(topo);
   3564
   3565	return ret;
   3566}
   3567
   3568static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
   3569{
   3570	u32 s;
   3571	int ret;
   3572	const char **spec;
   3573	struct numa_topology *topo;
   3574
   3575	topo = numa_topology__new();
   3576	if (!topo) {
   3577		pr_err("Failed to allocate NUMA topology\n");
   3578		return -ENOMEM;
   3579	}
   3580
   3581	spec = zalloc(topo->nr * sizeof(char *));
   3582	if (!spec) {
   3583		pr_err("Failed to allocate NUMA spec\n");
   3584		ret = -ENOMEM;
   3585		goto out_delete_topo;
   3586	}
   3587	for (s = 0; s < topo->nr; s++)
   3588		spec[s] = topo->nodes[s].cpus;
   3589
   3590	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
   3591
   3592	zfree(&spec);
   3593
   3594out_delete_topo:
   3595	numa_topology__delete(topo);
   3596
   3597	return ret;
   3598}
   3599
   3600static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
   3601{
   3602	int t, ret;
   3603	u32 s, nr_spec = 0;
   3604	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
   3605	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
   3606
   3607	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
   3608		spec = strtok_r(user_spec, ":", &spec_ptr);
   3609		if (spec == NULL)
   3610			break;
   3611		pr_debug2("threads_spec[%d]: %s\n", t, spec);
   3612		mask = strtok_r(spec, "/", &mask_ptr);
   3613		if (mask == NULL)
   3614			break;
   3615		pr_debug2("  maps mask: %s\n", mask);
   3616		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
   3617		if (!tmp_spec) {
   3618			pr_err("Failed to reallocate maps spec\n");
   3619			ret = -ENOMEM;
   3620			goto out_free;
   3621		}
   3622		maps_spec = tmp_spec;
   3623		maps_spec[nr_spec] = dup_mask = strdup(mask);
   3624		if (!maps_spec[nr_spec]) {
   3625			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
   3626			ret = -ENOMEM;
   3627			goto out_free;
   3628		}
   3629		mask = strtok_r(NULL, "/", &mask_ptr);
   3630		if (mask == NULL) {
   3631			pr_err("Invalid thread maps or affinity specs\n");
   3632			ret = -EINVAL;
   3633			goto out_free;
   3634		}
   3635		pr_debug2("  affinity mask: %s\n", mask);
   3636		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
   3637		if (!tmp_spec) {
   3638			pr_err("Failed to reallocate affinity spec\n");
   3639			ret = -ENOMEM;
   3640			goto out_free;
   3641		}
   3642		affinity_spec = tmp_spec;
   3643		affinity_spec[nr_spec] = strdup(mask);
   3644		if (!affinity_spec[nr_spec]) {
   3645			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
   3646			ret = -ENOMEM;
   3647			goto out_free;
   3648		}
   3649		dup_mask = NULL;
   3650		nr_spec++;
   3651	}
   3652
   3653	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
   3654					     (const char **)affinity_spec, nr_spec);
   3655
   3656out_free:
   3657	free(dup_mask);
   3658	for (s = 0; s < nr_spec; s++) {
   3659		if (maps_spec)
   3660			free(maps_spec[s]);
   3661		if (affinity_spec)
   3662			free(affinity_spec[s]);
   3663	}
   3664	free(affinity_spec);
   3665	free(maps_spec);
   3666
   3667	return ret;
   3668}
   3669
   3670static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
   3671{
   3672	int ret;
   3673
   3674	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
   3675	if (ret)
   3676		return ret;
   3677
   3678	record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
   3679
   3680	rec->nr_threads = 1;
   3681
   3682	return 0;
   3683}
   3684
   3685static int record__init_thread_masks(struct record *rec)
   3686{
   3687	int ret = 0;
   3688	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
   3689
   3690	if (!record__threads_enabled(rec))
   3691		return record__init_thread_default_masks(rec, cpus);
   3692
   3693	if (evlist__per_thread(rec->evlist)) {
   3694		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
   3695		return -EINVAL;
   3696	}
   3697
   3698	switch (rec->opts.threads_spec) {
   3699	case THREAD_SPEC__CPU:
   3700		ret = record__init_thread_cpu_masks(rec, cpus);
   3701		break;
   3702	case THREAD_SPEC__CORE:
   3703		ret = record__init_thread_core_masks(rec, cpus);
   3704		break;
   3705	case THREAD_SPEC__PACKAGE:
   3706		ret = record__init_thread_package_masks(rec, cpus);
   3707		break;
   3708	case THREAD_SPEC__NUMA:
   3709		ret = record__init_thread_numa_masks(rec, cpus);
   3710		break;
   3711	case THREAD_SPEC__USER:
   3712		ret = record__init_thread_user_masks(rec, cpus);
   3713		break;
   3714	default:
   3715		break;
   3716	}
   3717
   3718	return ret;
   3719}
   3720
   3721int cmd_record(int argc, const char **argv)
   3722{
   3723	int err;
   3724	struct record *rec = &record;
   3725	char errbuf[BUFSIZ];
   3726
   3727	setlocale(LC_ALL, "");
   3728
   3729#ifndef HAVE_LIBBPF_SUPPORT
   3730# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
   3731	set_nobuild('\0', "clang-path", true);
   3732	set_nobuild('\0', "clang-opt", true);
   3733# undef set_nobuild
   3734#endif
   3735
   3736#ifndef HAVE_BPF_PROLOGUE
   3737# if !defined (HAVE_DWARF_SUPPORT)
   3738#  define REASON  "NO_DWARF=1"
   3739# elif !defined (HAVE_LIBBPF_SUPPORT)
   3740#  define REASON  "NO_LIBBPF=1"
   3741# else
   3742#  define REASON  "this architecture doesn't support BPF prologue"
   3743# endif
   3744# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
   3745	set_nobuild('\0', "vmlinux", true);
   3746# undef set_nobuild
   3747# undef REASON
   3748#endif
   3749
   3750#ifndef HAVE_BPF_SKEL
   3751# define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
   3752	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
   3753# undef set_nobuild
   3754#endif
   3755
   3756	rec->opts.affinity = PERF_AFFINITY_SYS;
   3757
   3758	rec->evlist = evlist__new();
   3759	if (rec->evlist == NULL)
   3760		return -ENOMEM;
   3761
   3762	err = perf_config(perf_record_config, rec);
   3763	if (err)
   3764		return err;
   3765
   3766	argc = parse_options(argc, argv, record_options, record_usage,
   3767			    PARSE_OPT_STOP_AT_NON_OPTION);
   3768	if (quiet)
   3769		perf_quiet_option();
   3770
   3771	err = symbol__validate_sym_arguments();
   3772	if (err)
   3773		return err;
   3774
   3775	perf_debuginfod_setup(&record.debuginfod);
   3776
   3777	/* Make system wide (-a) the default target. */
   3778	if (!argc && target__none(&rec->opts.target))
   3779		rec->opts.target.system_wide = true;
   3780
   3781	if (nr_cgroups && !rec->opts.target.system_wide) {
   3782		usage_with_options_msg(record_usage, record_options,
   3783			"cgroup monitoring only available in system-wide mode");
   3784
   3785	}
   3786
   3787	if (rec->buildid_mmap) {
   3788		if (!perf_can_record_build_id()) {
   3789			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
   3790			err = -EINVAL;
   3791			goto out_opts;
   3792		}
   3793		pr_debug("Enabling build id in mmap2 events.\n");
   3794		/* Enable mmap build id synthesizing. */
   3795		symbol_conf.buildid_mmap2 = true;
   3796		/* Enable perf_event_attr::build_id bit. */
   3797		rec->opts.build_id = true;
   3798		/* Disable build id cache. */
   3799		rec->no_buildid = true;
   3800	}
   3801
   3802	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
   3803		pr_err("Kernel has no cgroup sampling support.\n");
   3804		err = -EINVAL;
   3805		goto out_opts;
   3806	}
   3807
   3808	if (rec->opts.kcore || record__threads_enabled(rec))
   3809		rec->data.is_dir = true;
   3810
   3811	if (record__threads_enabled(rec)) {
   3812		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
   3813			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
   3814			goto out_opts;
   3815		}
   3816		if (record__aio_enabled(rec)) {
   3817			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
   3818			goto out_opts;
   3819		}
   3820	}
   3821
   3822	if (rec->opts.comp_level != 0) {
   3823		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
   3824		rec->no_buildid = true;
   3825	}
   3826
   3827	if (rec->opts.record_switch_events &&
   3828	    !perf_can_record_switch_events()) {
   3829		ui__error("kernel does not support recording context switch events\n");
   3830		parse_options_usage(record_usage, record_options, "switch-events", 0);
   3831		err = -EINVAL;
   3832		goto out_opts;
   3833	}
   3834
   3835	if (switch_output_setup(rec)) {
   3836		parse_options_usage(record_usage, record_options, "switch-output", 0);
   3837		err = -EINVAL;
   3838		goto out_opts;
   3839	}
   3840
   3841	if (rec->switch_output.time) {
   3842		signal(SIGALRM, alarm_sig_handler);
   3843		alarm(rec->switch_output.time);
   3844	}
   3845
   3846	if (rec->switch_output.num_files) {
   3847		rec->switch_output.filenames = calloc(sizeof(char *),
   3848						      rec->switch_output.num_files);
   3849		if (!rec->switch_output.filenames) {
   3850			err = -EINVAL;
   3851			goto out_opts;
   3852		}
   3853	}
   3854
   3855	if (rec->timestamp_filename && record__threads_enabled(rec)) {
   3856		rec->timestamp_filename = false;
   3857		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
   3858	}
   3859
   3860	/*
   3861	 * Allow aliases to facilitate the lookup of symbols for address
   3862	 * filters. Refer to auxtrace_parse_filters().
   3863	 */
   3864	symbol_conf.allow_aliases = true;
   3865
   3866	symbol__init(NULL);
   3867
   3868	err = record__auxtrace_init(rec);
   3869	if (err)
   3870		goto out;
   3871
   3872	if (dry_run)
   3873		goto out;
   3874
   3875	err = bpf__setup_stdout(rec->evlist);
   3876	if (err) {
   3877		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
   3878		pr_err("ERROR: Setup BPF stdout failed: %s\n",
   3879			 errbuf);
   3880		goto out;
   3881	}
   3882
   3883	err = -ENOMEM;
   3884
   3885	if (rec->no_buildid_cache || rec->no_buildid) {
   3886		disable_buildid_cache();
   3887	} else if (rec->switch_output.enabled) {
   3888		/*
   3889		 * In 'perf record --switch-output', disable buildid
   3890		 * generation by default to reduce data file switching
   3891		 * overhead. Still generate buildid if they are required
   3892		 * explicitly using
   3893		 *
   3894		 *  perf record --switch-output --no-no-buildid \
   3895		 *              --no-no-buildid-cache
   3896		 *
   3897		 * Following code equals to:
   3898		 *
   3899		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
   3900		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
   3901		 *         disable_buildid_cache();
   3902		 */
   3903		bool disable = true;
   3904
   3905		if (rec->no_buildid_set && !rec->no_buildid)
   3906			disable = false;
   3907		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
   3908			disable = false;
   3909		if (disable) {
   3910			rec->no_buildid = true;
   3911			rec->no_buildid_cache = true;
   3912			disable_buildid_cache();
   3913		}
   3914	}
   3915
   3916	if (record.opts.overwrite)
   3917		record.opts.tail_synthesize = true;
   3918
   3919	if (rec->evlist->core.nr_entries == 0) {
   3920		if (perf_pmu__has_hybrid()) {
   3921			err = evlist__add_default_hybrid(rec->evlist,
   3922							 !record.opts.no_samples);
   3923		} else {
   3924			err = __evlist__add_default(rec->evlist,
   3925						    !record.opts.no_samples);
   3926		}
   3927
   3928		if (err < 0) {
   3929			pr_err("Not enough memory for event selector list\n");
   3930			goto out;
   3931		}
   3932	}
   3933
   3934	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
   3935		rec->opts.no_inherit = true;
   3936
   3937	err = target__validate(&rec->opts.target);
   3938	if (err) {
   3939		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
   3940		ui__warning("%s\n", errbuf);
   3941	}
   3942
   3943	err = target__parse_uid(&rec->opts.target);
   3944	if (err) {
   3945		int saved_errno = errno;
   3946
   3947		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
   3948		ui__error("%s", errbuf);
   3949
   3950		err = -saved_errno;
   3951		goto out;
   3952	}
   3953
   3954	/* Enable ignoring missing threads when -u/-p option is defined. */
   3955	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
   3956
   3957	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
   3958		pr_err("failed to use cpu list %s\n",
   3959		       rec->opts.target.cpu_list);
   3960		goto out;
   3961	}
   3962
   3963	rec->opts.target.hybrid = perf_pmu__has_hybrid();
   3964
   3965	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
   3966		arch__add_leaf_frame_record_opts(&rec->opts);
   3967
   3968	err = -ENOMEM;
   3969	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
   3970		usage_with_options(record_usage, record_options);
   3971
   3972	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
   3973	if (err)
   3974		goto out;
   3975
   3976	/*
   3977	 * We take all buildids when the file contains
   3978	 * AUX area tracing data because we do not decode the
   3979	 * trace because it would take too long.
   3980	 */
   3981	if (rec->opts.full_auxtrace)
   3982		rec->buildid_all = true;
   3983
   3984	if (rec->opts.text_poke) {
   3985		err = record__config_text_poke(rec->evlist);
   3986		if (err) {
   3987			pr_err("record__config_text_poke failed, error %d\n", err);
   3988			goto out;
   3989		}
   3990	}
   3991
   3992	if (rec->off_cpu) {
   3993		err = record__config_off_cpu(rec);
   3994		if (err) {
   3995			pr_err("record__config_off_cpu failed, error %d\n", err);
   3996			goto out;
   3997		}
   3998	}
   3999
   4000	if (record_opts__config(&rec->opts)) {
   4001		err = -EINVAL;
   4002		goto out;
   4003	}
   4004
   4005	err = record__init_thread_masks(rec);
   4006	if (err) {
   4007		pr_err("Failed to initialize parallel data streaming masks\n");
   4008		goto out;
   4009	}
   4010
   4011	if (rec->opts.nr_cblocks > nr_cblocks_max)
   4012		rec->opts.nr_cblocks = nr_cblocks_max;
   4013	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
   4014
   4015	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
   4016	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
   4017
   4018	if (rec->opts.comp_level > comp_level_max)
   4019		rec->opts.comp_level = comp_level_max;
   4020	pr_debug("comp level: %d\n", rec->opts.comp_level);
   4021
   4022	err = __cmd_record(&record, argc, argv);
   4023out:
   4024	evlist__delete(rec->evlist);
   4025	symbol__exit();
   4026	auxtrace_record__free(rec->itr);
   4027out_opts:
   4028	record__free_thread_masks(rec, rec->nr_threads);
   4029	rec->nr_threads = 0;
   4030	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
   4031	return err;
   4032}
   4033
   4034static void snapshot_sig_handler(int sig __maybe_unused)
   4035{
   4036	struct record *rec = &record;
   4037
   4038	hit_auxtrace_snapshot_trigger(rec);
   4039
   4040	if (switch_output_signal(rec))
   4041		trigger_hit(&switch_output_trigger);
   4042}
   4043
   4044static void alarm_sig_handler(int sig __maybe_unused)
   4045{
   4046	struct record *rec = &record;
   4047
   4048	if (switch_output_time(rec))
   4049		trigger_hit(&switch_output_trigger);
   4050}