cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

builtin-trace.c (151351B)


      1/*
      2 * builtin-trace.c
      3 *
      4 * Builtin 'trace' command:
      5 *
      6 * Display a continuously updated trace of any workload, CPU, specific PID,
      7 * system wide, etc.  Default format is loosely strace like, but any other
      8 * event may be specified using --event.
      9 *
     10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
     11 *
     12 * Initially based on the 'trace' prototype by Thomas Gleixner:
     13 *
     14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
     15 */
     16
     17#include "util/record.h"
     18#include <traceevent/event-parse.h>
     19#include <api/fs/tracing_path.h>
     20#include <bpf/bpf.h>
     21#include "util/bpf_map.h"
     22#include "util/rlimit.h"
     23#include "builtin.h"
     24#include "util/cgroup.h"
     25#include "util/color.h"
     26#include "util/config.h"
     27#include "util/debug.h"
     28#include "util/dso.h"
     29#include "util/env.h"
     30#include "util/event.h"
     31#include "util/evsel.h"
     32#include "util/evsel_fprintf.h"
     33#include "util/synthetic-events.h"
     34#include "util/evlist.h"
     35#include "util/evswitch.h"
     36#include "util/mmap.h"
     37#include <subcmd/pager.h>
     38#include <subcmd/exec-cmd.h>
     39#include "util/machine.h"
     40#include "util/map.h"
     41#include "util/symbol.h"
     42#include "util/path.h"
     43#include "util/session.h"
     44#include "util/thread.h"
     45#include <subcmd/parse-options.h>
     46#include "util/strlist.h"
     47#include "util/intlist.h"
     48#include "util/thread_map.h"
     49#include "util/stat.h"
     50#include "util/tool.h"
     51#include "util/util.h"
     52#include "trace/beauty/beauty.h"
     53#include "trace-event.h"
     54#include "util/parse-events.h"
     55#include "util/bpf-loader.h"
     56#include "callchain.h"
     57#include "print_binary.h"
     58#include "string2.h"
     59#include "syscalltbl.h"
     60#include "rb_resort.h"
     61#include "../perf.h"
     62
     63#include <errno.h>
     64#include <inttypes.h>
     65#include <poll.h>
     66#include <signal.h>
     67#include <stdlib.h>
     68#include <string.h>
     69#include <linux/err.h>
     70#include <linux/filter.h>
     71#include <linux/kernel.h>
     72#include <linux/random.h>
     73#include <linux/stringify.h>
     74#include <linux/time64.h>
     75#include <linux/zalloc.h>
     76#include <fcntl.h>
     77#include <sys/sysmacros.h>
     78
     79#include <linux/ctype.h>
     80#include <perf/mmap.h>
     81
     82#ifndef O_CLOEXEC
     83# define O_CLOEXEC		02000000
     84#endif
     85
     86#ifndef F_LINUX_SPECIFIC_BASE
     87# define F_LINUX_SPECIFIC_BASE	1024
     88#endif
     89
     90/*
     91 * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100
     92 */
     93struct syscall_arg_fmt {
     94	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
     95	bool	   (*strtoul)(char *bf, size_t size, struct syscall_arg *arg, u64 *val);
     96	unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
     97	void	   *parm;
     98	const char *name;
     99	u16	   nr_entries; // for arrays
    100	bool	   show_zero;
    101};
    102
    103struct syscall_fmt {
    104	const char *name;
    105	const char *alias;
    106	struct {
    107		const char *sys_enter,
    108			   *sys_exit;
    109	}	   bpf_prog_name;
    110	struct syscall_arg_fmt arg[6];
    111	u8	   nr_args;
    112	bool	   errpid;
    113	bool	   timeout;
    114	bool	   hexret;
    115};
    116
    117struct trace {
    118	struct perf_tool	tool;
    119	struct syscalltbl	*sctbl;
    120	struct {
    121		struct syscall  *table;
    122		struct bpf_map  *map;
    123		struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY
    124			struct bpf_map  *sys_enter,
    125					*sys_exit;
    126		}		prog_array;
    127		struct {
    128			struct evsel *sys_enter,
    129					  *sys_exit,
    130					  *augmented;
    131		}		events;
    132		struct bpf_program *unaugmented_prog;
    133	} syscalls;
    134	struct {
    135		struct bpf_map *map;
    136	} dump;
    137	struct record_opts	opts;
    138	struct evlist	*evlist;
    139	struct machine		*host;
    140	struct thread		*current;
    141	struct bpf_object	*bpf_obj;
    142	struct cgroup		*cgroup;
    143	u64			base_time;
    144	FILE			*output;
    145	unsigned long		nr_events;
    146	unsigned long		nr_events_printed;
    147	unsigned long		max_events;
    148	struct evswitch		evswitch;
    149	struct strlist		*ev_qualifier;
    150	struct {
    151		size_t		nr;
    152		int		*entries;
    153	}			ev_qualifier_ids;
    154	struct {
    155		size_t		nr;
    156		pid_t		*entries;
    157		struct bpf_map  *map;
    158	}			filter_pids;
    159	double			duration_filter;
    160	double			runtime_ms;
    161	struct {
    162		u64		vfs_getname,
    163				proc_getname;
    164	} stats;
    165	unsigned int		max_stack;
    166	unsigned int		min_stack;
    167	int			raw_augmented_syscalls_args_size;
    168	bool			raw_augmented_syscalls;
    169	bool			fd_path_disabled;
    170	bool			sort_events;
    171	bool			not_ev_qualifier;
    172	bool			live;
    173	bool			full_time;
    174	bool			sched;
    175	bool			multiple_threads;
    176	bool			summary;
    177	bool			summary_only;
    178	bool			errno_summary;
    179	bool			failure_only;
    180	bool			show_comm;
    181	bool			print_sample;
    182	bool			show_tool_stats;
    183	bool			trace_syscalls;
    184	bool			libtraceevent_print;
    185	bool			kernel_syscallchains;
    186	s16			args_alignment;
    187	bool			show_tstamp;
    188	bool			show_duration;
    189	bool			show_zeros;
    190	bool			show_arg_names;
    191	bool			show_string_prefix;
    192	bool			force;
    193	bool			vfs_getname;
    194	int			trace_pgfaults;
    195	char			*perfconfig_events;
    196	struct {
    197		struct ordered_events	data;
    198		u64			last;
    199	} oe;
    200};
    201
    202struct tp_field {
    203	int offset;
    204	union {
    205		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
    206		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
    207	};
    208};
    209
    210#define TP_UINT_FIELD(bits) \
    211static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
    212{ \
    213	u##bits value; \
    214	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
    215	return value;  \
    216}
    217
    218TP_UINT_FIELD(8);
    219TP_UINT_FIELD(16);
    220TP_UINT_FIELD(32);
    221TP_UINT_FIELD(64);
    222
    223#define TP_UINT_FIELD__SWAPPED(bits) \
    224static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
    225{ \
    226	u##bits value; \
    227	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
    228	return bswap_##bits(value);\
    229}
    230
    231TP_UINT_FIELD__SWAPPED(16);
    232TP_UINT_FIELD__SWAPPED(32);
    233TP_UINT_FIELD__SWAPPED(64);
    234
    235static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
    236{
    237	field->offset = offset;
    238
    239	switch (size) {
    240	case 1:
    241		field->integer = tp_field__u8;
    242		break;
    243	case 2:
    244		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
    245		break;
    246	case 4:
    247		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
    248		break;
    249	case 8:
    250		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
    251		break;
    252	default:
    253		return -1;
    254	}
    255
    256	return 0;
    257}
    258
    259static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
    260{
    261	return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
    262}
    263
    264static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
    265{
    266	return sample->raw_data + field->offset;
    267}
    268
    269static int __tp_field__init_ptr(struct tp_field *field, int offset)
    270{
    271	field->offset = offset;
    272	field->pointer = tp_field__ptr;
    273	return 0;
    274}
    275
    276static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
    277{
    278	return __tp_field__init_ptr(field, format_field->offset);
    279}
    280
    281struct syscall_tp {
    282	struct tp_field id;
    283	union {
    284		struct tp_field args, ret;
    285	};
    286};
    287
    288/*
    289 * The evsel->priv as used by 'perf trace'
    290 * sc:	for raw_syscalls:sys_{enter,exit} and syscalls:sys_{enter,exit}_SYSCALLNAME
    291 * fmt: for all the other tracepoints
    292 */
    293struct evsel_trace {
    294	struct syscall_tp	sc;
    295	struct syscall_arg_fmt  *fmt;
    296};
    297
    298static struct evsel_trace *evsel_trace__new(void)
    299{
    300	return zalloc(sizeof(struct evsel_trace));
    301}
    302
    303static void evsel_trace__delete(struct evsel_trace *et)
    304{
    305	if (et == NULL)
    306		return;
    307
    308	zfree(&et->fmt);
    309	free(et);
    310}
    311
    312/*
    313 * Used with raw_syscalls:sys_{enter,exit} and with the
    314 * syscalls:sys_{enter,exit}_SYSCALL tracepoints
    315 */
    316static inline struct syscall_tp *__evsel__syscall_tp(struct evsel *evsel)
    317{
    318	struct evsel_trace *et = evsel->priv;
    319
    320	return &et->sc;
    321}
    322
    323static struct syscall_tp *evsel__syscall_tp(struct evsel *evsel)
    324{
    325	if (evsel->priv == NULL) {
    326		evsel->priv = evsel_trace__new();
    327		if (evsel->priv == NULL)
    328			return NULL;
    329	}
    330
    331	return __evsel__syscall_tp(evsel);
    332}
    333
    334/*
    335 * Used with all the other tracepoints.
    336 */
    337static inline struct syscall_arg_fmt *__evsel__syscall_arg_fmt(struct evsel *evsel)
    338{
    339	struct evsel_trace *et = evsel->priv;
    340
    341	return et->fmt;
    342}
    343
    344static struct syscall_arg_fmt *evsel__syscall_arg_fmt(struct evsel *evsel)
    345{
    346	struct evsel_trace *et = evsel->priv;
    347
    348	if (evsel->priv == NULL) {
    349		et = evsel->priv = evsel_trace__new();
    350
    351		if (et == NULL)
    352			return NULL;
    353	}
    354
    355	if (et->fmt == NULL) {
    356		et->fmt = calloc(evsel->tp_format->format.nr_fields, sizeof(struct syscall_arg_fmt));
    357		if (et->fmt == NULL)
    358			goto out_delete;
    359	}
    360
    361	return __evsel__syscall_arg_fmt(evsel);
    362
    363out_delete:
    364	evsel_trace__delete(evsel->priv);
    365	evsel->priv = NULL;
    366	return NULL;
    367}
    368
    369static int evsel__init_tp_uint_field(struct evsel *evsel, struct tp_field *field, const char *name)
    370{
    371	struct tep_format_field *format_field = evsel__field(evsel, name);
    372
    373	if (format_field == NULL)
    374		return -1;
    375
    376	return tp_field__init_uint(field, format_field, evsel->needs_swap);
    377}
    378
    379#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
    380	({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
    381	   evsel__init_tp_uint_field(evsel, &sc->name, #name); })
    382
    383static int evsel__init_tp_ptr_field(struct evsel *evsel, struct tp_field *field, const char *name)
    384{
    385	struct tep_format_field *format_field = evsel__field(evsel, name);
    386
    387	if (format_field == NULL)
    388		return -1;
    389
    390	return tp_field__init_ptr(field, format_field);
    391}
    392
    393#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
    394	({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
    395	   evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
    396
    397static void evsel__delete_priv(struct evsel *evsel)
    398{
    399	zfree(&evsel->priv);
    400	evsel__delete(evsel);
    401}
    402
    403static int evsel__init_syscall_tp(struct evsel *evsel)
    404{
    405	struct syscall_tp *sc = evsel__syscall_tp(evsel);
    406
    407	if (sc != NULL) {
    408		if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
    409		    evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
    410			return -ENOENT;
    411		return 0;
    412	}
    413
    414	return -ENOMEM;
    415}
    416
    417static int evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp)
    418{
    419	struct syscall_tp *sc = evsel__syscall_tp(evsel);
    420
    421	if (sc != NULL) {
    422		struct tep_format_field *syscall_id = evsel__field(tp, "id");
    423		if (syscall_id == NULL)
    424			syscall_id = evsel__field(tp, "__syscall_nr");
    425		if (syscall_id == NULL ||
    426		    __tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
    427			return -EINVAL;
    428
    429		return 0;
    430	}
    431
    432	return -ENOMEM;
    433}
    434
    435static int evsel__init_augmented_syscall_tp_args(struct evsel *evsel)
    436{
    437	struct syscall_tp *sc = __evsel__syscall_tp(evsel);
    438
    439	return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
    440}
    441
    442static int evsel__init_augmented_syscall_tp_ret(struct evsel *evsel)
    443{
    444	struct syscall_tp *sc = __evsel__syscall_tp(evsel);
    445
    446	return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
    447}
    448
    449static int evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler)
    450{
    451	if (evsel__syscall_tp(evsel) != NULL) {
    452		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
    453			return -ENOENT;
    454
    455		evsel->handler = handler;
    456		return 0;
    457	}
    458
    459	return -ENOMEM;
    460}
    461
    462static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
    463{
    464	struct evsel *evsel = evsel__newtp("raw_syscalls", direction);
    465
    466	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
    467	if (IS_ERR(evsel))
    468		evsel = evsel__newtp("syscalls", direction);
    469
    470	if (IS_ERR(evsel))
    471		return NULL;
    472
    473	if (evsel__init_raw_syscall_tp(evsel, handler))
    474		goto out_delete;
    475
    476	return evsel;
    477
    478out_delete:
    479	evsel__delete_priv(evsel);
    480	return NULL;
    481}
    482
    483#define perf_evsel__sc_tp_uint(evsel, name, sample) \
    484	({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
    485	   fields->name.integer(&fields->name, sample); })
    486
    487#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
    488	({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
    489	   fields->name.pointer(&fields->name, sample); })
    490
    491size_t strarray__scnprintf_suffix(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_suffix, int val)
    492{
    493	int idx = val - sa->offset;
    494
    495	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
    496		size_t printed = scnprintf(bf, size, intfmt, val);
    497		if (show_suffix)
    498			printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
    499		return printed;
    500	}
    501
    502	return scnprintf(bf, size, "%s%s", sa->entries[idx], show_suffix ? sa->prefix : "");
    503}
    504
    505size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
    506{
    507	int idx = val - sa->offset;
    508
    509	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
    510		size_t printed = scnprintf(bf, size, intfmt, val);
    511		if (show_prefix)
    512			printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
    513		return printed;
    514	}
    515
    516	return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
    517}
    518
    519static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
    520						const char *intfmt,
    521					        struct syscall_arg *arg)
    522{
    523	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
    524}
    525
    526static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
    527					      struct syscall_arg *arg)
    528{
    529	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
    530}
    531
    532#define SCA_STRARRAY syscall_arg__scnprintf_strarray
    533
    534bool syscall_arg__strtoul_strarray(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
    535{
    536	return strarray__strtoul(arg->parm, bf, size, ret);
    537}
    538
    539bool syscall_arg__strtoul_strarray_flags(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
    540{
    541	return strarray__strtoul_flags(arg->parm, bf, size, ret);
    542}
    543
    544bool syscall_arg__strtoul_strarrays(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
    545{
    546	return strarrays__strtoul(arg->parm, bf, size, ret);
    547}
    548
    549size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg)
    550{
    551	return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val);
    552}
    553
    554size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
    555{
    556	size_t printed;
    557	int i;
    558
    559	for (i = 0; i < sas->nr_entries; ++i) {
    560		struct strarray *sa = sas->entries[i];
    561		int idx = val - sa->offset;
    562
    563		if (idx >= 0 && idx < sa->nr_entries) {
    564			if (sa->entries[idx] == NULL)
    565				break;
    566			return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
    567		}
    568	}
    569
    570	printed = scnprintf(bf, size, intfmt, val);
    571	if (show_prefix)
    572		printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
    573	return printed;
    574}
    575
    576bool strarray__strtoul(struct strarray *sa, char *bf, size_t size, u64 *ret)
    577{
    578	int i;
    579
    580	for (i = 0; i < sa->nr_entries; ++i) {
    581		if (sa->entries[i] && strncmp(sa->entries[i], bf, size) == 0 && sa->entries[i][size] == '\0') {
    582			*ret = sa->offset + i;
    583			return true;
    584		}
    585	}
    586
    587	return false;
    588}
    589
    590bool strarray__strtoul_flags(struct strarray *sa, char *bf, size_t size, u64 *ret)
    591{
    592	u64 val = 0;
    593	char *tok = bf, *sep, *end;
    594
    595	*ret = 0;
    596
    597	while (size != 0) {
    598		int toklen = size;
    599
    600		sep = memchr(tok, '|', size);
    601		if (sep != NULL) {
    602			size -= sep - tok + 1;
    603
    604			end = sep - 1;
    605			while (end > tok && isspace(*end))
    606				--end;
    607
    608			toklen = end - tok + 1;
    609		}
    610
    611		while (isspace(*tok))
    612			++tok;
    613
    614		if (isalpha(*tok) || *tok == '_') {
    615			if (!strarray__strtoul(sa, tok, toklen, &val))
    616				return false;
    617		} else {
    618			bool is_hexa = tok[0] == 0 && (tok[1] = 'x' || tok[1] == 'X');
    619
    620			val = strtoul(tok, NULL, is_hexa ? 16 : 0);
    621		}
    622
    623		*ret |= (1 << (val - 1));
    624
    625		if (sep == NULL)
    626			break;
    627		tok = sep + 1;
    628	}
    629
    630	return true;
    631}
    632
    633bool strarrays__strtoul(struct strarrays *sas, char *bf, size_t size, u64 *ret)
    634{
    635	int i;
    636
    637	for (i = 0; i < sas->nr_entries; ++i) {
    638		struct strarray *sa = sas->entries[i];
    639
    640		if (strarray__strtoul(sa, bf, size, ret))
    641			return true;
    642	}
    643
    644	return false;
    645}
    646
    647size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
    648					struct syscall_arg *arg)
    649{
    650	return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
    651}
    652
    653#ifndef AT_FDCWD
    654#define AT_FDCWD	-100
    655#endif
    656
    657static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
    658					   struct syscall_arg *arg)
    659{
    660	int fd = arg->val;
    661	const char *prefix = "AT_FD";
    662
    663	if (fd == AT_FDCWD)
    664		return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
    665
    666	return syscall_arg__scnprintf_fd(bf, size, arg);
    667}
    668
    669#define SCA_FDAT syscall_arg__scnprintf_fd_at
    670
    671static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
    672					      struct syscall_arg *arg);
    673
    674#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
    675
    676size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
    677{
    678	return scnprintf(bf, size, "%#lx", arg->val);
    679}
    680
    681size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
    682{
    683	if (arg->val == 0)
    684		return scnprintf(bf, size, "NULL");
    685	return syscall_arg__scnprintf_hex(bf, size, arg);
    686}
    687
    688size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
    689{
    690	return scnprintf(bf, size, "%d", arg->val);
    691}
    692
    693size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
    694{
    695	return scnprintf(bf, size, "%ld", arg->val);
    696}
    697
    698static size_t syscall_arg__scnprintf_char_array(char *bf, size_t size, struct syscall_arg *arg)
    699{
    700	// XXX Hey, maybe for sched:sched_switch prev/next comm fields we can
    701	//     fill missing comms using thread__set_comm()...
    702	//     here or in a special syscall_arg__scnprintf_pid_sched_tp...
    703	return scnprintf(bf, size, "\"%-.*s\"", arg->fmt->nr_entries ?: arg->len, arg->val);
    704}
    705
    706#define SCA_CHAR_ARRAY syscall_arg__scnprintf_char_array
    707
    708static const char *bpf_cmd[] = {
    709	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
    710	"MAP_GET_NEXT_KEY", "PROG_LOAD", "OBJ_PIN", "OBJ_GET", "PROG_ATTACH",
    711	"PROG_DETACH", "PROG_TEST_RUN", "PROG_GET_NEXT_ID", "MAP_GET_NEXT_ID",
    712	"PROG_GET_FD_BY_ID", "MAP_GET_FD_BY_ID", "OBJ_GET_INFO_BY_FD",
    713	"PROG_QUERY", "RAW_TRACEPOINT_OPEN", "BTF_LOAD", "BTF_GET_FD_BY_ID",
    714	"TASK_FD_QUERY", "MAP_LOOKUP_AND_DELETE_ELEM", "MAP_FREEZE",
    715	"BTF_GET_NEXT_ID", "MAP_LOOKUP_BATCH", "MAP_LOOKUP_AND_DELETE_BATCH",
    716	"MAP_UPDATE_BATCH", "MAP_DELETE_BATCH", "LINK_CREATE", "LINK_UPDATE",
    717	"LINK_GET_FD_BY_ID", "LINK_GET_NEXT_ID", "ENABLE_STATS", "ITER_CREATE",
    718	"LINK_DETACH", "PROG_BIND_MAP",
    719};
    720static DEFINE_STRARRAY(bpf_cmd, "BPF_");
    721
    722static const char *fsmount_flags[] = {
    723	[1] = "CLOEXEC",
    724};
    725static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_");
    726
    727#include "trace/beauty/generated/fsconfig_arrays.c"
    728
    729static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_");
    730
    731static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
    732static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
    733
    734static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
    735static DEFINE_STRARRAY(itimers, "ITIMER_");
    736
    737static const char *keyctl_options[] = {
    738	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
    739	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
    740	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
    741	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
    742	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
    743};
    744static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
    745
    746static const char *whences[] = { "SET", "CUR", "END",
    747#ifdef SEEK_DATA
    748"DATA",
    749#endif
    750#ifdef SEEK_HOLE
    751"HOLE",
    752#endif
    753};
    754static DEFINE_STRARRAY(whences, "SEEK_");
    755
    756static const char *fcntl_cmds[] = {
    757	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
    758	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
    759	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
    760	"GETOWNER_UIDS",
    761};
    762static DEFINE_STRARRAY(fcntl_cmds, "F_");
    763
    764static const char *fcntl_linux_specific_cmds[] = {
    765	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
    766	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
    767	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
    768};
    769
    770static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
    771
    772static struct strarray *fcntl_cmds_arrays[] = {
    773	&strarray__fcntl_cmds,
    774	&strarray__fcntl_linux_specific_cmds,
    775};
    776
    777static DEFINE_STRARRAYS(fcntl_cmds_arrays);
    778
    779static const char *rlimit_resources[] = {
    780	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
    781	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
    782	"RTTIME",
    783};
    784static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
    785
    786static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
    787static DEFINE_STRARRAY(sighow, "SIG_");
    788
    789static const char *clockid[] = {
    790	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
    791	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
    792	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
    793};
    794static DEFINE_STRARRAY(clockid, "CLOCK_");
    795
    796static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
    797						 struct syscall_arg *arg)
    798{
    799	bool show_prefix = arg->show_string_prefix;
    800	const char *suffix = "_OK";
    801	size_t printed = 0;
    802	int mode = arg->val;
    803
    804	if (mode == F_OK) /* 0 */
    805		return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
    806#define	P_MODE(n) \
    807	if (mode & n##_OK) { \
    808		printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
    809		mode &= ~n##_OK; \
    810	}
    811
    812	P_MODE(R);
    813	P_MODE(W);
    814	P_MODE(X);
    815#undef P_MODE
    816
    817	if (mode)
    818		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
    819
    820	return printed;
    821}
    822
    823#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
    824
    825static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
    826					      struct syscall_arg *arg);
    827
    828#define SCA_FILENAME syscall_arg__scnprintf_filename
    829
    830static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
    831						struct syscall_arg *arg)
    832{
    833	bool show_prefix = arg->show_string_prefix;
    834	const char *prefix = "O_";
    835	int printed = 0, flags = arg->val;
    836
    837#define	P_FLAG(n) \
    838	if (flags & O_##n) { \
    839		printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
    840		flags &= ~O_##n; \
    841	}
    842
    843	P_FLAG(CLOEXEC);
    844	P_FLAG(NONBLOCK);
    845#undef P_FLAG
    846
    847	if (flags)
    848		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
    849
    850	return printed;
    851}
    852
    853#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
    854
    855#ifndef GRND_NONBLOCK
    856#define GRND_NONBLOCK	0x0001
    857#endif
    858#ifndef GRND_RANDOM
    859#define GRND_RANDOM	0x0002
    860#endif
    861
    862static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
    863						   struct syscall_arg *arg)
    864{
    865	bool show_prefix = arg->show_string_prefix;
    866	const char *prefix = "GRND_";
    867	int printed = 0, flags = arg->val;
    868
    869#define	P_FLAG(n) \
    870	if (flags & GRND_##n) { \
    871		printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
    872		flags &= ~GRND_##n; \
    873	}
    874
    875	P_FLAG(RANDOM);
    876	P_FLAG(NONBLOCK);
    877#undef P_FLAG
    878
    879	if (flags)
    880		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
    881
    882	return printed;
    883}
    884
    885#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
    886
    887#define STRARRAY(name, array) \
    888	  { .scnprintf	= SCA_STRARRAY, \
    889	    .strtoul	= STUL_STRARRAY, \
    890	    .parm	= &strarray__##array, }
    891
    892#define STRARRAY_FLAGS(name, array) \
    893	  { .scnprintf	= SCA_STRARRAY_FLAGS, \
    894	    .strtoul	= STUL_STRARRAY_FLAGS, \
    895	    .parm	= &strarray__##array, }
    896
    897#include "trace/beauty/arch_errno_names.c"
    898#include "trace/beauty/eventfd.c"
    899#include "trace/beauty/futex_op.c"
    900#include "trace/beauty/futex_val3.c"
    901#include "trace/beauty/mmap.c"
    902#include "trace/beauty/mode_t.c"
    903#include "trace/beauty/msg_flags.c"
    904#include "trace/beauty/open_flags.c"
    905#include "trace/beauty/perf_event_open.c"
    906#include "trace/beauty/pid.c"
    907#include "trace/beauty/sched_policy.c"
    908#include "trace/beauty/seccomp.c"
    909#include "trace/beauty/signum.c"
    910#include "trace/beauty/socket_type.c"
    911#include "trace/beauty/waitid_options.c"
    912
    913static struct syscall_fmt syscall_fmts[] = {
    914	{ .name	    = "access",
    915	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
    916	{ .name	    = "arch_prctl",
    917	  .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
    918		   [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
    919	{ .name	    = "bind",
    920	  .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
    921		   [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ },
    922		   [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
    923	{ .name	    = "bpf",
    924	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
    925	{ .name	    = "brk",	    .hexret = true,
    926	  .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
    927	{ .name     = "clock_gettime",
    928	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
    929	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
    930	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
    931		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
    932		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
    933		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
    934		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
    935	{ .name	    = "close",
    936	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
    937	{ .name	    = "connect",
    938	  .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
    939		   [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ },
    940		   [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
    941	{ .name	    = "epoll_ctl",
    942	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
    943	{ .name	    = "eventfd2",
    944	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
    945	{ .name	    = "fchmodat",
    946	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
    947	{ .name	    = "fchownat",
    948	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
    949	{ .name	    = "fcntl",
    950	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD,  /* cmd */
    951			   .strtoul   = STUL_STRARRAYS,
    952			   .parm      = &strarrays__fcntl_cmds_arrays,
    953			   .show_zero = true, },
    954		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
    955	{ .name	    = "flock",
    956	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
    957	{ .name     = "fsconfig",
    958	  .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, },
    959	{ .name     = "fsmount",
    960	  .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags),
    961		   [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
    962	{ .name     = "fspick",
    963	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dfd */ },
    964		   [1] = { .scnprintf = SCA_FILENAME,	  /* path */ },
    965		   [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
    966	{ .name	    = "fstat", .alias = "newfstat", },
    967	{ .name	    = "fstatat", .alias = "newfstatat", },
    968	{ .name	    = "futex",
    969	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
    970		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
    971	{ .name	    = "futimesat",
    972	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
    973	{ .name	    = "getitimer",
    974	  .arg = { [0] = STRARRAY(which, itimers), }, },
    975	{ .name	    = "getpid",	    .errpid = true, },
    976	{ .name	    = "getpgid",    .errpid = true, },
    977	{ .name	    = "getppid",    .errpid = true, },
    978	{ .name	    = "getrandom",
    979	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
    980	{ .name	    = "getrlimit",
    981	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
    982	{ .name	    = "getsockopt",
    983	  .arg = { [1] = STRARRAY(level, socket_level), }, },
    984	{ .name	    = "gettid",	    .errpid = true, },
    985	{ .name	    = "ioctl",
    986	  .arg = {
    987#if defined(__i386__) || defined(__x86_64__)
    988/*
    989 * FIXME: Make this available to all arches.
    990 */
    991		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
    992		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
    993#else
    994		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
    995#endif
    996	{ .name	    = "kcmp",	    .nr_args = 5,
    997	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
    998		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
    999		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
   1000		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
   1001		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
   1002	{ .name	    = "keyctl",
   1003	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
   1004	{ .name	    = "kill",
   1005	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
   1006	{ .name	    = "linkat",
   1007	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
   1008	{ .name	    = "lseek",
   1009	  .arg = { [2] = STRARRAY(whence, whences), }, },
   1010	{ .name	    = "lstat", .alias = "newlstat", },
   1011	{ .name     = "madvise",
   1012	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
   1013		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
   1014	{ .name	    = "mkdirat",
   1015	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
   1016	{ .name	    = "mknodat",
   1017	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
   1018	{ .name	    = "mmap",	    .hexret = true,
   1019/* The standard mmap maps to old_mmap on s390x */
   1020#if defined(__s390x__)
   1021	.alias = "old_mmap",
   1022#endif
   1023	  .arg = { [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
   1024		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */
   1025			   .strtoul   = STUL_STRARRAY_FLAGS,
   1026			   .parm      = &strarray__mmap_flags, },
   1027		   [5] = { .scnprintf = SCA_HEX,	/* offset */ }, }, },
   1028	{ .name	    = "mount",
   1029	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
   1030		   [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
   1031			   .mask_val  = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
   1032	{ .name	    = "move_mount",
   1033	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* from_dfd */ },
   1034		   [1] = { .scnprintf = SCA_FILENAME, /* from_pathname */ },
   1035		   [2] = { .scnprintf = SCA_FDAT,	/* to_dfd */ },
   1036		   [3] = { .scnprintf = SCA_FILENAME, /* to_pathname */ },
   1037		   [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
   1038	{ .name	    = "mprotect",
   1039	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
   1040		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
   1041	{ .name	    = "mq_unlink",
   1042	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
   1043	{ .name	    = "mremap",	    .hexret = true,
   1044	  .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
   1045	{ .name	    = "name_to_handle_at",
   1046	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
   1047	{ .name	    = "newfstatat",
   1048	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
   1049	{ .name	    = "open",
   1050	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
   1051	{ .name	    = "open_by_handle_at",
   1052	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
   1053		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
   1054	{ .name	    = "openat",
   1055	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
   1056		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
   1057	{ .name	    = "perf_event_open",
   1058	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
   1059		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
   1060		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
   1061	{ .name	    = "pipe2",
   1062	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
   1063	{ .name	    = "pkey_alloc",
   1064	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
   1065	{ .name	    = "pkey_free",
   1066	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
   1067	{ .name	    = "pkey_mprotect",
   1068	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
   1069		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
   1070		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
   1071	{ .name	    = "poll", .timeout = true, },
   1072	{ .name	    = "ppoll", .timeout = true, },
   1073	{ .name	    = "prctl",
   1074	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */
   1075			   .strtoul   = STUL_STRARRAY,
   1076			   .parm      = &strarray__prctl_options, },
   1077		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
   1078		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
   1079	{ .name	    = "pread", .alias = "pread64", },
   1080	{ .name	    = "preadv", .alias = "pread", },
   1081	{ .name	    = "prlimit64",
   1082	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
   1083	{ .name	    = "pwrite", .alias = "pwrite64", },
   1084	{ .name	    = "readlinkat",
   1085	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
   1086	{ .name	    = "recvfrom",
   1087	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
   1088	{ .name	    = "recvmmsg",
   1089	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
   1090	{ .name	    = "recvmsg",
   1091	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
   1092	{ .name	    = "renameat",
   1093	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
   1094		   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
   1095	{ .name	    = "renameat2",
   1096	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
   1097		   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
   1098		   [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
   1099	{ .name	    = "rt_sigaction",
   1100	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
   1101	{ .name	    = "rt_sigprocmask",
   1102	  .arg = { [0] = STRARRAY(how, sighow), }, },
   1103	{ .name	    = "rt_sigqueueinfo",
   1104	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
   1105	{ .name	    = "rt_tgsigqueueinfo",
   1106	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
   1107	{ .name	    = "sched_setscheduler",
   1108	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
   1109	{ .name	    = "seccomp",
   1110	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
   1111		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
   1112	{ .name	    = "select", .timeout = true, },
   1113	{ .name	    = "sendfile", .alias = "sendfile64", },
   1114	{ .name	    = "sendmmsg",
   1115	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
   1116	{ .name	    = "sendmsg",
   1117	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
   1118	{ .name	    = "sendto",
   1119	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
   1120		   [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, },
   1121	{ .name	    = "set_tid_address", .errpid = true, },
   1122	{ .name	    = "setitimer",
   1123	  .arg = { [0] = STRARRAY(which, itimers), }, },
   1124	{ .name	    = "setrlimit",
   1125	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
   1126	{ .name	    = "setsockopt",
   1127	  .arg = { [1] = STRARRAY(level, socket_level), }, },
   1128	{ .name	    = "socket",
   1129	  .arg = { [0] = STRARRAY(family, socket_families),
   1130		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
   1131		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
   1132	{ .name	    = "socketpair",
   1133	  .arg = { [0] = STRARRAY(family, socket_families),
   1134		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
   1135		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
   1136	{ .name	    = "stat", .alias = "newstat", },
   1137	{ .name	    = "statx",
   1138	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
   1139		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
   1140		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
   1141	{ .name	    = "swapoff",
   1142	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
   1143	{ .name	    = "swapon",
   1144	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
   1145	{ .name	    = "symlinkat",
   1146	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
   1147	{ .name	    = "sync_file_range",
   1148	  .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, },
   1149	{ .name	    = "tgkill",
   1150	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
   1151	{ .name	    = "tkill",
   1152	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
   1153	{ .name     = "umount2", .alias = "umount",
   1154	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, },
   1155	{ .name	    = "uname", .alias = "newuname", },
   1156	{ .name	    = "unlinkat",
   1157	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
   1158	{ .name	    = "utimensat",
   1159	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
   1160	{ .name	    = "wait4",	    .errpid = true,
   1161	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
   1162	{ .name	    = "waitid",	    .errpid = true,
   1163	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
   1164};
   1165
   1166static int syscall_fmt__cmp(const void *name, const void *fmtp)
   1167{
   1168	const struct syscall_fmt *fmt = fmtp;
   1169	return strcmp(name, fmt->name);
   1170}
   1171
   1172static struct syscall_fmt *__syscall_fmt__find(struct syscall_fmt *fmts, const int nmemb, const char *name)
   1173{
   1174	return bsearch(name, fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
   1175}
   1176
   1177static struct syscall_fmt *syscall_fmt__find(const char *name)
   1178{
   1179	const int nmemb = ARRAY_SIZE(syscall_fmts);
   1180	return __syscall_fmt__find(syscall_fmts, nmemb, name);
   1181}
   1182
   1183static struct syscall_fmt *__syscall_fmt__find_by_alias(struct syscall_fmt *fmts, const int nmemb, const char *alias)
   1184{
   1185	int i;
   1186
   1187	for (i = 0; i < nmemb; ++i) {
   1188		if (fmts[i].alias && strcmp(fmts[i].alias, alias) == 0)
   1189			return &fmts[i];
   1190	}
   1191
   1192	return NULL;
   1193}
   1194
   1195static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
   1196{
   1197	const int nmemb = ARRAY_SIZE(syscall_fmts);
   1198	return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias);
   1199}
   1200
   1201/*
   1202 * is_exit: is this "exit" or "exit_group"?
   1203 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
   1204 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
   1205 * nonexistent: Just a hole in the syscall table, syscall id not allocated
   1206 */
   1207struct syscall {
   1208	struct tep_event    *tp_format;
   1209	int		    nr_args;
   1210	int		    args_size;
   1211	struct {
   1212		struct bpf_program *sys_enter,
   1213				   *sys_exit;
   1214	}		    bpf_prog;
   1215	bool		    is_exit;
   1216	bool		    is_open;
   1217	bool		    nonexistent;
   1218	struct tep_format_field *args;
   1219	const char	    *name;
   1220	struct syscall_fmt  *fmt;
   1221	struct syscall_arg_fmt *arg_fmt;
   1222};
   1223
   1224/*
   1225 * Must match what is in the BPF program:
   1226 *
   1227 * tools/perf/examples/bpf/augmented_raw_syscalls.c
   1228 */
   1229struct bpf_map_syscall_entry {
   1230	bool	enabled;
   1231	u16	string_args_len[6];
   1232};
   1233
   1234/*
   1235 * We need to have this 'calculated' boolean because in some cases we really
   1236 * don't know what is the duration of a syscall, for instance, when we start
   1237 * a session and some threads are waiting for a syscall to finish, say 'poll',
   1238 * in which case all we can do is to print "( ? ) for duration and for the
   1239 * start timestamp.
   1240 */
   1241static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
   1242{
   1243	double duration = (double)t / NSEC_PER_MSEC;
   1244	size_t printed = fprintf(fp, "(");
   1245
   1246	if (!calculated)
   1247		printed += fprintf(fp, "         ");
   1248	else if (duration >= 1.0)
   1249		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
   1250	else if (duration >= 0.01)
   1251		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
   1252	else
   1253		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
   1254	return printed + fprintf(fp, "): ");
   1255}
   1256
   1257/**
   1258 * filename.ptr: The filename char pointer that will be vfs_getname'd
   1259 * filename.entry_str_pos: Where to insert the string translated from
   1260 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
   1261 * ret_scnprintf: syscall args may set this to a different syscall return
   1262 *                formatter, for instance, fcntl may return fds, file flags, etc.
   1263 */
   1264struct thread_trace {
   1265	u64		  entry_time;
   1266	bool		  entry_pending;
   1267	unsigned long	  nr_events;
   1268	unsigned long	  pfmaj, pfmin;
   1269	char		  *entry_str;
   1270	double		  runtime_ms;
   1271	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
   1272        struct {
   1273		unsigned long ptr;
   1274		short int     entry_str_pos;
   1275		bool	      pending_open;
   1276		unsigned int  namelen;
   1277		char	      *name;
   1278	} filename;
   1279	struct {
   1280		int	      max;
   1281		struct file   *table;
   1282	} files;
   1283
   1284	struct intlist *syscall_stats;
   1285};
   1286
   1287static struct thread_trace *thread_trace__new(void)
   1288{
   1289	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
   1290
   1291	if (ttrace) {
   1292		ttrace->files.max = -1;
   1293		ttrace->syscall_stats = intlist__new(NULL);
   1294	}
   1295
   1296	return ttrace;
   1297}
   1298
   1299static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
   1300{
   1301	struct thread_trace *ttrace;
   1302
   1303	if (thread == NULL)
   1304		goto fail;
   1305
   1306	if (thread__priv(thread) == NULL)
   1307		thread__set_priv(thread, thread_trace__new());
   1308
   1309	if (thread__priv(thread) == NULL)
   1310		goto fail;
   1311
   1312	ttrace = thread__priv(thread);
   1313	++ttrace->nr_events;
   1314
   1315	return ttrace;
   1316fail:
   1317	color_fprintf(fp, PERF_COLOR_RED,
   1318		      "WARNING: not enough memory, dropping samples!\n");
   1319	return NULL;
   1320}
   1321
   1322
   1323void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
   1324				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
   1325{
   1326	struct thread_trace *ttrace = thread__priv(arg->thread);
   1327
   1328	ttrace->ret_scnprintf = ret_scnprintf;
   1329}
   1330
   1331#define TRACE_PFMAJ		(1 << 0)
   1332#define TRACE_PFMIN		(1 << 1)
   1333
   1334static const size_t trace__entry_str_size = 2048;
   1335
   1336static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
   1337{
   1338	if (fd < 0)
   1339		return NULL;
   1340
   1341	if (fd > ttrace->files.max) {
   1342		struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
   1343
   1344		if (nfiles == NULL)
   1345			return NULL;
   1346
   1347		if (ttrace->files.max != -1) {
   1348			memset(nfiles + ttrace->files.max + 1, 0,
   1349			       (fd - ttrace->files.max) * sizeof(struct file));
   1350		} else {
   1351			memset(nfiles, 0, (fd + 1) * sizeof(struct file));
   1352		}
   1353
   1354		ttrace->files.table = nfiles;
   1355		ttrace->files.max   = fd;
   1356	}
   1357
   1358	return ttrace->files.table + fd;
   1359}
   1360
   1361struct file *thread__files_entry(struct thread *thread, int fd)
   1362{
   1363	return thread_trace__files_entry(thread__priv(thread), fd);
   1364}
   1365
   1366static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
   1367{
   1368	struct thread_trace *ttrace = thread__priv(thread);
   1369	struct file *file = thread_trace__files_entry(ttrace, fd);
   1370
   1371	if (file != NULL) {
   1372		struct stat st;
   1373		if (stat(pathname, &st) == 0)
   1374			file->dev_maj = major(st.st_rdev);
   1375		file->pathname = strdup(pathname);
   1376		if (file->pathname)
   1377			return 0;
   1378	}
   1379
   1380	return -1;
   1381}
   1382
   1383static int thread__read_fd_path(struct thread *thread, int fd)
   1384{
   1385	char linkname[PATH_MAX], pathname[PATH_MAX];
   1386	struct stat st;
   1387	int ret;
   1388
   1389	if (thread->pid_ == thread->tid) {
   1390		scnprintf(linkname, sizeof(linkname),
   1391			  "/proc/%d/fd/%d", thread->pid_, fd);
   1392	} else {
   1393		scnprintf(linkname, sizeof(linkname),
   1394			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
   1395	}
   1396
   1397	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
   1398		return -1;
   1399
   1400	ret = readlink(linkname, pathname, sizeof(pathname));
   1401
   1402	if (ret < 0 || ret > st.st_size)
   1403		return -1;
   1404
   1405	pathname[ret] = '\0';
   1406	return trace__set_fd_pathname(thread, fd, pathname);
   1407}
   1408
   1409static const char *thread__fd_path(struct thread *thread, int fd,
   1410				   struct trace *trace)
   1411{
   1412	struct thread_trace *ttrace = thread__priv(thread);
   1413
   1414	if (ttrace == NULL || trace->fd_path_disabled)
   1415		return NULL;
   1416
   1417	if (fd < 0)
   1418		return NULL;
   1419
   1420	if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
   1421		if (!trace->live)
   1422			return NULL;
   1423		++trace->stats.proc_getname;
   1424		if (thread__read_fd_path(thread, fd))
   1425			return NULL;
   1426	}
   1427
   1428	return ttrace->files.table[fd].pathname;
   1429}
   1430
   1431size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
   1432{
   1433	int fd = arg->val;
   1434	size_t printed = scnprintf(bf, size, "%d", fd);
   1435	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
   1436
   1437	if (path)
   1438		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
   1439
   1440	return printed;
   1441}
   1442
   1443size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
   1444{
   1445        size_t printed = scnprintf(bf, size, "%d", fd);
   1446	struct thread *thread = machine__find_thread(trace->host, pid, pid);
   1447
   1448	if (thread) {
   1449		const char *path = thread__fd_path(thread, fd, trace);
   1450
   1451		if (path)
   1452			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
   1453
   1454		thread__put(thread);
   1455	}
   1456
   1457        return printed;
   1458}
   1459
   1460static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
   1461					      struct syscall_arg *arg)
   1462{
   1463	int fd = arg->val;
   1464	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
   1465	struct thread_trace *ttrace = thread__priv(arg->thread);
   1466
   1467	if (ttrace && fd >= 0 && fd <= ttrace->files.max)
   1468		zfree(&ttrace->files.table[fd].pathname);
   1469
   1470	return printed;
   1471}
   1472
   1473static void thread__set_filename_pos(struct thread *thread, const char *bf,
   1474				     unsigned long ptr)
   1475{
   1476	struct thread_trace *ttrace = thread__priv(thread);
   1477
   1478	ttrace->filename.ptr = ptr;
   1479	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
   1480}
   1481
   1482static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
   1483{
   1484	struct augmented_arg *augmented_arg = arg->augmented.args;
   1485	size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
   1486	/*
   1487	 * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls
   1488	 * we would have two strings, each prefixed by its size.
   1489	 */
   1490	int consumed = sizeof(*augmented_arg) + augmented_arg->size;
   1491
   1492	arg->augmented.args = ((void *)arg->augmented.args) + consumed;
   1493	arg->augmented.size -= consumed;
   1494
   1495	return printed;
   1496}
   1497
   1498static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
   1499					      struct syscall_arg *arg)
   1500{
   1501	unsigned long ptr = arg->val;
   1502
   1503	if (arg->augmented.args)
   1504		return syscall_arg__scnprintf_augmented_string(arg, bf, size);
   1505
   1506	if (!arg->trace->vfs_getname)
   1507		return scnprintf(bf, size, "%#x", ptr);
   1508
   1509	thread__set_filename_pos(arg->thread, bf, ptr);
   1510	return 0;
   1511}
   1512
   1513static bool trace__filter_duration(struct trace *trace, double t)
   1514{
   1515	return t < (trace->duration_filter * NSEC_PER_MSEC);
   1516}
   1517
   1518static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
   1519{
   1520	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
   1521
   1522	return fprintf(fp, "%10.3f ", ts);
   1523}
   1524
   1525/*
   1526 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
   1527 * using ttrace->entry_time for a thread that receives a sys_exit without
   1528 * first having received a sys_enter ("poll" issued before tracing session
   1529 * starts, lost sys_enter exit due to ring buffer overflow).
   1530 */
   1531static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
   1532{
   1533	if (tstamp > 0)
   1534		return __trace__fprintf_tstamp(trace, tstamp, fp);
   1535
   1536	return fprintf(fp, "         ? ");
   1537}
   1538
   1539static pid_t workload_pid = -1;
   1540static bool done = false;
   1541static bool interrupted = false;
   1542
   1543static void sighandler_interrupt(int sig __maybe_unused)
   1544{
   1545	done = interrupted = true;
   1546}
   1547
   1548static void sighandler_chld(int sig __maybe_unused, siginfo_t *info,
   1549			    void *context __maybe_unused)
   1550{
   1551	if (info->si_pid == workload_pid)
   1552		done = true;
   1553}
   1554
   1555static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
   1556{
   1557	size_t printed = 0;
   1558
   1559	if (trace->multiple_threads) {
   1560		if (trace->show_comm)
   1561			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
   1562		printed += fprintf(fp, "%d ", thread->tid);
   1563	}
   1564
   1565	return printed;
   1566}
   1567
   1568static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
   1569					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
   1570{
   1571	size_t printed = 0;
   1572
   1573	if (trace->show_tstamp)
   1574		printed = trace__fprintf_tstamp(trace, tstamp, fp);
   1575	if (trace->show_duration)
   1576		printed += fprintf_duration(duration, duration_calculated, fp);
   1577	return printed + trace__fprintf_comm_tid(trace, thread, fp);
   1578}
   1579
   1580static int trace__process_event(struct trace *trace, struct machine *machine,
   1581				union perf_event *event, struct perf_sample *sample)
   1582{
   1583	int ret = 0;
   1584
   1585	switch (event->header.type) {
   1586	case PERF_RECORD_LOST:
   1587		color_fprintf(trace->output, PERF_COLOR_RED,
   1588			      "LOST %" PRIu64 " events!\n", event->lost.lost);
   1589		ret = machine__process_lost_event(machine, event, sample);
   1590		break;
   1591	default:
   1592		ret = machine__process_event(machine, event, sample);
   1593		break;
   1594	}
   1595
   1596	return ret;
   1597}
   1598
   1599static int trace__tool_process(struct perf_tool *tool,
   1600			       union perf_event *event,
   1601			       struct perf_sample *sample,
   1602			       struct machine *machine)
   1603{
   1604	struct trace *trace = container_of(tool, struct trace, tool);
   1605	return trace__process_event(trace, machine, event, sample);
   1606}
   1607
   1608static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
   1609{
   1610	struct machine *machine = vmachine;
   1611
   1612	if (machine->kptr_restrict_warned)
   1613		return NULL;
   1614
   1615	if (symbol_conf.kptr_restrict) {
   1616		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
   1617			   "Check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
   1618			   "Kernel samples will not be resolved.\n");
   1619		machine->kptr_restrict_warned = true;
   1620		return NULL;
   1621	}
   1622
   1623	return machine__resolve_kernel_addr(vmachine, addrp, modp);
   1624}
   1625
   1626static int trace__symbols_init(struct trace *trace, struct evlist *evlist)
   1627{
   1628	int err = symbol__init(NULL);
   1629
   1630	if (err)
   1631		return err;
   1632
   1633	trace->host = machine__new_host();
   1634	if (trace->host == NULL)
   1635		return -ENOMEM;
   1636
   1637	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
   1638	if (err < 0)
   1639		goto out;
   1640
   1641	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
   1642					    evlist->core.threads, trace__tool_process,
   1643					    true, false, 1);
   1644out:
   1645	if (err)
   1646		symbol__exit();
   1647
   1648	return err;
   1649}
   1650
   1651static void trace__symbols__exit(struct trace *trace)
   1652{
   1653	machine__exit(trace->host);
   1654	trace->host = NULL;
   1655
   1656	symbol__exit();
   1657}
   1658
   1659static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
   1660{
   1661	int idx;
   1662
   1663	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
   1664		nr_args = sc->fmt->nr_args;
   1665
   1666	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
   1667	if (sc->arg_fmt == NULL)
   1668		return -1;
   1669
   1670	for (idx = 0; idx < nr_args; ++idx) {
   1671		if (sc->fmt)
   1672			sc->arg_fmt[idx] = sc->fmt->arg[idx];
   1673	}
   1674
   1675	sc->nr_args = nr_args;
   1676	return 0;
   1677}
   1678
   1679static struct syscall_arg_fmt syscall_arg_fmts__by_name[] = {
   1680	{ .name = "msr",	.scnprintf = SCA_X86_MSR,	  .strtoul = STUL_X86_MSR,	   },
   1681	{ .name = "vector",	.scnprintf = SCA_X86_IRQ_VECTORS, .strtoul = STUL_X86_IRQ_VECTORS, },
   1682};
   1683
   1684static int syscall_arg_fmt__cmp(const void *name, const void *fmtp)
   1685{
   1686       const struct syscall_arg_fmt *fmt = fmtp;
   1687       return strcmp(name, fmt->name);
   1688}
   1689
   1690static struct syscall_arg_fmt *
   1691__syscall_arg_fmt__find_by_name(struct syscall_arg_fmt *fmts, const int nmemb, const char *name)
   1692{
   1693       return bsearch(name, fmts, nmemb, sizeof(struct syscall_arg_fmt), syscall_arg_fmt__cmp);
   1694}
   1695
   1696static struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name)
   1697{
   1698       const int nmemb = ARRAY_SIZE(syscall_arg_fmts__by_name);
   1699       return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name);
   1700}
   1701
   1702static struct tep_format_field *
   1703syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field)
   1704{
   1705	struct tep_format_field *last_field = NULL;
   1706	int len;
   1707
   1708	for (; field; field = field->next, ++arg) {
   1709		last_field = field;
   1710
   1711		if (arg->scnprintf)
   1712			continue;
   1713
   1714		len = strlen(field->name);
   1715
   1716		if (strcmp(field->type, "const char *") == 0 &&
   1717		    ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
   1718		     strstr(field->name, "path") != NULL))
   1719			arg->scnprintf = SCA_FILENAME;
   1720		else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
   1721			arg->scnprintf = SCA_PTR;
   1722		else if (strcmp(field->type, "pid_t") == 0)
   1723			arg->scnprintf = SCA_PID;
   1724		else if (strcmp(field->type, "umode_t") == 0)
   1725			arg->scnprintf = SCA_MODE_T;
   1726		else if ((field->flags & TEP_FIELD_IS_ARRAY) && strstr(field->type, "char")) {
   1727			arg->scnprintf = SCA_CHAR_ARRAY;
   1728			arg->nr_entries = field->arraylen;
   1729		} else if ((strcmp(field->type, "int") == 0 ||
   1730			  strcmp(field->type, "unsigned int") == 0 ||
   1731			  strcmp(field->type, "long") == 0) &&
   1732			 len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
   1733			/*
   1734			 * /sys/kernel/tracing/events/syscalls/sys_enter*
   1735			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
   1736			 * 65 int
   1737			 * 23 unsigned int
   1738			 * 7 unsigned long
   1739			 */
   1740			arg->scnprintf = SCA_FD;
   1741               } else {
   1742			struct syscall_arg_fmt *fmt = syscall_arg_fmt__find_by_name(field->name);
   1743
   1744			if (fmt) {
   1745				arg->scnprintf = fmt->scnprintf;
   1746				arg->strtoul   = fmt->strtoul;
   1747			}
   1748		}
   1749	}
   1750
   1751	return last_field;
   1752}
   1753
   1754static int syscall__set_arg_fmts(struct syscall *sc)
   1755{
   1756	struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args);
   1757
   1758	if (last_field)
   1759		sc->args_size = last_field->offset + last_field->size;
   1760
   1761	return 0;
   1762}
   1763
   1764static int trace__read_syscall_info(struct trace *trace, int id)
   1765{
   1766	char tp_name[128];
   1767	struct syscall *sc;
   1768	const char *name = syscalltbl__name(trace->sctbl, id);
   1769
   1770#ifdef HAVE_SYSCALL_TABLE_SUPPORT
   1771	if (trace->syscalls.table == NULL) {
   1772		trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc));
   1773		if (trace->syscalls.table == NULL)
   1774			return -ENOMEM;
   1775	}
   1776#else
   1777	if (id > trace->sctbl->syscalls.max_id || (id == 0 && trace->syscalls.table == NULL)) {
   1778		// When using libaudit we don't know beforehand what is the max syscall id
   1779		struct syscall *table = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
   1780
   1781		if (table == NULL)
   1782			return -ENOMEM;
   1783
   1784		// Need to memset from offset 0 and +1 members if brand new
   1785		if (trace->syscalls.table == NULL)
   1786			memset(table, 0, (id + 1) * sizeof(*sc));
   1787		else
   1788			memset(table + trace->sctbl->syscalls.max_id + 1, 0, (id - trace->sctbl->syscalls.max_id) * sizeof(*sc));
   1789
   1790		trace->syscalls.table	      = table;
   1791		trace->sctbl->syscalls.max_id = id;
   1792	}
   1793#endif
   1794	sc = trace->syscalls.table + id;
   1795	if (sc->nonexistent)
   1796		return 0;
   1797
   1798	if (name == NULL) {
   1799		sc->nonexistent = true;
   1800		return 0;
   1801	}
   1802
   1803	sc->name = name;
   1804	sc->fmt  = syscall_fmt__find(sc->name);
   1805
   1806	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
   1807	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
   1808
   1809	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
   1810		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
   1811		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
   1812	}
   1813
   1814	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
   1815		return -ENOMEM;
   1816
   1817	if (IS_ERR(sc->tp_format))
   1818		return PTR_ERR(sc->tp_format);
   1819
   1820	sc->args = sc->tp_format->format.fields;
   1821	/*
   1822	 * We need to check and discard the first variable '__syscall_nr'
   1823	 * or 'nr' that mean the syscall number. It is needless here.
   1824	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
   1825	 */
   1826	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
   1827		sc->args = sc->args->next;
   1828		--sc->nr_args;
   1829	}
   1830
   1831	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
   1832	sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
   1833
   1834	return syscall__set_arg_fmts(sc);
   1835}
   1836
   1837static int evsel__init_tp_arg_scnprintf(struct evsel *evsel)
   1838{
   1839	struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
   1840
   1841	if (fmt != NULL) {
   1842		syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields);
   1843		return 0;
   1844	}
   1845
   1846	return -ENOMEM;
   1847}
   1848
   1849static int intcmp(const void *a, const void *b)
   1850{
   1851	const int *one = a, *another = b;
   1852
   1853	return *one - *another;
   1854}
   1855
   1856static int trace__validate_ev_qualifier(struct trace *trace)
   1857{
   1858	int err = 0;
   1859	bool printed_invalid_prefix = false;
   1860	struct str_node *pos;
   1861	size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
   1862
   1863	trace->ev_qualifier_ids.entries = malloc(nr_allocated *
   1864						 sizeof(trace->ev_qualifier_ids.entries[0]));
   1865
   1866	if (trace->ev_qualifier_ids.entries == NULL) {
   1867		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
   1868		       trace->output);
   1869		err = -EINVAL;
   1870		goto out;
   1871	}
   1872
   1873	strlist__for_each_entry(pos, trace->ev_qualifier) {
   1874		const char *sc = pos->s;
   1875		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
   1876
   1877		if (id < 0) {
   1878			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
   1879			if (id >= 0)
   1880				goto matches;
   1881
   1882			if (!printed_invalid_prefix) {
   1883				pr_debug("Skipping unknown syscalls: ");
   1884				printed_invalid_prefix = true;
   1885			} else {
   1886				pr_debug(", ");
   1887			}
   1888
   1889			pr_debug("%s", sc);
   1890			continue;
   1891		}
   1892matches:
   1893		trace->ev_qualifier_ids.entries[nr_used++] = id;
   1894		if (match_next == -1)
   1895			continue;
   1896
   1897		while (1) {
   1898			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
   1899			if (id < 0)
   1900				break;
   1901			if (nr_allocated == nr_used) {
   1902				void *entries;
   1903
   1904				nr_allocated += 8;
   1905				entries = realloc(trace->ev_qualifier_ids.entries,
   1906						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
   1907				if (entries == NULL) {
   1908					err = -ENOMEM;
   1909					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
   1910					goto out_free;
   1911				}
   1912				trace->ev_qualifier_ids.entries = entries;
   1913			}
   1914			trace->ev_qualifier_ids.entries[nr_used++] = id;
   1915		}
   1916	}
   1917
   1918	trace->ev_qualifier_ids.nr = nr_used;
   1919	qsort(trace->ev_qualifier_ids.entries, nr_used, sizeof(int), intcmp);
   1920out:
   1921	if (printed_invalid_prefix)
   1922		pr_debug("\n");
   1923	return err;
   1924out_free:
   1925	zfree(&trace->ev_qualifier_ids.entries);
   1926	trace->ev_qualifier_ids.nr = 0;
   1927	goto out;
   1928}
   1929
   1930static __maybe_unused bool trace__syscall_enabled(struct trace *trace, int id)
   1931{
   1932	bool in_ev_qualifier;
   1933
   1934	if (trace->ev_qualifier_ids.nr == 0)
   1935		return true;
   1936
   1937	in_ev_qualifier = bsearch(&id, trace->ev_qualifier_ids.entries,
   1938				  trace->ev_qualifier_ids.nr, sizeof(int), intcmp) != NULL;
   1939
   1940	if (in_ev_qualifier)
   1941	       return !trace->not_ev_qualifier;
   1942
   1943	return trace->not_ev_qualifier;
   1944}
   1945
   1946/*
   1947 * args is to be interpreted as a series of longs but we need to handle
   1948 * 8-byte unaligned accesses. args points to raw_data within the event
   1949 * and raw_data is guaranteed to be 8-byte unaligned because it is
   1950 * preceded by raw_size which is a u32. So we need to copy args to a temp
   1951 * variable to read it. Most notably this avoids extended load instructions
   1952 * on unaligned addresses
   1953 */
   1954unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
   1955{
   1956	unsigned long val;
   1957	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
   1958
   1959	memcpy(&val, p, sizeof(val));
   1960	return val;
   1961}
   1962
   1963static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
   1964				      struct syscall_arg *arg)
   1965{
   1966	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
   1967		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
   1968
   1969	return scnprintf(bf, size, "arg%d: ", arg->idx);
   1970}
   1971
   1972/*
   1973 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
   1974 * as mount 'flags' argument that needs ignoring some magic flag, see comment
   1975 * in tools/perf/trace/beauty/mount_flags.c
   1976 */
   1977static unsigned long syscall_arg_fmt__mask_val(struct syscall_arg_fmt *fmt, struct syscall_arg *arg, unsigned long val)
   1978{
   1979	if (fmt && fmt->mask_val)
   1980		return fmt->mask_val(arg, val);
   1981
   1982	return val;
   1983}
   1984
   1985static size_t syscall_arg_fmt__scnprintf_val(struct syscall_arg_fmt *fmt, char *bf, size_t size,
   1986					     struct syscall_arg *arg, unsigned long val)
   1987{
   1988	if (fmt && fmt->scnprintf) {
   1989		arg->val = val;
   1990		if (fmt->parm)
   1991			arg->parm = fmt->parm;
   1992		return fmt->scnprintf(bf, size, arg);
   1993	}
   1994	return scnprintf(bf, size, "%ld", val);
   1995}
   1996
   1997static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
   1998				      unsigned char *args, void *augmented_args, int augmented_args_size,
   1999				      struct trace *trace, struct thread *thread)
   2000{
   2001	size_t printed = 0;
   2002	unsigned long val;
   2003	u8 bit = 1;
   2004	struct syscall_arg arg = {
   2005		.args	= args,
   2006		.augmented = {
   2007			.size = augmented_args_size,
   2008			.args = augmented_args,
   2009		},
   2010		.idx	= 0,
   2011		.mask	= 0,
   2012		.trace  = trace,
   2013		.thread = thread,
   2014		.show_string_prefix = trace->show_string_prefix,
   2015	};
   2016	struct thread_trace *ttrace = thread__priv(thread);
   2017
   2018	/*
   2019	 * Things like fcntl will set this in its 'cmd' formatter to pick the
   2020	 * right formatter for the return value (an fd? file flags?), which is
   2021	 * not needed for syscalls that always return a given type, say an fd.
   2022	 */
   2023	ttrace->ret_scnprintf = NULL;
   2024
   2025	if (sc->args != NULL) {
   2026		struct tep_format_field *field;
   2027
   2028		for (field = sc->args; field;
   2029		     field = field->next, ++arg.idx, bit <<= 1) {
   2030			if (arg.mask & bit)
   2031				continue;
   2032
   2033			arg.fmt = &sc->arg_fmt[arg.idx];
   2034			val = syscall_arg__val(&arg, arg.idx);
   2035			/*
   2036			 * Some syscall args need some mask, most don't and
   2037			 * return val untouched.
   2038			 */
   2039			val = syscall_arg_fmt__mask_val(&sc->arg_fmt[arg.idx], &arg, val);
   2040
   2041			/*
   2042 			 * Suppress this argument if its value is zero and
   2043 			 * and we don't have a string associated in an
   2044 			 * strarray for it.
   2045 			 */
   2046			if (val == 0 &&
   2047			    !trace->show_zeros &&
   2048			    !(sc->arg_fmt &&
   2049			      (sc->arg_fmt[arg.idx].show_zero ||
   2050			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
   2051			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
   2052			      sc->arg_fmt[arg.idx].parm))
   2053				continue;
   2054
   2055			printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
   2056
   2057			if (trace->show_arg_names)
   2058				printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
   2059
   2060			printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx],
   2061								  bf + printed, size - printed, &arg, val);
   2062		}
   2063	} else if (IS_ERR(sc->tp_format)) {
   2064		/*
   2065		 * If we managed to read the tracepoint /format file, then we
   2066		 * may end up not having any args, like with gettid(), so only
   2067		 * print the raw args when we didn't manage to read it.
   2068		 */
   2069		while (arg.idx < sc->nr_args) {
   2070			if (arg.mask & bit)
   2071				goto next_arg;
   2072			val = syscall_arg__val(&arg, arg.idx);
   2073			if (printed)
   2074				printed += scnprintf(bf + printed, size - printed, ", ");
   2075			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
   2076			printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], bf + printed, size - printed, &arg, val);
   2077next_arg:
   2078			++arg.idx;
   2079			bit <<= 1;
   2080		}
   2081	}
   2082
   2083	return printed;
   2084}
   2085
   2086typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel,
   2087				  union perf_event *event,
   2088				  struct perf_sample *sample);
   2089
   2090static struct syscall *trace__syscall_info(struct trace *trace,
   2091					   struct evsel *evsel, int id)
   2092{
   2093	int err = 0;
   2094
   2095	if (id < 0) {
   2096
   2097		/*
   2098		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
   2099		 * before that, leaving at a higher verbosity level till that is
   2100		 * explained. Reproduced with plain ftrace with:
   2101		 *
   2102		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
   2103		 * grep "NR -1 " /t/trace_pipe
   2104		 *
   2105		 * After generating some load on the machine.
   2106 		 */
   2107		if (verbose > 1) {
   2108			static u64 n;
   2109			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
   2110				id, evsel__name(evsel), ++n);
   2111		}
   2112		return NULL;
   2113	}
   2114
   2115	err = -EINVAL;
   2116
   2117#ifdef HAVE_SYSCALL_TABLE_SUPPORT
   2118	if (id > trace->sctbl->syscalls.max_id) {
   2119#else
   2120	if (id >= trace->sctbl->syscalls.max_id) {
   2121		/*
   2122		 * With libaudit we don't know beforehand what is the max_id,
   2123		 * so we let trace__read_syscall_info() figure that out as we
   2124		 * go on reading syscalls.
   2125		 */
   2126		err = trace__read_syscall_info(trace, id);
   2127		if (err)
   2128#endif
   2129		goto out_cant_read;
   2130	}
   2131
   2132	if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) &&
   2133	    (err = trace__read_syscall_info(trace, id)) != 0)
   2134		goto out_cant_read;
   2135
   2136	if (trace->syscalls.table[id].name == NULL) {
   2137		if (trace->syscalls.table[id].nonexistent)
   2138			return NULL;
   2139		goto out_cant_read;
   2140	}
   2141
   2142	return &trace->syscalls.table[id];
   2143
   2144out_cant_read:
   2145	if (verbose > 0) {
   2146		char sbuf[STRERR_BUFSIZE];
   2147		fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf)));
   2148		if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL)
   2149			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
   2150		fputs(" information\n", trace->output);
   2151	}
   2152	return NULL;
   2153}
   2154
   2155struct syscall_stats {
   2156	struct stats stats;
   2157	u64	     nr_failures;
   2158	int	     max_errno;
   2159	u32	     *errnos;
   2160};
   2161
   2162static void thread__update_stats(struct thread *thread, struct thread_trace *ttrace,
   2163				 int id, struct perf_sample *sample, long err, bool errno_summary)
   2164{
   2165	struct int_node *inode;
   2166	struct syscall_stats *stats;
   2167	u64 duration = 0;
   2168
   2169	inode = intlist__findnew(ttrace->syscall_stats, id);
   2170	if (inode == NULL)
   2171		return;
   2172
   2173	stats = inode->priv;
   2174	if (stats == NULL) {
   2175		stats = malloc(sizeof(*stats));
   2176		if (stats == NULL)
   2177			return;
   2178
   2179		stats->nr_failures = 0;
   2180		stats->max_errno   = 0;
   2181		stats->errnos	   = NULL;
   2182		init_stats(&stats->stats);
   2183		inode->priv = stats;
   2184	}
   2185
   2186	if (ttrace->entry_time && sample->time > ttrace->entry_time)
   2187		duration = sample->time - ttrace->entry_time;
   2188
   2189	update_stats(&stats->stats, duration);
   2190
   2191	if (err < 0) {
   2192		++stats->nr_failures;
   2193
   2194		if (!errno_summary)
   2195			return;
   2196
   2197		err = -err;
   2198		if (err > stats->max_errno) {
   2199			u32 *new_errnos = realloc(stats->errnos, err * sizeof(u32));
   2200
   2201			if (new_errnos) {
   2202				memset(new_errnos + stats->max_errno, 0, (err - stats->max_errno) * sizeof(u32));
   2203			} else {
   2204				pr_debug("Not enough memory for errno stats for thread \"%s\"(%d/%d), results will be incomplete\n",
   2205					 thread__comm_str(thread), thread->pid_, thread->tid);
   2206				return;
   2207			}
   2208
   2209			stats->errnos = new_errnos;
   2210			stats->max_errno = err;
   2211		}
   2212
   2213		++stats->errnos[err - 1];
   2214	}
   2215}
   2216
   2217static int trace__printf_interrupted_entry(struct trace *trace)
   2218{
   2219	struct thread_trace *ttrace;
   2220	size_t printed;
   2221	int len;
   2222
   2223	if (trace->failure_only || trace->current == NULL)
   2224		return 0;
   2225
   2226	ttrace = thread__priv(trace->current);
   2227
   2228	if (!ttrace->entry_pending)
   2229		return 0;
   2230
   2231	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
   2232	printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
   2233
   2234	if (len < trace->args_alignment - 4)
   2235		printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
   2236
   2237	printed += fprintf(trace->output, " ...\n");
   2238
   2239	ttrace->entry_pending = false;
   2240	++trace->nr_events_printed;
   2241
   2242	return printed;
   2243}
   2244
   2245static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel,
   2246				 struct perf_sample *sample, struct thread *thread)
   2247{
   2248	int printed = 0;
   2249
   2250	if (trace->print_sample) {
   2251		double ts = (double)sample->time / NSEC_PER_MSEC;
   2252
   2253		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
   2254				   evsel__name(evsel), ts,
   2255				   thread__comm_str(thread),
   2256				   sample->pid, sample->tid, sample->cpu);
   2257	}
   2258
   2259	return printed;
   2260}
   2261
   2262static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
   2263{
   2264	void *augmented_args = NULL;
   2265	/*
   2266	 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
   2267	 * and there we get all 6 syscall args plus the tracepoint common fields
   2268	 * that gets calculated at the start and the syscall_nr (another long).
   2269	 * So we check if that is the case and if so don't look after the
   2270	 * sc->args_size but always after the full raw_syscalls:sys_enter payload,
   2271	 * which is fixed.
   2272	 *
   2273	 * We'll revisit this later to pass s->args_size to the BPF augmenter
   2274	 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
   2275	 * copies only what we need for each syscall, like what happens when we
   2276	 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
   2277	 * traffic to just what is needed for each syscall.
   2278	 */
   2279	int args_size = raw_augmented_args_size ?: sc->args_size;
   2280
   2281	*augmented_args_size = sample->raw_size - args_size;
   2282	if (*augmented_args_size > 0)
   2283		augmented_args = sample->raw_data + args_size;
   2284
   2285	return augmented_args;
   2286}
   2287
   2288static void syscall__exit(struct syscall *sc)
   2289{
   2290	if (!sc)
   2291		return;
   2292
   2293	free(sc->arg_fmt);
   2294}
   2295
   2296static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
   2297			    union perf_event *event __maybe_unused,
   2298			    struct perf_sample *sample)
   2299{
   2300	char *msg;
   2301	void *args;
   2302	int printed = 0;
   2303	struct thread *thread;
   2304	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
   2305	int augmented_args_size = 0;
   2306	void *augmented_args = NULL;
   2307	struct syscall *sc = trace__syscall_info(trace, evsel, id);
   2308	struct thread_trace *ttrace;
   2309
   2310	if (sc == NULL)
   2311		return -1;
   2312
   2313	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
   2314	ttrace = thread__trace(thread, trace->output);
   2315	if (ttrace == NULL)
   2316		goto out_put;
   2317
   2318	trace__fprintf_sample(trace, evsel, sample, thread);
   2319
   2320	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
   2321
   2322	if (ttrace->entry_str == NULL) {
   2323		ttrace->entry_str = malloc(trace__entry_str_size);
   2324		if (!ttrace->entry_str)
   2325			goto out_put;
   2326	}
   2327
   2328	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
   2329		trace__printf_interrupted_entry(trace);
   2330	/*
   2331	 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
   2332	 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
   2333	 * this breaks syscall__augmented_args() check for augmented args, as we calculate
   2334	 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
   2335	 * so when handling, say the openat syscall, we end up getting 6 args for the
   2336	 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
   2337	 * thinking that the extra 2 u64 args are the augmented filename, so just check
   2338	 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
   2339	 */
   2340	if (evsel != trace->syscalls.events.sys_enter)
   2341		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
   2342	ttrace->entry_time = sample->time;
   2343	msg = ttrace->entry_str;
   2344	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
   2345
   2346	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
   2347					   args, augmented_args, augmented_args_size, trace, thread);
   2348
   2349	if (sc->is_exit) {
   2350		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
   2351			int alignment = 0;
   2352
   2353			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
   2354			printed = fprintf(trace->output, "%s)", ttrace->entry_str);
   2355			if (trace->args_alignment > printed)
   2356				alignment = trace->args_alignment - printed;
   2357			fprintf(trace->output, "%*s= ?\n", alignment, " ");
   2358		}
   2359	} else {
   2360		ttrace->entry_pending = true;
   2361		/* See trace__vfs_getname & trace__sys_exit */
   2362		ttrace->filename.pending_open = false;
   2363	}
   2364
   2365	if (trace->current != thread) {
   2366		thread__put(trace->current);
   2367		trace->current = thread__get(thread);
   2368	}
   2369	err = 0;
   2370out_put:
   2371	thread__put(thread);
   2372	return err;
   2373}
   2374
   2375static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
   2376				    struct perf_sample *sample)
   2377{
   2378	struct thread_trace *ttrace;
   2379	struct thread *thread;
   2380	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
   2381	struct syscall *sc = trace__syscall_info(trace, evsel, id);
   2382	char msg[1024];
   2383	void *args, *augmented_args = NULL;
   2384	int augmented_args_size;
   2385
   2386	if (sc == NULL)
   2387		return -1;
   2388
   2389	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
   2390	ttrace = thread__trace(thread, trace->output);
   2391	/*
   2392	 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
   2393	 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
   2394	 */
   2395	if (ttrace == NULL)
   2396		goto out_put;
   2397
   2398	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
   2399	augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
   2400	syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
   2401	fprintf(trace->output, "%s", msg);
   2402	err = 0;
   2403out_put:
   2404	thread__put(thread);
   2405	return err;
   2406}
   2407
   2408static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel,
   2409				    struct perf_sample *sample,
   2410				    struct callchain_cursor *cursor)
   2411{
   2412	struct addr_location al;
   2413	int max_stack = evsel->core.attr.sample_max_stack ?
   2414			evsel->core.attr.sample_max_stack :
   2415			trace->max_stack;
   2416	int err;
   2417
   2418	if (machine__resolve(trace->host, &al, sample) < 0)
   2419		return -1;
   2420
   2421	err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
   2422	addr_location__put(&al);
   2423	return err;
   2424}
   2425
   2426static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
   2427{
   2428	/* TODO: user-configurable print_opts */
   2429	const unsigned int print_opts = EVSEL__PRINT_SYM |
   2430				        EVSEL__PRINT_DSO |
   2431				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
   2432
   2433	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, symbol_conf.bt_stop_list, trace->output);
   2434}
   2435
   2436static const char *errno_to_name(struct evsel *evsel, int err)
   2437{
   2438	struct perf_env *env = evsel__env(evsel);
   2439	const char *arch_name = perf_env__arch(env);
   2440
   2441	return arch_syscalls__strerrno(arch_name, err);
   2442}
   2443
   2444static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
   2445			   union perf_event *event __maybe_unused,
   2446			   struct perf_sample *sample)
   2447{
   2448	long ret;
   2449	u64 duration = 0;
   2450	bool duration_calculated = false;
   2451	struct thread *thread;
   2452	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
   2453	int alignment = trace->args_alignment;
   2454	struct syscall *sc = trace__syscall_info(trace, evsel, id);
   2455	struct thread_trace *ttrace;
   2456
   2457	if (sc == NULL)
   2458		return -1;
   2459
   2460	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
   2461	ttrace = thread__trace(thread, trace->output);
   2462	if (ttrace == NULL)
   2463		goto out_put;
   2464
   2465	trace__fprintf_sample(trace, evsel, sample, thread);
   2466
   2467	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
   2468
   2469	if (trace->summary)
   2470		thread__update_stats(thread, ttrace, id, sample, ret, trace->errno_summary);
   2471
   2472	if (!trace->fd_path_disabled && sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
   2473		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
   2474		ttrace->filename.pending_open = false;
   2475		++trace->stats.vfs_getname;
   2476	}
   2477
   2478	if (ttrace->entry_time) {
   2479		duration = sample->time - ttrace->entry_time;
   2480		if (trace__filter_duration(trace, duration))
   2481			goto out;
   2482		duration_calculated = true;
   2483	} else if (trace->duration_filter)
   2484		goto out;
   2485
   2486	if (sample->callchain) {
   2487		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
   2488		if (callchain_ret == 0) {
   2489			if (callchain_cursor.nr < trace->min_stack)
   2490				goto out;
   2491			callchain_ret = 1;
   2492		}
   2493	}
   2494
   2495	if (trace->summary_only || (ret >= 0 && trace->failure_only))
   2496		goto out;
   2497
   2498	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
   2499
   2500	if (ttrace->entry_pending) {
   2501		printed = fprintf(trace->output, "%s", ttrace->entry_str);
   2502	} else {
   2503		printed += fprintf(trace->output, " ... [");
   2504		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
   2505		printed += 9;
   2506		printed += fprintf(trace->output, "]: %s()", sc->name);
   2507	}
   2508
   2509	printed++; /* the closing ')' */
   2510
   2511	if (alignment > printed)
   2512		alignment -= printed;
   2513	else
   2514		alignment = 0;
   2515
   2516	fprintf(trace->output, ")%*s= ", alignment, " ");
   2517
   2518	if (sc->fmt == NULL) {
   2519		if (ret < 0)
   2520			goto errno_print;
   2521signed_print:
   2522		fprintf(trace->output, "%ld", ret);
   2523	} else if (ret < 0) {
   2524errno_print: {
   2525		char bf[STRERR_BUFSIZE];
   2526		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
   2527			   *e = errno_to_name(evsel, -ret);
   2528
   2529		fprintf(trace->output, "-1 %s (%s)", e, emsg);
   2530	}
   2531	} else if (ret == 0 && sc->fmt->timeout)
   2532		fprintf(trace->output, "0 (Timeout)");
   2533	else if (ttrace->ret_scnprintf) {
   2534		char bf[1024];
   2535		struct syscall_arg arg = {
   2536			.val	= ret,
   2537			.thread	= thread,
   2538			.trace	= trace,
   2539		};
   2540		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
   2541		ttrace->ret_scnprintf = NULL;
   2542		fprintf(trace->output, "%s", bf);
   2543	} else if (sc->fmt->hexret)
   2544		fprintf(trace->output, "%#lx", ret);
   2545	else if (sc->fmt->errpid) {
   2546		struct thread *child = machine__find_thread(trace->host, ret, ret);
   2547
   2548		if (child != NULL) {
   2549			fprintf(trace->output, "%ld", ret);
   2550			if (child->comm_set)
   2551				fprintf(trace->output, " (%s)", thread__comm_str(child));
   2552			thread__put(child);
   2553		}
   2554	} else
   2555		goto signed_print;
   2556
   2557	fputc('\n', trace->output);
   2558
   2559	/*
   2560	 * We only consider an 'event' for the sake of --max-events a non-filtered
   2561	 * sys_enter + sys_exit and other tracepoint events.
   2562	 */
   2563	if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
   2564		interrupted = true;
   2565
   2566	if (callchain_ret > 0)
   2567		trace__fprintf_callchain(trace, sample);
   2568	else if (callchain_ret < 0)
   2569		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
   2570out:
   2571	ttrace->entry_pending = false;
   2572	err = 0;
   2573out_put:
   2574	thread__put(thread);
   2575	return err;
   2576}
   2577
   2578static int trace__vfs_getname(struct trace *trace, struct evsel *evsel,
   2579			      union perf_event *event __maybe_unused,
   2580			      struct perf_sample *sample)
   2581{
   2582	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
   2583	struct thread_trace *ttrace;
   2584	size_t filename_len, entry_str_len, to_move;
   2585	ssize_t remaining_space;
   2586	char *pos;
   2587	const char *filename = evsel__rawptr(evsel, sample, "pathname");
   2588
   2589	if (!thread)
   2590		goto out;
   2591
   2592	ttrace = thread__priv(thread);
   2593	if (!ttrace)
   2594		goto out_put;
   2595
   2596	filename_len = strlen(filename);
   2597	if (filename_len == 0)
   2598		goto out_put;
   2599
   2600	if (ttrace->filename.namelen < filename_len) {
   2601		char *f = realloc(ttrace->filename.name, filename_len + 1);
   2602
   2603		if (f == NULL)
   2604			goto out_put;
   2605
   2606		ttrace->filename.namelen = filename_len;
   2607		ttrace->filename.name = f;
   2608	}
   2609
   2610	strcpy(ttrace->filename.name, filename);
   2611	ttrace->filename.pending_open = true;
   2612
   2613	if (!ttrace->filename.ptr)
   2614		goto out_put;
   2615
   2616	entry_str_len = strlen(ttrace->entry_str);
   2617	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
   2618	if (remaining_space <= 0)
   2619		goto out_put;
   2620
   2621	if (filename_len > (size_t)remaining_space) {
   2622		filename += filename_len - remaining_space;
   2623		filename_len = remaining_space;
   2624	}
   2625
   2626	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
   2627	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
   2628	memmove(pos + filename_len, pos, to_move);
   2629	memcpy(pos, filename, filename_len);
   2630
   2631	ttrace->filename.ptr = 0;
   2632	ttrace->filename.entry_str_pos = 0;
   2633out_put:
   2634	thread__put(thread);
   2635out:
   2636	return 0;
   2637}
   2638
   2639static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel,
   2640				     union perf_event *event __maybe_unused,
   2641				     struct perf_sample *sample)
   2642{
   2643        u64 runtime = evsel__intval(evsel, sample, "runtime");
   2644	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
   2645	struct thread *thread = machine__findnew_thread(trace->host,
   2646							sample->pid,
   2647							sample->tid);
   2648	struct thread_trace *ttrace = thread__trace(thread, trace->output);
   2649
   2650	if (ttrace == NULL)
   2651		goto out_dump;
   2652
   2653	ttrace->runtime_ms += runtime_ms;
   2654	trace->runtime_ms += runtime_ms;
   2655out_put:
   2656	thread__put(thread);
   2657	return 0;
   2658
   2659out_dump:
   2660	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
   2661	       evsel->name,
   2662	       evsel__strval(evsel, sample, "comm"),
   2663	       (pid_t)evsel__intval(evsel, sample, "pid"),
   2664	       runtime,
   2665	       evsel__intval(evsel, sample, "vruntime"));
   2666	goto out_put;
   2667}
   2668
   2669static int bpf_output__printer(enum binary_printer_ops op,
   2670			       unsigned int val, void *extra __maybe_unused, FILE *fp)
   2671{
   2672	unsigned char ch = (unsigned char)val;
   2673
   2674	switch (op) {
   2675	case BINARY_PRINT_CHAR_DATA:
   2676		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
   2677	case BINARY_PRINT_DATA_BEGIN:
   2678	case BINARY_PRINT_LINE_BEGIN:
   2679	case BINARY_PRINT_ADDR:
   2680	case BINARY_PRINT_NUM_DATA:
   2681	case BINARY_PRINT_NUM_PAD:
   2682	case BINARY_PRINT_SEP:
   2683	case BINARY_PRINT_CHAR_PAD:
   2684	case BINARY_PRINT_LINE_END:
   2685	case BINARY_PRINT_DATA_END:
   2686	default:
   2687		break;
   2688	}
   2689
   2690	return 0;
   2691}
   2692
   2693static void bpf_output__fprintf(struct trace *trace,
   2694				struct perf_sample *sample)
   2695{
   2696	binary__fprintf(sample->raw_data, sample->raw_size, 8,
   2697			bpf_output__printer, NULL, trace->output);
   2698	++trace->nr_events_printed;
   2699}
   2700
   2701static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, struct perf_sample *sample,
   2702				       struct thread *thread, void *augmented_args, int augmented_args_size)
   2703{
   2704	char bf[2048];
   2705	size_t size = sizeof(bf);
   2706	struct tep_format_field *field = evsel->tp_format->format.fields;
   2707	struct syscall_arg_fmt *arg = __evsel__syscall_arg_fmt(evsel);
   2708	size_t printed = 0;
   2709	unsigned long val;
   2710	u8 bit = 1;
   2711	struct syscall_arg syscall_arg = {
   2712		.augmented = {
   2713			.size = augmented_args_size,
   2714			.args = augmented_args,
   2715		},
   2716		.idx	= 0,
   2717		.mask	= 0,
   2718		.trace  = trace,
   2719		.thread = thread,
   2720		.show_string_prefix = trace->show_string_prefix,
   2721	};
   2722
   2723	for (; field && arg; field = field->next, ++syscall_arg.idx, bit <<= 1, ++arg) {
   2724		if (syscall_arg.mask & bit)
   2725			continue;
   2726
   2727		syscall_arg.len = 0;
   2728		syscall_arg.fmt = arg;
   2729		if (field->flags & TEP_FIELD_IS_ARRAY) {
   2730			int offset = field->offset;
   2731
   2732			if (field->flags & TEP_FIELD_IS_DYNAMIC) {
   2733				offset = format_field__intval(field, sample, evsel->needs_swap);
   2734				syscall_arg.len = offset >> 16;
   2735				offset &= 0xffff;
   2736				if (field->flags & TEP_FIELD_IS_RELATIVE)
   2737					offset += field->offset + field->size;
   2738			}
   2739
   2740			val = (uintptr_t)(sample->raw_data + offset);
   2741		} else
   2742			val = format_field__intval(field, sample, evsel->needs_swap);
   2743		/*
   2744		 * Some syscall args need some mask, most don't and
   2745		 * return val untouched.
   2746		 */
   2747		val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val);
   2748
   2749		/*
   2750		 * Suppress this argument if its value is zero and
   2751		 * and we don't have a string associated in an
   2752		 * strarray for it.
   2753		 */
   2754		if (val == 0 &&
   2755		    !trace->show_zeros &&
   2756		    !((arg->show_zero ||
   2757		       arg->scnprintf == SCA_STRARRAY ||
   2758		       arg->scnprintf == SCA_STRARRAYS) &&
   2759		      arg->parm))
   2760			continue;
   2761
   2762		printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
   2763
   2764		/*
   2765		 * XXX Perhaps we should have a show_tp_arg_names,
   2766		 * leaving show_arg_names just for syscalls?
   2767		 */
   2768		if (1 || trace->show_arg_names)
   2769			printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
   2770
   2771		printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val);
   2772	}
   2773
   2774	return printed + fprintf(trace->output, "%s", bf);
   2775}
   2776
   2777static int trace__event_handler(struct trace *trace, struct evsel *evsel,
   2778				union perf_event *event __maybe_unused,
   2779				struct perf_sample *sample)
   2780{
   2781	struct thread *thread;
   2782	int callchain_ret = 0;
   2783	/*
   2784	 * Check if we called perf_evsel__disable(evsel) due to, for instance,
   2785	 * this event's max_events having been hit and this is an entry coming
   2786	 * from the ring buffer that we should discard, since the max events
   2787	 * have already been considered/printed.
   2788	 */
   2789	if (evsel->disabled)
   2790		return 0;
   2791
   2792	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
   2793
   2794	if (sample->callchain) {
   2795		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
   2796		if (callchain_ret == 0) {
   2797			if (callchain_cursor.nr < trace->min_stack)
   2798				goto out;
   2799			callchain_ret = 1;
   2800		}
   2801	}
   2802
   2803	trace__printf_interrupted_entry(trace);
   2804	trace__fprintf_tstamp(trace, sample->time, trace->output);
   2805
   2806	if (trace->trace_syscalls && trace->show_duration)
   2807		fprintf(trace->output, "(         ): ");
   2808
   2809	if (thread)
   2810		trace__fprintf_comm_tid(trace, thread, trace->output);
   2811
   2812	if (evsel == trace->syscalls.events.augmented) {
   2813		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
   2814		struct syscall *sc = trace__syscall_info(trace, evsel, id);
   2815
   2816		if (sc) {
   2817			fprintf(trace->output, "%s(", sc->name);
   2818			trace__fprintf_sys_enter(trace, evsel, sample);
   2819			fputc(')', trace->output);
   2820			goto newline;
   2821		}
   2822
   2823		/*
   2824		 * XXX: Not having the associated syscall info or not finding/adding
   2825		 * 	the thread should never happen, but if it does...
   2826		 * 	fall thru and print it as a bpf_output event.
   2827		 */
   2828	}
   2829
   2830	fprintf(trace->output, "%s(", evsel->name);
   2831
   2832	if (evsel__is_bpf_output(evsel)) {
   2833		bpf_output__fprintf(trace, sample);
   2834	} else if (evsel->tp_format) {
   2835		if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
   2836		    trace__fprintf_sys_enter(trace, evsel, sample)) {
   2837			if (trace->libtraceevent_print) {
   2838				event_format__fprintf(evsel->tp_format, sample->cpu,
   2839						      sample->raw_data, sample->raw_size,
   2840						      trace->output);
   2841			} else {
   2842				trace__fprintf_tp_fields(trace, evsel, sample, thread, NULL, 0);
   2843			}
   2844		}
   2845	}
   2846
   2847newline:
   2848	fprintf(trace->output, ")\n");
   2849
   2850	if (callchain_ret > 0)
   2851		trace__fprintf_callchain(trace, sample);
   2852	else if (callchain_ret < 0)
   2853		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
   2854
   2855	++trace->nr_events_printed;
   2856
   2857	if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
   2858		evsel__disable(evsel);
   2859		evsel__close(evsel);
   2860	}
   2861out:
   2862	thread__put(thread);
   2863	return 0;
   2864}
   2865
   2866static void print_location(FILE *f, struct perf_sample *sample,
   2867			   struct addr_location *al,
   2868			   bool print_dso, bool print_sym)
   2869{
   2870
   2871	if ((verbose > 0 || print_dso) && al->map)
   2872		fprintf(f, "%s@", al->map->dso->long_name);
   2873
   2874	if ((verbose > 0 || print_sym) && al->sym)
   2875		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
   2876			al->addr - al->sym->start);
   2877	else if (al->map)
   2878		fprintf(f, "0x%" PRIx64, al->addr);
   2879	else
   2880		fprintf(f, "0x%" PRIx64, sample->addr);
   2881}
   2882
   2883static int trace__pgfault(struct trace *trace,
   2884			  struct evsel *evsel,
   2885			  union perf_event *event __maybe_unused,
   2886			  struct perf_sample *sample)
   2887{
   2888	struct thread *thread;
   2889	struct addr_location al;
   2890	char map_type = 'd';
   2891	struct thread_trace *ttrace;
   2892	int err = -1;
   2893	int callchain_ret = 0;
   2894
   2895	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
   2896
   2897	if (sample->callchain) {
   2898		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
   2899		if (callchain_ret == 0) {
   2900			if (callchain_cursor.nr < trace->min_stack)
   2901				goto out_put;
   2902			callchain_ret = 1;
   2903		}
   2904	}
   2905
   2906	ttrace = thread__trace(thread, trace->output);
   2907	if (ttrace == NULL)
   2908		goto out_put;
   2909
   2910	if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
   2911		ttrace->pfmaj++;
   2912	else
   2913		ttrace->pfmin++;
   2914
   2915	if (trace->summary_only)
   2916		goto out;
   2917
   2918	thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
   2919
   2920	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
   2921
   2922	fprintf(trace->output, "%sfault [",
   2923		evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
   2924		"maj" : "min");
   2925
   2926	print_location(trace->output, sample, &al, false, true);
   2927
   2928	fprintf(trace->output, "] => ");
   2929
   2930	thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
   2931
   2932	if (!al.map) {
   2933		thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
   2934
   2935		if (al.map)
   2936			map_type = 'x';
   2937		else
   2938			map_type = '?';
   2939	}
   2940
   2941	print_location(trace->output, sample, &al, true, false);
   2942
   2943	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
   2944
   2945	if (callchain_ret > 0)
   2946		trace__fprintf_callchain(trace, sample);
   2947	else if (callchain_ret < 0)
   2948		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
   2949
   2950	++trace->nr_events_printed;
   2951out:
   2952	err = 0;
   2953out_put:
   2954	thread__put(thread);
   2955	return err;
   2956}
   2957
   2958static void trace__set_base_time(struct trace *trace,
   2959				 struct evsel *evsel,
   2960				 struct perf_sample *sample)
   2961{
   2962	/*
   2963	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
   2964	 * and don't use sample->time unconditionally, we may end up having
   2965	 * some other event in the future without PERF_SAMPLE_TIME for good
   2966	 * reason, i.e. we may not be interested in its timestamps, just in
   2967	 * it taking place, picking some piece of information when it
   2968	 * appears in our event stream (vfs_getname comes to mind).
   2969	 */
   2970	if (trace->base_time == 0 && !trace->full_time &&
   2971	    (evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
   2972		trace->base_time = sample->time;
   2973}
   2974
   2975static int trace__process_sample(struct perf_tool *tool,
   2976				 union perf_event *event,
   2977				 struct perf_sample *sample,
   2978				 struct evsel *evsel,
   2979				 struct machine *machine __maybe_unused)
   2980{
   2981	struct trace *trace = container_of(tool, struct trace, tool);
   2982	struct thread *thread;
   2983	int err = 0;
   2984
   2985	tracepoint_handler handler = evsel->handler;
   2986
   2987	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
   2988	if (thread && thread__is_filtered(thread))
   2989		goto out;
   2990
   2991	trace__set_base_time(trace, evsel, sample);
   2992
   2993	if (handler) {
   2994		++trace->nr_events;
   2995		handler(trace, evsel, event, sample);
   2996	}
   2997out:
   2998	thread__put(thread);
   2999	return err;
   3000}
   3001
   3002static int trace__record(struct trace *trace, int argc, const char **argv)
   3003{
   3004	unsigned int rec_argc, i, j;
   3005	const char **rec_argv;
   3006	const char * const record_args[] = {
   3007		"record",
   3008		"-R",
   3009		"-m", "1024",
   3010		"-c", "1",
   3011	};
   3012	pid_t pid = getpid();
   3013	char *filter = asprintf__tp_filter_pids(1, &pid);
   3014	const char * const sc_args[] = { "-e", };
   3015	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
   3016	const char * const majpf_args[] = { "-e", "major-faults" };
   3017	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
   3018	const char * const minpf_args[] = { "-e", "minor-faults" };
   3019	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
   3020	int err = -1;
   3021
   3022	/* +3 is for the event string below and the pid filter */
   3023	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 3 +
   3024		majpf_args_nr + minpf_args_nr + argc;
   3025	rec_argv = calloc(rec_argc + 1, sizeof(char *));
   3026
   3027	if (rec_argv == NULL || filter == NULL)
   3028		goto out_free;
   3029
   3030	j = 0;
   3031	for (i = 0; i < ARRAY_SIZE(record_args); i++)
   3032		rec_argv[j++] = record_args[i];
   3033
   3034	if (trace->trace_syscalls) {
   3035		for (i = 0; i < sc_args_nr; i++)
   3036			rec_argv[j++] = sc_args[i];
   3037
   3038		/* event string may be different for older kernels - e.g., RHEL6 */
   3039		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
   3040			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
   3041		else if (is_valid_tracepoint("syscalls:sys_enter"))
   3042			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
   3043		else {
   3044			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
   3045			goto out_free;
   3046		}
   3047	}
   3048
   3049	rec_argv[j++] = "--filter";
   3050	rec_argv[j++] = filter;
   3051
   3052	if (trace->trace_pgfaults & TRACE_PFMAJ)
   3053		for (i = 0; i < majpf_args_nr; i++)
   3054			rec_argv[j++] = majpf_args[i];
   3055
   3056	if (trace->trace_pgfaults & TRACE_PFMIN)
   3057		for (i = 0; i < minpf_args_nr; i++)
   3058			rec_argv[j++] = minpf_args[i];
   3059
   3060	for (i = 0; i < (unsigned int)argc; i++)
   3061		rec_argv[j++] = argv[i];
   3062
   3063	err = cmd_record(j, rec_argv);
   3064out_free:
   3065	free(filter);
   3066	free(rec_argv);
   3067	return err;
   3068}
   3069
   3070static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
   3071
   3072static bool evlist__add_vfs_getname(struct evlist *evlist)
   3073{
   3074	bool found = false;
   3075	struct evsel *evsel, *tmp;
   3076	struct parse_events_error err;
   3077	int ret;
   3078
   3079	parse_events_error__init(&err);
   3080	ret = parse_events(evlist, "probe:vfs_getname*", &err);
   3081	parse_events_error__exit(&err);
   3082	if (ret)
   3083		return false;
   3084
   3085	evlist__for_each_entry_safe(evlist, evsel, tmp) {
   3086		if (!strstarts(evsel__name(evsel), "probe:vfs_getname"))
   3087			continue;
   3088
   3089		if (evsel__field(evsel, "pathname")) {
   3090			evsel->handler = trace__vfs_getname;
   3091			found = true;
   3092			continue;
   3093		}
   3094
   3095		list_del_init(&evsel->core.node);
   3096		evsel->evlist = NULL;
   3097		evsel__delete(evsel);
   3098	}
   3099
   3100	return found;
   3101}
   3102
   3103static struct evsel *evsel__new_pgfault(u64 config)
   3104{
   3105	struct evsel *evsel;
   3106	struct perf_event_attr attr = {
   3107		.type = PERF_TYPE_SOFTWARE,
   3108		.mmap_data = 1,
   3109	};
   3110
   3111	attr.config = config;
   3112	attr.sample_period = 1;
   3113
   3114	event_attr_init(&attr);
   3115
   3116	evsel = evsel__new(&attr);
   3117	if (evsel)
   3118		evsel->handler = trace__pgfault;
   3119
   3120	return evsel;
   3121}
   3122
   3123static void evlist__free_syscall_tp_fields(struct evlist *evlist)
   3124{
   3125	struct evsel *evsel;
   3126
   3127	evlist__for_each_entry(evlist, evsel) {
   3128		struct evsel_trace *et = evsel->priv;
   3129
   3130		if (!et || !evsel->tp_format || strcmp(evsel->tp_format->system, "syscalls"))
   3131			continue;
   3132
   3133		free(et->fmt);
   3134		free(et);
   3135	}
   3136}
   3137
   3138static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
   3139{
   3140	const u32 type = event->header.type;
   3141	struct evsel *evsel;
   3142
   3143	if (type != PERF_RECORD_SAMPLE) {
   3144		trace__process_event(trace, trace->host, event, sample);
   3145		return;
   3146	}
   3147
   3148	evsel = evlist__id2evsel(trace->evlist, sample->id);
   3149	if (evsel == NULL) {
   3150		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
   3151		return;
   3152	}
   3153
   3154	if (evswitch__discard(&trace->evswitch, evsel))
   3155		return;
   3156
   3157	trace__set_base_time(trace, evsel, sample);
   3158
   3159	if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT &&
   3160	    sample->raw_data == NULL) {
   3161		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
   3162		       evsel__name(evsel), sample->tid,
   3163		       sample->cpu, sample->raw_size);
   3164	} else {
   3165		tracepoint_handler handler = evsel->handler;
   3166		handler(trace, evsel, event, sample);
   3167	}
   3168
   3169	if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
   3170		interrupted = true;
   3171}
   3172
   3173static int trace__add_syscall_newtp(struct trace *trace)
   3174{
   3175	int ret = -1;
   3176	struct evlist *evlist = trace->evlist;
   3177	struct evsel *sys_enter, *sys_exit;
   3178
   3179	sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
   3180	if (sys_enter == NULL)
   3181		goto out;
   3182
   3183	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
   3184		goto out_delete_sys_enter;
   3185
   3186	sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
   3187	if (sys_exit == NULL)
   3188		goto out_delete_sys_enter;
   3189
   3190	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
   3191		goto out_delete_sys_exit;
   3192
   3193	evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
   3194	evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
   3195
   3196	evlist__add(evlist, sys_enter);
   3197	evlist__add(evlist, sys_exit);
   3198
   3199	if (callchain_param.enabled && !trace->kernel_syscallchains) {
   3200		/*
   3201		 * We're interested only in the user space callchain
   3202		 * leading to the syscall, allow overriding that for
   3203		 * debugging reasons using --kernel_syscall_callchains
   3204		 */
   3205		sys_exit->core.attr.exclude_callchain_kernel = 1;
   3206	}
   3207
   3208	trace->syscalls.events.sys_enter = sys_enter;
   3209	trace->syscalls.events.sys_exit  = sys_exit;
   3210
   3211	ret = 0;
   3212out:
   3213	return ret;
   3214
   3215out_delete_sys_exit:
   3216	evsel__delete_priv(sys_exit);
   3217out_delete_sys_enter:
   3218	evsel__delete_priv(sys_enter);
   3219	goto out;
   3220}
   3221
   3222static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
   3223{
   3224	int err = -1;
   3225	struct evsel *sys_exit;
   3226	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
   3227						trace->ev_qualifier_ids.nr,
   3228						trace->ev_qualifier_ids.entries);
   3229
   3230	if (filter == NULL)
   3231		goto out_enomem;
   3232
   3233	if (!evsel__append_tp_filter(trace->syscalls.events.sys_enter, filter)) {
   3234		sys_exit = trace->syscalls.events.sys_exit;
   3235		err = evsel__append_tp_filter(sys_exit, filter);
   3236	}
   3237
   3238	free(filter);
   3239out:
   3240	return err;
   3241out_enomem:
   3242	errno = ENOMEM;
   3243	goto out;
   3244}
   3245
   3246#ifdef HAVE_LIBBPF_SUPPORT
   3247static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace, const char *name)
   3248{
   3249	if (trace->bpf_obj == NULL)
   3250		return NULL;
   3251
   3252	return bpf_object__find_map_by_name(trace->bpf_obj, name);
   3253}
   3254
   3255static void trace__set_bpf_map_filtered_pids(struct trace *trace)
   3256{
   3257	trace->filter_pids.map = trace__find_bpf_map_by_name(trace, "pids_filtered");
   3258}
   3259
   3260static void trace__set_bpf_map_syscalls(struct trace *trace)
   3261{
   3262	trace->syscalls.map = trace__find_bpf_map_by_name(trace, "syscalls");
   3263	trace->syscalls.prog_array.sys_enter = trace__find_bpf_map_by_name(trace, "syscalls_sys_enter");
   3264	trace->syscalls.prog_array.sys_exit  = trace__find_bpf_map_by_name(trace, "syscalls_sys_exit");
   3265}
   3266
   3267static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
   3268{
   3269	struct bpf_program *pos, *prog = NULL;
   3270	const char *sec_name;
   3271
   3272	if (trace->bpf_obj == NULL)
   3273		return NULL;
   3274
   3275	bpf_object__for_each_program(pos, trace->bpf_obj) {
   3276		sec_name = bpf_program__section_name(pos);
   3277		if (sec_name && !strcmp(sec_name, name)) {
   3278			prog = pos;
   3279			break;
   3280		}
   3281	}
   3282
   3283	return prog;
   3284}
   3285
   3286static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc,
   3287							const char *prog_name, const char *type)
   3288{
   3289	struct bpf_program *prog;
   3290
   3291	if (prog_name == NULL) {
   3292		char default_prog_name[256];
   3293		scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->name);
   3294		prog = trace__find_bpf_program_by_title(trace, default_prog_name);
   3295		if (prog != NULL)
   3296			goto out_found;
   3297		if (sc->fmt && sc->fmt->alias) {
   3298			scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->fmt->alias);
   3299			prog = trace__find_bpf_program_by_title(trace, default_prog_name);
   3300			if (prog != NULL)
   3301				goto out_found;
   3302		}
   3303		goto out_unaugmented;
   3304	}
   3305
   3306	prog = trace__find_bpf_program_by_title(trace, prog_name);
   3307
   3308	if (prog != NULL) {
   3309out_found:
   3310		return prog;
   3311	}
   3312
   3313	pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
   3314		 prog_name, type, sc->name);
   3315out_unaugmented:
   3316	return trace->syscalls.unaugmented_prog;
   3317}
   3318
   3319static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
   3320{
   3321	struct syscall *sc = trace__syscall_info(trace, NULL, id);
   3322
   3323	if (sc == NULL)
   3324		return;
   3325
   3326	sc->bpf_prog.sys_enter = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_enter : NULL, "enter");
   3327	sc->bpf_prog.sys_exit  = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_exit  : NULL,  "exit");
   3328}
   3329
   3330static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
   3331{
   3332	struct syscall *sc = trace__syscall_info(trace, NULL, id);
   3333	return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->syscalls.unaugmented_prog);
   3334}
   3335
   3336static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
   3337{
   3338	struct syscall *sc = trace__syscall_info(trace, NULL, id);
   3339	return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog);
   3340}
   3341
   3342static void trace__init_bpf_map_syscall_args(struct trace *trace, int id, struct bpf_map_syscall_entry *entry)
   3343{
   3344	struct syscall *sc = trace__syscall_info(trace, NULL, id);
   3345	int arg = 0;
   3346
   3347	if (sc == NULL)
   3348		goto out;
   3349
   3350	for (; arg < sc->nr_args; ++arg) {
   3351		entry->string_args_len[arg] = 0;
   3352		if (sc->arg_fmt[arg].scnprintf == SCA_FILENAME) {
   3353			/* Should be set like strace -s strsize */
   3354			entry->string_args_len[arg] = PATH_MAX;
   3355		}
   3356	}
   3357out:
   3358	for (; arg < 6; ++arg)
   3359		entry->string_args_len[arg] = 0;
   3360}
   3361static int trace__set_ev_qualifier_bpf_filter(struct trace *trace)
   3362{
   3363	int fd = bpf_map__fd(trace->syscalls.map);
   3364	struct bpf_map_syscall_entry value = {
   3365		.enabled = !trace->not_ev_qualifier,
   3366	};
   3367	int err = 0;
   3368	size_t i;
   3369
   3370	for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) {
   3371		int key = trace->ev_qualifier_ids.entries[i];
   3372
   3373		if (value.enabled) {
   3374			trace__init_bpf_map_syscall_args(trace, key, &value);
   3375			trace__init_syscall_bpf_progs(trace, key);
   3376		}
   3377
   3378		err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
   3379		if (err)
   3380			break;
   3381	}
   3382
   3383	return err;
   3384}
   3385
   3386static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled)
   3387{
   3388	int fd = bpf_map__fd(trace->syscalls.map);
   3389	struct bpf_map_syscall_entry value = {
   3390		.enabled = enabled,
   3391	};
   3392	int err = 0, key;
   3393
   3394	for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
   3395		if (enabled)
   3396			trace__init_bpf_map_syscall_args(trace, key, &value);
   3397
   3398		err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
   3399		if (err)
   3400			break;
   3401	}
   3402
   3403	return err;
   3404}
   3405
   3406static int trace__init_syscalls_bpf_map(struct trace *trace)
   3407{
   3408	bool enabled = true;
   3409
   3410	if (trace->ev_qualifier_ids.nr)
   3411		enabled = trace->not_ev_qualifier;
   3412
   3413	return __trace__init_syscalls_bpf_map(trace, enabled);
   3414}
   3415
   3416static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
   3417{
   3418	struct tep_format_field *field, *candidate_field;
   3419	int id;
   3420
   3421	/*
   3422	 * We're only interested in syscalls that have a pointer:
   3423	 */
   3424	for (field = sc->args; field; field = field->next) {
   3425		if (field->flags & TEP_FIELD_IS_POINTER)
   3426			goto try_to_find_pair;
   3427	}
   3428
   3429	return NULL;
   3430
   3431try_to_find_pair:
   3432	for (id = 0; id < trace->sctbl->syscalls.nr_entries; ++id) {
   3433		struct syscall *pair = trace__syscall_info(trace, NULL, id);
   3434		struct bpf_program *pair_prog;
   3435		bool is_candidate = false;
   3436
   3437		if (pair == NULL || pair == sc ||
   3438		    pair->bpf_prog.sys_enter == trace->syscalls.unaugmented_prog)
   3439			continue;
   3440
   3441		for (field = sc->args, candidate_field = pair->args;
   3442		     field && candidate_field; field = field->next, candidate_field = candidate_field->next) {
   3443			bool is_pointer = field->flags & TEP_FIELD_IS_POINTER,
   3444			     candidate_is_pointer = candidate_field->flags & TEP_FIELD_IS_POINTER;
   3445
   3446			if (is_pointer) {
   3447			       if (!candidate_is_pointer) {
   3448					// The candidate just doesn't copies our pointer arg, might copy other pointers we want.
   3449					continue;
   3450			       }
   3451			} else {
   3452				if (candidate_is_pointer) {
   3453					// The candidate might copy a pointer we don't have, skip it.
   3454					goto next_candidate;
   3455				}
   3456				continue;
   3457			}
   3458
   3459			if (strcmp(field->type, candidate_field->type))
   3460				goto next_candidate;
   3461
   3462			is_candidate = true;
   3463		}
   3464
   3465		if (!is_candidate)
   3466			goto next_candidate;
   3467
   3468		/*
   3469		 * Check if the tentative pair syscall augmenter has more pointers, if it has,
   3470		 * then it may be collecting that and we then can't use it, as it would collect
   3471		 * more than what is common to the two syscalls.
   3472		 */
   3473		if (candidate_field) {
   3474			for (candidate_field = candidate_field->next; candidate_field; candidate_field = candidate_field->next)
   3475				if (candidate_field->flags & TEP_FIELD_IS_POINTER)
   3476					goto next_candidate;
   3477		}
   3478
   3479		pair_prog = pair->bpf_prog.sys_enter;
   3480		/*
   3481		 * If the pair isn't enabled, then its bpf_prog.sys_enter will not
   3482		 * have been searched for, so search it here and if it returns the
   3483		 * unaugmented one, then ignore it, otherwise we'll reuse that BPF
   3484		 * program for a filtered syscall on a non-filtered one.
   3485		 *
   3486		 * For instance, we have "!syscalls:sys_enter_renameat" and that is
   3487		 * useful for "renameat2".
   3488		 */
   3489		if (pair_prog == NULL) {
   3490			pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
   3491			if (pair_prog == trace->syscalls.unaugmented_prog)
   3492				goto next_candidate;
   3493		}
   3494
   3495		pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name);
   3496		return pair_prog;
   3497	next_candidate:
   3498		continue;
   3499	}
   3500
   3501	return NULL;
   3502}
   3503
   3504static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
   3505{
   3506	int map_enter_fd = bpf_map__fd(trace->syscalls.prog_array.sys_enter),
   3507	    map_exit_fd  = bpf_map__fd(trace->syscalls.prog_array.sys_exit);
   3508	int err = 0, key;
   3509
   3510	for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
   3511		int prog_fd;
   3512
   3513		if (!trace__syscall_enabled(trace, key))
   3514			continue;
   3515
   3516		trace__init_syscall_bpf_progs(trace, key);
   3517
   3518		// It'll get at least the "!raw_syscalls:unaugmented"
   3519		prog_fd = trace__bpf_prog_sys_enter_fd(trace, key);
   3520		err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
   3521		if (err)
   3522			break;
   3523		prog_fd = trace__bpf_prog_sys_exit_fd(trace, key);
   3524		err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY);
   3525		if (err)
   3526			break;
   3527	}
   3528
   3529	/*
   3530	 * Now lets do a second pass looking for enabled syscalls without
   3531	 * an augmenter that have a signature that is a superset of another
   3532	 * syscall with an augmenter so that we can auto-reuse it.
   3533	 *
   3534	 * I.e. if we have an augmenter for the "open" syscall that has
   3535	 * this signature:
   3536	 *
   3537	 *   int open(const char *pathname, int flags, mode_t mode);
   3538	 *
   3539	 * I.e. that will collect just the first string argument, then we
   3540	 * can reuse it for the 'creat' syscall, that has this signature:
   3541	 *
   3542	 *   int creat(const char *pathname, mode_t mode);
   3543	 *
   3544	 * and for:
   3545	 *
   3546	 *   int stat(const char *pathname, struct stat *statbuf);
   3547	 *   int lstat(const char *pathname, struct stat *statbuf);
   3548	 *
   3549	 * Because the 'open' augmenter will collect the first arg as a string,
   3550	 * and leave alone all the other args, which already helps with
   3551	 * beautifying 'stat' and 'lstat''s pathname arg.
   3552	 *
   3553	 * Then, in time, when 'stat' gets an augmenter that collects both
   3554	 * first and second arg (this one on the raw_syscalls:sys_exit prog
   3555	 * array tail call, then that one will be used.
   3556	 */
   3557	for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
   3558		struct syscall *sc = trace__syscall_info(trace, NULL, key);
   3559		struct bpf_program *pair_prog;
   3560		int prog_fd;
   3561
   3562		if (sc == NULL || sc->bpf_prog.sys_enter == NULL)
   3563			continue;
   3564
   3565		/*
   3566		 * For now we're just reusing the sys_enter prog, and if it
   3567		 * already has an augmenter, we don't need to find one.
   3568		 */
   3569		if (sc->bpf_prog.sys_enter != trace->syscalls.unaugmented_prog)
   3570			continue;
   3571
   3572		/*
   3573		 * Look at all the other syscalls for one that has a signature
   3574		 * that is close enough that we can share:
   3575		 */
   3576		pair_prog = trace__find_usable_bpf_prog_entry(trace, sc);
   3577		if (pair_prog == NULL)
   3578			continue;
   3579
   3580		sc->bpf_prog.sys_enter = pair_prog;
   3581
   3582		/*
   3583		 * Update the BPF_MAP_TYPE_PROG_SHARED for raw_syscalls:sys_enter
   3584		 * with the fd for the program we're reusing:
   3585		 */
   3586		prog_fd = bpf_program__fd(sc->bpf_prog.sys_enter);
   3587		err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
   3588		if (err)
   3589			break;
   3590	}
   3591
   3592
   3593	return err;
   3594}
   3595
   3596static void trace__delete_augmented_syscalls(struct trace *trace)
   3597{
   3598	struct evsel *evsel, *tmp;
   3599
   3600	evlist__remove(trace->evlist, trace->syscalls.events.augmented);
   3601	evsel__delete(trace->syscalls.events.augmented);
   3602	trace->syscalls.events.augmented = NULL;
   3603
   3604	evlist__for_each_entry_safe(trace->evlist, tmp, evsel) {
   3605		if (evsel->bpf_obj == trace->bpf_obj) {
   3606			evlist__remove(trace->evlist, evsel);
   3607			evsel__delete(evsel);
   3608		}
   3609
   3610	}
   3611
   3612	bpf_object__close(trace->bpf_obj);
   3613	trace->bpf_obj = NULL;
   3614}
   3615#else // HAVE_LIBBPF_SUPPORT
   3616static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace __maybe_unused,
   3617						   const char *name __maybe_unused)
   3618{
   3619	return NULL;
   3620}
   3621
   3622static void trace__set_bpf_map_filtered_pids(struct trace *trace __maybe_unused)
   3623{
   3624}
   3625
   3626static void trace__set_bpf_map_syscalls(struct trace *trace __maybe_unused)
   3627{
   3628}
   3629
   3630static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused)
   3631{
   3632	return 0;
   3633}
   3634
   3635static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused)
   3636{
   3637	return 0;
   3638}
   3639
   3640static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace __maybe_unused,
   3641							    const char *name __maybe_unused)
   3642{
   3643	return NULL;
   3644}
   3645
   3646static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace __maybe_unused)
   3647{
   3648	return 0;
   3649}
   3650
   3651static void trace__delete_augmented_syscalls(struct trace *trace __maybe_unused)
   3652{
   3653}
   3654#endif // HAVE_LIBBPF_SUPPORT
   3655
   3656static bool trace__only_augmented_syscalls_evsels(struct trace *trace)
   3657{
   3658	struct evsel *evsel;
   3659
   3660	evlist__for_each_entry(trace->evlist, evsel) {
   3661		if (evsel == trace->syscalls.events.augmented ||
   3662		    evsel->bpf_obj == trace->bpf_obj)
   3663			continue;
   3664
   3665		return false;
   3666	}
   3667
   3668	return true;
   3669}
   3670
   3671static int trace__set_ev_qualifier_filter(struct trace *trace)
   3672{
   3673	if (trace->syscalls.map)
   3674		return trace__set_ev_qualifier_bpf_filter(trace);
   3675	if (trace->syscalls.events.sys_enter)
   3676		return trace__set_ev_qualifier_tp_filter(trace);
   3677	return 0;
   3678}
   3679
   3680static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
   3681				    size_t npids __maybe_unused, pid_t *pids __maybe_unused)
   3682{
   3683	int err = 0;
   3684#ifdef HAVE_LIBBPF_SUPPORT
   3685	bool value = true;
   3686	int map_fd = bpf_map__fd(map);
   3687	size_t i;
   3688
   3689	for (i = 0; i < npids; ++i) {
   3690		err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
   3691		if (err)
   3692			break;
   3693	}
   3694#endif
   3695	return err;
   3696}
   3697
   3698static int trace__set_filter_loop_pids(struct trace *trace)
   3699{
   3700	unsigned int nr = 1, err;
   3701	pid_t pids[32] = {
   3702		getpid(),
   3703	};
   3704	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
   3705
   3706	while (thread && nr < ARRAY_SIZE(pids)) {
   3707		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
   3708
   3709		if (parent == NULL)
   3710			break;
   3711
   3712		if (!strcmp(thread__comm_str(parent), "sshd") ||
   3713		    strstarts(thread__comm_str(parent), "gnome-terminal")) {
   3714			pids[nr++] = parent->tid;
   3715			break;
   3716		}
   3717		thread = parent;
   3718	}
   3719
   3720	err = evlist__append_tp_filter_pids(trace->evlist, nr, pids);
   3721	if (!err && trace->filter_pids.map)
   3722		err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
   3723
   3724	return err;
   3725}
   3726
   3727static int trace__set_filter_pids(struct trace *trace)
   3728{
   3729	int err = 0;
   3730	/*
   3731	 * Better not use !target__has_task() here because we need to cover the
   3732	 * case where no threads were specified in the command line, but a
   3733	 * workload was, and in that case we will fill in the thread_map when
   3734	 * we fork the workload in evlist__prepare_workload.
   3735	 */
   3736	if (trace->filter_pids.nr > 0) {
   3737		err = evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
   3738						    trace->filter_pids.entries);
   3739		if (!err && trace->filter_pids.map) {
   3740			err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
   3741						       trace->filter_pids.entries);
   3742		}
   3743	} else if (perf_thread_map__pid(trace->evlist->core.threads, 0) == -1) {
   3744		err = trace__set_filter_loop_pids(trace);
   3745	}
   3746
   3747	return err;
   3748}
   3749
   3750static int __trace__deliver_event(struct trace *trace, union perf_event *event)
   3751{
   3752	struct evlist *evlist = trace->evlist;
   3753	struct perf_sample sample;
   3754	int err = evlist__parse_sample(evlist, event, &sample);
   3755
   3756	if (err)
   3757		fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
   3758	else
   3759		trace__handle_event(trace, event, &sample);
   3760
   3761	return 0;
   3762}
   3763
   3764static int __trace__flush_events(struct trace *trace)
   3765{
   3766	u64 first = ordered_events__first_time(&trace->oe.data);
   3767	u64 flush = trace->oe.last - NSEC_PER_SEC;
   3768
   3769	/* Is there some thing to flush.. */
   3770	if (first && first < flush)
   3771		return ordered_events__flush_time(&trace->oe.data, flush);
   3772
   3773	return 0;
   3774}
   3775
   3776static int trace__flush_events(struct trace *trace)
   3777{
   3778	return !trace->sort_events ? 0 : __trace__flush_events(trace);
   3779}
   3780
   3781static int trace__deliver_event(struct trace *trace, union perf_event *event)
   3782{
   3783	int err;
   3784
   3785	if (!trace->sort_events)
   3786		return __trace__deliver_event(trace, event);
   3787
   3788	err = evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
   3789	if (err && err != -1)
   3790		return err;
   3791
   3792	err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0, NULL);
   3793	if (err)
   3794		return err;
   3795
   3796	return trace__flush_events(trace);
   3797}
   3798
   3799static int ordered_events__deliver_event(struct ordered_events *oe,
   3800					 struct ordered_event *event)
   3801{
   3802	struct trace *trace = container_of(oe, struct trace, oe.data);
   3803
   3804	return __trace__deliver_event(trace, event->event);
   3805}
   3806
   3807static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg)
   3808{
   3809	struct tep_format_field *field;
   3810	struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel);
   3811
   3812	if (evsel->tp_format == NULL || fmt == NULL)
   3813		return NULL;
   3814
   3815	for (field = evsel->tp_format->format.fields; field; field = field->next, ++fmt)
   3816		if (strcmp(field->name, arg) == 0)
   3817			return fmt;
   3818
   3819	return NULL;
   3820}
   3821
   3822static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel *evsel)
   3823{
   3824	char *tok, *left = evsel->filter, *new_filter = evsel->filter;
   3825
   3826	while ((tok = strpbrk(left, "=<>!")) != NULL) {
   3827		char *right = tok + 1, *right_end;
   3828
   3829		if (*right == '=')
   3830			++right;
   3831
   3832		while (isspace(*right))
   3833			++right;
   3834
   3835		if (*right == '\0')
   3836			break;
   3837
   3838		while (!isalpha(*left))
   3839			if (++left == tok) {
   3840				/*
   3841				 * Bail out, can't find the name of the argument that is being
   3842				 * used in the filter, let it try to set this filter, will fail later.
   3843				 */
   3844				return 0;
   3845			}
   3846
   3847		right_end = right + 1;
   3848		while (isalnum(*right_end) || *right_end == '_' || *right_end == '|')
   3849			++right_end;
   3850
   3851		if (isalpha(*right)) {
   3852			struct syscall_arg_fmt *fmt;
   3853			int left_size = tok - left,
   3854			    right_size = right_end - right;
   3855			char arg[128];
   3856
   3857			while (isspace(left[left_size - 1]))
   3858				--left_size;
   3859
   3860			scnprintf(arg, sizeof(arg), "%.*s", left_size, left);
   3861
   3862			fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg);
   3863			if (fmt == NULL) {
   3864				pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n",
   3865				       arg, evsel->name, evsel->filter);
   3866				return -1;
   3867			}
   3868
   3869			pr_debug2("trying to expand \"%s\" \"%.*s\" \"%.*s\" -> ",
   3870				 arg, (int)(right - tok), tok, right_size, right);
   3871
   3872			if (fmt->strtoul) {
   3873				u64 val;
   3874				struct syscall_arg syscall_arg = {
   3875					.parm = fmt->parm,
   3876				};
   3877
   3878				if (fmt->strtoul(right, right_size, &syscall_arg, &val)) {
   3879					char *n, expansion[19];
   3880					int expansion_lenght = scnprintf(expansion, sizeof(expansion), "%#" PRIx64, val);
   3881					int expansion_offset = right - new_filter;
   3882
   3883					pr_debug("%s", expansion);
   3884
   3885					if (asprintf(&n, "%.*s%s%s", expansion_offset, new_filter, expansion, right_end) < 0) {
   3886						pr_debug(" out of memory!\n");
   3887						free(new_filter);
   3888						return -1;
   3889					}
   3890					if (new_filter != evsel->filter)
   3891						free(new_filter);
   3892					left = n + expansion_offset + expansion_lenght;
   3893					new_filter = n;
   3894				} else {
   3895					pr_err("\"%.*s\" not found for \"%s\" in \"%s\", can't set filter \"%s\"\n",
   3896					       right_size, right, arg, evsel->name, evsel->filter);
   3897					return -1;
   3898				}
   3899			} else {
   3900				pr_err("No resolver (strtoul) for \"%s\" in \"%s\", can't set filter \"%s\"\n",
   3901				       arg, evsel->name, evsel->filter);
   3902				return -1;
   3903			}
   3904
   3905			pr_debug("\n");
   3906		} else {
   3907			left = right_end;
   3908		}
   3909	}
   3910
   3911	if (new_filter != evsel->filter) {
   3912		pr_debug("New filter for %s: %s\n", evsel->name, new_filter);
   3913		evsel__set_filter(evsel, new_filter);
   3914		free(new_filter);
   3915	}
   3916
   3917	return 0;
   3918}
   3919
   3920static int trace__expand_filters(struct trace *trace, struct evsel **err_evsel)
   3921{
   3922	struct evlist *evlist = trace->evlist;
   3923	struct evsel *evsel;
   3924
   3925	evlist__for_each_entry(evlist, evsel) {
   3926		if (evsel->filter == NULL)
   3927			continue;
   3928
   3929		if (trace__expand_filter(trace, evsel)) {
   3930			*err_evsel = evsel;
   3931			return -1;
   3932		}
   3933	}
   3934
   3935	return 0;
   3936}
   3937
   3938static int trace__run(struct trace *trace, int argc, const char **argv)
   3939{
   3940	struct evlist *evlist = trace->evlist;
   3941	struct evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
   3942	int err = -1, i;
   3943	unsigned long before;
   3944	const bool forks = argc > 0;
   3945	bool draining = false;
   3946
   3947	trace->live = true;
   3948
   3949	if (!trace->raw_augmented_syscalls) {
   3950		if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
   3951			goto out_error_raw_syscalls;
   3952
   3953		if (trace->trace_syscalls)
   3954			trace->vfs_getname = evlist__add_vfs_getname(evlist);
   3955	}
   3956
   3957	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
   3958		pgfault_maj = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
   3959		if (pgfault_maj == NULL)
   3960			goto out_error_mem;
   3961		evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
   3962		evlist__add(evlist, pgfault_maj);
   3963	}
   3964
   3965	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
   3966		pgfault_min = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
   3967		if (pgfault_min == NULL)
   3968			goto out_error_mem;
   3969		evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
   3970		evlist__add(evlist, pgfault_min);
   3971	}
   3972
   3973	/* Enable ignoring missing threads when -u/-p option is defined. */
   3974	trace->opts.ignore_missing_thread = trace->opts.target.uid != UINT_MAX || trace->opts.target.pid;
   3975
   3976	if (trace->sched &&
   3977	    evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime))
   3978		goto out_error_sched_stat_runtime;
   3979	/*
   3980	 * If a global cgroup was set, apply it to all the events without an
   3981	 * explicit cgroup. I.e.:
   3982	 *
   3983	 * 	trace -G A -e sched:*switch
   3984	 *
   3985	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
   3986	 * _and_ sched:sched_switch to the 'A' cgroup, while:
   3987	 *
   3988	 * trace -e sched:*switch -G A
   3989	 *
   3990	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
   3991	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
   3992	 * a cgroup (on the root cgroup, sys wide, etc).
   3993	 *
   3994	 * Multiple cgroups:
   3995	 *
   3996	 * trace -G A -e sched:*switch -G B
   3997	 *
   3998	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
   3999	 * to the 'B' cgroup.
   4000	 *
   4001	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
   4002	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
   4003	 */
   4004	if (trace->cgroup)
   4005		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
   4006
   4007	err = evlist__create_maps(evlist, &trace->opts.target);
   4008	if (err < 0) {
   4009		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
   4010		goto out_delete_evlist;
   4011	}
   4012
   4013	err = trace__symbols_init(trace, evlist);
   4014	if (err < 0) {
   4015		fprintf(trace->output, "Problems initializing symbol libraries!\n");
   4016		goto out_delete_evlist;
   4017	}
   4018
   4019	evlist__config(evlist, &trace->opts, &callchain_param);
   4020
   4021	if (forks) {
   4022		err = evlist__prepare_workload(evlist, &trace->opts.target, argv, false, NULL);
   4023		if (err < 0) {
   4024			fprintf(trace->output, "Couldn't run the workload!\n");
   4025			goto out_delete_evlist;
   4026		}
   4027		workload_pid = evlist->workload.pid;
   4028	}
   4029
   4030	err = evlist__open(evlist);
   4031	if (err < 0)
   4032		goto out_error_open;
   4033
   4034	err = bpf__apply_obj_config();
   4035	if (err) {
   4036		char errbuf[BUFSIZ];
   4037
   4038		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
   4039		pr_err("ERROR: Apply config to BPF failed: %s\n",
   4040			 errbuf);
   4041		goto out_error_open;
   4042	}
   4043
   4044	err = trace__set_filter_pids(trace);
   4045	if (err < 0)
   4046		goto out_error_mem;
   4047
   4048	if (trace->syscalls.map)
   4049		trace__init_syscalls_bpf_map(trace);
   4050
   4051	if (trace->syscalls.prog_array.sys_enter)
   4052		trace__init_syscalls_bpf_prog_array_maps(trace);
   4053
   4054	if (trace->ev_qualifier_ids.nr > 0) {
   4055		err = trace__set_ev_qualifier_filter(trace);
   4056		if (err < 0)
   4057			goto out_errno;
   4058
   4059		if (trace->syscalls.events.sys_exit) {
   4060			pr_debug("event qualifier tracepoint filter: %s\n",
   4061				 trace->syscalls.events.sys_exit->filter);
   4062		}
   4063	}
   4064
   4065	/*
   4066	 * If the "close" syscall is not traced, then we will not have the
   4067	 * opportunity to, in syscall_arg__scnprintf_close_fd() invalidate the
   4068	 * fd->pathname table and were ending up showing the last value set by
   4069	 * syscalls opening a pathname and associating it with a descriptor or
   4070	 * reading it from /proc/pid/fd/ in cases where that doesn't make
   4071	 * sense.
   4072	 *
   4073	 *  So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is
   4074	 *  not in use.
   4075	 */
   4076	trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(trace->sctbl, "close"));
   4077
   4078	err = trace__expand_filters(trace, &evsel);
   4079	if (err)
   4080		goto out_delete_evlist;
   4081	err = evlist__apply_filters(evlist, &evsel);
   4082	if (err < 0)
   4083		goto out_error_apply_filters;
   4084
   4085	if (trace->dump.map)
   4086		bpf_map__fprintf(trace->dump.map, trace->output);
   4087
   4088	err = evlist__mmap(evlist, trace->opts.mmap_pages);
   4089	if (err < 0)
   4090		goto out_error_mmap;
   4091
   4092	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
   4093		evlist__enable(evlist);
   4094
   4095	if (forks)
   4096		evlist__start_workload(evlist);
   4097
   4098	if (trace->opts.initial_delay) {
   4099		usleep(trace->opts.initial_delay * 1000);
   4100		evlist__enable(evlist);
   4101	}
   4102
   4103	trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
   4104				  evlist->core.threads->nr > 1 ||
   4105				  evlist__first(evlist)->core.attr.inherit;
   4106
   4107	/*
   4108	 * Now that we already used evsel->core.attr to ask the kernel to setup the
   4109	 * events, lets reuse evsel->core.attr.sample_max_stack as the limit in
   4110	 * trace__resolve_callchain(), allowing per-event max-stack settings
   4111	 * to override an explicitly set --max-stack global setting.
   4112	 */
   4113	evlist__for_each_entry(evlist, evsel) {
   4114		if (evsel__has_callchain(evsel) &&
   4115		    evsel->core.attr.sample_max_stack == 0)
   4116			evsel->core.attr.sample_max_stack = trace->max_stack;
   4117	}
   4118again:
   4119	before = trace->nr_events;
   4120
   4121	for (i = 0; i < evlist->core.nr_mmaps; i++) {
   4122		union perf_event *event;
   4123		struct mmap *md;
   4124
   4125		md = &evlist->mmap[i];
   4126		if (perf_mmap__read_init(&md->core) < 0)
   4127			continue;
   4128
   4129		while ((event = perf_mmap__read_event(&md->core)) != NULL) {
   4130			++trace->nr_events;
   4131
   4132			err = trace__deliver_event(trace, event);
   4133			if (err)
   4134				goto out_disable;
   4135
   4136			perf_mmap__consume(&md->core);
   4137
   4138			if (interrupted)
   4139				goto out_disable;
   4140
   4141			if (done && !draining) {
   4142				evlist__disable(evlist);
   4143				draining = true;
   4144			}
   4145		}
   4146		perf_mmap__read_done(&md->core);
   4147	}
   4148
   4149	if (trace->nr_events == before) {
   4150		int timeout = done ? 100 : -1;
   4151
   4152		if (!draining && evlist__poll(evlist, timeout) > 0) {
   4153			if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
   4154				draining = true;
   4155
   4156			goto again;
   4157		} else {
   4158			if (trace__flush_events(trace))
   4159				goto out_disable;
   4160		}
   4161	} else {
   4162		goto again;
   4163	}
   4164
   4165out_disable:
   4166	thread__zput(trace->current);
   4167
   4168	evlist__disable(evlist);
   4169
   4170	if (trace->sort_events)
   4171		ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
   4172
   4173	if (!err) {
   4174		if (trace->summary)
   4175			trace__fprintf_thread_summary(trace, trace->output);
   4176
   4177		if (trace->show_tool_stats) {
   4178			fprintf(trace->output, "Stats:\n "
   4179					       " vfs_getname : %" PRIu64 "\n"
   4180					       " proc_getname: %" PRIu64 "\n",
   4181				trace->stats.vfs_getname,
   4182				trace->stats.proc_getname);
   4183		}
   4184	}
   4185
   4186out_delete_evlist:
   4187	trace__symbols__exit(trace);
   4188	evlist__free_syscall_tp_fields(evlist);
   4189	evlist__delete(evlist);
   4190	cgroup__put(trace->cgroup);
   4191	trace->evlist = NULL;
   4192	trace->live = false;
   4193	return err;
   4194{
   4195	char errbuf[BUFSIZ];
   4196
   4197out_error_sched_stat_runtime:
   4198	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
   4199	goto out_error;
   4200
   4201out_error_raw_syscalls:
   4202	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
   4203	goto out_error;
   4204
   4205out_error_mmap:
   4206	evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
   4207	goto out_error;
   4208
   4209out_error_open:
   4210	evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
   4211
   4212out_error:
   4213	fprintf(trace->output, "%s\n", errbuf);
   4214	goto out_delete_evlist;
   4215
   4216out_error_apply_filters:
   4217	fprintf(trace->output,
   4218		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
   4219		evsel->filter, evsel__name(evsel), errno,
   4220		str_error_r(errno, errbuf, sizeof(errbuf)));
   4221	goto out_delete_evlist;
   4222}
   4223out_error_mem:
   4224	fprintf(trace->output, "Not enough memory to run!\n");
   4225	goto out_delete_evlist;
   4226
   4227out_errno:
   4228	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
   4229	goto out_delete_evlist;
   4230}
   4231
   4232static int trace__replay(struct trace *trace)
   4233{
   4234	const struct evsel_str_handler handlers[] = {
   4235		{ "probe:vfs_getname",	     trace__vfs_getname, },
   4236	};
   4237	struct perf_data data = {
   4238		.path  = input_name,
   4239		.mode  = PERF_DATA_MODE_READ,
   4240		.force = trace->force,
   4241	};
   4242	struct perf_session *session;
   4243	struct evsel *evsel;
   4244	int err = -1;
   4245
   4246	trace->tool.sample	  = trace__process_sample;
   4247	trace->tool.mmap	  = perf_event__process_mmap;
   4248	trace->tool.mmap2	  = perf_event__process_mmap2;
   4249	trace->tool.comm	  = perf_event__process_comm;
   4250	trace->tool.exit	  = perf_event__process_exit;
   4251	trace->tool.fork	  = perf_event__process_fork;
   4252	trace->tool.attr	  = perf_event__process_attr;
   4253	trace->tool.tracing_data  = perf_event__process_tracing_data;
   4254	trace->tool.build_id	  = perf_event__process_build_id;
   4255	trace->tool.namespaces	  = perf_event__process_namespaces;
   4256
   4257	trace->tool.ordered_events = true;
   4258	trace->tool.ordering_requires_timestamps = true;
   4259
   4260	/* add tid to output */
   4261	trace->multiple_threads = true;
   4262
   4263	session = perf_session__new(&data, &trace->tool);
   4264	if (IS_ERR(session))
   4265		return PTR_ERR(session);
   4266
   4267	if (trace->opts.target.pid)
   4268		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
   4269
   4270	if (trace->opts.target.tid)
   4271		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
   4272
   4273	if (symbol__init(&session->header.env) < 0)
   4274		goto out;
   4275
   4276	trace->host = &session->machines.host;
   4277
   4278	err = perf_session__set_tracepoints_handlers(session, handlers);
   4279	if (err)
   4280		goto out;
   4281
   4282	evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter");
   4283	/* older kernels have syscalls tp versus raw_syscalls */
   4284	if (evsel == NULL)
   4285		evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter");
   4286
   4287	if (evsel &&
   4288	    (evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
   4289	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
   4290		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
   4291		goto out;
   4292	}
   4293
   4294	evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit");
   4295	if (evsel == NULL)
   4296		evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit");
   4297	if (evsel &&
   4298	    (evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
   4299	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
   4300		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
   4301		goto out;
   4302	}
   4303
   4304	evlist__for_each_entry(session->evlist, evsel) {
   4305		if (evsel->core.attr.type == PERF_TYPE_SOFTWARE &&
   4306		    (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
   4307		     evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
   4308		     evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS))
   4309			evsel->handler = trace__pgfault;
   4310	}
   4311
   4312	setup_pager();
   4313
   4314	err = perf_session__process_events(session);
   4315	if (err)
   4316		pr_err("Failed to process events, error %d", err);
   4317
   4318	else if (trace->summary)
   4319		trace__fprintf_thread_summary(trace, trace->output);
   4320
   4321out:
   4322	perf_session__delete(session);
   4323
   4324	return err;
   4325}
   4326
   4327static size_t trace__fprintf_threads_header(FILE *fp)
   4328{
   4329	size_t printed;
   4330
   4331	printed  = fprintf(fp, "\n Summary of events:\n\n");
   4332
   4333	return printed;
   4334}
   4335
   4336DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
   4337	struct syscall_stats *stats;
   4338	double		     msecs;
   4339	int		     syscall;
   4340)
   4341{
   4342	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
   4343	struct syscall_stats *stats = source->priv;
   4344
   4345	entry->syscall = source->i;
   4346	entry->stats   = stats;
   4347	entry->msecs   = stats ? (u64)stats->stats.n * (avg_stats(&stats->stats) / NSEC_PER_MSEC) : 0;
   4348}
   4349
   4350static size_t thread__dump_stats(struct thread_trace *ttrace,
   4351				 struct trace *trace, FILE *fp)
   4352{
   4353	size_t printed = 0;
   4354	struct syscall *sc;
   4355	struct rb_node *nd;
   4356	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
   4357
   4358	if (syscall_stats == NULL)
   4359		return 0;
   4360
   4361	printed += fprintf(fp, "\n");
   4362
   4363	printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
   4364	printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
   4365	printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
   4366
   4367	resort_rb__for_each_entry(nd, syscall_stats) {
   4368		struct syscall_stats *stats = syscall_stats_entry->stats;
   4369		if (stats) {
   4370			double min = (double)(stats->stats.min) / NSEC_PER_MSEC;
   4371			double max = (double)(stats->stats.max) / NSEC_PER_MSEC;
   4372			double avg = avg_stats(&stats->stats);
   4373			double pct;
   4374			u64 n = (u64)stats->stats.n;
   4375
   4376			pct = avg ? 100.0 * stddev_stats(&stats->stats) / avg : 0.0;
   4377			avg /= NSEC_PER_MSEC;
   4378
   4379			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
   4380			printed += fprintf(fp, "   %-15s", sc->name);
   4381			printed += fprintf(fp, " %8" PRIu64 " %6" PRIu64 " %9.3f %9.3f %9.3f",
   4382					   n, stats->nr_failures, syscall_stats_entry->msecs, min, avg);
   4383			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
   4384
   4385			if (trace->errno_summary && stats->nr_failures) {
   4386				const char *arch_name = perf_env__arch(trace->host->env);
   4387				int e;
   4388
   4389				for (e = 0; e < stats->max_errno; ++e) {
   4390					if (stats->errnos[e] != 0)
   4391						fprintf(fp, "\t\t\t\t%s: %d\n", arch_syscalls__strerrno(arch_name, e + 1), stats->errnos[e]);
   4392				}
   4393			}
   4394		}
   4395	}
   4396
   4397	resort_rb__delete(syscall_stats);
   4398	printed += fprintf(fp, "\n\n");
   4399
   4400	return printed;
   4401}
   4402
   4403static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
   4404{
   4405	size_t printed = 0;
   4406	struct thread_trace *ttrace = thread__priv(thread);
   4407	double ratio;
   4408
   4409	if (ttrace == NULL)
   4410		return 0;
   4411
   4412	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
   4413
   4414	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
   4415	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
   4416	printed += fprintf(fp, "%.1f%%", ratio);
   4417	if (ttrace->pfmaj)
   4418		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
   4419	if (ttrace->pfmin)
   4420		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
   4421	if (trace->sched)
   4422		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
   4423	else if (fputc('\n', fp) != EOF)
   4424		++printed;
   4425
   4426	printed += thread__dump_stats(ttrace, trace, fp);
   4427
   4428	return printed;
   4429}
   4430
   4431static unsigned long thread__nr_events(struct thread_trace *ttrace)
   4432{
   4433	return ttrace ? ttrace->nr_events : 0;
   4434}
   4435
   4436DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
   4437	struct thread *thread;
   4438)
   4439{
   4440	entry->thread = rb_entry(nd, struct thread, rb_node);
   4441}
   4442
   4443static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
   4444{
   4445	size_t printed = trace__fprintf_threads_header(fp);
   4446	struct rb_node *nd;
   4447	int i;
   4448
   4449	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
   4450		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
   4451
   4452		if (threads == NULL) {
   4453			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
   4454			return 0;
   4455		}
   4456
   4457		resort_rb__for_each_entry(nd, threads)
   4458			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
   4459
   4460		resort_rb__delete(threads);
   4461	}
   4462	return printed;
   4463}
   4464
   4465static int trace__set_duration(const struct option *opt, const char *str,
   4466			       int unset __maybe_unused)
   4467{
   4468	struct trace *trace = opt->value;
   4469
   4470	trace->duration_filter = atof(str);
   4471	return 0;
   4472}
   4473
   4474static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
   4475					      int unset __maybe_unused)
   4476{
   4477	int ret = -1;
   4478	size_t i;
   4479	struct trace *trace = opt->value;
   4480	/*
   4481	 * FIXME: introduce a intarray class, plain parse csv and create a
   4482	 * { int nr, int entries[] } struct...
   4483	 */
   4484	struct intlist *list = intlist__new(str);
   4485
   4486	if (list == NULL)
   4487		return -1;
   4488
   4489	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
   4490	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
   4491
   4492	if (trace->filter_pids.entries == NULL)
   4493		goto out;
   4494
   4495	trace->filter_pids.entries[0] = getpid();
   4496
   4497	for (i = 1; i < trace->filter_pids.nr; ++i)
   4498		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
   4499
   4500	intlist__delete(list);
   4501	ret = 0;
   4502out:
   4503	return ret;
   4504}
   4505
   4506static int trace__open_output(struct trace *trace, const char *filename)
   4507{
   4508	struct stat st;
   4509
   4510	if (!stat(filename, &st) && st.st_size) {
   4511		char oldname[PATH_MAX];
   4512
   4513		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
   4514		unlink(oldname);
   4515		rename(filename, oldname);
   4516	}
   4517
   4518	trace->output = fopen(filename, "w");
   4519
   4520	return trace->output == NULL ? -errno : 0;
   4521}
   4522
   4523static int parse_pagefaults(const struct option *opt, const char *str,
   4524			    int unset __maybe_unused)
   4525{
   4526	int *trace_pgfaults = opt->value;
   4527
   4528	if (strcmp(str, "all") == 0)
   4529		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
   4530	else if (strcmp(str, "maj") == 0)
   4531		*trace_pgfaults |= TRACE_PFMAJ;
   4532	else if (strcmp(str, "min") == 0)
   4533		*trace_pgfaults |= TRACE_PFMIN;
   4534	else
   4535		return -1;
   4536
   4537	return 0;
   4538}
   4539
   4540static void evlist__set_default_evsel_handler(struct evlist *evlist, void *handler)
   4541{
   4542	struct evsel *evsel;
   4543
   4544	evlist__for_each_entry(evlist, evsel) {
   4545		if (evsel->handler == NULL)
   4546			evsel->handler = handler;
   4547	}
   4548}
   4549
   4550static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name)
   4551{
   4552	struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
   4553
   4554	if (fmt) {
   4555		struct syscall_fmt *scfmt = syscall_fmt__find(name);
   4556
   4557		if (scfmt) {
   4558			int skip = 0;
   4559
   4560			if (strcmp(evsel->tp_format->format.fields->name, "__syscall_nr") == 0 ||
   4561			    strcmp(evsel->tp_format->format.fields->name, "nr") == 0)
   4562				++skip;
   4563
   4564			memcpy(fmt + skip, scfmt->arg, (evsel->tp_format->format.nr_fields - skip) * sizeof(*fmt));
   4565		}
   4566	}
   4567}
   4568
   4569static int evlist__set_syscall_tp_fields(struct evlist *evlist)
   4570{
   4571	struct evsel *evsel;
   4572
   4573	evlist__for_each_entry(evlist, evsel) {
   4574		if (evsel->priv || !evsel->tp_format)
   4575			continue;
   4576
   4577		if (strcmp(evsel->tp_format->system, "syscalls")) {
   4578			evsel__init_tp_arg_scnprintf(evsel);
   4579			continue;
   4580		}
   4581
   4582		if (evsel__init_syscall_tp(evsel))
   4583			return -1;
   4584
   4585		if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
   4586			struct syscall_tp *sc = __evsel__syscall_tp(evsel);
   4587
   4588			if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
   4589				return -1;
   4590
   4591			evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_enter_") - 1);
   4592		} else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
   4593			struct syscall_tp *sc = __evsel__syscall_tp(evsel);
   4594
   4595			if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
   4596				return -1;
   4597
   4598			evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_exit_") - 1);
   4599		}
   4600	}
   4601
   4602	return 0;
   4603}
   4604
   4605/*
   4606 * XXX: Hackish, just splitting the combined -e+--event (syscalls
   4607 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
   4608 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
   4609 *
   4610 * It'd be better to introduce a parse_options() variant that would return a
   4611 * list with the terms it didn't match to an event...
   4612 */
   4613static int trace__parse_events_option(const struct option *opt, const char *str,
   4614				      int unset __maybe_unused)
   4615{
   4616	struct trace *trace = (struct trace *)opt->value;
   4617	const char *s = str;
   4618	char *sep = NULL, *lists[2] = { NULL, NULL, };
   4619	int len = strlen(str) + 1, err = -1, list, idx;
   4620	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
   4621	char group_name[PATH_MAX];
   4622	struct syscall_fmt *fmt;
   4623
   4624	if (strace_groups_dir == NULL)
   4625		return -1;
   4626
   4627	if (*s == '!') {
   4628		++s;
   4629		trace->not_ev_qualifier = true;
   4630	}
   4631
   4632	while (1) {
   4633		if ((sep = strchr(s, ',')) != NULL)
   4634			*sep = '\0';
   4635
   4636		list = 0;
   4637		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
   4638		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
   4639			list = 1;
   4640			goto do_concat;
   4641		}
   4642
   4643		fmt = syscall_fmt__find_by_alias(s);
   4644		if (fmt != NULL) {
   4645			list = 1;
   4646			s = fmt->name;
   4647		} else {
   4648			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
   4649			if (access(group_name, R_OK) == 0)
   4650				list = 1;
   4651		}
   4652do_concat:
   4653		if (lists[list]) {
   4654			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
   4655		} else {
   4656			lists[list] = malloc(len);
   4657			if (lists[list] == NULL)
   4658				goto out;
   4659			strcpy(lists[list], s);
   4660		}
   4661
   4662		if (!sep)
   4663			break;
   4664
   4665		*sep = ',';
   4666		s = sep + 1;
   4667	}
   4668
   4669	if (lists[1] != NULL) {
   4670		struct strlist_config slist_config = {
   4671			.dirname = strace_groups_dir,
   4672		};
   4673
   4674		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
   4675		if (trace->ev_qualifier == NULL) {
   4676			fputs("Not enough memory to parse event qualifier", trace->output);
   4677			goto out;
   4678		}
   4679
   4680		if (trace__validate_ev_qualifier(trace))
   4681			goto out;
   4682		trace->trace_syscalls = true;
   4683	}
   4684
   4685	err = 0;
   4686
   4687	if (lists[0]) {
   4688		struct option o = {
   4689			.value = &trace->evlist,
   4690		};
   4691		err = parse_events_option(&o, lists[0], 0);
   4692	}
   4693out:
   4694	free(strace_groups_dir);
   4695	free(lists[0]);
   4696	free(lists[1]);
   4697	if (sep)
   4698		*sep = ',';
   4699
   4700	return err;
   4701}
   4702
   4703static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
   4704{
   4705	struct trace *trace = opt->value;
   4706
   4707	if (!list_empty(&trace->evlist->core.entries)) {
   4708		struct option o = {
   4709			.value = &trace->evlist,
   4710		};
   4711		return parse_cgroups(&o, str, unset);
   4712	}
   4713	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
   4714
   4715	return 0;
   4716}
   4717
   4718static int trace__config(const char *var, const char *value, void *arg)
   4719{
   4720	struct trace *trace = arg;
   4721	int err = 0;
   4722
   4723	if (!strcmp(var, "trace.add_events")) {
   4724		trace->perfconfig_events = strdup(value);
   4725		if (trace->perfconfig_events == NULL) {
   4726			pr_err("Not enough memory for %s\n", "trace.add_events");
   4727			return -1;
   4728		}
   4729	} else if (!strcmp(var, "trace.show_timestamp")) {
   4730		trace->show_tstamp = perf_config_bool(var, value);
   4731	} else if (!strcmp(var, "trace.show_duration")) {
   4732		trace->show_duration = perf_config_bool(var, value);
   4733	} else if (!strcmp(var, "trace.show_arg_names")) {
   4734		trace->show_arg_names = perf_config_bool(var, value);
   4735		if (!trace->show_arg_names)
   4736			trace->show_zeros = true;
   4737	} else if (!strcmp(var, "trace.show_zeros")) {
   4738		bool new_show_zeros = perf_config_bool(var, value);
   4739		if (!trace->show_arg_names && !new_show_zeros) {
   4740			pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
   4741			goto out;
   4742		}
   4743		trace->show_zeros = new_show_zeros;
   4744	} else if (!strcmp(var, "trace.show_prefix")) {
   4745		trace->show_string_prefix = perf_config_bool(var, value);
   4746	} else if (!strcmp(var, "trace.no_inherit")) {
   4747		trace->opts.no_inherit = perf_config_bool(var, value);
   4748	} else if (!strcmp(var, "trace.args_alignment")) {
   4749		int args_alignment = 0;
   4750		if (perf_config_int(&args_alignment, var, value) == 0)
   4751			trace->args_alignment = args_alignment;
   4752	} else if (!strcmp(var, "trace.tracepoint_beautifiers")) {
   4753		if (strcasecmp(value, "libtraceevent") == 0)
   4754			trace->libtraceevent_print = true;
   4755		else if (strcasecmp(value, "libbeauty") == 0)
   4756			trace->libtraceevent_print = false;
   4757	}
   4758out:
   4759	return err;
   4760}
   4761
   4762static void trace__exit(struct trace *trace)
   4763{
   4764	int i;
   4765
   4766	strlist__delete(trace->ev_qualifier);
   4767	free(trace->ev_qualifier_ids.entries);
   4768	if (trace->syscalls.table) {
   4769		for (i = 0; i <= trace->sctbl->syscalls.max_id; i++)
   4770			syscall__exit(&trace->syscalls.table[i]);
   4771		free(trace->syscalls.table);
   4772	}
   4773	syscalltbl__delete(trace->sctbl);
   4774	zfree(&trace->perfconfig_events);
   4775}
   4776
   4777int cmd_trace(int argc, const char **argv)
   4778{
   4779	const char *trace_usage[] = {
   4780		"perf trace [<options>] [<command>]",
   4781		"perf trace [<options>] -- <command> [<options>]",
   4782		"perf trace record [<options>] [<command>]",
   4783		"perf trace record [<options>] -- <command> [<options>]",
   4784		NULL
   4785	};
   4786	struct trace trace = {
   4787		.opts = {
   4788			.target = {
   4789				.uid	   = UINT_MAX,
   4790				.uses_mmap = true,
   4791			},
   4792			.user_freq     = UINT_MAX,
   4793			.user_interval = ULLONG_MAX,
   4794			.no_buffering  = true,
   4795			.mmap_pages    = UINT_MAX,
   4796		},
   4797		.output = stderr,
   4798		.show_comm = true,
   4799		.show_tstamp = true,
   4800		.show_duration = true,
   4801		.show_arg_names = true,
   4802		.args_alignment = 70,
   4803		.trace_syscalls = false,
   4804		.kernel_syscallchains = false,
   4805		.max_stack = UINT_MAX,
   4806		.max_events = ULONG_MAX,
   4807	};
   4808	const char *map_dump_str = NULL;
   4809	const char *output_name = NULL;
   4810	const struct option trace_options[] = {
   4811	OPT_CALLBACK('e', "event", &trace, "event",
   4812		     "event/syscall selector. use 'perf list' to list available events",
   4813		     trace__parse_events_option),
   4814	OPT_CALLBACK(0, "filter", &trace.evlist, "filter",
   4815		     "event filter", parse_filter),
   4816	OPT_BOOLEAN(0, "comm", &trace.show_comm,
   4817		    "show the thread COMM next to its id"),
   4818	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
   4819	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
   4820		     trace__parse_events_option),
   4821	OPT_STRING('o', "output", &output_name, "file", "output file name"),
   4822	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
   4823	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
   4824		    "trace events on existing process id"),
   4825	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
   4826		    "trace events on existing thread id"),
   4827	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
   4828		     "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
   4829	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
   4830		    "system-wide collection from all CPUs"),
   4831	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
   4832		    "list of cpus to monitor"),
   4833	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
   4834		    "child tasks do not inherit counters"),
   4835	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
   4836		     "number of mmap data pages", evlist__parse_mmap_pages),
   4837	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
   4838		   "user to profile"),
   4839	OPT_CALLBACK(0, "duration", &trace, "float",
   4840		     "show only events with duration > N.M ms",
   4841		     trace__set_duration),
   4842#ifdef HAVE_LIBBPF_SUPPORT
   4843	OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
   4844#endif
   4845	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
   4846	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
   4847	OPT_BOOLEAN('T', "time", &trace.full_time,
   4848		    "Show full timestamp, not time relative to first start"),
   4849	OPT_BOOLEAN(0, "failure", &trace.failure_only,
   4850		    "Show only syscalls that failed"),
   4851	OPT_BOOLEAN('s', "summary", &trace.summary_only,
   4852		    "Show only syscall summary with statistics"),
   4853	OPT_BOOLEAN('S', "with-summary", &trace.summary,
   4854		    "Show all syscalls and summary with statistics"),
   4855	OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
   4856		    "Show errno stats per syscall, use with -s or -S"),
   4857	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
   4858		     "Trace pagefaults", parse_pagefaults, "maj"),
   4859	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
   4860	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
   4861	OPT_CALLBACK(0, "call-graph", &trace.opts,
   4862		     "record_mode[,record_size]", record_callchain_help,
   4863		     &record_parse_callchain_opt),
   4864	OPT_BOOLEAN(0, "libtraceevent_print", &trace.libtraceevent_print,
   4865		    "Use libtraceevent to print the tracepoint arguments."),
   4866	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
   4867		    "Show the kernel callchains on the syscall exit path"),
   4868	OPT_ULONG(0, "max-events", &trace.max_events,
   4869		"Set the maximum number of events to print, exit after that is reached. "),
   4870	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
   4871		     "Set the minimum stack depth when parsing the callchain, "
   4872		     "anything below the specified depth will be ignored."),
   4873	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
   4874		     "Set the maximum stack depth when parsing the callchain, "
   4875		     "anything beyond the specified depth will be ignored. "
   4876		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
   4877	OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
   4878			"Sort batch of events before processing, use if getting out of order events"),
   4879	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
   4880			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
   4881	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
   4882			"per thread proc mmap processing timeout in ms"),
   4883	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
   4884		     trace__parse_cgroups),
   4885	OPT_INTEGER('D', "delay", &trace.opts.initial_delay,
   4886		     "ms to wait before starting measurement after program "
   4887		     "start"),
   4888	OPTS_EVSWITCH(&trace.evswitch),
   4889	OPT_END()
   4890	};
   4891	bool __maybe_unused max_stack_user_set = true;
   4892	bool mmap_pages_user_set = true;
   4893	struct evsel *evsel;
   4894	const char * const trace_subcommands[] = { "record", NULL };
   4895	int err = -1;
   4896	char bf[BUFSIZ];
   4897	struct sigaction sigchld_act;
   4898
   4899	signal(SIGSEGV, sighandler_dump_stack);
   4900	signal(SIGFPE, sighandler_dump_stack);
   4901	signal(SIGINT, sighandler_interrupt);
   4902
   4903	memset(&sigchld_act, 0, sizeof(sigchld_act));
   4904	sigchld_act.sa_flags = SA_SIGINFO;
   4905	sigchld_act.sa_sigaction = sighandler_chld;
   4906	sigaction(SIGCHLD, &sigchld_act, NULL);
   4907
   4908	trace.evlist = evlist__new();
   4909	trace.sctbl = syscalltbl__new();
   4910
   4911	if (trace.evlist == NULL || trace.sctbl == NULL) {
   4912		pr_err("Not enough memory to run!\n");
   4913		err = -ENOMEM;
   4914		goto out;
   4915	}
   4916
   4917	/*
   4918	 * Parsing .perfconfig may entail creating a BPF event, that may need
   4919	 * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting
   4920	 * is too small. This affects just this process, not touching the
   4921	 * global setting. If it fails we'll get something in 'perf trace -v'
   4922	 * to help diagnose the problem.
   4923	 */
   4924	rlimit__bump_memlock();
   4925
   4926	err = perf_config(trace__config, &trace);
   4927	if (err)
   4928		goto out;
   4929
   4930	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
   4931				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
   4932
   4933	/*
   4934	 * Here we already passed thru trace__parse_events_option() and it has
   4935	 * already figured out if -e syscall_name, if not but if --event
   4936	 * foo:bar was used, the user is interested _just_ in those, say,
   4937	 * tracepoint events, not in the strace-like syscall-name-based mode.
   4938	 *
   4939	 * This is important because we need to check if strace-like mode is
   4940	 * needed to decided if we should filter out the eBPF
   4941	 * __augmented_syscalls__ code, if it is in the mix, say, via
   4942	 * .perfconfig trace.add_events, and filter those out.
   4943	 */
   4944	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
   4945	    trace.evlist->core.nr_entries == 0 /* Was --events used? */) {
   4946		trace.trace_syscalls = true;
   4947	}
   4948	/*
   4949	 * Now that we have --verbose figured out, lets see if we need to parse
   4950	 * events from .perfconfig, so that if those events fail parsing, say some
   4951	 * BPF program fails, then we'll be able to use --verbose to see what went
   4952	 * wrong in more detail.
   4953	 */
   4954	if (trace.perfconfig_events != NULL) {
   4955		struct parse_events_error parse_err;
   4956
   4957		parse_events_error__init(&parse_err);
   4958		err = parse_events(trace.evlist, trace.perfconfig_events, &parse_err);
   4959		if (err)
   4960			parse_events_error__print(&parse_err, trace.perfconfig_events);
   4961		parse_events_error__exit(&parse_err);
   4962		if (err)
   4963			goto out;
   4964	}
   4965
   4966	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
   4967		usage_with_options_msg(trace_usage, trace_options,
   4968				       "cgroup monitoring only available in system-wide mode");
   4969	}
   4970
   4971	evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
   4972	if (IS_ERR(evsel)) {
   4973		bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
   4974		pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
   4975		goto out;
   4976	}
   4977
   4978	if (evsel) {
   4979		trace.syscalls.events.augmented = evsel;
   4980
   4981		evsel = evlist__find_tracepoint_by_name(trace.evlist, "raw_syscalls:sys_enter");
   4982		if (evsel == NULL) {
   4983			pr_err("ERROR: raw_syscalls:sys_enter not found in the augmented BPF object\n");
   4984			goto out;
   4985		}
   4986
   4987		if (evsel->bpf_obj == NULL) {
   4988			pr_err("ERROR: raw_syscalls:sys_enter not associated to a BPF object\n");
   4989			goto out;
   4990		}
   4991
   4992		trace.bpf_obj = evsel->bpf_obj;
   4993
   4994		/*
   4995		 * If we have _just_ the augmenter event but don't have a
   4996		 * explicit --syscalls, then assume we want all strace-like
   4997		 * syscalls:
   4998		 */
   4999		if (!trace.trace_syscalls && trace__only_augmented_syscalls_evsels(&trace))
   5000			trace.trace_syscalls = true;
   5001		/*
   5002		 * So, if we have a syscall augmenter, but trace_syscalls, aka
   5003		 * strace-like syscall tracing is not set, then we need to trow
   5004		 * away the augmenter, i.e. all the events that were created
   5005		 * from that BPF object file.
   5006		 *
   5007		 * This is more to fix the current .perfconfig trace.add_events
   5008		 * style of setting up the strace-like eBPF based syscall point
   5009		 * payload augmenter.
   5010		 *
   5011		 * All this complexity will be avoided by adding an alternative
   5012		 * to trace.add_events in the form of
   5013		 * trace.bpf_augmented_syscalls, that will be only parsed if we
   5014		 * need it.
   5015		 *
   5016		 * .perfconfig trace.add_events is still useful if we want, for
   5017		 * instance, have msr_write.msr in some .perfconfig profile based
   5018		 * 'perf trace --config determinism.profile' mode, where for some
   5019		 * particular goal/workload type we want a set of events and
   5020		 * output mode (with timings, etc) instead of having to add
   5021		 * all via the command line.
   5022		 *
   5023		 * Also --config to specify an alternate .perfconfig file needs
   5024		 * to be implemented.
   5025		 */
   5026		if (!trace.trace_syscalls) {
   5027			trace__delete_augmented_syscalls(&trace);
   5028		} else {
   5029			trace__set_bpf_map_filtered_pids(&trace);
   5030			trace__set_bpf_map_syscalls(&trace);
   5031			trace.syscalls.unaugmented_prog = trace__find_bpf_program_by_title(&trace, "!raw_syscalls:unaugmented");
   5032		}
   5033	}
   5034
   5035	err = bpf__setup_stdout(trace.evlist);
   5036	if (err) {
   5037		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
   5038		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
   5039		goto out;
   5040	}
   5041
   5042	err = -1;
   5043
   5044	if (map_dump_str) {
   5045		trace.dump.map = trace__find_bpf_map_by_name(&trace, map_dump_str);
   5046		if (trace.dump.map == NULL) {
   5047			pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
   5048			goto out;
   5049		}
   5050	}
   5051
   5052	if (trace.trace_pgfaults) {
   5053		trace.opts.sample_address = true;
   5054		trace.opts.sample_time = true;
   5055	}
   5056
   5057	if (trace.opts.mmap_pages == UINT_MAX)
   5058		mmap_pages_user_set = false;
   5059
   5060	if (trace.max_stack == UINT_MAX) {
   5061		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
   5062		max_stack_user_set = false;
   5063	}
   5064
   5065#ifdef HAVE_DWARF_UNWIND_SUPPORT
   5066	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
   5067		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
   5068	}
   5069#endif
   5070
   5071	if (callchain_param.enabled) {
   5072		if (!mmap_pages_user_set && geteuid() == 0)
   5073			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
   5074
   5075		symbol_conf.use_callchain = true;
   5076	}
   5077
   5078	if (trace.evlist->core.nr_entries > 0) {
   5079		evlist__set_default_evsel_handler(trace.evlist, trace__event_handler);
   5080		if (evlist__set_syscall_tp_fields(trace.evlist)) {
   5081			perror("failed to set syscalls:* tracepoint fields");
   5082			goto out;
   5083		}
   5084	}
   5085
   5086	if (trace.sort_events) {
   5087		ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
   5088		ordered_events__set_copy_on_queue(&trace.oe.data, true);
   5089	}
   5090
   5091	/*
   5092	 * If we are augmenting syscalls, then combine what we put in the
   5093	 * __augmented_syscalls__ BPF map with what is in the
   5094	 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
   5095	 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
   5096	 *
   5097	 * We'll switch to look at two BPF maps, one for sys_enter and the
   5098	 * other for sys_exit when we start augmenting the sys_exit paths with
   5099	 * buffers that are being copied from kernel to userspace, think 'read'
   5100	 * syscall.
   5101	 */
   5102	if (trace.syscalls.events.augmented) {
   5103		evlist__for_each_entry(trace.evlist, evsel) {
   5104			bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
   5105
   5106			if (raw_syscalls_sys_exit) {
   5107				trace.raw_augmented_syscalls = true;
   5108				goto init_augmented_syscall_tp;
   5109			}
   5110
   5111			if (trace.syscalls.events.augmented->priv == NULL &&
   5112			    strstr(evsel__name(evsel), "syscalls:sys_enter")) {
   5113				struct evsel *augmented = trace.syscalls.events.augmented;
   5114				if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
   5115				    evsel__init_augmented_syscall_tp_args(augmented))
   5116					goto out;
   5117				/*
   5118				 * Augmented is __augmented_syscalls__ BPF_OUTPUT event
   5119				 * Above we made sure we can get from the payload the tp fields
   5120				 * that we get from syscalls:sys_enter tracefs format file.
   5121				 */
   5122				augmented->handler = trace__sys_enter;
   5123				/*
   5124				 * Now we do the same for the *syscalls:sys_enter event so that
   5125				 * if we handle it directly, i.e. if the BPF prog returns 0 so
   5126				 * as not to filter it, then we'll handle it just like we would
   5127				 * for the BPF_OUTPUT one:
   5128				 */
   5129				if (evsel__init_augmented_syscall_tp(evsel, evsel) ||
   5130				    evsel__init_augmented_syscall_tp_args(evsel))
   5131					goto out;
   5132				evsel->handler = trace__sys_enter;
   5133			}
   5134
   5135			if (strstarts(evsel__name(evsel), "syscalls:sys_exit_")) {
   5136				struct syscall_tp *sc;
   5137init_augmented_syscall_tp:
   5138				if (evsel__init_augmented_syscall_tp(evsel, evsel))
   5139					goto out;
   5140				sc = __evsel__syscall_tp(evsel);
   5141				/*
   5142				 * For now with BPF raw_augmented we hook into
   5143				 * raw_syscalls:sys_enter and there we get all
   5144				 * 6 syscall args plus the tracepoint common
   5145				 * fields and the syscall_nr (another long).
   5146				 * So we check if that is the case and if so
   5147				 * don't look after the sc->args_size but
   5148				 * always after the full raw_syscalls:sys_enter
   5149				 * payload, which is fixed.
   5150				 *
   5151				 * We'll revisit this later to pass
   5152				 * s->args_size to the BPF augmenter (now
   5153				 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
   5154				 * so that it copies only what we need for each
   5155				 * syscall, like what happens when we use
   5156				 * syscalls:sys_enter_NAME, so that we reduce
   5157				 * the kernel/userspace traffic to just what is
   5158				 * needed for each syscall.
   5159				 */
   5160				if (trace.raw_augmented_syscalls)
   5161					trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
   5162				evsel__init_augmented_syscall_tp_ret(evsel);
   5163				evsel->handler = trace__sys_exit;
   5164			}
   5165		}
   5166	}
   5167
   5168	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
   5169		return trace__record(&trace, argc-1, &argv[1]);
   5170
   5171	/* Using just --errno-summary will trigger --summary */
   5172	if (trace.errno_summary && !trace.summary && !trace.summary_only)
   5173		trace.summary_only = true;
   5174
   5175	/* summary_only implies summary option, but don't overwrite summary if set */
   5176	if (trace.summary_only)
   5177		trace.summary = trace.summary_only;
   5178
   5179	if (output_name != NULL) {
   5180		err = trace__open_output(&trace, output_name);
   5181		if (err < 0) {
   5182			perror("failed to create output file");
   5183			goto out;
   5184		}
   5185	}
   5186
   5187	err = evswitch__init(&trace.evswitch, trace.evlist, stderr);
   5188	if (err)
   5189		goto out_close;
   5190
   5191	err = target__validate(&trace.opts.target);
   5192	if (err) {
   5193		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
   5194		fprintf(trace.output, "%s", bf);
   5195		goto out_close;
   5196	}
   5197
   5198	err = target__parse_uid(&trace.opts.target);
   5199	if (err) {
   5200		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
   5201		fprintf(trace.output, "%s", bf);
   5202		goto out_close;
   5203	}
   5204
   5205	if (!argc && target__none(&trace.opts.target))
   5206		trace.opts.target.system_wide = true;
   5207
   5208	if (input_name)
   5209		err = trace__replay(&trace);
   5210	else
   5211		err = trace__run(&trace, argc, argv);
   5212
   5213out_close:
   5214	if (output_name != NULL)
   5215		fclose(trace.output);
   5216out:
   5217	trace__exit(&trace);
   5218	return err;
   5219}