cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

builtin-sched.c (92170B)


      1// SPDX-License-Identifier: GPL-2.0
      2#include "builtin.h"
      3#include "perf.h"
      4#include "perf-sys.h"
      5
      6#include "util/cpumap.h"
      7#include "util/evlist.h"
      8#include "util/evsel.h"
      9#include "util/evsel_fprintf.h"
     10#include "util/symbol.h"
     11#include "util/thread.h"
     12#include "util/header.h"
     13#include "util/session.h"
     14#include "util/tool.h"
     15#include "util/cloexec.h"
     16#include "util/thread_map.h"
     17#include "util/color.h"
     18#include "util/stat.h"
     19#include "util/string2.h"
     20#include "util/callchain.h"
     21#include "util/time-utils.h"
     22
     23#include <subcmd/pager.h>
     24#include <subcmd/parse-options.h>
     25#include "util/trace-event.h"
     26
     27#include "util/debug.h"
     28#include "util/event.h"
     29
     30#include <linux/kernel.h>
     31#include <linux/log2.h>
     32#include <linux/zalloc.h>
     33#include <sys/prctl.h>
     34#include <sys/resource.h>
     35#include <inttypes.h>
     36
     37#include <errno.h>
     38#include <semaphore.h>
     39#include <pthread.h>
     40#include <math.h>
     41#include <api/fs/fs.h>
     42#include <perf/cpumap.h>
     43#include <linux/time64.h>
     44#include <linux/err.h>
     45
     46#include <linux/ctype.h>
     47
     48#define PR_SET_NAME		15               /* Set process name */
     49#define MAX_CPUS		4096
     50#define COMM_LEN		20
     51#define SYM_LEN			129
     52#define MAX_PID			1024000
     53
     54static const char *cpu_list;
     55static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
     56
     57struct sched_atom;
     58
     59struct task_desc {
     60	unsigned long		nr;
     61	unsigned long		pid;
     62	char			comm[COMM_LEN];
     63
     64	unsigned long		nr_events;
     65	unsigned long		curr_event;
     66	struct sched_atom	**atoms;
     67
     68	pthread_t		thread;
     69	sem_t			sleep_sem;
     70
     71	sem_t			ready_for_work;
     72	sem_t			work_done_sem;
     73
     74	u64			cpu_usage;
     75};
     76
     77enum sched_event_type {
     78	SCHED_EVENT_RUN,
     79	SCHED_EVENT_SLEEP,
     80	SCHED_EVENT_WAKEUP,
     81	SCHED_EVENT_MIGRATION,
     82};
     83
     84struct sched_atom {
     85	enum sched_event_type	type;
     86	int			specific_wait;
     87	u64			timestamp;
     88	u64			duration;
     89	unsigned long		nr;
     90	sem_t			*wait_sem;
     91	struct task_desc	*wakee;
     92};
     93
     94#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
     95
     96/* task state bitmask, copied from include/linux/sched.h */
     97#define TASK_RUNNING		0
     98#define TASK_INTERRUPTIBLE	1
     99#define TASK_UNINTERRUPTIBLE	2
    100#define __TASK_STOPPED		4
    101#define __TASK_TRACED		8
    102/* in tsk->exit_state */
    103#define EXIT_DEAD		16
    104#define EXIT_ZOMBIE		32
    105#define EXIT_TRACE		(EXIT_ZOMBIE | EXIT_DEAD)
    106/* in tsk->state again */
    107#define TASK_DEAD		64
    108#define TASK_WAKEKILL		128
    109#define TASK_WAKING		256
    110#define TASK_PARKED		512
    111
    112enum thread_state {
    113	THREAD_SLEEPING = 0,
    114	THREAD_WAIT_CPU,
    115	THREAD_SCHED_IN,
    116	THREAD_IGNORE
    117};
    118
    119struct work_atom {
    120	struct list_head	list;
    121	enum thread_state	state;
    122	u64			sched_out_time;
    123	u64			wake_up_time;
    124	u64			sched_in_time;
    125	u64			runtime;
    126};
    127
    128struct work_atoms {
    129	struct list_head	work_list;
    130	struct thread		*thread;
    131	struct rb_node		node;
    132	u64			max_lat;
    133	u64			max_lat_start;
    134	u64			max_lat_end;
    135	u64			total_lat;
    136	u64			nb_atoms;
    137	u64			total_runtime;
    138	int			num_merged;
    139};
    140
    141typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
    142
    143struct perf_sched;
    144
    145struct trace_sched_handler {
    146	int (*switch_event)(struct perf_sched *sched, struct evsel *evsel,
    147			    struct perf_sample *sample, struct machine *machine);
    148
    149	int (*runtime_event)(struct perf_sched *sched, struct evsel *evsel,
    150			     struct perf_sample *sample, struct machine *machine);
    151
    152	int (*wakeup_event)(struct perf_sched *sched, struct evsel *evsel,
    153			    struct perf_sample *sample, struct machine *machine);
    154
    155	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
    156	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
    157			  struct machine *machine);
    158
    159	int (*migrate_task_event)(struct perf_sched *sched,
    160				  struct evsel *evsel,
    161				  struct perf_sample *sample,
    162				  struct machine *machine);
    163};
    164
    165#define COLOR_PIDS PERF_COLOR_BLUE
    166#define COLOR_CPUS PERF_COLOR_BG_RED
    167
    168struct perf_sched_map {
    169	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
    170	struct perf_cpu		*comp_cpus;
    171	bool			 comp;
    172	struct perf_thread_map *color_pids;
    173	const char		*color_pids_str;
    174	struct perf_cpu_map	*color_cpus;
    175	const char		*color_cpus_str;
    176	struct perf_cpu_map	*cpus;
    177	const char		*cpus_str;
    178};
    179
    180struct perf_sched {
    181	struct perf_tool tool;
    182	const char	 *sort_order;
    183	unsigned long	 nr_tasks;
    184	struct task_desc **pid_to_task;
    185	struct task_desc **tasks;
    186	const struct trace_sched_handler *tp_handler;
    187	pthread_mutex_t	 start_work_mutex;
    188	pthread_mutex_t	 work_done_wait_mutex;
    189	int		 profile_cpu;
    190/*
    191 * Track the current task - that way we can know whether there's any
    192 * weird events, such as a task being switched away that is not current.
    193 */
    194	struct perf_cpu	 max_cpu;
    195	u32		 curr_pid[MAX_CPUS];
    196	struct thread	 *curr_thread[MAX_CPUS];
    197	char		 next_shortname1;
    198	char		 next_shortname2;
    199	unsigned int	 replay_repeat;
    200	unsigned long	 nr_run_events;
    201	unsigned long	 nr_sleep_events;
    202	unsigned long	 nr_wakeup_events;
    203	unsigned long	 nr_sleep_corrections;
    204	unsigned long	 nr_run_events_optimized;
    205	unsigned long	 targetless_wakeups;
    206	unsigned long	 multitarget_wakeups;
    207	unsigned long	 nr_runs;
    208	unsigned long	 nr_timestamps;
    209	unsigned long	 nr_unordered_timestamps;
    210	unsigned long	 nr_context_switch_bugs;
    211	unsigned long	 nr_events;
    212	unsigned long	 nr_lost_chunks;
    213	unsigned long	 nr_lost_events;
    214	u64		 run_measurement_overhead;
    215	u64		 sleep_measurement_overhead;
    216	u64		 start_time;
    217	u64		 cpu_usage;
    218	u64		 runavg_cpu_usage;
    219	u64		 parent_cpu_usage;
    220	u64		 runavg_parent_cpu_usage;
    221	u64		 sum_runtime;
    222	u64		 sum_fluct;
    223	u64		 run_avg;
    224	u64		 all_runtime;
    225	u64		 all_count;
    226	u64		 cpu_last_switched[MAX_CPUS];
    227	struct rb_root_cached atom_root, sorted_atom_root, merged_atom_root;
    228	struct list_head sort_list, cmp_pid;
    229	bool force;
    230	bool skip_merge;
    231	struct perf_sched_map map;
    232
    233	/* options for timehist command */
    234	bool		summary;
    235	bool		summary_only;
    236	bool		idle_hist;
    237	bool		show_callchain;
    238	unsigned int	max_stack;
    239	bool		show_cpu_visual;
    240	bool		show_wakeups;
    241	bool		show_next;
    242	bool		show_migrations;
    243	bool		show_state;
    244	u64		skipped_samples;
    245	const char	*time_str;
    246	struct perf_time_interval ptime;
    247	struct perf_time_interval hist_time;
    248};
    249
    250/* per thread run time data */
    251struct thread_runtime {
    252	u64 last_time;      /* time of previous sched in/out event */
    253	u64 dt_run;         /* run time */
    254	u64 dt_sleep;       /* time between CPU access by sleep (off cpu) */
    255	u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
    256	u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
    257	u64 dt_delay;       /* time between wakeup and sched-in */
    258	u64 ready_to_run;   /* time of wakeup */
    259
    260	struct stats run_stats;
    261	u64 total_run_time;
    262	u64 total_sleep_time;
    263	u64 total_iowait_time;
    264	u64 total_preempt_time;
    265	u64 total_delay_time;
    266
    267	int last_state;
    268
    269	char shortname[3];
    270	bool comm_changed;
    271
    272	u64 migrations;
    273};
    274
    275/* per event run time data */
    276struct evsel_runtime {
    277	u64 *last_time; /* time this event was last seen per cpu */
    278	u32 ncpu;       /* highest cpu slot allocated */
    279};
    280
    281/* per cpu idle time data */
    282struct idle_thread_runtime {
    283	struct thread_runtime	tr;
    284	struct thread		*last_thread;
    285	struct rb_root_cached	sorted_root;
    286	struct callchain_root	callchain;
    287	struct callchain_cursor	cursor;
    288};
    289
    290/* track idle times per cpu */
    291static struct thread **idle_threads;
    292static int idle_max_cpu;
    293static char idle_comm[] = "<idle>";
    294
    295static u64 get_nsecs(void)
    296{
    297	struct timespec ts;
    298
    299	clock_gettime(CLOCK_MONOTONIC, &ts);
    300
    301	return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
    302}
    303
    304static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
    305{
    306	u64 T0 = get_nsecs(), T1;
    307
    308	do {
    309		T1 = get_nsecs();
    310	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
    311}
    312
    313static void sleep_nsecs(u64 nsecs)
    314{
    315	struct timespec ts;
    316
    317	ts.tv_nsec = nsecs % 999999999;
    318	ts.tv_sec = nsecs / 999999999;
    319
    320	nanosleep(&ts, NULL);
    321}
    322
    323static void calibrate_run_measurement_overhead(struct perf_sched *sched)
    324{
    325	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
    326	int i;
    327
    328	for (i = 0; i < 10; i++) {
    329		T0 = get_nsecs();
    330		burn_nsecs(sched, 0);
    331		T1 = get_nsecs();
    332		delta = T1-T0;
    333		min_delta = min(min_delta, delta);
    334	}
    335	sched->run_measurement_overhead = min_delta;
    336
    337	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
    338}
    339
    340static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
    341{
    342	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
    343	int i;
    344
    345	for (i = 0; i < 10; i++) {
    346		T0 = get_nsecs();
    347		sleep_nsecs(10000);
    348		T1 = get_nsecs();
    349		delta = T1-T0;
    350		min_delta = min(min_delta, delta);
    351	}
    352	min_delta -= 10000;
    353	sched->sleep_measurement_overhead = min_delta;
    354
    355	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
    356}
    357
    358static struct sched_atom *
    359get_new_event(struct task_desc *task, u64 timestamp)
    360{
    361	struct sched_atom *event = zalloc(sizeof(*event));
    362	unsigned long idx = task->nr_events;
    363	size_t size;
    364
    365	event->timestamp = timestamp;
    366	event->nr = idx;
    367
    368	task->nr_events++;
    369	size = sizeof(struct sched_atom *) * task->nr_events;
    370	task->atoms = realloc(task->atoms, size);
    371	BUG_ON(!task->atoms);
    372
    373	task->atoms[idx] = event;
    374
    375	return event;
    376}
    377
    378static struct sched_atom *last_event(struct task_desc *task)
    379{
    380	if (!task->nr_events)
    381		return NULL;
    382
    383	return task->atoms[task->nr_events - 1];
    384}
    385
    386static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
    387				u64 timestamp, u64 duration)
    388{
    389	struct sched_atom *event, *curr_event = last_event(task);
    390
    391	/*
    392	 * optimize an existing RUN event by merging this one
    393	 * to it:
    394	 */
    395	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
    396		sched->nr_run_events_optimized++;
    397		curr_event->duration += duration;
    398		return;
    399	}
    400
    401	event = get_new_event(task, timestamp);
    402
    403	event->type = SCHED_EVENT_RUN;
    404	event->duration = duration;
    405
    406	sched->nr_run_events++;
    407}
    408
    409static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
    410				   u64 timestamp, struct task_desc *wakee)
    411{
    412	struct sched_atom *event, *wakee_event;
    413
    414	event = get_new_event(task, timestamp);
    415	event->type = SCHED_EVENT_WAKEUP;
    416	event->wakee = wakee;
    417
    418	wakee_event = last_event(wakee);
    419	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
    420		sched->targetless_wakeups++;
    421		return;
    422	}
    423	if (wakee_event->wait_sem) {
    424		sched->multitarget_wakeups++;
    425		return;
    426	}
    427
    428	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
    429	sem_init(wakee_event->wait_sem, 0, 0);
    430	wakee_event->specific_wait = 1;
    431	event->wait_sem = wakee_event->wait_sem;
    432
    433	sched->nr_wakeup_events++;
    434}
    435
    436static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
    437				  u64 timestamp, u64 task_state __maybe_unused)
    438{
    439	struct sched_atom *event = get_new_event(task, timestamp);
    440
    441	event->type = SCHED_EVENT_SLEEP;
    442
    443	sched->nr_sleep_events++;
    444}
    445
    446static struct task_desc *register_pid(struct perf_sched *sched,
    447				      unsigned long pid, const char *comm)
    448{
    449	struct task_desc *task;
    450	static int pid_max;
    451
    452	if (sched->pid_to_task == NULL) {
    453		if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
    454			pid_max = MAX_PID;
    455		BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
    456	}
    457	if (pid >= (unsigned long)pid_max) {
    458		BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
    459			sizeof(struct task_desc *))) == NULL);
    460		while (pid >= (unsigned long)pid_max)
    461			sched->pid_to_task[pid_max++] = NULL;
    462	}
    463
    464	task = sched->pid_to_task[pid];
    465
    466	if (task)
    467		return task;
    468
    469	task = zalloc(sizeof(*task));
    470	task->pid = pid;
    471	task->nr = sched->nr_tasks;
    472	strcpy(task->comm, comm);
    473	/*
    474	 * every task starts in sleeping state - this gets ignored
    475	 * if there's no wakeup pointing to this sleep state:
    476	 */
    477	add_sched_event_sleep(sched, task, 0, 0);
    478
    479	sched->pid_to_task[pid] = task;
    480	sched->nr_tasks++;
    481	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
    482	BUG_ON(!sched->tasks);
    483	sched->tasks[task->nr] = task;
    484
    485	if (verbose > 0)
    486		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
    487
    488	return task;
    489}
    490
    491
    492static void print_task_traces(struct perf_sched *sched)
    493{
    494	struct task_desc *task;
    495	unsigned long i;
    496
    497	for (i = 0; i < sched->nr_tasks; i++) {
    498		task = sched->tasks[i];
    499		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
    500			task->nr, task->comm, task->pid, task->nr_events);
    501	}
    502}
    503
    504static void add_cross_task_wakeups(struct perf_sched *sched)
    505{
    506	struct task_desc *task1, *task2;
    507	unsigned long i, j;
    508
    509	for (i = 0; i < sched->nr_tasks; i++) {
    510		task1 = sched->tasks[i];
    511		j = i + 1;
    512		if (j == sched->nr_tasks)
    513			j = 0;
    514		task2 = sched->tasks[j];
    515		add_sched_event_wakeup(sched, task1, 0, task2);
    516	}
    517}
    518
    519static void perf_sched__process_event(struct perf_sched *sched,
    520				      struct sched_atom *atom)
    521{
    522	int ret = 0;
    523
    524	switch (atom->type) {
    525		case SCHED_EVENT_RUN:
    526			burn_nsecs(sched, atom->duration);
    527			break;
    528		case SCHED_EVENT_SLEEP:
    529			if (atom->wait_sem)
    530				ret = sem_wait(atom->wait_sem);
    531			BUG_ON(ret);
    532			break;
    533		case SCHED_EVENT_WAKEUP:
    534			if (atom->wait_sem)
    535				ret = sem_post(atom->wait_sem);
    536			BUG_ON(ret);
    537			break;
    538		case SCHED_EVENT_MIGRATION:
    539			break;
    540		default:
    541			BUG_ON(1);
    542	}
    543}
    544
    545static u64 get_cpu_usage_nsec_parent(void)
    546{
    547	struct rusage ru;
    548	u64 sum;
    549	int err;
    550
    551	err = getrusage(RUSAGE_SELF, &ru);
    552	BUG_ON(err);
    553
    554	sum =  ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
    555	sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
    556
    557	return sum;
    558}
    559
    560static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
    561{
    562	struct perf_event_attr attr;
    563	char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
    564	int fd;
    565	struct rlimit limit;
    566	bool need_privilege = false;
    567
    568	memset(&attr, 0, sizeof(attr));
    569
    570	attr.type = PERF_TYPE_SOFTWARE;
    571	attr.config = PERF_COUNT_SW_TASK_CLOCK;
    572
    573force_again:
    574	fd = sys_perf_event_open(&attr, 0, -1, -1,
    575				 perf_event_open_cloexec_flag());
    576
    577	if (fd < 0) {
    578		if (errno == EMFILE) {
    579			if (sched->force) {
    580				BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
    581				limit.rlim_cur += sched->nr_tasks - cur_task;
    582				if (limit.rlim_cur > limit.rlim_max) {
    583					limit.rlim_max = limit.rlim_cur;
    584					need_privilege = true;
    585				}
    586				if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
    587					if (need_privilege && errno == EPERM)
    588						strcpy(info, "Need privilege\n");
    589				} else
    590					goto force_again;
    591			} else
    592				strcpy(info, "Have a try with -f option\n");
    593		}
    594		pr_err("Error: sys_perf_event_open() syscall returned "
    595		       "with %d (%s)\n%s", fd,
    596		       str_error_r(errno, sbuf, sizeof(sbuf)), info);
    597		exit(EXIT_FAILURE);
    598	}
    599	return fd;
    600}
    601
    602static u64 get_cpu_usage_nsec_self(int fd)
    603{
    604	u64 runtime;
    605	int ret;
    606
    607	ret = read(fd, &runtime, sizeof(runtime));
    608	BUG_ON(ret != sizeof(runtime));
    609
    610	return runtime;
    611}
    612
    613struct sched_thread_parms {
    614	struct task_desc  *task;
    615	struct perf_sched *sched;
    616	int fd;
    617};
    618
    619static void *thread_func(void *ctx)
    620{
    621	struct sched_thread_parms *parms = ctx;
    622	struct task_desc *this_task = parms->task;
    623	struct perf_sched *sched = parms->sched;
    624	u64 cpu_usage_0, cpu_usage_1;
    625	unsigned long i, ret;
    626	char comm2[22];
    627	int fd = parms->fd;
    628
    629	zfree(&parms);
    630
    631	sprintf(comm2, ":%s", this_task->comm);
    632	prctl(PR_SET_NAME, comm2);
    633	if (fd < 0)
    634		return NULL;
    635again:
    636	ret = sem_post(&this_task->ready_for_work);
    637	BUG_ON(ret);
    638	ret = pthread_mutex_lock(&sched->start_work_mutex);
    639	BUG_ON(ret);
    640	ret = pthread_mutex_unlock(&sched->start_work_mutex);
    641	BUG_ON(ret);
    642
    643	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
    644
    645	for (i = 0; i < this_task->nr_events; i++) {
    646		this_task->curr_event = i;
    647		perf_sched__process_event(sched, this_task->atoms[i]);
    648	}
    649
    650	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
    651	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
    652	ret = sem_post(&this_task->work_done_sem);
    653	BUG_ON(ret);
    654
    655	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
    656	BUG_ON(ret);
    657	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
    658	BUG_ON(ret);
    659
    660	goto again;
    661}
    662
    663static void create_tasks(struct perf_sched *sched)
    664{
    665	struct task_desc *task;
    666	pthread_attr_t attr;
    667	unsigned long i;
    668	int err;
    669
    670	err = pthread_attr_init(&attr);
    671	BUG_ON(err);
    672	err = pthread_attr_setstacksize(&attr,
    673			(size_t) max(16 * 1024, (int)PTHREAD_STACK_MIN));
    674	BUG_ON(err);
    675	err = pthread_mutex_lock(&sched->start_work_mutex);
    676	BUG_ON(err);
    677	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
    678	BUG_ON(err);
    679	for (i = 0; i < sched->nr_tasks; i++) {
    680		struct sched_thread_parms *parms = malloc(sizeof(*parms));
    681		BUG_ON(parms == NULL);
    682		parms->task = task = sched->tasks[i];
    683		parms->sched = sched;
    684		parms->fd = self_open_counters(sched, i);
    685		sem_init(&task->sleep_sem, 0, 0);
    686		sem_init(&task->ready_for_work, 0, 0);
    687		sem_init(&task->work_done_sem, 0, 0);
    688		task->curr_event = 0;
    689		err = pthread_create(&task->thread, &attr, thread_func, parms);
    690		BUG_ON(err);
    691	}
    692}
    693
    694static void wait_for_tasks(struct perf_sched *sched)
    695{
    696	u64 cpu_usage_0, cpu_usage_1;
    697	struct task_desc *task;
    698	unsigned long i, ret;
    699
    700	sched->start_time = get_nsecs();
    701	sched->cpu_usage = 0;
    702	pthread_mutex_unlock(&sched->work_done_wait_mutex);
    703
    704	for (i = 0; i < sched->nr_tasks; i++) {
    705		task = sched->tasks[i];
    706		ret = sem_wait(&task->ready_for_work);
    707		BUG_ON(ret);
    708		sem_init(&task->ready_for_work, 0, 0);
    709	}
    710	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
    711	BUG_ON(ret);
    712
    713	cpu_usage_0 = get_cpu_usage_nsec_parent();
    714
    715	pthread_mutex_unlock(&sched->start_work_mutex);
    716
    717	for (i = 0; i < sched->nr_tasks; i++) {
    718		task = sched->tasks[i];
    719		ret = sem_wait(&task->work_done_sem);
    720		BUG_ON(ret);
    721		sem_init(&task->work_done_sem, 0, 0);
    722		sched->cpu_usage += task->cpu_usage;
    723		task->cpu_usage = 0;
    724	}
    725
    726	cpu_usage_1 = get_cpu_usage_nsec_parent();
    727	if (!sched->runavg_cpu_usage)
    728		sched->runavg_cpu_usage = sched->cpu_usage;
    729	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
    730
    731	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
    732	if (!sched->runavg_parent_cpu_usage)
    733		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
    734	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
    735					 sched->parent_cpu_usage)/sched->replay_repeat;
    736
    737	ret = pthread_mutex_lock(&sched->start_work_mutex);
    738	BUG_ON(ret);
    739
    740	for (i = 0; i < sched->nr_tasks; i++) {
    741		task = sched->tasks[i];
    742		sem_init(&task->sleep_sem, 0, 0);
    743		task->curr_event = 0;
    744	}
    745}
    746
    747static void run_one_test(struct perf_sched *sched)
    748{
    749	u64 T0, T1, delta, avg_delta, fluct;
    750
    751	T0 = get_nsecs();
    752	wait_for_tasks(sched);
    753	T1 = get_nsecs();
    754
    755	delta = T1 - T0;
    756	sched->sum_runtime += delta;
    757	sched->nr_runs++;
    758
    759	avg_delta = sched->sum_runtime / sched->nr_runs;
    760	if (delta < avg_delta)
    761		fluct = avg_delta - delta;
    762	else
    763		fluct = delta - avg_delta;
    764	sched->sum_fluct += fluct;
    765	if (!sched->run_avg)
    766		sched->run_avg = delta;
    767	sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
    768
    769	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC);
    770
    771	printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC);
    772
    773	printf("cpu: %0.2f / %0.2f",
    774		(double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC);
    775
    776#if 0
    777	/*
    778	 * rusage statistics done by the parent, these are less
    779	 * accurate than the sched->sum_exec_runtime based statistics:
    780	 */
    781	printf(" [%0.2f / %0.2f]",
    782		(double)sched->parent_cpu_usage / NSEC_PER_MSEC,
    783		(double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC);
    784#endif
    785
    786	printf("\n");
    787
    788	if (sched->nr_sleep_corrections)
    789		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
    790	sched->nr_sleep_corrections = 0;
    791}
    792
    793static void test_calibrations(struct perf_sched *sched)
    794{
    795	u64 T0, T1;
    796
    797	T0 = get_nsecs();
    798	burn_nsecs(sched, NSEC_PER_MSEC);
    799	T1 = get_nsecs();
    800
    801	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
    802
    803	T0 = get_nsecs();
    804	sleep_nsecs(NSEC_PER_MSEC);
    805	T1 = get_nsecs();
    806
    807	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
    808}
    809
    810static int
    811replay_wakeup_event(struct perf_sched *sched,
    812		    struct evsel *evsel, struct perf_sample *sample,
    813		    struct machine *machine __maybe_unused)
    814{
    815	const char *comm = evsel__strval(evsel, sample, "comm");
    816	const u32 pid	 = evsel__intval(evsel, sample, "pid");
    817	struct task_desc *waker, *wakee;
    818
    819	if (verbose > 0) {
    820		printf("sched_wakeup event %p\n", evsel);
    821
    822		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
    823	}
    824
    825	waker = register_pid(sched, sample->tid, "<unknown>");
    826	wakee = register_pid(sched, pid, comm);
    827
    828	add_sched_event_wakeup(sched, waker, sample->time, wakee);
    829	return 0;
    830}
    831
    832static int replay_switch_event(struct perf_sched *sched,
    833			       struct evsel *evsel,
    834			       struct perf_sample *sample,
    835			       struct machine *machine __maybe_unused)
    836{
    837	const char *prev_comm  = evsel__strval(evsel, sample, "prev_comm"),
    838		   *next_comm  = evsel__strval(evsel, sample, "next_comm");
    839	const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
    840		  next_pid = evsel__intval(evsel, sample, "next_pid");
    841	const u64 prev_state = evsel__intval(evsel, sample, "prev_state");
    842	struct task_desc *prev, __maybe_unused *next;
    843	u64 timestamp0, timestamp = sample->time;
    844	int cpu = sample->cpu;
    845	s64 delta;
    846
    847	if (verbose > 0)
    848		printf("sched_switch event %p\n", evsel);
    849
    850	if (cpu >= MAX_CPUS || cpu < 0)
    851		return 0;
    852
    853	timestamp0 = sched->cpu_last_switched[cpu];
    854	if (timestamp0)
    855		delta = timestamp - timestamp0;
    856	else
    857		delta = 0;
    858
    859	if (delta < 0) {
    860		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
    861		return -1;
    862	}
    863
    864	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
    865		 prev_comm, prev_pid, next_comm, next_pid, delta);
    866
    867	prev = register_pid(sched, prev_pid, prev_comm);
    868	next = register_pid(sched, next_pid, next_comm);
    869
    870	sched->cpu_last_switched[cpu] = timestamp;
    871
    872	add_sched_event_run(sched, prev, timestamp, delta);
    873	add_sched_event_sleep(sched, prev, timestamp, prev_state);
    874
    875	return 0;
    876}
    877
    878static int replay_fork_event(struct perf_sched *sched,
    879			     union perf_event *event,
    880			     struct machine *machine)
    881{
    882	struct thread *child, *parent;
    883
    884	child = machine__findnew_thread(machine, event->fork.pid,
    885					event->fork.tid);
    886	parent = machine__findnew_thread(machine, event->fork.ppid,
    887					 event->fork.ptid);
    888
    889	if (child == NULL || parent == NULL) {
    890		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
    891				 child, parent);
    892		goto out_put;
    893	}
    894
    895	if (verbose > 0) {
    896		printf("fork event\n");
    897		printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
    898		printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
    899	}
    900
    901	register_pid(sched, parent->tid, thread__comm_str(parent));
    902	register_pid(sched, child->tid, thread__comm_str(child));
    903out_put:
    904	thread__put(child);
    905	thread__put(parent);
    906	return 0;
    907}
    908
    909struct sort_dimension {
    910	const char		*name;
    911	sort_fn_t		cmp;
    912	struct list_head	list;
    913};
    914
    915/*
    916 * handle runtime stats saved per thread
    917 */
    918static struct thread_runtime *thread__init_runtime(struct thread *thread)
    919{
    920	struct thread_runtime *r;
    921
    922	r = zalloc(sizeof(struct thread_runtime));
    923	if (!r)
    924		return NULL;
    925
    926	init_stats(&r->run_stats);
    927	thread__set_priv(thread, r);
    928
    929	return r;
    930}
    931
    932static struct thread_runtime *thread__get_runtime(struct thread *thread)
    933{
    934	struct thread_runtime *tr;
    935
    936	tr = thread__priv(thread);
    937	if (tr == NULL) {
    938		tr = thread__init_runtime(thread);
    939		if (tr == NULL)
    940			pr_debug("Failed to malloc memory for runtime data.\n");
    941	}
    942
    943	return tr;
    944}
    945
    946static int
    947thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
    948{
    949	struct sort_dimension *sort;
    950	int ret = 0;
    951
    952	BUG_ON(list_empty(list));
    953
    954	list_for_each_entry(sort, list, list) {
    955		ret = sort->cmp(l, r);
    956		if (ret)
    957			return ret;
    958	}
    959
    960	return ret;
    961}
    962
    963static struct work_atoms *
    964thread_atoms_search(struct rb_root_cached *root, struct thread *thread,
    965			 struct list_head *sort_list)
    966{
    967	struct rb_node *node = root->rb_root.rb_node;
    968	struct work_atoms key = { .thread = thread };
    969
    970	while (node) {
    971		struct work_atoms *atoms;
    972		int cmp;
    973
    974		atoms = container_of(node, struct work_atoms, node);
    975
    976		cmp = thread_lat_cmp(sort_list, &key, atoms);
    977		if (cmp > 0)
    978			node = node->rb_left;
    979		else if (cmp < 0)
    980			node = node->rb_right;
    981		else {
    982			BUG_ON(thread != atoms->thread);
    983			return atoms;
    984		}
    985	}
    986	return NULL;
    987}
    988
    989static void
    990__thread_latency_insert(struct rb_root_cached *root, struct work_atoms *data,
    991			 struct list_head *sort_list)
    992{
    993	struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
    994	bool leftmost = true;
    995
    996	while (*new) {
    997		struct work_atoms *this;
    998		int cmp;
    999
   1000		this = container_of(*new, struct work_atoms, node);
   1001		parent = *new;
   1002
   1003		cmp = thread_lat_cmp(sort_list, data, this);
   1004
   1005		if (cmp > 0)
   1006			new = &((*new)->rb_left);
   1007		else {
   1008			new = &((*new)->rb_right);
   1009			leftmost = false;
   1010		}
   1011	}
   1012
   1013	rb_link_node(&data->node, parent, new);
   1014	rb_insert_color_cached(&data->node, root, leftmost);
   1015}
   1016
   1017static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
   1018{
   1019	struct work_atoms *atoms = zalloc(sizeof(*atoms));
   1020	if (!atoms) {
   1021		pr_err("No memory at %s\n", __func__);
   1022		return -1;
   1023	}
   1024
   1025	atoms->thread = thread__get(thread);
   1026	INIT_LIST_HEAD(&atoms->work_list);
   1027	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
   1028	return 0;
   1029}
   1030
   1031static char sched_out_state(u64 prev_state)
   1032{
   1033	const char *str = TASK_STATE_TO_CHAR_STR;
   1034
   1035	return str[prev_state];
   1036}
   1037
   1038static int
   1039add_sched_out_event(struct work_atoms *atoms,
   1040		    char run_state,
   1041		    u64 timestamp)
   1042{
   1043	struct work_atom *atom = zalloc(sizeof(*atom));
   1044	if (!atom) {
   1045		pr_err("Non memory at %s", __func__);
   1046		return -1;
   1047	}
   1048
   1049	atom->sched_out_time = timestamp;
   1050
   1051	if (run_state == 'R') {
   1052		atom->state = THREAD_WAIT_CPU;
   1053		atom->wake_up_time = atom->sched_out_time;
   1054	}
   1055
   1056	list_add_tail(&atom->list, &atoms->work_list);
   1057	return 0;
   1058}
   1059
   1060static void
   1061add_runtime_event(struct work_atoms *atoms, u64 delta,
   1062		  u64 timestamp __maybe_unused)
   1063{
   1064	struct work_atom *atom;
   1065
   1066	BUG_ON(list_empty(&atoms->work_list));
   1067
   1068	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
   1069
   1070	atom->runtime += delta;
   1071	atoms->total_runtime += delta;
   1072}
   1073
   1074static void
   1075add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
   1076{
   1077	struct work_atom *atom;
   1078	u64 delta;
   1079
   1080	if (list_empty(&atoms->work_list))
   1081		return;
   1082
   1083	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
   1084
   1085	if (atom->state != THREAD_WAIT_CPU)
   1086		return;
   1087
   1088	if (timestamp < atom->wake_up_time) {
   1089		atom->state = THREAD_IGNORE;
   1090		return;
   1091	}
   1092
   1093	atom->state = THREAD_SCHED_IN;
   1094	atom->sched_in_time = timestamp;
   1095
   1096	delta = atom->sched_in_time - atom->wake_up_time;
   1097	atoms->total_lat += delta;
   1098	if (delta > atoms->max_lat) {
   1099		atoms->max_lat = delta;
   1100		atoms->max_lat_start = atom->wake_up_time;
   1101		atoms->max_lat_end = timestamp;
   1102	}
   1103	atoms->nb_atoms++;
   1104}
   1105
   1106static int latency_switch_event(struct perf_sched *sched,
   1107				struct evsel *evsel,
   1108				struct perf_sample *sample,
   1109				struct machine *machine)
   1110{
   1111	const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
   1112		  next_pid = evsel__intval(evsel, sample, "next_pid");
   1113	const u64 prev_state = evsel__intval(evsel, sample, "prev_state");
   1114	struct work_atoms *out_events, *in_events;
   1115	struct thread *sched_out, *sched_in;
   1116	u64 timestamp0, timestamp = sample->time;
   1117	int cpu = sample->cpu, err = -1;
   1118	s64 delta;
   1119
   1120	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
   1121
   1122	timestamp0 = sched->cpu_last_switched[cpu];
   1123	sched->cpu_last_switched[cpu] = timestamp;
   1124	if (timestamp0)
   1125		delta = timestamp - timestamp0;
   1126	else
   1127		delta = 0;
   1128
   1129	if (delta < 0) {
   1130		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
   1131		return -1;
   1132	}
   1133
   1134	sched_out = machine__findnew_thread(machine, -1, prev_pid);
   1135	sched_in = machine__findnew_thread(machine, -1, next_pid);
   1136	if (sched_out == NULL || sched_in == NULL)
   1137		goto out_put;
   1138
   1139	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
   1140	if (!out_events) {
   1141		if (thread_atoms_insert(sched, sched_out))
   1142			goto out_put;
   1143		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
   1144		if (!out_events) {
   1145			pr_err("out-event: Internal tree error");
   1146			goto out_put;
   1147		}
   1148	}
   1149	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
   1150		return -1;
   1151
   1152	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
   1153	if (!in_events) {
   1154		if (thread_atoms_insert(sched, sched_in))
   1155			goto out_put;
   1156		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
   1157		if (!in_events) {
   1158			pr_err("in-event: Internal tree error");
   1159			goto out_put;
   1160		}
   1161		/*
   1162		 * Take came in we have not heard about yet,
   1163		 * add in an initial atom in runnable state:
   1164		 */
   1165		if (add_sched_out_event(in_events, 'R', timestamp))
   1166			goto out_put;
   1167	}
   1168	add_sched_in_event(in_events, timestamp);
   1169	err = 0;
   1170out_put:
   1171	thread__put(sched_out);
   1172	thread__put(sched_in);
   1173	return err;
   1174}
   1175
   1176static int latency_runtime_event(struct perf_sched *sched,
   1177				 struct evsel *evsel,
   1178				 struct perf_sample *sample,
   1179				 struct machine *machine)
   1180{
   1181	const u32 pid	   = evsel__intval(evsel, sample, "pid");
   1182	const u64 runtime  = evsel__intval(evsel, sample, "runtime");
   1183	struct thread *thread = machine__findnew_thread(machine, -1, pid);
   1184	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
   1185	u64 timestamp = sample->time;
   1186	int cpu = sample->cpu, err = -1;
   1187
   1188	if (thread == NULL)
   1189		return -1;
   1190
   1191	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
   1192	if (!atoms) {
   1193		if (thread_atoms_insert(sched, thread))
   1194			goto out_put;
   1195		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
   1196		if (!atoms) {
   1197			pr_err("in-event: Internal tree error");
   1198			goto out_put;
   1199		}
   1200		if (add_sched_out_event(atoms, 'R', timestamp))
   1201			goto out_put;
   1202	}
   1203
   1204	add_runtime_event(atoms, runtime, timestamp);
   1205	err = 0;
   1206out_put:
   1207	thread__put(thread);
   1208	return err;
   1209}
   1210
   1211static int latency_wakeup_event(struct perf_sched *sched,
   1212				struct evsel *evsel,
   1213				struct perf_sample *sample,
   1214				struct machine *machine)
   1215{
   1216	const u32 pid	  = evsel__intval(evsel, sample, "pid");
   1217	struct work_atoms *atoms;
   1218	struct work_atom *atom;
   1219	struct thread *wakee;
   1220	u64 timestamp = sample->time;
   1221	int err = -1;
   1222
   1223	wakee = machine__findnew_thread(machine, -1, pid);
   1224	if (wakee == NULL)
   1225		return -1;
   1226	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
   1227	if (!atoms) {
   1228		if (thread_atoms_insert(sched, wakee))
   1229			goto out_put;
   1230		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
   1231		if (!atoms) {
   1232			pr_err("wakeup-event: Internal tree error");
   1233			goto out_put;
   1234		}
   1235		if (add_sched_out_event(atoms, 'S', timestamp))
   1236			goto out_put;
   1237	}
   1238
   1239	BUG_ON(list_empty(&atoms->work_list));
   1240
   1241	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
   1242
   1243	/*
   1244	 * As we do not guarantee the wakeup event happens when
   1245	 * task is out of run queue, also may happen when task is
   1246	 * on run queue and wakeup only change ->state to TASK_RUNNING,
   1247	 * then we should not set the ->wake_up_time when wake up a
   1248	 * task which is on run queue.
   1249	 *
   1250	 * You WILL be missing events if you've recorded only
   1251	 * one CPU, or are only looking at only one, so don't
   1252	 * skip in this case.
   1253	 */
   1254	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
   1255		goto out_ok;
   1256
   1257	sched->nr_timestamps++;
   1258	if (atom->sched_out_time > timestamp) {
   1259		sched->nr_unordered_timestamps++;
   1260		goto out_ok;
   1261	}
   1262
   1263	atom->state = THREAD_WAIT_CPU;
   1264	atom->wake_up_time = timestamp;
   1265out_ok:
   1266	err = 0;
   1267out_put:
   1268	thread__put(wakee);
   1269	return err;
   1270}
   1271
   1272static int latency_migrate_task_event(struct perf_sched *sched,
   1273				      struct evsel *evsel,
   1274				      struct perf_sample *sample,
   1275				      struct machine *machine)
   1276{
   1277	const u32 pid = evsel__intval(evsel, sample, "pid");
   1278	u64 timestamp = sample->time;
   1279	struct work_atoms *atoms;
   1280	struct work_atom *atom;
   1281	struct thread *migrant;
   1282	int err = -1;
   1283
   1284	/*
   1285	 * Only need to worry about migration when profiling one CPU.
   1286	 */
   1287	if (sched->profile_cpu == -1)
   1288		return 0;
   1289
   1290	migrant = machine__findnew_thread(machine, -1, pid);
   1291	if (migrant == NULL)
   1292		return -1;
   1293	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
   1294	if (!atoms) {
   1295		if (thread_atoms_insert(sched, migrant))
   1296			goto out_put;
   1297		register_pid(sched, migrant->tid, thread__comm_str(migrant));
   1298		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
   1299		if (!atoms) {
   1300			pr_err("migration-event: Internal tree error");
   1301			goto out_put;
   1302		}
   1303		if (add_sched_out_event(atoms, 'R', timestamp))
   1304			goto out_put;
   1305	}
   1306
   1307	BUG_ON(list_empty(&atoms->work_list));
   1308
   1309	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
   1310	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
   1311
   1312	sched->nr_timestamps++;
   1313
   1314	if (atom->sched_out_time > timestamp)
   1315		sched->nr_unordered_timestamps++;
   1316	err = 0;
   1317out_put:
   1318	thread__put(migrant);
   1319	return err;
   1320}
   1321
   1322static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
   1323{
   1324	int i;
   1325	int ret;
   1326	u64 avg;
   1327	char max_lat_start[32], max_lat_end[32];
   1328
   1329	if (!work_list->nb_atoms)
   1330		return;
   1331	/*
   1332	 * Ignore idle threads:
   1333	 */
   1334	if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
   1335		return;
   1336
   1337	sched->all_runtime += work_list->total_runtime;
   1338	sched->all_count   += work_list->nb_atoms;
   1339
   1340	if (work_list->num_merged > 1)
   1341		ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
   1342	else
   1343		ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
   1344
   1345	for (i = 0; i < 24 - ret; i++)
   1346		printf(" ");
   1347
   1348	avg = work_list->total_lat / work_list->nb_atoms;
   1349	timestamp__scnprintf_usec(work_list->max_lat_start, max_lat_start, sizeof(max_lat_start));
   1350	timestamp__scnprintf_usec(work_list->max_lat_end, max_lat_end, sizeof(max_lat_end));
   1351
   1352	printf("|%11.3f ms |%9" PRIu64 " | avg:%8.3f ms | max:%8.3f ms | max start: %12s s | max end: %12s s\n",
   1353	      (double)work_list->total_runtime / NSEC_PER_MSEC,
   1354		 work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
   1355		 (double)work_list->max_lat / NSEC_PER_MSEC,
   1356		 max_lat_start, max_lat_end);
   1357}
   1358
   1359static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
   1360{
   1361	if (l->thread == r->thread)
   1362		return 0;
   1363	if (l->thread->tid < r->thread->tid)
   1364		return -1;
   1365	if (l->thread->tid > r->thread->tid)
   1366		return 1;
   1367	return (int)(l->thread - r->thread);
   1368}
   1369
   1370static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
   1371{
   1372	u64 avgl, avgr;
   1373
   1374	if (!l->nb_atoms)
   1375		return -1;
   1376
   1377	if (!r->nb_atoms)
   1378		return 1;
   1379
   1380	avgl = l->total_lat / l->nb_atoms;
   1381	avgr = r->total_lat / r->nb_atoms;
   1382
   1383	if (avgl < avgr)
   1384		return -1;
   1385	if (avgl > avgr)
   1386		return 1;
   1387
   1388	return 0;
   1389}
   1390
   1391static int max_cmp(struct work_atoms *l, struct work_atoms *r)
   1392{
   1393	if (l->max_lat < r->max_lat)
   1394		return -1;
   1395	if (l->max_lat > r->max_lat)
   1396		return 1;
   1397
   1398	return 0;
   1399}
   1400
   1401static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
   1402{
   1403	if (l->nb_atoms < r->nb_atoms)
   1404		return -1;
   1405	if (l->nb_atoms > r->nb_atoms)
   1406		return 1;
   1407
   1408	return 0;
   1409}
   1410
   1411static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
   1412{
   1413	if (l->total_runtime < r->total_runtime)
   1414		return -1;
   1415	if (l->total_runtime > r->total_runtime)
   1416		return 1;
   1417
   1418	return 0;
   1419}
   1420
   1421static int sort_dimension__add(const char *tok, struct list_head *list)
   1422{
   1423	size_t i;
   1424	static struct sort_dimension avg_sort_dimension = {
   1425		.name = "avg",
   1426		.cmp  = avg_cmp,
   1427	};
   1428	static struct sort_dimension max_sort_dimension = {
   1429		.name = "max",
   1430		.cmp  = max_cmp,
   1431	};
   1432	static struct sort_dimension pid_sort_dimension = {
   1433		.name = "pid",
   1434		.cmp  = pid_cmp,
   1435	};
   1436	static struct sort_dimension runtime_sort_dimension = {
   1437		.name = "runtime",
   1438		.cmp  = runtime_cmp,
   1439	};
   1440	static struct sort_dimension switch_sort_dimension = {
   1441		.name = "switch",
   1442		.cmp  = switch_cmp,
   1443	};
   1444	struct sort_dimension *available_sorts[] = {
   1445		&pid_sort_dimension,
   1446		&avg_sort_dimension,
   1447		&max_sort_dimension,
   1448		&switch_sort_dimension,
   1449		&runtime_sort_dimension,
   1450	};
   1451
   1452	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
   1453		if (!strcmp(available_sorts[i]->name, tok)) {
   1454			list_add_tail(&available_sorts[i]->list, list);
   1455
   1456			return 0;
   1457		}
   1458	}
   1459
   1460	return -1;
   1461}
   1462
   1463static void perf_sched__sort_lat(struct perf_sched *sched)
   1464{
   1465	struct rb_node *node;
   1466	struct rb_root_cached *root = &sched->atom_root;
   1467again:
   1468	for (;;) {
   1469		struct work_atoms *data;
   1470		node = rb_first_cached(root);
   1471		if (!node)
   1472			break;
   1473
   1474		rb_erase_cached(node, root);
   1475		data = rb_entry(node, struct work_atoms, node);
   1476		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
   1477	}
   1478	if (root == &sched->atom_root) {
   1479		root = &sched->merged_atom_root;
   1480		goto again;
   1481	}
   1482}
   1483
   1484static int process_sched_wakeup_event(struct perf_tool *tool,
   1485				      struct evsel *evsel,
   1486				      struct perf_sample *sample,
   1487				      struct machine *machine)
   1488{
   1489	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1490
   1491	if (sched->tp_handler->wakeup_event)
   1492		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
   1493
   1494	return 0;
   1495}
   1496
   1497union map_priv {
   1498	void	*ptr;
   1499	bool	 color;
   1500};
   1501
   1502static bool thread__has_color(struct thread *thread)
   1503{
   1504	union map_priv priv = {
   1505		.ptr = thread__priv(thread),
   1506	};
   1507
   1508	return priv.color;
   1509}
   1510
   1511static struct thread*
   1512map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
   1513{
   1514	struct thread *thread = machine__findnew_thread(machine, pid, tid);
   1515	union map_priv priv = {
   1516		.color = false,
   1517	};
   1518
   1519	if (!sched->map.color_pids || !thread || thread__priv(thread))
   1520		return thread;
   1521
   1522	if (thread_map__has(sched->map.color_pids, tid))
   1523		priv.color = true;
   1524
   1525	thread__set_priv(thread, priv.ptr);
   1526	return thread;
   1527}
   1528
   1529static int map_switch_event(struct perf_sched *sched, struct evsel *evsel,
   1530			    struct perf_sample *sample, struct machine *machine)
   1531{
   1532	const u32 next_pid = evsel__intval(evsel, sample, "next_pid");
   1533	struct thread *sched_in;
   1534	struct thread_runtime *tr;
   1535	int new_shortname;
   1536	u64 timestamp0, timestamp = sample->time;
   1537	s64 delta;
   1538	int i;
   1539	struct perf_cpu this_cpu = {
   1540		.cpu = sample->cpu,
   1541	};
   1542	int cpus_nr;
   1543	bool new_cpu = false;
   1544	const char *color = PERF_COLOR_NORMAL;
   1545	char stimestamp[32];
   1546
   1547	BUG_ON(this_cpu.cpu >= MAX_CPUS || this_cpu.cpu < 0);
   1548
   1549	if (this_cpu.cpu > sched->max_cpu.cpu)
   1550		sched->max_cpu = this_cpu;
   1551
   1552	if (sched->map.comp) {
   1553		cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
   1554		if (!test_and_set_bit(this_cpu.cpu, sched->map.comp_cpus_mask)) {
   1555			sched->map.comp_cpus[cpus_nr++] = this_cpu;
   1556			new_cpu = true;
   1557		}
   1558	} else
   1559		cpus_nr = sched->max_cpu.cpu;
   1560
   1561	timestamp0 = sched->cpu_last_switched[this_cpu.cpu];
   1562	sched->cpu_last_switched[this_cpu.cpu] = timestamp;
   1563	if (timestamp0)
   1564		delta = timestamp - timestamp0;
   1565	else
   1566		delta = 0;
   1567
   1568	if (delta < 0) {
   1569		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
   1570		return -1;
   1571	}
   1572
   1573	sched_in = map__findnew_thread(sched, machine, -1, next_pid);
   1574	if (sched_in == NULL)
   1575		return -1;
   1576
   1577	tr = thread__get_runtime(sched_in);
   1578	if (tr == NULL) {
   1579		thread__put(sched_in);
   1580		return -1;
   1581	}
   1582
   1583	sched->curr_thread[this_cpu.cpu] = thread__get(sched_in);
   1584
   1585	printf("  ");
   1586
   1587	new_shortname = 0;
   1588	if (!tr->shortname[0]) {
   1589		if (!strcmp(thread__comm_str(sched_in), "swapper")) {
   1590			/*
   1591			 * Don't allocate a letter-number for swapper:0
   1592			 * as a shortname. Instead, we use '.' for it.
   1593			 */
   1594			tr->shortname[0] = '.';
   1595			tr->shortname[1] = ' ';
   1596		} else {
   1597			tr->shortname[0] = sched->next_shortname1;
   1598			tr->shortname[1] = sched->next_shortname2;
   1599
   1600			if (sched->next_shortname1 < 'Z') {
   1601				sched->next_shortname1++;
   1602			} else {
   1603				sched->next_shortname1 = 'A';
   1604				if (sched->next_shortname2 < '9')
   1605					sched->next_shortname2++;
   1606				else
   1607					sched->next_shortname2 = '0';
   1608			}
   1609		}
   1610		new_shortname = 1;
   1611	}
   1612
   1613	for (i = 0; i < cpus_nr; i++) {
   1614		struct perf_cpu cpu = {
   1615			.cpu = sched->map.comp ? sched->map.comp_cpus[i].cpu : i,
   1616		};
   1617		struct thread *curr_thread = sched->curr_thread[cpu.cpu];
   1618		struct thread_runtime *curr_tr;
   1619		const char *pid_color = color;
   1620		const char *cpu_color = color;
   1621
   1622		if (curr_thread && thread__has_color(curr_thread))
   1623			pid_color = COLOR_PIDS;
   1624
   1625		if (sched->map.cpus && !perf_cpu_map__has(sched->map.cpus, cpu))
   1626			continue;
   1627
   1628		if (sched->map.color_cpus && perf_cpu_map__has(sched->map.color_cpus, cpu))
   1629			cpu_color = COLOR_CPUS;
   1630
   1631		if (cpu.cpu != this_cpu.cpu)
   1632			color_fprintf(stdout, color, " ");
   1633		else
   1634			color_fprintf(stdout, cpu_color, "*");
   1635
   1636		if (sched->curr_thread[cpu.cpu]) {
   1637			curr_tr = thread__get_runtime(sched->curr_thread[cpu.cpu]);
   1638			if (curr_tr == NULL) {
   1639				thread__put(sched_in);
   1640				return -1;
   1641			}
   1642			color_fprintf(stdout, pid_color, "%2s ", curr_tr->shortname);
   1643		} else
   1644			color_fprintf(stdout, color, "   ");
   1645	}
   1646
   1647	if (sched->map.cpus && !perf_cpu_map__has(sched->map.cpus, this_cpu))
   1648		goto out;
   1649
   1650	timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
   1651	color_fprintf(stdout, color, "  %12s secs ", stimestamp);
   1652	if (new_shortname || tr->comm_changed || (verbose > 0 && sched_in->tid)) {
   1653		const char *pid_color = color;
   1654
   1655		if (thread__has_color(sched_in))
   1656			pid_color = COLOR_PIDS;
   1657
   1658		color_fprintf(stdout, pid_color, "%s => %s:%d",
   1659		       tr->shortname, thread__comm_str(sched_in), sched_in->tid);
   1660		tr->comm_changed = false;
   1661	}
   1662
   1663	if (sched->map.comp && new_cpu)
   1664		color_fprintf(stdout, color, " (CPU %d)", this_cpu);
   1665
   1666out:
   1667	color_fprintf(stdout, color, "\n");
   1668
   1669	thread__put(sched_in);
   1670
   1671	return 0;
   1672}
   1673
   1674static int process_sched_switch_event(struct perf_tool *tool,
   1675				      struct evsel *evsel,
   1676				      struct perf_sample *sample,
   1677				      struct machine *machine)
   1678{
   1679	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1680	int this_cpu = sample->cpu, err = 0;
   1681	u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
   1682	    next_pid = evsel__intval(evsel, sample, "next_pid");
   1683
   1684	if (sched->curr_pid[this_cpu] != (u32)-1) {
   1685		/*
   1686		 * Are we trying to switch away a PID that is
   1687		 * not current?
   1688		 */
   1689		if (sched->curr_pid[this_cpu] != prev_pid)
   1690			sched->nr_context_switch_bugs++;
   1691	}
   1692
   1693	if (sched->tp_handler->switch_event)
   1694		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
   1695
   1696	sched->curr_pid[this_cpu] = next_pid;
   1697	return err;
   1698}
   1699
   1700static int process_sched_runtime_event(struct perf_tool *tool,
   1701				       struct evsel *evsel,
   1702				       struct perf_sample *sample,
   1703				       struct machine *machine)
   1704{
   1705	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1706
   1707	if (sched->tp_handler->runtime_event)
   1708		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
   1709
   1710	return 0;
   1711}
   1712
   1713static int perf_sched__process_fork_event(struct perf_tool *tool,
   1714					  union perf_event *event,
   1715					  struct perf_sample *sample,
   1716					  struct machine *machine)
   1717{
   1718	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1719
   1720	/* run the fork event through the perf machinery */
   1721	perf_event__process_fork(tool, event, sample, machine);
   1722
   1723	/* and then run additional processing needed for this command */
   1724	if (sched->tp_handler->fork_event)
   1725		return sched->tp_handler->fork_event(sched, event, machine);
   1726
   1727	return 0;
   1728}
   1729
   1730static int process_sched_migrate_task_event(struct perf_tool *tool,
   1731					    struct evsel *evsel,
   1732					    struct perf_sample *sample,
   1733					    struct machine *machine)
   1734{
   1735	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   1736
   1737	if (sched->tp_handler->migrate_task_event)
   1738		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
   1739
   1740	return 0;
   1741}
   1742
   1743typedef int (*tracepoint_handler)(struct perf_tool *tool,
   1744				  struct evsel *evsel,
   1745				  struct perf_sample *sample,
   1746				  struct machine *machine);
   1747
   1748static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
   1749						 union perf_event *event __maybe_unused,
   1750						 struct perf_sample *sample,
   1751						 struct evsel *evsel,
   1752						 struct machine *machine)
   1753{
   1754	int err = 0;
   1755
   1756	if (evsel->handler != NULL) {
   1757		tracepoint_handler f = evsel->handler;
   1758		err = f(tool, evsel, sample, machine);
   1759	}
   1760
   1761	return err;
   1762}
   1763
   1764static int perf_sched__process_comm(struct perf_tool *tool __maybe_unused,
   1765				    union perf_event *event,
   1766				    struct perf_sample *sample,
   1767				    struct machine *machine)
   1768{
   1769	struct thread *thread;
   1770	struct thread_runtime *tr;
   1771	int err;
   1772
   1773	err = perf_event__process_comm(tool, event, sample, machine);
   1774	if (err)
   1775		return err;
   1776
   1777	thread = machine__find_thread(machine, sample->pid, sample->tid);
   1778	if (!thread) {
   1779		pr_err("Internal error: can't find thread\n");
   1780		return -1;
   1781	}
   1782
   1783	tr = thread__get_runtime(thread);
   1784	if (tr == NULL) {
   1785		thread__put(thread);
   1786		return -1;
   1787	}
   1788
   1789	tr->comm_changed = true;
   1790	thread__put(thread);
   1791
   1792	return 0;
   1793}
   1794
   1795static int perf_sched__read_events(struct perf_sched *sched)
   1796{
   1797	const struct evsel_str_handler handlers[] = {
   1798		{ "sched:sched_switch",	      process_sched_switch_event, },
   1799		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
   1800		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
   1801		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
   1802		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
   1803	};
   1804	struct perf_session *session;
   1805	struct perf_data data = {
   1806		.path  = input_name,
   1807		.mode  = PERF_DATA_MODE_READ,
   1808		.force = sched->force,
   1809	};
   1810	int rc = -1;
   1811
   1812	session = perf_session__new(&data, &sched->tool);
   1813	if (IS_ERR(session)) {
   1814		pr_debug("Error creating perf session");
   1815		return PTR_ERR(session);
   1816	}
   1817
   1818	symbol__init(&session->header.env);
   1819
   1820	if (perf_session__set_tracepoints_handlers(session, handlers))
   1821		goto out_delete;
   1822
   1823	if (perf_session__has_traces(session, "record -R")) {
   1824		int err = perf_session__process_events(session);
   1825		if (err) {
   1826			pr_err("Failed to process events, error %d", err);
   1827			goto out_delete;
   1828		}
   1829
   1830		sched->nr_events      = session->evlist->stats.nr_events[0];
   1831		sched->nr_lost_events = session->evlist->stats.total_lost;
   1832		sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
   1833	}
   1834
   1835	rc = 0;
   1836out_delete:
   1837	perf_session__delete(session);
   1838	return rc;
   1839}
   1840
   1841/*
   1842 * scheduling times are printed as msec.usec
   1843 */
   1844static inline void print_sched_time(unsigned long long nsecs, int width)
   1845{
   1846	unsigned long msecs;
   1847	unsigned long usecs;
   1848
   1849	msecs  = nsecs / NSEC_PER_MSEC;
   1850	nsecs -= msecs * NSEC_PER_MSEC;
   1851	usecs  = nsecs / NSEC_PER_USEC;
   1852	printf("%*lu.%03lu ", width, msecs, usecs);
   1853}
   1854
   1855/*
   1856 * returns runtime data for event, allocating memory for it the
   1857 * first time it is used.
   1858 */
   1859static struct evsel_runtime *evsel__get_runtime(struct evsel *evsel)
   1860{
   1861	struct evsel_runtime *r = evsel->priv;
   1862
   1863	if (r == NULL) {
   1864		r = zalloc(sizeof(struct evsel_runtime));
   1865		evsel->priv = r;
   1866	}
   1867
   1868	return r;
   1869}
   1870
   1871/*
   1872 * save last time event was seen per cpu
   1873 */
   1874static void evsel__save_time(struct evsel *evsel, u64 timestamp, u32 cpu)
   1875{
   1876	struct evsel_runtime *r = evsel__get_runtime(evsel);
   1877
   1878	if (r == NULL)
   1879		return;
   1880
   1881	if ((cpu >= r->ncpu) || (r->last_time == NULL)) {
   1882		int i, n = __roundup_pow_of_two(cpu+1);
   1883		void *p = r->last_time;
   1884
   1885		p = realloc(r->last_time, n * sizeof(u64));
   1886		if (!p)
   1887			return;
   1888
   1889		r->last_time = p;
   1890		for (i = r->ncpu; i < n; ++i)
   1891			r->last_time[i] = (u64) 0;
   1892
   1893		r->ncpu = n;
   1894	}
   1895
   1896	r->last_time[cpu] = timestamp;
   1897}
   1898
   1899/* returns last time this event was seen on the given cpu */
   1900static u64 evsel__get_time(struct evsel *evsel, u32 cpu)
   1901{
   1902	struct evsel_runtime *r = evsel__get_runtime(evsel);
   1903
   1904	if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu))
   1905		return 0;
   1906
   1907	return r->last_time[cpu];
   1908}
   1909
   1910static int comm_width = 30;
   1911
   1912static char *timehist_get_commstr(struct thread *thread)
   1913{
   1914	static char str[32];
   1915	const char *comm = thread__comm_str(thread);
   1916	pid_t tid = thread->tid;
   1917	pid_t pid = thread->pid_;
   1918	int n;
   1919
   1920	if (pid == 0)
   1921		n = scnprintf(str, sizeof(str), "%s", comm);
   1922
   1923	else if (tid != pid)
   1924		n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid);
   1925
   1926	else
   1927		n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid);
   1928
   1929	if (n > comm_width)
   1930		comm_width = n;
   1931
   1932	return str;
   1933}
   1934
   1935static void timehist_header(struct perf_sched *sched)
   1936{
   1937	u32 ncpus = sched->max_cpu.cpu + 1;
   1938	u32 i, j;
   1939
   1940	printf("%15s %6s ", "time", "cpu");
   1941
   1942	if (sched->show_cpu_visual) {
   1943		printf(" ");
   1944		for (i = 0, j = 0; i < ncpus; ++i) {
   1945			printf("%x", j++);
   1946			if (j > 15)
   1947				j = 0;
   1948		}
   1949		printf(" ");
   1950	}
   1951
   1952	printf(" %-*s  %9s  %9s  %9s", comm_width,
   1953		"task name", "wait time", "sch delay", "run time");
   1954
   1955	if (sched->show_state)
   1956		printf("  %s", "state");
   1957
   1958	printf("\n");
   1959
   1960	/*
   1961	 * units row
   1962	 */
   1963	printf("%15s %-6s ", "", "");
   1964
   1965	if (sched->show_cpu_visual)
   1966		printf(" %*s ", ncpus, "");
   1967
   1968	printf(" %-*s  %9s  %9s  %9s", comm_width,
   1969	       "[tid/pid]", "(msec)", "(msec)", "(msec)");
   1970
   1971	if (sched->show_state)
   1972		printf("  %5s", "");
   1973
   1974	printf("\n");
   1975
   1976	/*
   1977	 * separator
   1978	 */
   1979	printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line);
   1980
   1981	if (sched->show_cpu_visual)
   1982		printf(" %.*s ", ncpus, graph_dotted_line);
   1983
   1984	printf(" %.*s  %.9s  %.9s  %.9s", comm_width,
   1985		graph_dotted_line, graph_dotted_line, graph_dotted_line,
   1986		graph_dotted_line);
   1987
   1988	if (sched->show_state)
   1989		printf("  %.5s", graph_dotted_line);
   1990
   1991	printf("\n");
   1992}
   1993
   1994static char task_state_char(struct thread *thread, int state)
   1995{
   1996	static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
   1997	unsigned bit = state ? ffs(state) : 0;
   1998
   1999	/* 'I' for idle */
   2000	if (thread->tid == 0)
   2001		return 'I';
   2002
   2003	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
   2004}
   2005
   2006static void timehist_print_sample(struct perf_sched *sched,
   2007				  struct evsel *evsel,
   2008				  struct perf_sample *sample,
   2009				  struct addr_location *al,
   2010				  struct thread *thread,
   2011				  u64 t, int state)
   2012{
   2013	struct thread_runtime *tr = thread__priv(thread);
   2014	const char *next_comm = evsel__strval(evsel, sample, "next_comm");
   2015	const u32 next_pid = evsel__intval(evsel, sample, "next_pid");
   2016	u32 max_cpus = sched->max_cpu.cpu + 1;
   2017	char tstr[64];
   2018	char nstr[30];
   2019	u64 wait_time;
   2020
   2021	if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
   2022		return;
   2023
   2024	timestamp__scnprintf_usec(t, tstr, sizeof(tstr));
   2025	printf("%15s [%04d] ", tstr, sample->cpu);
   2026
   2027	if (sched->show_cpu_visual) {
   2028		u32 i;
   2029		char c;
   2030
   2031		printf(" ");
   2032		for (i = 0; i < max_cpus; ++i) {
   2033			/* flag idle times with 'i'; others are sched events */
   2034			if (i == sample->cpu)
   2035				c = (thread->tid == 0) ? 'i' : 's';
   2036			else
   2037				c = ' ';
   2038			printf("%c", c);
   2039		}
   2040		printf(" ");
   2041	}
   2042
   2043	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
   2044
   2045	wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt;
   2046	print_sched_time(wait_time, 6);
   2047
   2048	print_sched_time(tr->dt_delay, 6);
   2049	print_sched_time(tr->dt_run, 6);
   2050
   2051	if (sched->show_state)
   2052		printf(" %5c ", task_state_char(thread, state));
   2053
   2054	if (sched->show_next) {
   2055		snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid);
   2056		printf(" %-*s", comm_width, nstr);
   2057	}
   2058
   2059	if (sched->show_wakeups && !sched->show_next)
   2060		printf("  %-*s", comm_width, "");
   2061
   2062	if (thread->tid == 0)
   2063		goto out;
   2064
   2065	if (sched->show_callchain)
   2066		printf("  ");
   2067
   2068	sample__fprintf_sym(sample, al, 0,
   2069			    EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
   2070			    EVSEL__PRINT_CALLCHAIN_ARROW |
   2071			    EVSEL__PRINT_SKIP_IGNORED,
   2072			    &callchain_cursor, symbol_conf.bt_stop_list,  stdout);
   2073
   2074out:
   2075	printf("\n");
   2076}
   2077
   2078/*
   2079 * Explanation of delta-time stats:
   2080 *
   2081 *            t = time of current schedule out event
   2082 *        tprev = time of previous sched out event
   2083 *                also time of schedule-in event for current task
   2084 *    last_time = time of last sched change event for current task
   2085 *                (i.e, time process was last scheduled out)
   2086 * ready_to_run = time of wakeup for current task
   2087 *
   2088 * -----|------------|------------|------------|------
   2089 *    last         ready        tprev          t
   2090 *    time         to run
   2091 *
   2092 *      |-------- dt_wait --------|
   2093 *                   |- dt_delay -|-- dt_run --|
   2094 *
   2095 *   dt_run = run time of current task
   2096 *  dt_wait = time between last schedule out event for task and tprev
   2097 *            represents time spent off the cpu
   2098 * dt_delay = time between wakeup and schedule-in of task
   2099 */
   2100
   2101static void timehist_update_runtime_stats(struct thread_runtime *r,
   2102					 u64 t, u64 tprev)
   2103{
   2104	r->dt_delay   = 0;
   2105	r->dt_sleep   = 0;
   2106	r->dt_iowait  = 0;
   2107	r->dt_preempt = 0;
   2108	r->dt_run     = 0;
   2109
   2110	if (tprev) {
   2111		r->dt_run = t - tprev;
   2112		if (r->ready_to_run) {
   2113			if (r->ready_to_run > tprev)
   2114				pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
   2115			else
   2116				r->dt_delay = tprev - r->ready_to_run;
   2117		}
   2118
   2119		if (r->last_time > tprev)
   2120			pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
   2121		else if (r->last_time) {
   2122			u64 dt_wait = tprev - r->last_time;
   2123
   2124			if (r->last_state == TASK_RUNNING)
   2125				r->dt_preempt = dt_wait;
   2126			else if (r->last_state == TASK_UNINTERRUPTIBLE)
   2127				r->dt_iowait = dt_wait;
   2128			else
   2129				r->dt_sleep = dt_wait;
   2130		}
   2131	}
   2132
   2133	update_stats(&r->run_stats, r->dt_run);
   2134
   2135	r->total_run_time     += r->dt_run;
   2136	r->total_delay_time   += r->dt_delay;
   2137	r->total_sleep_time   += r->dt_sleep;
   2138	r->total_iowait_time  += r->dt_iowait;
   2139	r->total_preempt_time += r->dt_preempt;
   2140}
   2141
   2142static bool is_idle_sample(struct perf_sample *sample,
   2143			   struct evsel *evsel)
   2144{
   2145	/* pid 0 == swapper == idle task */
   2146	if (strcmp(evsel__name(evsel), "sched:sched_switch") == 0)
   2147		return evsel__intval(evsel, sample, "prev_pid") == 0;
   2148
   2149	return sample->pid == 0;
   2150}
   2151
   2152static void save_task_callchain(struct perf_sched *sched,
   2153				struct perf_sample *sample,
   2154				struct evsel *evsel,
   2155				struct machine *machine)
   2156{
   2157	struct callchain_cursor *cursor = &callchain_cursor;
   2158	struct thread *thread;
   2159
   2160	/* want main thread for process - has maps */
   2161	thread = machine__findnew_thread(machine, sample->pid, sample->pid);
   2162	if (thread == NULL) {
   2163		pr_debug("Failed to get thread for pid %d.\n", sample->pid);
   2164		return;
   2165	}
   2166
   2167	if (!sched->show_callchain || sample->callchain == NULL)
   2168		return;
   2169
   2170	if (thread__resolve_callchain(thread, cursor, evsel, sample,
   2171				      NULL, NULL, sched->max_stack + 2) != 0) {
   2172		if (verbose > 0)
   2173			pr_err("Failed to resolve callchain. Skipping\n");
   2174
   2175		return;
   2176	}
   2177
   2178	callchain_cursor_commit(cursor);
   2179
   2180	while (true) {
   2181		struct callchain_cursor_node *node;
   2182		struct symbol *sym;
   2183
   2184		node = callchain_cursor_current(cursor);
   2185		if (node == NULL)
   2186			break;
   2187
   2188		sym = node->ms.sym;
   2189		if (sym) {
   2190			if (!strcmp(sym->name, "schedule") ||
   2191			    !strcmp(sym->name, "__schedule") ||
   2192			    !strcmp(sym->name, "preempt_schedule"))
   2193				sym->ignore = 1;
   2194		}
   2195
   2196		callchain_cursor_advance(cursor);
   2197	}
   2198}
   2199
   2200static int init_idle_thread(struct thread *thread)
   2201{
   2202	struct idle_thread_runtime *itr;
   2203
   2204	thread__set_comm(thread, idle_comm, 0);
   2205
   2206	itr = zalloc(sizeof(*itr));
   2207	if (itr == NULL)
   2208		return -ENOMEM;
   2209
   2210	init_stats(&itr->tr.run_stats);
   2211	callchain_init(&itr->callchain);
   2212	callchain_cursor_reset(&itr->cursor);
   2213	thread__set_priv(thread, itr);
   2214
   2215	return 0;
   2216}
   2217
   2218/*
   2219 * Track idle stats per cpu by maintaining a local thread
   2220 * struct for the idle task on each cpu.
   2221 */
   2222static int init_idle_threads(int ncpu)
   2223{
   2224	int i, ret;
   2225
   2226	idle_threads = zalloc(ncpu * sizeof(struct thread *));
   2227	if (!idle_threads)
   2228		return -ENOMEM;
   2229
   2230	idle_max_cpu = ncpu;
   2231
   2232	/* allocate the actual thread struct if needed */
   2233	for (i = 0; i < ncpu; ++i) {
   2234		idle_threads[i] = thread__new(0, 0);
   2235		if (idle_threads[i] == NULL)
   2236			return -ENOMEM;
   2237
   2238		ret = init_idle_thread(idle_threads[i]);
   2239		if (ret < 0)
   2240			return ret;
   2241	}
   2242
   2243	return 0;
   2244}
   2245
   2246static void free_idle_threads(void)
   2247{
   2248	int i;
   2249
   2250	if (idle_threads == NULL)
   2251		return;
   2252
   2253	for (i = 0; i < idle_max_cpu; ++i) {
   2254		if ((idle_threads[i]))
   2255			thread__delete(idle_threads[i]);
   2256	}
   2257
   2258	free(idle_threads);
   2259}
   2260
   2261static struct thread *get_idle_thread(int cpu)
   2262{
   2263	/*
   2264	 * expand/allocate array of pointers to local thread
   2265	 * structs if needed
   2266	 */
   2267	if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) {
   2268		int i, j = __roundup_pow_of_two(cpu+1);
   2269		void *p;
   2270
   2271		p = realloc(idle_threads, j * sizeof(struct thread *));
   2272		if (!p)
   2273			return NULL;
   2274
   2275		idle_threads = (struct thread **) p;
   2276		for (i = idle_max_cpu; i < j; ++i)
   2277			idle_threads[i] = NULL;
   2278
   2279		idle_max_cpu = j;
   2280	}
   2281
   2282	/* allocate a new thread struct if needed */
   2283	if (idle_threads[cpu] == NULL) {
   2284		idle_threads[cpu] = thread__new(0, 0);
   2285
   2286		if (idle_threads[cpu]) {
   2287			if (init_idle_thread(idle_threads[cpu]) < 0)
   2288				return NULL;
   2289		}
   2290	}
   2291
   2292	return idle_threads[cpu];
   2293}
   2294
   2295static void save_idle_callchain(struct perf_sched *sched,
   2296				struct idle_thread_runtime *itr,
   2297				struct perf_sample *sample)
   2298{
   2299	if (!sched->show_callchain || sample->callchain == NULL)
   2300		return;
   2301
   2302	callchain_cursor__copy(&itr->cursor, &callchain_cursor);
   2303}
   2304
   2305static struct thread *timehist_get_thread(struct perf_sched *sched,
   2306					  struct perf_sample *sample,
   2307					  struct machine *machine,
   2308					  struct evsel *evsel)
   2309{
   2310	struct thread *thread;
   2311
   2312	if (is_idle_sample(sample, evsel)) {
   2313		thread = get_idle_thread(sample->cpu);
   2314		if (thread == NULL)
   2315			pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
   2316
   2317	} else {
   2318		/* there were samples with tid 0 but non-zero pid */
   2319		thread = machine__findnew_thread(machine, sample->pid,
   2320						 sample->tid ?: sample->pid);
   2321		if (thread == NULL) {
   2322			pr_debug("Failed to get thread for tid %d. skipping sample.\n",
   2323				 sample->tid);
   2324		}
   2325
   2326		save_task_callchain(sched, sample, evsel, machine);
   2327		if (sched->idle_hist) {
   2328			struct thread *idle;
   2329			struct idle_thread_runtime *itr;
   2330
   2331			idle = get_idle_thread(sample->cpu);
   2332			if (idle == NULL) {
   2333				pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
   2334				return NULL;
   2335			}
   2336
   2337			itr = thread__priv(idle);
   2338			if (itr == NULL)
   2339				return NULL;
   2340
   2341			itr->last_thread = thread;
   2342
   2343			/* copy task callchain when entering to idle */
   2344			if (evsel__intval(evsel, sample, "next_pid") == 0)
   2345				save_idle_callchain(sched, itr, sample);
   2346		}
   2347	}
   2348
   2349	return thread;
   2350}
   2351
   2352static bool timehist_skip_sample(struct perf_sched *sched,
   2353				 struct thread *thread,
   2354				 struct evsel *evsel,
   2355				 struct perf_sample *sample)
   2356{
   2357	bool rc = false;
   2358
   2359	if (thread__is_filtered(thread)) {
   2360		rc = true;
   2361		sched->skipped_samples++;
   2362	}
   2363
   2364	if (sched->idle_hist) {
   2365		if (strcmp(evsel__name(evsel), "sched:sched_switch"))
   2366			rc = true;
   2367		else if (evsel__intval(evsel, sample, "prev_pid") != 0 &&
   2368			 evsel__intval(evsel, sample, "next_pid") != 0)
   2369			rc = true;
   2370	}
   2371
   2372	return rc;
   2373}
   2374
   2375static void timehist_print_wakeup_event(struct perf_sched *sched,
   2376					struct evsel *evsel,
   2377					struct perf_sample *sample,
   2378					struct machine *machine,
   2379					struct thread *awakened)
   2380{
   2381	struct thread *thread;
   2382	char tstr[64];
   2383
   2384	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
   2385	if (thread == NULL)
   2386		return;
   2387
   2388	/* show wakeup unless both awakee and awaker are filtered */
   2389	if (timehist_skip_sample(sched, thread, evsel, sample) &&
   2390	    timehist_skip_sample(sched, awakened, evsel, sample)) {
   2391		return;
   2392	}
   2393
   2394	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
   2395	printf("%15s [%04d] ", tstr, sample->cpu);
   2396	if (sched->show_cpu_visual)
   2397		printf(" %*s ", sched->max_cpu.cpu + 1, "");
   2398
   2399	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
   2400
   2401	/* dt spacer */
   2402	printf("  %9s  %9s  %9s ", "", "", "");
   2403
   2404	printf("awakened: %s", timehist_get_commstr(awakened));
   2405
   2406	printf("\n");
   2407}
   2408
   2409static int timehist_sched_wakeup_ignore(struct perf_tool *tool __maybe_unused,
   2410					union perf_event *event __maybe_unused,
   2411					struct evsel *evsel __maybe_unused,
   2412					struct perf_sample *sample __maybe_unused,
   2413					struct machine *machine __maybe_unused)
   2414{
   2415	return 0;
   2416}
   2417
   2418static int timehist_sched_wakeup_event(struct perf_tool *tool,
   2419				       union perf_event *event __maybe_unused,
   2420				       struct evsel *evsel,
   2421				       struct perf_sample *sample,
   2422				       struct machine *machine)
   2423{
   2424	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   2425	struct thread *thread;
   2426	struct thread_runtime *tr = NULL;
   2427	/* want pid of awakened task not pid in sample */
   2428	const u32 pid = evsel__intval(evsel, sample, "pid");
   2429
   2430	thread = machine__findnew_thread(machine, 0, pid);
   2431	if (thread == NULL)
   2432		return -1;
   2433
   2434	tr = thread__get_runtime(thread);
   2435	if (tr == NULL)
   2436		return -1;
   2437
   2438	if (tr->ready_to_run == 0)
   2439		tr->ready_to_run = sample->time;
   2440
   2441	/* show wakeups if requested */
   2442	if (sched->show_wakeups &&
   2443	    !perf_time__skip_sample(&sched->ptime, sample->time))
   2444		timehist_print_wakeup_event(sched, evsel, sample, machine, thread);
   2445
   2446	return 0;
   2447}
   2448
   2449static void timehist_print_migration_event(struct perf_sched *sched,
   2450					struct evsel *evsel,
   2451					struct perf_sample *sample,
   2452					struct machine *machine,
   2453					struct thread *migrated)
   2454{
   2455	struct thread *thread;
   2456	char tstr[64];
   2457	u32 max_cpus;
   2458	u32 ocpu, dcpu;
   2459
   2460	if (sched->summary_only)
   2461		return;
   2462
   2463	max_cpus = sched->max_cpu.cpu + 1;
   2464	ocpu = evsel__intval(evsel, sample, "orig_cpu");
   2465	dcpu = evsel__intval(evsel, sample, "dest_cpu");
   2466
   2467	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
   2468	if (thread == NULL)
   2469		return;
   2470
   2471	if (timehist_skip_sample(sched, thread, evsel, sample) &&
   2472	    timehist_skip_sample(sched, migrated, evsel, sample)) {
   2473		return;
   2474	}
   2475
   2476	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
   2477	printf("%15s [%04d] ", tstr, sample->cpu);
   2478
   2479	if (sched->show_cpu_visual) {
   2480		u32 i;
   2481		char c;
   2482
   2483		printf("  ");
   2484		for (i = 0; i < max_cpus; ++i) {
   2485			c = (i == sample->cpu) ? 'm' : ' ';
   2486			printf("%c", c);
   2487		}
   2488		printf("  ");
   2489	}
   2490
   2491	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
   2492
   2493	/* dt spacer */
   2494	printf("  %9s  %9s  %9s ", "", "", "");
   2495
   2496	printf("migrated: %s", timehist_get_commstr(migrated));
   2497	printf(" cpu %d => %d", ocpu, dcpu);
   2498
   2499	printf("\n");
   2500}
   2501
   2502static int timehist_migrate_task_event(struct perf_tool *tool,
   2503				       union perf_event *event __maybe_unused,
   2504				       struct evsel *evsel,
   2505				       struct perf_sample *sample,
   2506				       struct machine *machine)
   2507{
   2508	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   2509	struct thread *thread;
   2510	struct thread_runtime *tr = NULL;
   2511	/* want pid of migrated task not pid in sample */
   2512	const u32 pid = evsel__intval(evsel, sample, "pid");
   2513
   2514	thread = machine__findnew_thread(machine, 0, pid);
   2515	if (thread == NULL)
   2516		return -1;
   2517
   2518	tr = thread__get_runtime(thread);
   2519	if (tr == NULL)
   2520		return -1;
   2521
   2522	tr->migrations++;
   2523
   2524	/* show migrations if requested */
   2525	timehist_print_migration_event(sched, evsel, sample, machine, thread);
   2526
   2527	return 0;
   2528}
   2529
   2530static int timehist_sched_change_event(struct perf_tool *tool,
   2531				       union perf_event *event,
   2532				       struct evsel *evsel,
   2533				       struct perf_sample *sample,
   2534				       struct machine *machine)
   2535{
   2536	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   2537	struct perf_time_interval *ptime = &sched->ptime;
   2538	struct addr_location al;
   2539	struct thread *thread;
   2540	struct thread_runtime *tr = NULL;
   2541	u64 tprev, t = sample->time;
   2542	int rc = 0;
   2543	int state = evsel__intval(evsel, sample, "prev_state");
   2544
   2545	if (machine__resolve(machine, &al, sample) < 0) {
   2546		pr_err("problem processing %d event. skipping it\n",
   2547		       event->header.type);
   2548		rc = -1;
   2549		goto out;
   2550	}
   2551
   2552	thread = timehist_get_thread(sched, sample, machine, evsel);
   2553	if (thread == NULL) {
   2554		rc = -1;
   2555		goto out;
   2556	}
   2557
   2558	if (timehist_skip_sample(sched, thread, evsel, sample))
   2559		goto out;
   2560
   2561	tr = thread__get_runtime(thread);
   2562	if (tr == NULL) {
   2563		rc = -1;
   2564		goto out;
   2565	}
   2566
   2567	tprev = evsel__get_time(evsel, sample->cpu);
   2568
   2569	/*
   2570	 * If start time given:
   2571	 * - sample time is under window user cares about - skip sample
   2572	 * - tprev is under window user cares about  - reset to start of window
   2573	 */
   2574	if (ptime->start && ptime->start > t)
   2575		goto out;
   2576
   2577	if (tprev && ptime->start > tprev)
   2578		tprev = ptime->start;
   2579
   2580	/*
   2581	 * If end time given:
   2582	 * - previous sched event is out of window - we are done
   2583	 * - sample time is beyond window user cares about - reset it
   2584	 *   to close out stats for time window interest
   2585	 */
   2586	if (ptime->end) {
   2587		if (tprev > ptime->end)
   2588			goto out;
   2589
   2590		if (t > ptime->end)
   2591			t = ptime->end;
   2592	}
   2593
   2594	if (!sched->idle_hist || thread->tid == 0) {
   2595		if (!cpu_list || test_bit(sample->cpu, cpu_bitmap))
   2596			timehist_update_runtime_stats(tr, t, tprev);
   2597
   2598		if (sched->idle_hist) {
   2599			struct idle_thread_runtime *itr = (void *)tr;
   2600			struct thread_runtime *last_tr;
   2601
   2602			BUG_ON(thread->tid != 0);
   2603
   2604			if (itr->last_thread == NULL)
   2605				goto out;
   2606
   2607			/* add current idle time as last thread's runtime */
   2608			last_tr = thread__get_runtime(itr->last_thread);
   2609			if (last_tr == NULL)
   2610				goto out;
   2611
   2612			timehist_update_runtime_stats(last_tr, t, tprev);
   2613			/*
   2614			 * remove delta time of last thread as it's not updated
   2615			 * and otherwise it will show an invalid value next
   2616			 * time.  we only care total run time and run stat.
   2617			 */
   2618			last_tr->dt_run = 0;
   2619			last_tr->dt_delay = 0;
   2620			last_tr->dt_sleep = 0;
   2621			last_tr->dt_iowait = 0;
   2622			last_tr->dt_preempt = 0;
   2623
   2624			if (itr->cursor.nr)
   2625				callchain_append(&itr->callchain, &itr->cursor, t - tprev);
   2626
   2627			itr->last_thread = NULL;
   2628		}
   2629	}
   2630
   2631	if (!sched->summary_only)
   2632		timehist_print_sample(sched, evsel, sample, &al, thread, t, state);
   2633
   2634out:
   2635	if (sched->hist_time.start == 0 && t >= ptime->start)
   2636		sched->hist_time.start = t;
   2637	if (ptime->end == 0 || t <= ptime->end)
   2638		sched->hist_time.end = t;
   2639
   2640	if (tr) {
   2641		/* time of this sched_switch event becomes last time task seen */
   2642		tr->last_time = sample->time;
   2643
   2644		/* last state is used to determine where to account wait time */
   2645		tr->last_state = state;
   2646
   2647		/* sched out event for task so reset ready to run time */
   2648		tr->ready_to_run = 0;
   2649	}
   2650
   2651	evsel__save_time(evsel, sample->time, sample->cpu);
   2652
   2653	return rc;
   2654}
   2655
   2656static int timehist_sched_switch_event(struct perf_tool *tool,
   2657			     union perf_event *event,
   2658			     struct evsel *evsel,
   2659			     struct perf_sample *sample,
   2660			     struct machine *machine __maybe_unused)
   2661{
   2662	return timehist_sched_change_event(tool, event, evsel, sample, machine);
   2663}
   2664
   2665static int process_lost(struct perf_tool *tool __maybe_unused,
   2666			union perf_event *event,
   2667			struct perf_sample *sample,
   2668			struct machine *machine __maybe_unused)
   2669{
   2670	char tstr[64];
   2671
   2672	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
   2673	printf("%15s ", tstr);
   2674	printf("lost %" PRI_lu64 " events on cpu %d\n", event->lost.lost, sample->cpu);
   2675
   2676	return 0;
   2677}
   2678
   2679
   2680static void print_thread_runtime(struct thread *t,
   2681				 struct thread_runtime *r)
   2682{
   2683	double mean = avg_stats(&r->run_stats);
   2684	float stddev;
   2685
   2686	printf("%*s   %5d  %9" PRIu64 " ",
   2687	       comm_width, timehist_get_commstr(t), t->ppid,
   2688	       (u64) r->run_stats.n);
   2689
   2690	print_sched_time(r->total_run_time, 8);
   2691	stddev = rel_stddev_stats(stddev_stats(&r->run_stats), mean);
   2692	print_sched_time(r->run_stats.min, 6);
   2693	printf(" ");
   2694	print_sched_time((u64) mean, 6);
   2695	printf(" ");
   2696	print_sched_time(r->run_stats.max, 6);
   2697	printf("  ");
   2698	printf("%5.2f", stddev);
   2699	printf("   %5" PRIu64, r->migrations);
   2700	printf("\n");
   2701}
   2702
   2703static void print_thread_waittime(struct thread *t,
   2704				  struct thread_runtime *r)
   2705{
   2706	printf("%*s   %5d  %9" PRIu64 " ",
   2707	       comm_width, timehist_get_commstr(t), t->ppid,
   2708	       (u64) r->run_stats.n);
   2709
   2710	print_sched_time(r->total_run_time, 8);
   2711	print_sched_time(r->total_sleep_time, 6);
   2712	printf(" ");
   2713	print_sched_time(r->total_iowait_time, 6);
   2714	printf(" ");
   2715	print_sched_time(r->total_preempt_time, 6);
   2716	printf(" ");
   2717	print_sched_time(r->total_delay_time, 6);
   2718	printf("\n");
   2719}
   2720
   2721struct total_run_stats {
   2722	struct perf_sched *sched;
   2723	u64  sched_count;
   2724	u64  task_count;
   2725	u64  total_run_time;
   2726};
   2727
   2728static int __show_thread_runtime(struct thread *t, void *priv)
   2729{
   2730	struct total_run_stats *stats = priv;
   2731	struct thread_runtime *r;
   2732
   2733	if (thread__is_filtered(t))
   2734		return 0;
   2735
   2736	r = thread__priv(t);
   2737	if (r && r->run_stats.n) {
   2738		stats->task_count++;
   2739		stats->sched_count += r->run_stats.n;
   2740		stats->total_run_time += r->total_run_time;
   2741
   2742		if (stats->sched->show_state)
   2743			print_thread_waittime(t, r);
   2744		else
   2745			print_thread_runtime(t, r);
   2746	}
   2747
   2748	return 0;
   2749}
   2750
   2751static int show_thread_runtime(struct thread *t, void *priv)
   2752{
   2753	if (t->dead)
   2754		return 0;
   2755
   2756	return __show_thread_runtime(t, priv);
   2757}
   2758
   2759static int show_deadthread_runtime(struct thread *t, void *priv)
   2760{
   2761	if (!t->dead)
   2762		return 0;
   2763
   2764	return __show_thread_runtime(t, priv);
   2765}
   2766
   2767static size_t callchain__fprintf_folded(FILE *fp, struct callchain_node *node)
   2768{
   2769	const char *sep = " <- ";
   2770	struct callchain_list *chain;
   2771	size_t ret = 0;
   2772	char bf[1024];
   2773	bool first;
   2774
   2775	if (node == NULL)
   2776		return 0;
   2777
   2778	ret = callchain__fprintf_folded(fp, node->parent);
   2779	first = (ret == 0);
   2780
   2781	list_for_each_entry(chain, &node->val, list) {
   2782		if (chain->ip >= PERF_CONTEXT_MAX)
   2783			continue;
   2784		if (chain->ms.sym && chain->ms.sym->ignore)
   2785			continue;
   2786		ret += fprintf(fp, "%s%s", first ? "" : sep,
   2787			       callchain_list__sym_name(chain, bf, sizeof(bf),
   2788							false));
   2789		first = false;
   2790	}
   2791
   2792	return ret;
   2793}
   2794
   2795static size_t timehist_print_idlehist_callchain(struct rb_root_cached *root)
   2796{
   2797	size_t ret = 0;
   2798	FILE *fp = stdout;
   2799	struct callchain_node *chain;
   2800	struct rb_node *rb_node = rb_first_cached(root);
   2801
   2802	printf("  %16s  %8s  %s\n", "Idle time (msec)", "Count", "Callchains");
   2803	printf("  %.16s  %.8s  %.50s\n", graph_dotted_line, graph_dotted_line,
   2804	       graph_dotted_line);
   2805
   2806	while (rb_node) {
   2807		chain = rb_entry(rb_node, struct callchain_node, rb_node);
   2808		rb_node = rb_next(rb_node);
   2809
   2810		ret += fprintf(fp, "  ");
   2811		print_sched_time(chain->hit, 12);
   2812		ret += 16;  /* print_sched_time returns 2nd arg + 4 */
   2813		ret += fprintf(fp, " %8d  ", chain->count);
   2814		ret += callchain__fprintf_folded(fp, chain);
   2815		ret += fprintf(fp, "\n");
   2816	}
   2817
   2818	return ret;
   2819}
   2820
   2821static void timehist_print_summary(struct perf_sched *sched,
   2822				   struct perf_session *session)
   2823{
   2824	struct machine *m = &session->machines.host;
   2825	struct total_run_stats totals;
   2826	u64 task_count;
   2827	struct thread *t;
   2828	struct thread_runtime *r;
   2829	int i;
   2830	u64 hist_time = sched->hist_time.end - sched->hist_time.start;
   2831
   2832	memset(&totals, 0, sizeof(totals));
   2833	totals.sched = sched;
   2834
   2835	if (sched->idle_hist) {
   2836		printf("\nIdle-time summary\n");
   2837		printf("%*s  parent  sched-out  ", comm_width, "comm");
   2838		printf("  idle-time   min-idle    avg-idle    max-idle  stddev  migrations\n");
   2839	} else if (sched->show_state) {
   2840		printf("\nWait-time summary\n");
   2841		printf("%*s  parent   sched-in  ", comm_width, "comm");
   2842		printf("   run-time      sleep      iowait     preempt       delay\n");
   2843	} else {
   2844		printf("\nRuntime summary\n");
   2845		printf("%*s  parent   sched-in  ", comm_width, "comm");
   2846		printf("   run-time    min-run     avg-run     max-run  stddev  migrations\n");
   2847	}
   2848	printf("%*s            (count)  ", comm_width, "");
   2849	printf("     (msec)     (msec)      (msec)      (msec)       %s\n",
   2850	       sched->show_state ? "(msec)" : "%");
   2851	printf("%.117s\n", graph_dotted_line);
   2852
   2853	machine__for_each_thread(m, show_thread_runtime, &totals);
   2854	task_count = totals.task_count;
   2855	if (!task_count)
   2856		printf("<no still running tasks>\n");
   2857
   2858	printf("\nTerminated tasks:\n");
   2859	machine__for_each_thread(m, show_deadthread_runtime, &totals);
   2860	if (task_count == totals.task_count)
   2861		printf("<no terminated tasks>\n");
   2862
   2863	/* CPU idle stats not tracked when samples were skipped */
   2864	if (sched->skipped_samples && !sched->idle_hist)
   2865		return;
   2866
   2867	printf("\nIdle stats:\n");
   2868	for (i = 0; i < idle_max_cpu; ++i) {
   2869		if (cpu_list && !test_bit(i, cpu_bitmap))
   2870			continue;
   2871
   2872		t = idle_threads[i];
   2873		if (!t)
   2874			continue;
   2875
   2876		r = thread__priv(t);
   2877		if (r && r->run_stats.n) {
   2878			totals.sched_count += r->run_stats.n;
   2879			printf("    CPU %2d idle for ", i);
   2880			print_sched_time(r->total_run_time, 6);
   2881			printf(" msec  (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time);
   2882		} else
   2883			printf("    CPU %2d idle entire time window\n", i);
   2884	}
   2885
   2886	if (sched->idle_hist && sched->show_callchain) {
   2887		callchain_param.mode  = CHAIN_FOLDED;
   2888		callchain_param.value = CCVAL_PERIOD;
   2889
   2890		callchain_register_param(&callchain_param);
   2891
   2892		printf("\nIdle stats by callchain:\n");
   2893		for (i = 0; i < idle_max_cpu; ++i) {
   2894			struct idle_thread_runtime *itr;
   2895
   2896			t = idle_threads[i];
   2897			if (!t)
   2898				continue;
   2899
   2900			itr = thread__priv(t);
   2901			if (itr == NULL)
   2902				continue;
   2903
   2904			callchain_param.sort(&itr->sorted_root.rb_root, &itr->callchain,
   2905					     0, &callchain_param);
   2906
   2907			printf("  CPU %2d:", i);
   2908			print_sched_time(itr->tr.total_run_time, 6);
   2909			printf(" msec\n");
   2910			timehist_print_idlehist_callchain(&itr->sorted_root);
   2911			printf("\n");
   2912		}
   2913	}
   2914
   2915	printf("\n"
   2916	       "    Total number of unique tasks: %" PRIu64 "\n"
   2917	       "Total number of context switches: %" PRIu64 "\n",
   2918	       totals.task_count, totals.sched_count);
   2919
   2920	printf("           Total run time (msec): ");
   2921	print_sched_time(totals.total_run_time, 2);
   2922	printf("\n");
   2923
   2924	printf("    Total scheduling time (msec): ");
   2925	print_sched_time(hist_time, 2);
   2926	printf(" (x %d)\n", sched->max_cpu.cpu);
   2927}
   2928
   2929typedef int (*sched_handler)(struct perf_tool *tool,
   2930			  union perf_event *event,
   2931			  struct evsel *evsel,
   2932			  struct perf_sample *sample,
   2933			  struct machine *machine);
   2934
   2935static int perf_timehist__process_sample(struct perf_tool *tool,
   2936					 union perf_event *event,
   2937					 struct perf_sample *sample,
   2938					 struct evsel *evsel,
   2939					 struct machine *machine)
   2940{
   2941	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
   2942	int err = 0;
   2943	struct perf_cpu this_cpu = {
   2944		.cpu = sample->cpu,
   2945	};
   2946
   2947	if (this_cpu.cpu > sched->max_cpu.cpu)
   2948		sched->max_cpu = this_cpu;
   2949
   2950	if (evsel->handler != NULL) {
   2951		sched_handler f = evsel->handler;
   2952
   2953		err = f(tool, event, evsel, sample, machine);
   2954	}
   2955
   2956	return err;
   2957}
   2958
   2959static int timehist_check_attr(struct perf_sched *sched,
   2960			       struct evlist *evlist)
   2961{
   2962	struct evsel *evsel;
   2963	struct evsel_runtime *er;
   2964
   2965	list_for_each_entry(evsel, &evlist->core.entries, core.node) {
   2966		er = evsel__get_runtime(evsel);
   2967		if (er == NULL) {
   2968			pr_err("Failed to allocate memory for evsel runtime data\n");
   2969			return -1;
   2970		}
   2971
   2972		if (sched->show_callchain && !evsel__has_callchain(evsel)) {
   2973			pr_info("Samples do not have callchains.\n");
   2974			sched->show_callchain = 0;
   2975			symbol_conf.use_callchain = 0;
   2976		}
   2977	}
   2978
   2979	return 0;
   2980}
   2981
   2982static int perf_sched__timehist(struct perf_sched *sched)
   2983{
   2984	struct evsel_str_handler handlers[] = {
   2985		{ "sched:sched_switch",       timehist_sched_switch_event, },
   2986		{ "sched:sched_wakeup",	      timehist_sched_wakeup_event, },
   2987		{ "sched:sched_waking",       timehist_sched_wakeup_event, },
   2988		{ "sched:sched_wakeup_new",   timehist_sched_wakeup_event, },
   2989	};
   2990	const struct evsel_str_handler migrate_handlers[] = {
   2991		{ "sched:sched_migrate_task", timehist_migrate_task_event, },
   2992	};
   2993	struct perf_data data = {
   2994		.path  = input_name,
   2995		.mode  = PERF_DATA_MODE_READ,
   2996		.force = sched->force,
   2997	};
   2998
   2999	struct perf_session *session;
   3000	struct evlist *evlist;
   3001	int err = -1;
   3002
   3003	/*
   3004	 * event handlers for timehist option
   3005	 */
   3006	sched->tool.sample	 = perf_timehist__process_sample;
   3007	sched->tool.mmap	 = perf_event__process_mmap;
   3008	sched->tool.comm	 = perf_event__process_comm;
   3009	sched->tool.exit	 = perf_event__process_exit;
   3010	sched->tool.fork	 = perf_event__process_fork;
   3011	sched->tool.lost	 = process_lost;
   3012	sched->tool.attr	 = perf_event__process_attr;
   3013	sched->tool.tracing_data = perf_event__process_tracing_data;
   3014	sched->tool.build_id	 = perf_event__process_build_id;
   3015
   3016	sched->tool.ordered_events = true;
   3017	sched->tool.ordering_requires_timestamps = true;
   3018
   3019	symbol_conf.use_callchain = sched->show_callchain;
   3020
   3021	session = perf_session__new(&data, &sched->tool);
   3022	if (IS_ERR(session))
   3023		return PTR_ERR(session);
   3024
   3025	if (cpu_list) {
   3026		err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap);
   3027		if (err < 0)
   3028			goto out;
   3029	}
   3030
   3031	evlist = session->evlist;
   3032
   3033	symbol__init(&session->header.env);
   3034
   3035	if (perf_time__parse_str(&sched->ptime, sched->time_str) != 0) {
   3036		pr_err("Invalid time string\n");
   3037		return -EINVAL;
   3038	}
   3039
   3040	if (timehist_check_attr(sched, evlist) != 0)
   3041		goto out;
   3042
   3043	setup_pager();
   3044
   3045	/* prefer sched_waking if it is captured */
   3046	if (evlist__find_tracepoint_by_name(session->evlist, "sched:sched_waking"))
   3047		handlers[1].handler = timehist_sched_wakeup_ignore;
   3048
   3049	/* setup per-evsel handlers */
   3050	if (perf_session__set_tracepoints_handlers(session, handlers))
   3051		goto out;
   3052
   3053	/* sched_switch event at a minimum needs to exist */
   3054	if (!evlist__find_tracepoint_by_name(session->evlist, "sched:sched_switch")) {
   3055		pr_err("No sched_switch events found. Have you run 'perf sched record'?\n");
   3056		goto out;
   3057	}
   3058
   3059	if (sched->show_migrations &&
   3060	    perf_session__set_tracepoints_handlers(session, migrate_handlers))
   3061		goto out;
   3062
   3063	/* pre-allocate struct for per-CPU idle stats */
   3064	sched->max_cpu.cpu = session->header.env.nr_cpus_online;
   3065	if (sched->max_cpu.cpu == 0)
   3066		sched->max_cpu.cpu = 4;
   3067	if (init_idle_threads(sched->max_cpu.cpu))
   3068		goto out;
   3069
   3070	/* summary_only implies summary option, but don't overwrite summary if set */
   3071	if (sched->summary_only)
   3072		sched->summary = sched->summary_only;
   3073
   3074	if (!sched->summary_only)
   3075		timehist_header(sched);
   3076
   3077	err = perf_session__process_events(session);
   3078	if (err) {
   3079		pr_err("Failed to process events, error %d", err);
   3080		goto out;
   3081	}
   3082
   3083	sched->nr_events      = evlist->stats.nr_events[0];
   3084	sched->nr_lost_events = evlist->stats.total_lost;
   3085	sched->nr_lost_chunks = evlist->stats.nr_events[PERF_RECORD_LOST];
   3086
   3087	if (sched->summary)
   3088		timehist_print_summary(sched, session);
   3089
   3090out:
   3091	free_idle_threads();
   3092	perf_session__delete(session);
   3093
   3094	return err;
   3095}
   3096
   3097
   3098static void print_bad_events(struct perf_sched *sched)
   3099{
   3100	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
   3101		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
   3102			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
   3103			sched->nr_unordered_timestamps, sched->nr_timestamps);
   3104	}
   3105	if (sched->nr_lost_events && sched->nr_events) {
   3106		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
   3107			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
   3108			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
   3109	}
   3110	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
   3111		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
   3112			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
   3113			sched->nr_context_switch_bugs, sched->nr_timestamps);
   3114		if (sched->nr_lost_events)
   3115			printf(" (due to lost events?)");
   3116		printf("\n");
   3117	}
   3118}
   3119
   3120static void __merge_work_atoms(struct rb_root_cached *root, struct work_atoms *data)
   3121{
   3122	struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
   3123	struct work_atoms *this;
   3124	const char *comm = thread__comm_str(data->thread), *this_comm;
   3125	bool leftmost = true;
   3126
   3127	while (*new) {
   3128		int cmp;
   3129
   3130		this = container_of(*new, struct work_atoms, node);
   3131		parent = *new;
   3132
   3133		this_comm = thread__comm_str(this->thread);
   3134		cmp = strcmp(comm, this_comm);
   3135		if (cmp > 0) {
   3136			new = &((*new)->rb_left);
   3137		} else if (cmp < 0) {
   3138			new = &((*new)->rb_right);
   3139			leftmost = false;
   3140		} else {
   3141			this->num_merged++;
   3142			this->total_runtime += data->total_runtime;
   3143			this->nb_atoms += data->nb_atoms;
   3144			this->total_lat += data->total_lat;
   3145			list_splice(&data->work_list, &this->work_list);
   3146			if (this->max_lat < data->max_lat) {
   3147				this->max_lat = data->max_lat;
   3148				this->max_lat_start = data->max_lat_start;
   3149				this->max_lat_end = data->max_lat_end;
   3150			}
   3151			zfree(&data);
   3152			return;
   3153		}
   3154	}
   3155
   3156	data->num_merged++;
   3157	rb_link_node(&data->node, parent, new);
   3158	rb_insert_color_cached(&data->node, root, leftmost);
   3159}
   3160
   3161static void perf_sched__merge_lat(struct perf_sched *sched)
   3162{
   3163	struct work_atoms *data;
   3164	struct rb_node *node;
   3165
   3166	if (sched->skip_merge)
   3167		return;
   3168
   3169	while ((node = rb_first_cached(&sched->atom_root))) {
   3170		rb_erase_cached(node, &sched->atom_root);
   3171		data = rb_entry(node, struct work_atoms, node);
   3172		__merge_work_atoms(&sched->merged_atom_root, data);
   3173	}
   3174}
   3175
   3176static int perf_sched__lat(struct perf_sched *sched)
   3177{
   3178	struct rb_node *next;
   3179
   3180	setup_pager();
   3181
   3182	if (perf_sched__read_events(sched))
   3183		return -1;
   3184
   3185	perf_sched__merge_lat(sched);
   3186	perf_sched__sort_lat(sched);
   3187
   3188	printf("\n -------------------------------------------------------------------------------------------------------------------------------------------\n");
   3189	printf("  Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Max delay start           | Max delay end          |\n");
   3190	printf(" -------------------------------------------------------------------------------------------------------------------------------------------\n");
   3191
   3192	next = rb_first_cached(&sched->sorted_atom_root);
   3193
   3194	while (next) {
   3195		struct work_atoms *work_list;
   3196
   3197		work_list = rb_entry(next, struct work_atoms, node);
   3198		output_lat_thread(sched, work_list);
   3199		next = rb_next(next);
   3200		thread__zput(work_list->thread);
   3201	}
   3202
   3203	printf(" -----------------------------------------------------------------------------------------------------------------\n");
   3204	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
   3205		(double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count);
   3206
   3207	printf(" ---------------------------------------------------\n");
   3208
   3209	print_bad_events(sched);
   3210	printf("\n");
   3211
   3212	return 0;
   3213}
   3214
   3215static int setup_map_cpus(struct perf_sched *sched)
   3216{
   3217	struct perf_cpu_map *map;
   3218
   3219	sched->max_cpu.cpu  = sysconf(_SC_NPROCESSORS_CONF);
   3220
   3221	if (sched->map.comp) {
   3222		sched->map.comp_cpus = zalloc(sched->max_cpu.cpu * sizeof(int));
   3223		if (!sched->map.comp_cpus)
   3224			return -1;
   3225	}
   3226
   3227	if (!sched->map.cpus_str)
   3228		return 0;
   3229
   3230	map = perf_cpu_map__new(sched->map.cpus_str);
   3231	if (!map) {
   3232		pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
   3233		return -1;
   3234	}
   3235
   3236	sched->map.cpus = map;
   3237	return 0;
   3238}
   3239
   3240static int setup_color_pids(struct perf_sched *sched)
   3241{
   3242	struct perf_thread_map *map;
   3243
   3244	if (!sched->map.color_pids_str)
   3245		return 0;
   3246
   3247	map = thread_map__new_by_tid_str(sched->map.color_pids_str);
   3248	if (!map) {
   3249		pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
   3250		return -1;
   3251	}
   3252
   3253	sched->map.color_pids = map;
   3254	return 0;
   3255}
   3256
   3257static int setup_color_cpus(struct perf_sched *sched)
   3258{
   3259	struct perf_cpu_map *map;
   3260
   3261	if (!sched->map.color_cpus_str)
   3262		return 0;
   3263
   3264	map = perf_cpu_map__new(sched->map.color_cpus_str);
   3265	if (!map) {
   3266		pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
   3267		return -1;
   3268	}
   3269
   3270	sched->map.color_cpus = map;
   3271	return 0;
   3272}
   3273
   3274static int perf_sched__map(struct perf_sched *sched)
   3275{
   3276	if (setup_map_cpus(sched))
   3277		return -1;
   3278
   3279	if (setup_color_pids(sched))
   3280		return -1;
   3281
   3282	if (setup_color_cpus(sched))
   3283		return -1;
   3284
   3285	setup_pager();
   3286	if (perf_sched__read_events(sched))
   3287		return -1;
   3288	print_bad_events(sched);
   3289	return 0;
   3290}
   3291
   3292static int perf_sched__replay(struct perf_sched *sched)
   3293{
   3294	unsigned long i;
   3295
   3296	calibrate_run_measurement_overhead(sched);
   3297	calibrate_sleep_measurement_overhead(sched);
   3298
   3299	test_calibrations(sched);
   3300
   3301	if (perf_sched__read_events(sched))
   3302		return -1;
   3303
   3304	printf("nr_run_events:        %ld\n", sched->nr_run_events);
   3305	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
   3306	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
   3307
   3308	if (sched->targetless_wakeups)
   3309		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
   3310	if (sched->multitarget_wakeups)
   3311		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
   3312	if (sched->nr_run_events_optimized)
   3313		printf("run atoms optimized: %ld\n",
   3314			sched->nr_run_events_optimized);
   3315
   3316	print_task_traces(sched);
   3317	add_cross_task_wakeups(sched);
   3318
   3319	create_tasks(sched);
   3320	printf("------------------------------------------------------------\n");
   3321	for (i = 0; i < sched->replay_repeat; i++)
   3322		run_one_test(sched);
   3323
   3324	return 0;
   3325}
   3326
   3327static void setup_sorting(struct perf_sched *sched, const struct option *options,
   3328			  const char * const usage_msg[])
   3329{
   3330	char *tmp, *tok, *str = strdup(sched->sort_order);
   3331
   3332	for (tok = strtok_r(str, ", ", &tmp);
   3333			tok; tok = strtok_r(NULL, ", ", &tmp)) {
   3334		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
   3335			usage_with_options_msg(usage_msg, options,
   3336					"Unknown --sort key: `%s'", tok);
   3337		}
   3338	}
   3339
   3340	free(str);
   3341
   3342	sort_dimension__add("pid", &sched->cmp_pid);
   3343}
   3344
   3345static bool schedstat_events_exposed(void)
   3346{
   3347	/*
   3348	 * Select "sched:sched_stat_wait" event to check
   3349	 * whether schedstat tracepoints are exposed.
   3350	 */
   3351	return IS_ERR(trace_event__tp_format("sched", "sched_stat_wait")) ?
   3352		false : true;
   3353}
   3354
   3355static int __cmd_record(int argc, const char **argv)
   3356{
   3357	unsigned int rec_argc, i, j;
   3358	const char **rec_argv;
   3359	const char * const record_args[] = {
   3360		"record",
   3361		"-a",
   3362		"-R",
   3363		"-m", "1024",
   3364		"-c", "1",
   3365		"-e", "sched:sched_switch",
   3366		"-e", "sched:sched_stat_runtime",
   3367		"-e", "sched:sched_process_fork",
   3368		"-e", "sched:sched_wakeup_new",
   3369		"-e", "sched:sched_migrate_task",
   3370	};
   3371
   3372	/*
   3373	 * The tracepoints trace_sched_stat_{wait, sleep, iowait}
   3374	 * are not exposed to user if CONFIG_SCHEDSTATS is not set,
   3375	 * to prevent "perf sched record" execution failure, determine
   3376	 * whether to record schedstat events according to actual situation.
   3377	 */
   3378	const char * const schedstat_args[] = {
   3379		"-e", "sched:sched_stat_wait",
   3380		"-e", "sched:sched_stat_sleep",
   3381		"-e", "sched:sched_stat_iowait",
   3382	};
   3383	unsigned int schedstat_argc = schedstat_events_exposed() ?
   3384		ARRAY_SIZE(schedstat_args) : 0;
   3385
   3386	struct tep_event *waking_event;
   3387
   3388	/*
   3389	 * +2 for either "-e", "sched:sched_wakeup" or
   3390	 * "-e", "sched:sched_waking"
   3391	 */
   3392	rec_argc = ARRAY_SIZE(record_args) + 2 + schedstat_argc + argc - 1;
   3393	rec_argv = calloc(rec_argc + 1, sizeof(char *));
   3394
   3395	if (rec_argv == NULL)
   3396		return -ENOMEM;
   3397
   3398	for (i = 0; i < ARRAY_SIZE(record_args); i++)
   3399		rec_argv[i] = strdup(record_args[i]);
   3400
   3401	rec_argv[i++] = "-e";
   3402	waking_event = trace_event__tp_format("sched", "sched_waking");
   3403	if (!IS_ERR(waking_event))
   3404		rec_argv[i++] = strdup("sched:sched_waking");
   3405	else
   3406		rec_argv[i++] = strdup("sched:sched_wakeup");
   3407
   3408	for (j = 0; j < schedstat_argc; j++)
   3409		rec_argv[i++] = strdup(schedstat_args[j]);
   3410
   3411	for (j = 1; j < (unsigned int)argc; j++, i++)
   3412		rec_argv[i] = argv[j];
   3413
   3414	BUG_ON(i != rec_argc);
   3415
   3416	return cmd_record(i, rec_argv);
   3417}
   3418
   3419int cmd_sched(int argc, const char **argv)
   3420{
   3421	static const char default_sort_order[] = "avg, max, switch, runtime";
   3422	struct perf_sched sched = {
   3423		.tool = {
   3424			.sample		 = perf_sched__process_tracepoint_sample,
   3425			.comm		 = perf_sched__process_comm,
   3426			.namespaces	 = perf_event__process_namespaces,
   3427			.lost		 = perf_event__process_lost,
   3428			.fork		 = perf_sched__process_fork_event,
   3429			.ordered_events = true,
   3430		},
   3431		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
   3432		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
   3433		.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
   3434		.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
   3435		.sort_order	      = default_sort_order,
   3436		.replay_repeat	      = 10,
   3437		.profile_cpu	      = -1,
   3438		.next_shortname1      = 'A',
   3439		.next_shortname2      = '0',
   3440		.skip_merge           = 0,
   3441		.show_callchain	      = 1,
   3442		.max_stack            = 5,
   3443	};
   3444	const struct option sched_options[] = {
   3445	OPT_STRING('i', "input", &input_name, "file",
   3446		    "input file name"),
   3447	OPT_INCR('v', "verbose", &verbose,
   3448		    "be more verbose (show symbol address, etc)"),
   3449	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
   3450		    "dump raw trace in ASCII"),
   3451	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
   3452	OPT_END()
   3453	};
   3454	const struct option latency_options[] = {
   3455	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
   3456		   "sort by key(s): runtime, switch, avg, max"),
   3457	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
   3458		    "CPU to profile on"),
   3459	OPT_BOOLEAN('p', "pids", &sched.skip_merge,
   3460		    "latency stats per pid instead of per comm"),
   3461	OPT_PARENT(sched_options)
   3462	};
   3463	const struct option replay_options[] = {
   3464	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
   3465		     "repeat the workload replay N times (-1: infinite)"),
   3466	OPT_PARENT(sched_options)
   3467	};
   3468	const struct option map_options[] = {
   3469	OPT_BOOLEAN(0, "compact", &sched.map.comp,
   3470		    "map output in compact mode"),
   3471	OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
   3472		   "highlight given pids in map"),
   3473	OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
   3474                    "highlight given CPUs in map"),
   3475	OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
   3476                    "display given CPUs in map"),
   3477	OPT_PARENT(sched_options)
   3478	};
   3479	const struct option timehist_options[] = {
   3480	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
   3481		   "file", "vmlinux pathname"),
   3482	OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
   3483		   "file", "kallsyms pathname"),
   3484	OPT_BOOLEAN('g', "call-graph", &sched.show_callchain,
   3485		    "Display call chains if present (default on)"),
   3486	OPT_UINTEGER(0, "max-stack", &sched.max_stack,
   3487		   "Maximum number of functions to display backtrace."),
   3488	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
   3489		    "Look for files with symbols relative to this directory"),
   3490	OPT_BOOLEAN('s', "summary", &sched.summary_only,
   3491		    "Show only syscall summary with statistics"),
   3492	OPT_BOOLEAN('S', "with-summary", &sched.summary,
   3493		    "Show all syscalls and summary with statistics"),
   3494	OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"),
   3495	OPT_BOOLEAN('n', "next", &sched.show_next, "Show next task"),
   3496	OPT_BOOLEAN('M', "migrations", &sched.show_migrations, "Show migration events"),
   3497	OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"),
   3498	OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"),
   3499	OPT_STRING(0, "time", &sched.time_str, "str",
   3500		   "Time span for analysis (start,stop)"),
   3501	OPT_BOOLEAN(0, "state", &sched.show_state, "Show task state when sched-out"),
   3502	OPT_STRING('p', "pid", &symbol_conf.pid_list_str, "pid[,pid...]",
   3503		   "analyze events only for given process id(s)"),
   3504	OPT_STRING('t', "tid", &symbol_conf.tid_list_str, "tid[,tid...]",
   3505		   "analyze events only for given thread id(s)"),
   3506	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
   3507	OPT_PARENT(sched_options)
   3508	};
   3509
   3510	const char * const latency_usage[] = {
   3511		"perf sched latency [<options>]",
   3512		NULL
   3513	};
   3514	const char * const replay_usage[] = {
   3515		"perf sched replay [<options>]",
   3516		NULL
   3517	};
   3518	const char * const map_usage[] = {
   3519		"perf sched map [<options>]",
   3520		NULL
   3521	};
   3522	const char * const timehist_usage[] = {
   3523		"perf sched timehist [<options>]",
   3524		NULL
   3525	};
   3526	const char *const sched_subcommands[] = { "record", "latency", "map",
   3527						  "replay", "script",
   3528						  "timehist", NULL };
   3529	const char *sched_usage[] = {
   3530		NULL,
   3531		NULL
   3532	};
   3533	struct trace_sched_handler lat_ops  = {
   3534		.wakeup_event	    = latency_wakeup_event,
   3535		.switch_event	    = latency_switch_event,
   3536		.runtime_event	    = latency_runtime_event,
   3537		.migrate_task_event = latency_migrate_task_event,
   3538	};
   3539	struct trace_sched_handler map_ops  = {
   3540		.switch_event	    = map_switch_event,
   3541	};
   3542	struct trace_sched_handler replay_ops  = {
   3543		.wakeup_event	    = replay_wakeup_event,
   3544		.switch_event	    = replay_switch_event,
   3545		.fork_event	    = replay_fork_event,
   3546	};
   3547	unsigned int i;
   3548	int ret;
   3549
   3550	for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
   3551		sched.curr_pid[i] = -1;
   3552
   3553	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
   3554					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
   3555	if (!argc)
   3556		usage_with_options(sched_usage, sched_options);
   3557
   3558	/*
   3559	 * Aliased to 'perf script' for now:
   3560	 */
   3561	if (!strcmp(argv[0], "script"))
   3562		return cmd_script(argc, argv);
   3563
   3564	if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) {
   3565		return __cmd_record(argc, argv);
   3566	} else if (!strncmp(argv[0], "lat", 3)) {
   3567		sched.tp_handler = &lat_ops;
   3568		if (argc > 1) {
   3569			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
   3570			if (argc)
   3571				usage_with_options(latency_usage, latency_options);
   3572		}
   3573		setup_sorting(&sched, latency_options, latency_usage);
   3574		return perf_sched__lat(&sched);
   3575	} else if (!strcmp(argv[0], "map")) {
   3576		if (argc) {
   3577			argc = parse_options(argc, argv, map_options, map_usage, 0);
   3578			if (argc)
   3579				usage_with_options(map_usage, map_options);
   3580		}
   3581		sched.tp_handler = &map_ops;
   3582		setup_sorting(&sched, latency_options, latency_usage);
   3583		return perf_sched__map(&sched);
   3584	} else if (strlen(argv[0]) > 2 && strstarts("replay", argv[0])) {
   3585		sched.tp_handler = &replay_ops;
   3586		if (argc) {
   3587			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
   3588			if (argc)
   3589				usage_with_options(replay_usage, replay_options);
   3590		}
   3591		return perf_sched__replay(&sched);
   3592	} else if (!strcmp(argv[0], "timehist")) {
   3593		if (argc) {
   3594			argc = parse_options(argc, argv, timehist_options,
   3595					     timehist_usage, 0);
   3596			if (argc)
   3597				usage_with_options(timehist_usage, timehist_options);
   3598		}
   3599		if ((sched.show_wakeups || sched.show_next) &&
   3600		    sched.summary_only) {
   3601			pr_err(" Error: -s and -[n|w] are mutually exclusive.\n");
   3602			parse_options_usage(timehist_usage, timehist_options, "s", true);
   3603			if (sched.show_wakeups)
   3604				parse_options_usage(NULL, timehist_options, "w", true);
   3605			if (sched.show_next)
   3606				parse_options_usage(NULL, timehist_options, "n", true);
   3607			return -EINVAL;
   3608		}
   3609		ret = symbol__validate_sym_arguments();
   3610		if (ret)
   3611			return ret;
   3612
   3613		return perf_sched__timehist(&sched);
   3614	} else {
   3615		usage_with_options(sched_usage, sched_options);
   3616	}
   3617
   3618	return 0;
   3619}