sched.h - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
sched.h (68188B)
      1/* SPDX-License-Identifier: GPL-2.0 */
      2#ifndef _LINUX_SCHED_H
      3#define _LINUX_SCHED_H
      4
      5/*
      6 * Define 'struct task_struct' and provide the main scheduler
      7 * APIs (schedule(), wakeup variants, etc.)
      8 */
      9
     10#include <uapi/linux/sched.h>
     11
     12#include <asm/current.h>
     13
     14#include <linux/pid.h>
     15#include <linux/sem.h>
     16#include <linux/shm.h>
     17#include <linux/mutex.h>
     18#include <linux/plist.h>
     19#include <linux/hrtimer.h>
     20#include <linux/irqflags.h>
     21#include <linux/seccomp.h>
     22#include <linux/nodemask.h>
     23#include <linux/rcupdate.h>
     24#include <linux/refcount.h>
     25#include <linux/resource.h>
     26#include <linux/latencytop.h>
     27#include <linux/sched/prio.h>
     28#include <linux/sched/types.h>
     29#include <linux/signal_types.h>
     30#include <linux/syscall_user_dispatch.h>
     31#include <linux/mm_types_task.h>
     32#include <linux/task_io_accounting.h>
     33#include <linux/posix-timers.h>
     34#include <linux/rseq.h>
     35#include <linux/seqlock.h>
     36#include <linux/kcsan.h>
     37#include <asm/kmap_size.h>
     38
     39/* task_struct member predeclarations (sorted alphabetically): */
     40struct audit_context;
     41struct backing_dev_info;
     42struct bio_list;
     43struct blk_plug;
     44struct bpf_local_storage;
     45struct bpf_run_ctx;
     46struct capture_control;
     47struct cfs_rq;
     48struct fs_struct;
     49struct futex_pi_state;
     50struct io_context;
     51struct io_uring_task;
     52struct mempolicy;
     53struct nameidata;
     54struct nsproxy;
     55struct perf_event_context;
     56struct pid_namespace;
     57struct pipe_inode_info;
     58struct rcu_node;
     59struct reclaim_state;
     60struct robust_list_head;
     61struct root_domain;
     62struct rq;
     63struct sched_attr;
     64struct sched_param;
     65struct seq_file;
     66struct sighand_struct;
     67struct signal_struct;
     68struct task_delay_info;
     69struct task_group;
     70
     71/*
     72 * Task state bitmask. NOTE! These bits are also
     73 * encoded in fs/proc/array.c: get_task_state().
     74 *
     75 * We have two separate sets of flags: task->state
     76 * is about runnability, while task->exit_state are
     77 * about the task exiting. Confusing, but this way
     78 * modifying one set can't modify the other one by
     79 * mistake.
     80 */
     81
     82/* Used in tsk->state: */
     83#define TASK_RUNNING			0x0000
     84#define TASK_INTERRUPTIBLE		0x0001
     85#define TASK_UNINTERRUPTIBLE		0x0002
     86#define __TASK_STOPPED			0x0004
     87#define __TASK_TRACED			0x0008
     88/* Used in tsk->exit_state: */
     89#define EXIT_DEAD			0x0010
     90#define EXIT_ZOMBIE			0x0020
     91#define EXIT_TRACE			(EXIT_ZOMBIE | EXIT_DEAD)
     92/* Used in tsk->state again: */
     93#define TASK_PARKED			0x0040
     94#define TASK_DEAD			0x0080
     95#define TASK_WAKEKILL			0x0100
     96#define TASK_WAKING			0x0200
     97#define TASK_NOLOAD			0x0400
     98#define TASK_NEW			0x0800
     99/* RT specific auxilliary flag to mark RT lock waiters */
    100#define TASK_RTLOCK_WAIT		0x1000
    101#define TASK_STATE_MAX			0x2000
    102
    103/* Convenience macros for the sake of set_current_state: */
    104#define TASK_KILLABLE			(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
    105#define TASK_STOPPED			(TASK_WAKEKILL | __TASK_STOPPED)
    106#define TASK_TRACED			__TASK_TRACED
    107
    108#define TASK_IDLE			(TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
    109
    110/* Convenience macros for the sake of wake_up(): */
    111#define TASK_NORMAL			(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
    112
    113/* get_task_state(): */
    114#define TASK_REPORT			(TASK_RUNNING | TASK_INTERRUPTIBLE | \
    115					 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
    116					 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
    117					 TASK_PARKED)
    118
    119#define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
    120
    121#define task_is_traced(task)		((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
    122#define task_is_stopped(task)		((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
    123#define task_is_stopped_or_traced(task)	((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)
    124
    125/*
    126 * Special states are those that do not use the normal wait-loop pattern. See
    127 * the comment with set_special_state().
    128 */
    129#define is_special_task_state(state)				\
    130	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
    131
    132#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
    133# define debug_normal_state_change(state_value)				\
    134	do {								\
    135		WARN_ON_ONCE(is_special_task_state(state_value));	\
    136		current->task_state_change = _THIS_IP_;			\
    137	} while (0)
    138
    139# define debug_special_state_change(state_value)			\
    140	do {								\
    141		WARN_ON_ONCE(!is_special_task_state(state_value));	\
    142		current->task_state_change = _THIS_IP_;			\
    143	} while (0)
    144
    145# define debug_rtlock_wait_set_state()					\
    146	do {								 \
    147		current->saved_state_change = current->task_state_change;\
    148		current->task_state_change = _THIS_IP_;			 \
    149	} while (0)
    150
    151# define debug_rtlock_wait_restore_state()				\
    152	do {								 \
    153		current->task_state_change = current->saved_state_change;\
    154	} while (0)
    155
    156#else
    157# define debug_normal_state_change(cond)	do { } while (0)
    158# define debug_special_state_change(cond)	do { } while (0)
    159# define debug_rtlock_wait_set_state()		do { } while (0)
    160# define debug_rtlock_wait_restore_state()	do { } while (0)
    161#endif
    162
    163/*
    164 * set_current_state() includes a barrier so that the write of current->state
    165 * is correctly serialised wrt the caller's subsequent test of whether to
    166 * actually sleep:
    167 *
    168 *   for (;;) {
    169 *	set_current_state(TASK_UNINTERRUPTIBLE);
    170 *	if (CONDITION)
    171 *	   break;
    172 *
    173 *	schedule();
    174 *   }
    175 *   __set_current_state(TASK_RUNNING);
    176 *
    177 * If the caller does not need such serialisation (because, for instance, the
    178 * CONDITION test and condition change and wakeup are under the same lock) then
    179 * use __set_current_state().
    180 *
    181 * The above is typically ordered against the wakeup, which does:
    182 *
    183 *   CONDITION = 1;
    184 *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
    185 *
    186 * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
    187 * accessing p->state.
    188 *
    189 * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
    190 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
    191 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
    192 *
    193 * However, with slightly different timing the wakeup TASK_RUNNING store can
    194 * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
    195 * a problem either because that will result in one extra go around the loop
    196 * and our @cond test will save the day.
    197 *
    198 * Also see the comments of try_to_wake_up().
    199 */
    200#define __set_current_state(state_value)				\
    201	do {								\
    202		debug_normal_state_change((state_value));		\
    203		WRITE_ONCE(current->__state, (state_value));		\
    204	} while (0)
    205
    206#define set_current_state(state_value)					\
    207	do {								\
    208		debug_normal_state_change((state_value));		\
    209		smp_store_mb(current->__state, (state_value));		\
    210	} while (0)
    211
    212/*
    213 * set_special_state() should be used for those states when the blocking task
    214 * can not use the regular condition based wait-loop. In that case we must
    215 * serialize against wakeups such that any possible in-flight TASK_RUNNING
    216 * stores will not collide with our state change.
    217 */
    218#define set_special_state(state_value)					\
    219	do {								\
    220		unsigned long flags; /* may shadow */			\
    221									\
    222		raw_spin_lock_irqsave(&current->pi_lock, flags);	\
    223		debug_special_state_change((state_value));		\
    224		WRITE_ONCE(current->__state, (state_value));		\
    225		raw_spin_unlock_irqrestore(&current->pi_lock, flags);	\
    226	} while (0)
    227
    228/*
    229 * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
    230 *
    231 * RT's spin/rwlock substitutions are state preserving. The state of the
    232 * task when blocking on the lock is saved in task_struct::saved_state and
    233 * restored after the lock has been acquired.  These operations are
    234 * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
    235 * lock related wakeups while the task is blocked on the lock are
    236 * redirected to operate on task_struct::saved_state to ensure that these
    237 * are not dropped. On restore task_struct::saved_state is set to
    238 * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
    239 *
    240 * The lock operation looks like this:
    241 *
    242 *	current_save_and_set_rtlock_wait_state();
    243 *	for (;;) {
    244 *		if (try_lock())
    245 *			break;
    246 *		raw_spin_unlock_irq(&lock->wait_lock);
    247 *		schedule_rtlock();
    248 *		raw_spin_lock_irq(&lock->wait_lock);
    249 *		set_current_state(TASK_RTLOCK_WAIT);
    250 *	}
    251 *	current_restore_rtlock_saved_state();
    252 */
    253#define current_save_and_set_rtlock_wait_state()			\
    254	do {								\
    255		lockdep_assert_irqs_disabled();				\
    256		raw_spin_lock(&current->pi_lock);			\
    257		current->saved_state = current->__state;		\
    258		debug_rtlock_wait_set_state();				\
    259		WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);		\
    260		raw_spin_unlock(&current->pi_lock);			\
    261	} while (0);
    262
    263#define current_restore_rtlock_saved_state()				\
    264	do {								\
    265		lockdep_assert_irqs_disabled();				\
    266		raw_spin_lock(&current->pi_lock);			\
    267		debug_rtlock_wait_restore_state();			\
    268		WRITE_ONCE(current->__state, current->saved_state);	\
    269		current->saved_state = TASK_RUNNING;			\
    270		raw_spin_unlock(&current->pi_lock);			\
    271	} while (0);
    272
    273#define get_current_state()	READ_ONCE(current->__state)
    274
    275/*
    276 * Define the task command name length as enum, then it can be visible to
    277 * BPF programs.
    278 */
    279enum {
    280	TASK_COMM_LEN = 16,
    281};
    282
    283extern void scheduler_tick(void);
    284
    285#define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
    286
    287extern long schedule_timeout(long timeout);
    288extern long schedule_timeout_interruptible(long timeout);
    289extern long schedule_timeout_killable(long timeout);
    290extern long schedule_timeout_uninterruptible(long timeout);
    291extern long schedule_timeout_idle(long timeout);
    292asmlinkage void schedule(void);
    293extern void schedule_preempt_disabled(void);
    294asmlinkage void preempt_schedule_irq(void);
    295#ifdef CONFIG_PREEMPT_RT
    296 extern void schedule_rtlock(void);
    297#endif
    298
    299extern int __must_check io_schedule_prepare(void);
    300extern void io_schedule_finish(int token);
    301extern long io_schedule_timeout(long timeout);
    302extern void io_schedule(void);
    303
    304/**
    305 * struct prev_cputime - snapshot of system and user cputime
    306 * @utime: time spent in user mode
    307 * @stime: time spent in system mode
    308 * @lock: protects the above two fields
    309 *
    310 * Stores previous user/system time values such that we can guarantee
    311 * monotonicity.
    312 */
    313struct prev_cputime {
    314#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
    315	u64				utime;
    316	u64				stime;
    317	raw_spinlock_t			lock;
    318#endif
    319};
    320
    321enum vtime_state {
    322	/* Task is sleeping or running in a CPU with VTIME inactive: */
    323	VTIME_INACTIVE = 0,
    324	/* Task is idle */
    325	VTIME_IDLE,
    326	/* Task runs in kernelspace in a CPU with VTIME active: */
    327	VTIME_SYS,
    328	/* Task runs in userspace in a CPU with VTIME active: */
    329	VTIME_USER,
    330	/* Task runs as guests in a CPU with VTIME active: */
    331	VTIME_GUEST,
    332};
    333
    334struct vtime {
    335	seqcount_t		seqcount;
    336	unsigned long long	starttime;
    337	enum vtime_state	state;
    338	unsigned int		cpu;
    339	u64			utime;
    340	u64			stime;
    341	u64			gtime;
    342};
    343
    344/*
    345 * Utilization clamp constraints.
    346 * @UCLAMP_MIN:	Minimum utilization
    347 * @UCLAMP_MAX:	Maximum utilization
    348 * @UCLAMP_CNT:	Utilization clamp constraints count
    349 */
    350enum uclamp_id {
    351	UCLAMP_MIN = 0,
    352	UCLAMP_MAX,
    353	UCLAMP_CNT
    354};
    355
    356#ifdef CONFIG_SMP
    357extern struct root_domain def_root_domain;
    358extern struct mutex sched_domains_mutex;
    359#endif
    360
    361struct sched_info {
    362#ifdef CONFIG_SCHED_INFO
    363	/* Cumulative counters: */
    364
    365	/* # of times we have run on this CPU: */
    366	unsigned long			pcount;
    367
    368	/* Time spent waiting on a runqueue: */
    369	unsigned long long		run_delay;
    370
    371	/* Timestamps: */
    372
    373	/* When did we last run on a CPU? */
    374	unsigned long long		last_arrival;
    375
    376	/* When were we last queued to run? */
    377	unsigned long long		last_queued;
    378
    379#endif /* CONFIG_SCHED_INFO */
    380};
    381
    382/*
    383 * Integer metrics need fixed point arithmetic, e.g., sched/fair
    384 * has a few: load, load_avg, util_avg, freq, and capacity.
    385 *
    386 * We define a basic fixed point arithmetic range, and then formalize
    387 * all these metrics based on that basic range.
    388 */
    389# define SCHED_FIXEDPOINT_SHIFT		10
    390# define SCHED_FIXEDPOINT_SCALE		(1L << SCHED_FIXEDPOINT_SHIFT)
    391
    392/* Increase resolution of cpu_capacity calculations */
    393# define SCHED_CAPACITY_SHIFT		SCHED_FIXEDPOINT_SHIFT
    394# define SCHED_CAPACITY_SCALE		(1L << SCHED_CAPACITY_SHIFT)
    395
    396struct load_weight {
    397	unsigned long			weight;
    398	u32				inv_weight;
    399};
    400
    401/**
    402 * struct util_est - Estimation utilization of FAIR tasks
    403 * @enqueued: instantaneous estimated utilization of a task/cpu
    404 * @ewma:     the Exponential Weighted Moving Average (EWMA)
    405 *            utilization of a task
    406 *
    407 * Support data structure to track an Exponential Weighted Moving Average
    408 * (EWMA) of a FAIR task's utilization. New samples are added to the moving
    409 * average each time a task completes an activation. Sample's weight is chosen
    410 * so that the EWMA will be relatively insensitive to transient changes to the
    411 * task's workload.
    412 *
    413 * The enqueued attribute has a slightly different meaning for tasks and cpus:
    414 * - task:   the task's util_avg at last task dequeue time
    415 * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
    416 * Thus, the util_est.enqueued of a task represents the contribution on the
    417 * estimated utilization of the CPU where that task is currently enqueued.
    418 *
    419 * Only for tasks we track a moving average of the past instantaneous
    420 * estimated utilization. This allows to absorb sporadic drops in utilization
    421 * of an otherwise almost periodic task.
    422 *
    423 * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
    424 * updates. When a task is dequeued, its util_est should not be updated if its
    425 * util_avg has not been updated in the meantime.
    426 * This information is mapped into the MSB bit of util_est.enqueued at dequeue
    427 * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
    428 * for a task) it is safe to use MSB.
    429 */
    430struct util_est {
    431	unsigned int			enqueued;
    432	unsigned int			ewma;
    433#define UTIL_EST_WEIGHT_SHIFT		2
    434#define UTIL_AVG_UNCHANGED		0x80000000
    435} __attribute__((__aligned__(sizeof(u64))));
    436
    437/*
    438 * The load/runnable/util_avg accumulates an infinite geometric series
    439 * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
    440 *
    441 * [load_avg definition]
    442 *
    443 *   load_avg = runnable% * scale_load_down(load)
    444 *
    445 * [runnable_avg definition]
    446 *
    447 *   runnable_avg = runnable% * SCHED_CAPACITY_SCALE
    448 *
    449 * [util_avg definition]
    450 *
    451 *   util_avg = running% * SCHED_CAPACITY_SCALE
    452 *
    453 * where runnable% is the time ratio that a sched_entity is runnable and
    454 * running% the time ratio that a sched_entity is running.
    455 *
    456 * For cfs_rq, they are the aggregated values of all runnable and blocked
    457 * sched_entities.
    458 *
    459 * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
    460 * capacity scaling. The scaling is done through the rq_clock_pelt that is used
    461 * for computing those signals (see update_rq_clock_pelt())
    462 *
    463 * N.B., the above ratios (runnable% and running%) themselves are in the
    464 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
    465 * to as large a range as necessary. This is for example reflected by
    466 * util_avg's SCHED_CAPACITY_SCALE.
    467 *
    468 * [Overflow issue]
    469 *
    470 * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
    471 * with the highest load (=88761), always runnable on a single cfs_rq,
    472 * and should not overflow as the number already hits PID_MAX_LIMIT.
    473 *
    474 * For all other cases (including 32-bit kernels), struct load_weight's
    475 * weight will overflow first before we do, because:
    476 *
    477 *    Max(load_avg) <= Max(load.weight)
    478 *
    479 * Then it is the load_weight's responsibility to consider overflow
    480 * issues.
    481 */
    482struct sched_avg {
    483	u64				last_update_time;
    484	u64				load_sum;
    485	u64				runnable_sum;
    486	u32				util_sum;
    487	u32				period_contrib;
    488	unsigned long			load_avg;
    489	unsigned long			runnable_avg;
    490	unsigned long			util_avg;
    491	struct util_est			util_est;
    492} ____cacheline_aligned;
    493
    494struct sched_statistics {
    495#ifdef CONFIG_SCHEDSTATS
    496	u64				wait_start;
    497	u64				wait_max;
    498	u64				wait_count;
    499	u64				wait_sum;
    500	u64				iowait_count;
    501	u64				iowait_sum;
    502
    503	u64				sleep_start;
    504	u64				sleep_max;
    505	s64				sum_sleep_runtime;
    506
    507	u64				block_start;
    508	u64				block_max;
    509	s64				sum_block_runtime;
    510
    511	u64				exec_max;
    512	u64				slice_max;
    513
    514	u64				nr_migrations_cold;
    515	u64				nr_failed_migrations_affine;
    516	u64				nr_failed_migrations_running;
    517	u64				nr_failed_migrations_hot;
    518	u64				nr_forced_migrations;
    519
    520	u64				nr_wakeups;
    521	u64				nr_wakeups_sync;
    522	u64				nr_wakeups_migrate;
    523	u64				nr_wakeups_local;
    524	u64				nr_wakeups_remote;
    525	u64				nr_wakeups_affine;
    526	u64				nr_wakeups_affine_attempts;
    527	u64				nr_wakeups_passive;
    528	u64				nr_wakeups_idle;
    529
    530#ifdef CONFIG_SCHED_CORE
    531	u64				core_forceidle_sum;
    532#endif
    533#endif /* CONFIG_SCHEDSTATS */
    534} ____cacheline_aligned;
    535
    536struct sched_entity {
    537	/* For load-balancing: */
    538	struct load_weight		load;
    539	struct rb_node			run_node;
    540	struct list_head		group_node;
    541	unsigned int			on_rq;
    542
    543	u64				exec_start;
    544	u64				sum_exec_runtime;
    545	u64				vruntime;
    546	u64				prev_sum_exec_runtime;
    547
    548	u64				nr_migrations;
    549
    550#ifdef CONFIG_FAIR_GROUP_SCHED
    551	int				depth;
    552	struct sched_entity		*parent;
    553	/* rq on which this entity is (to be) queued: */
    554	struct cfs_rq			*cfs_rq;
    555	/* rq "owned" by this entity/group: */
    556	struct cfs_rq			*my_q;
    557	/* cached value of my_q->h_nr_running */
    558	unsigned long			runnable_weight;
    559#endif
    560
    561#ifdef CONFIG_SMP
    562	/*
    563	 * Per entity load average tracking.
    564	 *
    565	 * Put into separate cache line so it does not
    566	 * collide with read-mostly values above.
    567	 */
    568	struct sched_avg		avg;
    569#endif
    570};
    571
    572struct sched_rt_entity {
    573	struct list_head		run_list;
    574	unsigned long			timeout;
    575	unsigned long			watchdog_stamp;
    576	unsigned int			time_slice;
    577	unsigned short			on_rq;
    578	unsigned short			on_list;
    579
    580	struct sched_rt_entity		*back;
    581#ifdef CONFIG_RT_GROUP_SCHED
    582	struct sched_rt_entity		*parent;
    583	/* rq on which this entity is (to be) queued: */
    584	struct rt_rq			*rt_rq;
    585	/* rq "owned" by this entity/group: */
    586	struct rt_rq			*my_q;
    587#endif
    588} __randomize_layout;
    589
    590struct sched_dl_entity {
    591	struct rb_node			rb_node;
    592
    593	/*
    594	 * Original scheduling parameters. Copied here from sched_attr
    595	 * during sched_setattr(), they will remain the same until
    596	 * the next sched_setattr().
    597	 */
    598	u64				dl_runtime;	/* Maximum runtime for each instance	*/
    599	u64				dl_deadline;	/* Relative deadline of each instance	*/
    600	u64				dl_period;	/* Separation of two instances (period) */
    601	u64				dl_bw;		/* dl_runtime / dl_period		*/
    602	u64				dl_density;	/* dl_runtime / dl_deadline		*/
    603
    604	/*
    605	 * Actual scheduling parameters. Initialized with the values above,
    606	 * they are continuously updated during task execution. Note that
    607	 * the remaining runtime could be < 0 in case we are in overrun.
    608	 */
    609	s64				runtime;	/* Remaining runtime for this instance	*/
    610	u64				deadline;	/* Absolute deadline for this instance	*/
    611	unsigned int			flags;		/* Specifying the scheduler behaviour	*/
    612
    613	/*
    614	 * Some bool flags:
    615	 *
    616	 * @dl_throttled tells if we exhausted the runtime. If so, the
    617	 * task has to wait for a replenishment to be performed at the
    618	 * next firing of dl_timer.
    619	 *
    620	 * @dl_yielded tells if task gave up the CPU before consuming
    621	 * all its available runtime during the last job.
    622	 *
    623	 * @dl_non_contending tells if the task is inactive while still
    624	 * contributing to the active utilization. In other words, it
    625	 * indicates if the inactive timer has been armed and its handler
    626	 * has not been executed yet. This flag is useful to avoid race
    627	 * conditions between the inactive timer handler and the wakeup
    628	 * code.
    629	 *
    630	 * @dl_overrun tells if the task asked to be informed about runtime
    631	 * overruns.
    632	 */
    633	unsigned int			dl_throttled      : 1;
    634	unsigned int			dl_yielded        : 1;
    635	unsigned int			dl_non_contending : 1;
    636	unsigned int			dl_overrun	  : 1;
    637
    638	/*
    639	 * Bandwidth enforcement timer. Each -deadline task has its
    640	 * own bandwidth to be enforced, thus we need one timer per task.
    641	 */
    642	struct hrtimer			dl_timer;
    643
    644	/*
    645	 * Inactive timer, responsible for decreasing the active utilization
    646	 * at the "0-lag time". When a -deadline task blocks, it contributes
    647	 * to GRUB's active utilization until the "0-lag time", hence a
    648	 * timer is needed to decrease the active utilization at the correct
    649	 * time.
    650	 */
    651	struct hrtimer inactive_timer;
    652
    653#ifdef CONFIG_RT_MUTEXES
    654	/*
    655	 * Priority Inheritance. When a DEADLINE scheduling entity is boosted
    656	 * pi_se points to the donor, otherwise points to the dl_se it belongs
    657	 * to (the original one/itself).
    658	 */
    659	struct sched_dl_entity *pi_se;
    660#endif
    661};
    662
    663#ifdef CONFIG_UCLAMP_TASK
    664/* Number of utilization clamp buckets (shorter alias) */
    665#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
    666
    667/*
    668 * Utilization clamp for a scheduling entity
    669 * @value:		clamp value "assigned" to a se
    670 * @bucket_id:		bucket index corresponding to the "assigned" value
    671 * @active:		the se is currently refcounted in a rq's bucket
    672 * @user_defined:	the requested clamp value comes from user-space
    673 *
    674 * The bucket_id is the index of the clamp bucket matching the clamp value
    675 * which is pre-computed and stored to avoid expensive integer divisions from
    676 * the fast path.
    677 *
    678 * The active bit is set whenever a task has got an "effective" value assigned,
    679 * which can be different from the clamp value "requested" from user-space.
    680 * This allows to know a task is refcounted in the rq's bucket corresponding
    681 * to the "effective" bucket_id.
    682 *
    683 * The user_defined bit is set whenever a task has got a task-specific clamp
    684 * value requested from userspace, i.e. the system defaults apply to this task
    685 * just as a restriction. This allows to relax default clamps when a less
    686 * restrictive task-specific value has been requested, thus allowing to
    687 * implement a "nice" semantic. For example, a task running with a 20%
    688 * default boost can still drop its own boosting to 0%.
    689 */
    690struct uclamp_se {
    691	unsigned int value		: bits_per(SCHED_CAPACITY_SCALE);
    692	unsigned int bucket_id		: bits_per(UCLAMP_BUCKETS);
    693	unsigned int active		: 1;
    694	unsigned int user_defined	: 1;
    695};
    696#endif /* CONFIG_UCLAMP_TASK */
    697
    698union rcu_special {
    699	struct {
    700		u8			blocked;
    701		u8			need_qs;
    702		u8			exp_hint; /* Hint for performance. */
    703		u8			need_mb; /* Readers need smp_mb(). */
    704	} b; /* Bits. */
    705	u32 s; /* Set of bits. */
    706};
    707
    708enum perf_event_task_context {
    709	perf_invalid_context = -1,
    710	perf_hw_context = 0,
    711	perf_sw_context,
    712	perf_nr_task_contexts,
    713};
    714
    715struct wake_q_node {
    716	struct wake_q_node *next;
    717};
    718
    719struct kmap_ctrl {
    720#ifdef CONFIG_KMAP_LOCAL
    721	int				idx;
    722	pte_t				pteval[KM_MAX_IDX];
    723#endif
    724};
    725
    726struct task_struct {
    727#ifdef CONFIG_THREAD_INFO_IN_TASK
    728	/*
    729	 * For reasons of header soup (see current_thread_info()), this
    730	 * must be the first element of task_struct.
    731	 */
    732	struct thread_info		thread_info;
    733#endif
    734	unsigned int			__state;
    735
    736#ifdef CONFIG_PREEMPT_RT
    737	/* saved state for "spinlock sleepers" */
    738	unsigned int			saved_state;
    739#endif
    740
    741	/*
    742	 * This begins the randomizable portion of task_struct. Only
    743	 * scheduling-critical items should be added above here.
    744	 */
    745	randomized_struct_fields_start
    746
    747	void				*stack;
    748	refcount_t			usage;
    749	/* Per task flags (PF_*), defined further below: */
    750	unsigned int			flags;
    751	unsigned int			ptrace;
    752
    753#ifdef CONFIG_SMP
    754	int				on_cpu;
    755	struct __call_single_node	wake_entry;
    756	unsigned int			wakee_flips;
    757	unsigned long			wakee_flip_decay_ts;
    758	struct task_struct		*last_wakee;
    759
    760	/*
    761	 * recent_used_cpu is initially set as the last CPU used by a task
    762	 * that wakes affine another task. Waker/wakee relationships can
    763	 * push tasks around a CPU where each wakeup moves to the next one.
    764	 * Tracking a recently used CPU allows a quick search for a recently
    765	 * used CPU that may be idle.
    766	 */
    767	int				recent_used_cpu;
    768	int				wake_cpu;
    769#endif
    770	int				on_rq;
    771
    772	int				prio;
    773	int				static_prio;
    774	int				normal_prio;
    775	unsigned int			rt_priority;
    776
    777	struct sched_entity		se;
    778	struct sched_rt_entity		rt;
    779	struct sched_dl_entity		dl;
    780	const struct sched_class	*sched_class;
    781
    782#ifdef CONFIG_SCHED_CORE
    783	struct rb_node			core_node;
    784	unsigned long			core_cookie;
    785	unsigned int			core_occupation;
    786#endif
    787
    788#ifdef CONFIG_CGROUP_SCHED
    789	struct task_group		*sched_task_group;
    790#endif
    791
    792#ifdef CONFIG_UCLAMP_TASK
    793	/*
    794	 * Clamp values requested for a scheduling entity.
    795	 * Must be updated with task_rq_lock() held.
    796	 */
    797	struct uclamp_se		uclamp_req[UCLAMP_CNT];
    798	/*
    799	 * Effective clamp values used for a scheduling entity.
    800	 * Must be updated with task_rq_lock() held.
    801	 */
    802	struct uclamp_se		uclamp[UCLAMP_CNT];
    803#endif
    804
    805	struct sched_statistics         stats;
    806
    807#ifdef CONFIG_PREEMPT_NOTIFIERS
    808	/* List of struct preempt_notifier: */
    809	struct hlist_head		preempt_notifiers;
    810#endif
    811
    812#ifdef CONFIG_BLK_DEV_IO_TRACE
    813	unsigned int			btrace_seq;
    814#endif
    815
    816	unsigned int			policy;
    817	int				nr_cpus_allowed;
    818	const cpumask_t			*cpus_ptr;
    819	cpumask_t			*user_cpus_ptr;
    820	cpumask_t			cpus_mask;
    821	void				*migration_pending;
    822#ifdef CONFIG_SMP
    823	unsigned short			migration_disabled;
    824#endif
    825	unsigned short			migration_flags;
    826
    827#ifdef CONFIG_PREEMPT_RCU
    828	int				rcu_read_lock_nesting;
    829	union rcu_special		rcu_read_unlock_special;
    830	struct list_head		rcu_node_entry;
    831	struct rcu_node			*rcu_blocked_node;
    832#endif /* #ifdef CONFIG_PREEMPT_RCU */
    833
    834#ifdef CONFIG_TASKS_RCU
    835	unsigned long			rcu_tasks_nvcsw;
    836	u8				rcu_tasks_holdout;
    837	u8				rcu_tasks_idx;
    838	int				rcu_tasks_idle_cpu;
    839	struct list_head		rcu_tasks_holdout_list;
    840#endif /* #ifdef CONFIG_TASKS_RCU */
    841
    842#ifdef CONFIG_TASKS_TRACE_RCU
    843	int				trc_reader_nesting;
    844	int				trc_ipi_to_cpu;
    845	union rcu_special		trc_reader_special;
    846	bool				trc_reader_checked;
    847	struct list_head		trc_holdout_list;
    848#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
    849
    850	struct sched_info		sched_info;
    851
    852	struct list_head		tasks;
    853#ifdef CONFIG_SMP
    854	struct plist_node		pushable_tasks;
    855	struct rb_node			pushable_dl_tasks;
    856#endif
    857
    858	struct mm_struct		*mm;
    859	struct mm_struct		*active_mm;
    860
    861	/* Per-thread vma caching: */
    862	struct vmacache			vmacache;
    863
    864#ifdef SPLIT_RSS_COUNTING
    865	struct task_rss_stat		rss_stat;
    866#endif
    867	int				exit_state;
    868	int				exit_code;
    869	int				exit_signal;
    870	/* The signal sent when the parent dies: */
    871	int				pdeath_signal;
    872	/* JOBCTL_*, siglock protected: */
    873	unsigned long			jobctl;
    874
    875	/* Used for emulating ABI behavior of previous Linux versions: */
    876	unsigned int			personality;
    877
    878	/* Scheduler bits, serialized by scheduler locks: */
    879	unsigned			sched_reset_on_fork:1;
    880	unsigned			sched_contributes_to_load:1;
    881	unsigned			sched_migrated:1;
    882#ifdef CONFIG_PSI
    883	unsigned			sched_psi_wake_requeue:1;
    884#endif
    885
    886	/* Force alignment to the next boundary: */
    887	unsigned			:0;
    888
    889	/* Unserialized, strictly 'current' */
    890
    891	/*
    892	 * This field must not be in the scheduler word above due to wakelist
    893	 * queueing no longer being serialized by p->on_cpu. However:
    894	 *
    895	 * p->XXX = X;			ttwu()
    896	 * schedule()			  if (p->on_rq && ..) // false
    897	 *   smp_mb__after_spinlock();	  if (smp_load_acquire(&p->on_cpu) && //true
    898	 *   deactivate_task()		      ttwu_queue_wakelist())
    899	 *     p->on_rq = 0;			p->sched_remote_wakeup = Y;
    900	 *
    901	 * guarantees all stores of 'current' are visible before
    902	 * ->sched_remote_wakeup gets used, so it can be in this word.
    903	 */
    904	unsigned			sched_remote_wakeup:1;
    905
    906	/* Bit to tell LSMs we're in execve(): */
    907	unsigned			in_execve:1;
    908	unsigned			in_iowait:1;
    909#ifndef TIF_RESTORE_SIGMASK
    910	unsigned			restore_sigmask:1;
    911#endif
    912#ifdef CONFIG_MEMCG
    913	unsigned			in_user_fault:1;
    914#endif
    915#ifdef CONFIG_COMPAT_BRK
    916	unsigned			brk_randomized:1;
    917#endif
    918#ifdef CONFIG_CGROUPS
    919	/* disallow userland-initiated cgroup migration */
    920	unsigned			no_cgroup_migration:1;
    921	/* task is frozen/stopped (used by the cgroup freezer) */
    922	unsigned			frozen:1;
    923#endif
    924#ifdef CONFIG_BLK_CGROUP
    925	unsigned			use_memdelay:1;
    926#endif
    927#ifdef CONFIG_PSI
    928	/* Stalled due to lack of memory */
    929	unsigned			in_memstall:1;
    930#endif
    931#ifdef CONFIG_PAGE_OWNER
    932	/* Used by page_owner=on to detect recursion in page tracking. */
    933	unsigned			in_page_owner:1;
    934#endif
    935#ifdef CONFIG_EVENTFD
    936	/* Recursion prevention for eventfd_signal() */
    937	unsigned			in_eventfd_signal:1;
    938#endif
    939#ifdef CONFIG_IOMMU_SVA
    940	unsigned			pasid_activated:1;
    941#endif
    942#ifdef	CONFIG_CPU_SUP_INTEL
    943	unsigned			reported_split_lock:1;
    944#endif
    945
    946	unsigned long			atomic_flags; /* Flags requiring atomic access. */
    947
    948	struct restart_block		restart_block;
    949
    950	pid_t				pid;
    951	pid_t				tgid;
    952
    953#ifdef CONFIG_STACKPROTECTOR
    954	/* Canary value for the -fstack-protector GCC feature: */
    955	unsigned long			stack_canary;
    956#endif
    957	/*
    958	 * Pointers to the (original) parent process, youngest child, younger sibling,
    959	 * older sibling, respectively.  (p->father can be replaced with
    960	 * p->real_parent->pid)
    961	 */
    962
    963	/* Real parent process: */
    964	struct task_struct __rcu	*real_parent;
    965
    966	/* Recipient of SIGCHLD, wait4() reports: */
    967	struct task_struct __rcu	*parent;
    968
    969	/*
    970	 * Children/sibling form the list of natural children:
    971	 */
    972	struct list_head		children;
    973	struct list_head		sibling;
    974	struct task_struct		*group_leader;
    975
    976	/*
    977	 * 'ptraced' is the list of tasks this task is using ptrace() on.
    978	 *
    979	 * This includes both natural children and PTRACE_ATTACH targets.
    980	 * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
    981	 */
    982	struct list_head		ptraced;
    983	struct list_head		ptrace_entry;
    984
    985	/* PID/PID hash table linkage. */
    986	struct pid			*thread_pid;
    987	struct hlist_node		pid_links[PIDTYPE_MAX];
    988	struct list_head		thread_group;
    989	struct list_head		thread_node;
    990
    991	struct completion		*vfork_done;
    992
    993	/* CLONE_CHILD_SETTID: */
    994	int __user			*set_child_tid;
    995
    996	/* CLONE_CHILD_CLEARTID: */
    997	int __user			*clear_child_tid;
    998
    999	/* PF_KTHREAD | PF_IO_WORKER */
   1000	void				*worker_private;
   1001
   1002	u64				utime;
   1003	u64				stime;
   1004#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
   1005	u64				utimescaled;
   1006	u64				stimescaled;
   1007#endif
   1008	u64				gtime;
   1009	struct prev_cputime		prev_cputime;
   1010#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
   1011	struct vtime			vtime;
   1012#endif
   1013
   1014#ifdef CONFIG_NO_HZ_FULL
   1015	atomic_t			tick_dep_mask;
   1016#endif
   1017	/* Context switch counts: */
   1018	unsigned long			nvcsw;
   1019	unsigned long			nivcsw;
   1020
   1021	/* Monotonic time in nsecs: */
   1022	u64				start_time;
   1023
   1024	/* Boot based time in nsecs: */
   1025	u64				start_boottime;
   1026
   1027	/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
   1028	unsigned long			min_flt;
   1029	unsigned long			maj_flt;
   1030
   1031	/* Empty if CONFIG_POSIX_CPUTIMERS=n */
   1032	struct posix_cputimers		posix_cputimers;
   1033
   1034#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
   1035	struct posix_cputimers_work	posix_cputimers_work;
   1036#endif
   1037
   1038	/* Process credentials: */
   1039
   1040	/* Tracer's credentials at attach: */
   1041	const struct cred __rcu		*ptracer_cred;
   1042
   1043	/* Objective and real subjective task credentials (COW): */
   1044	const struct cred __rcu		*real_cred;
   1045
   1046	/* Effective (overridable) subjective task credentials (COW): */
   1047	const struct cred __rcu		*cred;
   1048
   1049#ifdef CONFIG_KEYS
   1050	/* Cached requested key. */
   1051	struct key			*cached_requested_key;
   1052#endif
   1053
   1054	/*
   1055	 * executable name, excluding path.
   1056	 *
   1057	 * - normally initialized setup_new_exec()
   1058	 * - access it with [gs]et_task_comm()
   1059	 * - lock it with task_lock()
   1060	 */
   1061	char				comm[TASK_COMM_LEN];
   1062
   1063	struct nameidata		*nameidata;
   1064
   1065#ifdef CONFIG_SYSVIPC
   1066	struct sysv_sem			sysvsem;
   1067	struct sysv_shm			sysvshm;
   1068#endif
   1069#ifdef CONFIG_DETECT_HUNG_TASK
   1070	unsigned long			last_switch_count;
   1071	unsigned long			last_switch_time;
   1072#endif
   1073	/* Filesystem information: */
   1074	struct fs_struct		*fs;
   1075
   1076	/* Open file information: */
   1077	struct files_struct		*files;
   1078
   1079#ifdef CONFIG_IO_URING
   1080	struct io_uring_task		*io_uring;
   1081#endif
   1082
   1083	/* Namespaces: */
   1084	struct nsproxy			*nsproxy;
   1085
   1086	/* Signal handlers: */
   1087	struct signal_struct		*signal;
   1088	struct sighand_struct __rcu		*sighand;
   1089	sigset_t			blocked;
   1090	sigset_t			real_blocked;
   1091	/* Restored if set_restore_sigmask() was used: */
   1092	sigset_t			saved_sigmask;
   1093	struct sigpending		pending;
   1094	unsigned long			sas_ss_sp;
   1095	size_t				sas_ss_size;
   1096	unsigned int			sas_ss_flags;
   1097
   1098	struct callback_head		*task_works;
   1099
   1100#ifdef CONFIG_AUDIT
   1101#ifdef CONFIG_AUDITSYSCALL
   1102	struct audit_context		*audit_context;
   1103#endif
   1104	kuid_t				loginuid;
   1105	unsigned int			sessionid;
   1106#endif
   1107	struct seccomp			seccomp;
   1108	struct syscall_user_dispatch	syscall_dispatch;
   1109
   1110	/* Thread group tracking: */
   1111	u64				parent_exec_id;
   1112	u64				self_exec_id;
   1113
   1114	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
   1115	spinlock_t			alloc_lock;
   1116
   1117	/* Protection of the PI data structures: */
   1118	raw_spinlock_t			pi_lock;
   1119
   1120	struct wake_q_node		wake_q;
   1121
   1122#ifdef CONFIG_RT_MUTEXES
   1123	/* PI waiters blocked on a rt_mutex held by this task: */
   1124	struct rb_root_cached		pi_waiters;
   1125	/* Updated under owner's pi_lock and rq lock */
   1126	struct task_struct		*pi_top_task;
   1127	/* Deadlock detection and priority inheritance handling: */
   1128	struct rt_mutex_waiter		*pi_blocked_on;
   1129#endif
   1130
   1131#ifdef CONFIG_DEBUG_MUTEXES
   1132	/* Mutex deadlock detection: */
   1133	struct mutex_waiter		*blocked_on;
   1134#endif
   1135
   1136#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   1137	int				non_block_count;
   1138#endif
   1139
   1140#ifdef CONFIG_TRACE_IRQFLAGS
   1141	struct irqtrace_events		irqtrace;
   1142	unsigned int			hardirq_threaded;
   1143	u64				hardirq_chain_key;
   1144	int				softirqs_enabled;
   1145	int				softirq_context;
   1146	int				irq_config;
   1147#endif
   1148#ifdef CONFIG_PREEMPT_RT
   1149	int				softirq_disable_cnt;
   1150#endif
   1151
   1152#ifdef CONFIG_LOCKDEP
   1153# define MAX_LOCK_DEPTH			48UL
   1154	u64				curr_chain_key;
   1155	int				lockdep_depth;
   1156	unsigned int			lockdep_recursion;
   1157	struct held_lock		held_locks[MAX_LOCK_DEPTH];
   1158#endif
   1159
   1160#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
   1161	unsigned int			in_ubsan;
   1162#endif
   1163
   1164	/* Journalling filesystem info: */
   1165	void				*journal_info;
   1166
   1167	/* Stacked block device info: */
   1168	struct bio_list			*bio_list;
   1169
   1170	/* Stack plugging: */
   1171	struct blk_plug			*plug;
   1172
   1173	/* VM state: */
   1174	struct reclaim_state		*reclaim_state;
   1175
   1176	struct backing_dev_info		*backing_dev_info;
   1177
   1178	struct io_context		*io_context;
   1179
   1180#ifdef CONFIG_COMPACTION
   1181	struct capture_control		*capture_control;
   1182#endif
   1183	/* Ptrace state: */
   1184	unsigned long			ptrace_message;
   1185	kernel_siginfo_t		*last_siginfo;
   1186
   1187	struct task_io_accounting	ioac;
   1188#ifdef CONFIG_PSI
   1189	/* Pressure stall state */
   1190	unsigned int			psi_flags;
   1191#endif
   1192#ifdef CONFIG_TASK_XACCT
   1193	/* Accumulated RSS usage: */
   1194	u64				acct_rss_mem1;
   1195	/* Accumulated virtual memory usage: */
   1196	u64				acct_vm_mem1;
   1197	/* stime + utime since last update: */
   1198	u64				acct_timexpd;
   1199#endif
   1200#ifdef CONFIG_CPUSETS
   1201	/* Protected by ->alloc_lock: */
   1202	nodemask_t			mems_allowed;
   1203	/* Sequence number to catch updates: */
   1204	seqcount_spinlock_t		mems_allowed_seq;
   1205	int				cpuset_mem_spread_rotor;
   1206	int				cpuset_slab_spread_rotor;
   1207#endif
   1208#ifdef CONFIG_CGROUPS
   1209	/* Control Group info protected by css_set_lock: */
   1210	struct css_set __rcu		*cgroups;
   1211	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
   1212	struct list_head		cg_list;
   1213#endif
   1214#ifdef CONFIG_X86_CPU_RESCTRL
   1215	u32				closid;
   1216	u32				rmid;
   1217#endif
   1218#ifdef CONFIG_FUTEX
   1219	struct robust_list_head __user	*robust_list;
   1220#ifdef CONFIG_COMPAT
   1221	struct compat_robust_list_head __user *compat_robust_list;
   1222#endif
   1223	struct list_head		pi_state_list;
   1224	struct futex_pi_state		*pi_state_cache;
   1225	struct mutex			futex_exit_mutex;
   1226	unsigned int			futex_state;
   1227#endif
   1228#ifdef CONFIG_PERF_EVENTS
   1229	struct perf_event_context	*perf_event_ctxp[perf_nr_task_contexts];
   1230	struct mutex			perf_event_mutex;
   1231	struct list_head		perf_event_list;
   1232#endif
   1233#ifdef CONFIG_DEBUG_PREEMPT
   1234	unsigned long			preempt_disable_ip;
   1235#endif
   1236#ifdef CONFIG_NUMA
   1237	/* Protected by alloc_lock: */
   1238	struct mempolicy		*mempolicy;
   1239	short				il_prev;
   1240	short				pref_node_fork;
   1241#endif
   1242#ifdef CONFIG_NUMA_BALANCING
   1243	int				numa_scan_seq;
   1244	unsigned int			numa_scan_period;
   1245	unsigned int			numa_scan_period_max;
   1246	int				numa_preferred_nid;
   1247	unsigned long			numa_migrate_retry;
   1248	/* Migration stamp: */
   1249	u64				node_stamp;
   1250	u64				last_task_numa_placement;
   1251	u64				last_sum_exec_runtime;
   1252	struct callback_head		numa_work;
   1253
   1254	/*
   1255	 * This pointer is only modified for current in syscall and
   1256	 * pagefault context (and for tasks being destroyed), so it can be read
   1257	 * from any of the following contexts:
   1258	 *  - RCU read-side critical section
   1259	 *  - current->numa_group from everywhere
   1260	 *  - task's runqueue locked, task not running
   1261	 */
   1262	struct numa_group __rcu		*numa_group;
   1263
   1264	/*
   1265	 * numa_faults is an array split into four regions:
   1266	 * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
   1267	 * in this precise order.
   1268	 *
   1269	 * faults_memory: Exponential decaying average of faults on a per-node
   1270	 * basis. Scheduling placement decisions are made based on these
   1271	 * counts. The values remain static for the duration of a PTE scan.
   1272	 * faults_cpu: Track the nodes the process was running on when a NUMA
   1273	 * hinting fault was incurred.
   1274	 * faults_memory_buffer and faults_cpu_buffer: Record faults per node
   1275	 * during the current scan window. When the scan completes, the counts
   1276	 * in faults_memory and faults_cpu decay and these values are copied.
   1277	 */
   1278	unsigned long			*numa_faults;
   1279	unsigned long			total_numa_faults;
   1280
   1281	/*
   1282	 * numa_faults_locality tracks if faults recorded during the last
   1283	 * scan window were remote/local or failed to migrate. The task scan
   1284	 * period is adapted based on the locality of the faults with different
   1285	 * weights depending on whether they were shared or private faults
   1286	 */
   1287	unsigned long			numa_faults_locality[3];
   1288
   1289	unsigned long			numa_pages_migrated;
   1290#endif /* CONFIG_NUMA_BALANCING */
   1291
   1292#ifdef CONFIG_RSEQ
   1293	struct rseq __user *rseq;
   1294	u32 rseq_sig;
   1295	/*
   1296	 * RmW on rseq_event_mask must be performed atomically
   1297	 * with respect to preemption.
   1298	 */
   1299	unsigned long rseq_event_mask;
   1300#endif
   1301
   1302	struct tlbflush_unmap_batch	tlb_ubc;
   1303
   1304	union {
   1305		refcount_t		rcu_users;
   1306		struct rcu_head		rcu;
   1307	};
   1308
   1309	/* Cache last used pipe for splice(): */
   1310	struct pipe_inode_info		*splice_pipe;
   1311
   1312	struct page_frag		task_frag;
   1313
   1314#ifdef CONFIG_TASK_DELAY_ACCT
   1315	struct task_delay_info		*delays;
   1316#endif
   1317
   1318#ifdef CONFIG_FAULT_INJECTION
   1319	int				make_it_fail;
   1320	unsigned int			fail_nth;
   1321#endif
   1322	/*
   1323	 * When (nr_dirtied >= nr_dirtied_pause), it's time to call
   1324	 * balance_dirty_pages() for a dirty throttling pause:
   1325	 */
   1326	int				nr_dirtied;
   1327	int				nr_dirtied_pause;
   1328	/* Start of a write-and-pause period: */
   1329	unsigned long			dirty_paused_when;
   1330
   1331#ifdef CONFIG_LATENCYTOP
   1332	int				latency_record_count;
   1333	struct latency_record		latency_record[LT_SAVECOUNT];
   1334#endif
   1335	/*
   1336	 * Time slack values; these are used to round up poll() and
   1337	 * select() etc timeout values. These are in nanoseconds.
   1338	 */
   1339	u64				timer_slack_ns;
   1340	u64				default_timer_slack_ns;
   1341
   1342#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
   1343	unsigned int			kasan_depth;
   1344#endif
   1345
   1346#ifdef CONFIG_KCSAN
   1347	struct kcsan_ctx		kcsan_ctx;
   1348#ifdef CONFIG_TRACE_IRQFLAGS
   1349	struct irqtrace_events		kcsan_save_irqtrace;
   1350#endif
   1351#ifdef CONFIG_KCSAN_WEAK_MEMORY
   1352	int				kcsan_stack_depth;
   1353#endif
   1354#endif
   1355
   1356#if IS_ENABLED(CONFIG_KUNIT)
   1357	struct kunit			*kunit_test;
   1358#endif
   1359
   1360#ifdef CONFIG_FUNCTION_GRAPH_TRACER
   1361	/* Index of current stored address in ret_stack: */
   1362	int				curr_ret_stack;
   1363	int				curr_ret_depth;
   1364
   1365	/* Stack of return addresses for return function tracing: */
   1366	struct ftrace_ret_stack		*ret_stack;
   1367
   1368	/* Timestamp for last schedule: */
   1369	unsigned long long		ftrace_timestamp;
   1370
   1371	/*
   1372	 * Number of functions that haven't been traced
   1373	 * because of depth overrun:
   1374	 */
   1375	atomic_t			trace_overrun;
   1376
   1377	/* Pause tracing: */
   1378	atomic_t			tracing_graph_pause;
   1379#endif
   1380
   1381#ifdef CONFIG_TRACING
   1382	/* State flags for use by tracers: */
   1383	unsigned long			trace;
   1384
   1385	/* Bitmask and counter of trace recursion: */
   1386	unsigned long			trace_recursion;
   1387#endif /* CONFIG_TRACING */
   1388
   1389#ifdef CONFIG_KCOV
   1390	/* See kernel/kcov.c for more details. */
   1391
   1392	/* Coverage collection mode enabled for this task (0 if disabled): */
   1393	unsigned int			kcov_mode;
   1394
   1395	/* Size of the kcov_area: */
   1396	unsigned int			kcov_size;
   1397
   1398	/* Buffer for coverage collection: */
   1399	void				*kcov_area;
   1400
   1401	/* KCOV descriptor wired with this task or NULL: */
   1402	struct kcov			*kcov;
   1403
   1404	/* KCOV common handle for remote coverage collection: */
   1405	u64				kcov_handle;
   1406
   1407	/* KCOV sequence number: */
   1408	int				kcov_sequence;
   1409
   1410	/* Collect coverage from softirq context: */
   1411	unsigned int			kcov_softirq;
   1412#endif
   1413
   1414#ifdef CONFIG_MEMCG
   1415	struct mem_cgroup		*memcg_in_oom;
   1416	gfp_t				memcg_oom_gfp_mask;
   1417	int				memcg_oom_order;
   1418
   1419	/* Number of pages to reclaim on returning to userland: */
   1420	unsigned int			memcg_nr_pages_over_high;
   1421
   1422	/* Used by memcontrol for targeted memcg charge: */
   1423	struct mem_cgroup		*active_memcg;
   1424#endif
   1425
   1426#ifdef CONFIG_BLK_CGROUP
   1427	struct request_queue		*throttle_queue;
   1428#endif
   1429
   1430#ifdef CONFIG_UPROBES
   1431	struct uprobe_task		*utask;
   1432#endif
   1433#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
   1434	unsigned int			sequential_io;
   1435	unsigned int			sequential_io_avg;
   1436#endif
   1437	struct kmap_ctrl		kmap_ctrl;
   1438#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   1439	unsigned long			task_state_change;
   1440# ifdef CONFIG_PREEMPT_RT
   1441	unsigned long			saved_state_change;
   1442# endif
   1443#endif
   1444	int				pagefault_disabled;
   1445#ifdef CONFIG_MMU
   1446	struct task_struct		*oom_reaper_list;
   1447	struct timer_list		oom_reaper_timer;
   1448#endif
   1449#ifdef CONFIG_VMAP_STACK
   1450	struct vm_struct		*stack_vm_area;
   1451#endif
   1452#ifdef CONFIG_THREAD_INFO_IN_TASK
   1453	/* A live task holds one reference: */
   1454	refcount_t			stack_refcount;
   1455#endif
   1456#ifdef CONFIG_LIVEPATCH
   1457	int patch_state;
   1458#endif
   1459#ifdef CONFIG_SECURITY
   1460	/* Used by LSM modules for access restriction: */
   1461	void				*security;
   1462#endif
   1463#ifdef CONFIG_BPF_SYSCALL
   1464	/* Used by BPF task local storage */
   1465	struct bpf_local_storage __rcu	*bpf_storage;
   1466	/* Used for BPF run context */
   1467	struct bpf_run_ctx		*bpf_ctx;
   1468#endif
   1469
   1470#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
   1471	unsigned long			lowest_stack;
   1472	unsigned long			prev_lowest_stack;
   1473#endif
   1474
   1475#ifdef CONFIG_X86_MCE
   1476	void __user			*mce_vaddr;
   1477	__u64				mce_kflags;
   1478	u64				mce_addr;
   1479	__u64				mce_ripv : 1,
   1480					mce_whole_page : 1,
   1481					__mce_reserved : 62;
   1482	struct callback_head		mce_kill_me;
   1483	int				mce_count;
   1484#endif
   1485
   1486#ifdef CONFIG_KRETPROBES
   1487	struct llist_head               kretprobe_instances;
   1488#endif
   1489#ifdef CONFIG_RETHOOK
   1490	struct llist_head               rethooks;
   1491#endif
   1492
   1493#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
   1494	/*
   1495	 * If L1D flush is supported on mm context switch
   1496	 * then we use this callback head to queue kill work
   1497	 * to kill tasks that are not running on SMT disabled
   1498	 * cores
   1499	 */
   1500	struct callback_head		l1d_flush_kill;
   1501#endif
   1502
   1503	/*
   1504	 * New fields for task_struct should be added above here, so that
   1505	 * they are included in the randomized portion of task_struct.
   1506	 */
   1507	randomized_struct_fields_end
   1508
   1509	/* CPU-specific state of this task: */
   1510	struct thread_struct		thread;
   1511
   1512	/*
   1513	 * WARNING: on x86, 'thread_struct' contains a variable-sized
   1514	 * structure.  It *MUST* be at the end of 'task_struct'.
   1515	 *
   1516	 * Do not put anything below here!
   1517	 */
   1518};
   1519
   1520static inline struct pid *task_pid(struct task_struct *task)
   1521{
   1522	return task->thread_pid;
   1523}
   1524
   1525/*
   1526 * the helpers to get the task's different pids as they are seen
   1527 * from various namespaces
   1528 *
   1529 * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
   1530 * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
   1531 *                     current.
   1532 * task_xid_nr_ns()  : id seen from the ns specified;
   1533 *
   1534 * see also pid_nr() etc in include/linux/pid.h
   1535 */
   1536pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
   1537
   1538static inline pid_t task_pid_nr(struct task_struct *tsk)
   1539{
   1540	return tsk->pid;
   1541}
   1542
   1543static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
   1544{
   1545	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
   1546}
   1547
   1548static inline pid_t task_pid_vnr(struct task_struct *tsk)
   1549{
   1550	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
   1551}
   1552
   1553
   1554static inline pid_t task_tgid_nr(struct task_struct *tsk)
   1555{
   1556	return tsk->tgid;
   1557}
   1558
   1559/**
   1560 * pid_alive - check that a task structure is not stale
   1561 * @p: Task structure to be checked.
   1562 *
   1563 * Test if a process is not yet dead (at most zombie state)
   1564 * If pid_alive fails, then pointers within the task structure
   1565 * can be stale and must not be dereferenced.
   1566 *
   1567 * Return: 1 if the process is alive. 0 otherwise.
   1568 */
   1569static inline int pid_alive(const struct task_struct *p)
   1570{
   1571	return p->thread_pid != NULL;
   1572}
   1573
   1574static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
   1575{
   1576	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
   1577}
   1578
   1579static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
   1580{
   1581	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
   1582}
   1583
   1584
   1585static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
   1586{
   1587	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
   1588}
   1589
   1590static inline pid_t task_session_vnr(struct task_struct *tsk)
   1591{
   1592	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
   1593}
   1594
   1595static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
   1596{
   1597	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
   1598}
   1599
   1600static inline pid_t task_tgid_vnr(struct task_struct *tsk)
   1601{
   1602	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
   1603}
   1604
   1605static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
   1606{
   1607	pid_t pid = 0;
   1608
   1609	rcu_read_lock();
   1610	if (pid_alive(tsk))
   1611		pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
   1612	rcu_read_unlock();
   1613
   1614	return pid;
   1615}
   1616
   1617static inline pid_t task_ppid_nr(const struct task_struct *tsk)
   1618{
   1619	return task_ppid_nr_ns(tsk, &init_pid_ns);
   1620}
   1621
   1622/* Obsolete, do not use: */
   1623static inline pid_t task_pgrp_nr(struct task_struct *tsk)
   1624{
   1625	return task_pgrp_nr_ns(tsk, &init_pid_ns);
   1626}
   1627
   1628#define TASK_REPORT_IDLE	(TASK_REPORT + 1)
   1629#define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)
   1630
   1631static inline unsigned int __task_state_index(unsigned int tsk_state,
   1632					      unsigned int tsk_exit_state)
   1633{
   1634	unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;
   1635
   1636	BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
   1637
   1638	if (tsk_state == TASK_IDLE)
   1639		state = TASK_REPORT_IDLE;
   1640
   1641	/*
   1642	 * We're lying here, but rather than expose a completely new task state
   1643	 * to userspace, we can make this appear as if the task has gone through
   1644	 * a regular rt_mutex_lock() call.
   1645	 */
   1646	if (tsk_state == TASK_RTLOCK_WAIT)
   1647		state = TASK_UNINTERRUPTIBLE;
   1648
   1649	return fls(state);
   1650}
   1651
   1652static inline unsigned int task_state_index(struct task_struct *tsk)
   1653{
   1654	return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
   1655}
   1656
   1657static inline char task_index_to_char(unsigned int state)
   1658{
   1659	static const char state_char[] = "RSDTtXZPI";
   1660
   1661	BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1);
   1662
   1663	return state_char[state];
   1664}
   1665
   1666static inline char task_state_to_char(struct task_struct *tsk)
   1667{
   1668	return task_index_to_char(task_state_index(tsk));
   1669}
   1670
   1671/**
   1672 * is_global_init - check if a task structure is init. Since init
   1673 * is free to have sub-threads we need to check tgid.
   1674 * @tsk: Task structure to be checked.
   1675 *
   1676 * Check if a task structure is the first user space task the kernel created.
   1677 *
   1678 * Return: 1 if the task structure is init. 0 otherwise.
   1679 */
   1680static inline int is_global_init(struct task_struct *tsk)
   1681{
   1682	return task_tgid_nr(tsk) == 1;
   1683}
   1684
   1685extern struct pid *cad_pid;
   1686
   1687/*
   1688 * Per process flags
   1689 */
   1690#define PF_VCPU			0x00000001	/* I'm a virtual CPU */
   1691#define PF_IDLE			0x00000002	/* I am an IDLE thread */
   1692#define PF_EXITING		0x00000004	/* Getting shut down */
   1693#define PF_POSTCOREDUMP		0x00000008	/* Coredumps should ignore this task */
   1694#define PF_IO_WORKER		0x00000010	/* Task is an IO worker */
   1695#define PF_WQ_WORKER		0x00000020	/* I'm a workqueue worker */
   1696#define PF_FORKNOEXEC		0x00000040	/* Forked but didn't exec */
   1697#define PF_MCE_PROCESS		0x00000080      /* Process policy on mce errors */
   1698#define PF_SUPERPRIV		0x00000100	/* Used super-user privileges */
   1699#define PF_DUMPCORE		0x00000200	/* Dumped core */
   1700#define PF_SIGNALED		0x00000400	/* Killed by a signal */
   1701#define PF_MEMALLOC		0x00000800	/* Allocating memory */
   1702#define PF_NPROC_EXCEEDED	0x00001000	/* set_user() noticed that RLIMIT_NPROC was exceeded */
   1703#define PF_USED_MATH		0x00002000	/* If unset the fpu must be initialized before use */
   1704#define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
   1705#define PF_FROZEN		0x00010000	/* Frozen for system suspend */
   1706#define PF_KSWAPD		0x00020000	/* I am kswapd */
   1707#define PF_MEMALLOC_NOFS	0x00040000	/* All allocation requests will inherit GFP_NOFS */
   1708#define PF_MEMALLOC_NOIO	0x00080000	/* All allocation requests will inherit GFP_NOIO */
   1709#define PF_LOCAL_THROTTLE	0x00100000	/* Throttle writes only against the bdi I write to,
   1710						 * I am cleaning dirty pages from some other bdi. */
   1711#define PF_KTHREAD		0x00200000	/* I am a kernel thread */
   1712#define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
   1713#define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
   1714#define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
   1715#define PF_MEMALLOC_PIN		0x10000000	/* Allocation context constrained to zones which allow long term pinning. */
   1716#define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
   1717#define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
   1718
   1719/*
   1720 * Only the _current_ task can read/write to tsk->flags, but other
   1721 * tasks can access tsk->flags in readonly mode for example
   1722 * with tsk_used_math (like during threaded core dumping).
   1723 * There is however an exception to this rule during ptrace
   1724 * or during fork: the ptracer task is allowed to write to the
   1725 * child->flags of its traced child (same goes for fork, the parent
   1726 * can write to the child->flags), because we're guaranteed the
   1727 * child is not running and in turn not changing child->flags
   1728 * at the same time the parent does it.
   1729 */
   1730#define clear_stopped_child_used_math(child)	do { (child)->flags &= ~PF_USED_MATH; } while (0)
   1731#define set_stopped_child_used_math(child)	do { (child)->flags |= PF_USED_MATH; } while (0)
   1732#define clear_used_math()			clear_stopped_child_used_math(current)
   1733#define set_used_math()				set_stopped_child_used_math(current)
   1734
   1735#define conditional_stopped_child_used_math(condition, child) \
   1736	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
   1737
   1738#define conditional_used_math(condition)	conditional_stopped_child_used_math(condition, current)
   1739
   1740#define copy_to_stopped_child_used_math(child) \
   1741	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
   1742
   1743/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
   1744#define tsk_used_math(p)			((p)->flags & PF_USED_MATH)
   1745#define used_math()				tsk_used_math(current)
   1746
   1747static __always_inline bool is_percpu_thread(void)
   1748{
   1749#ifdef CONFIG_SMP
   1750	return (current->flags & PF_NO_SETAFFINITY) &&
   1751		(current->nr_cpus_allowed  == 1);
   1752#else
   1753	return true;
   1754#endif
   1755}
   1756
   1757/* Per-process atomic flags. */
   1758#define PFA_NO_NEW_PRIVS		0	/* May not gain new privileges. */
   1759#define PFA_SPREAD_PAGE			1	/* Spread page cache over cpuset */
   1760#define PFA_SPREAD_SLAB			2	/* Spread some slab caches over cpuset */
   1761#define PFA_SPEC_SSB_DISABLE		3	/* Speculative Store Bypass disabled */
   1762#define PFA_SPEC_SSB_FORCE_DISABLE	4	/* Speculative Store Bypass force disabled*/
   1763#define PFA_SPEC_IB_DISABLE		5	/* Indirect branch speculation restricted */
   1764#define PFA_SPEC_IB_FORCE_DISABLE	6	/* Indirect branch speculation permanently restricted */
   1765#define PFA_SPEC_SSB_NOEXEC		7	/* Speculative Store Bypass clear on execve() */
   1766
   1767#define TASK_PFA_TEST(name, func)					\
   1768	static inline bool task_##func(struct task_struct *p)		\
   1769	{ return test_bit(PFA_##name, &p->atomic_flags); }
   1770
   1771#define TASK_PFA_SET(name, func)					\
   1772	static inline void task_set_##func(struct task_struct *p)	\
   1773	{ set_bit(PFA_##name, &p->atomic_flags); }
   1774
   1775#define TASK_PFA_CLEAR(name, func)					\
   1776	static inline void task_clear_##func(struct task_struct *p)	\
   1777	{ clear_bit(PFA_##name, &p->atomic_flags); }
   1778
   1779TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
   1780TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)
   1781
   1782TASK_PFA_TEST(SPREAD_PAGE, spread_page)
   1783TASK_PFA_SET(SPREAD_PAGE, spread_page)
   1784TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)
   1785
   1786TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
   1787TASK_PFA_SET(SPREAD_SLAB, spread_slab)
   1788TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
   1789
   1790TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
   1791TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
   1792TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)
   1793
   1794TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
   1795TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
   1796TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)
   1797
   1798TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
   1799TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
   1800
   1801TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
   1802TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
   1803TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)
   1804
   1805TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
   1806TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
   1807
   1808static inline void
   1809current_restore_flags(unsigned long orig_flags, unsigned long flags)
   1810{
   1811	current->flags &= ~flags;
   1812	current->flags |= orig_flags & flags;
   1813}
   1814
   1815extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
   1816extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
   1817#ifdef CONFIG_SMP
   1818extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
   1819extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
   1820extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
   1821extern void release_user_cpus_ptr(struct task_struct *p);
   1822extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
   1823extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
   1824extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
   1825#else
   1826static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
   1827{
   1828}
   1829static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
   1830{
   1831	if (!cpumask_test_cpu(0, new_mask))
   1832		return -EINVAL;
   1833	return 0;
   1834}
   1835static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
   1836{
   1837	if (src->user_cpus_ptr)
   1838		return -EINVAL;
   1839	return 0;
   1840}
   1841static inline void release_user_cpus_ptr(struct task_struct *p)
   1842{
   1843	WARN_ON(p->user_cpus_ptr);
   1844}
   1845
   1846static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
   1847{
   1848	return 0;
   1849}
   1850#endif
   1851
   1852extern int yield_to(struct task_struct *p, bool preempt);
   1853extern void set_user_nice(struct task_struct *p, long nice);
   1854extern int task_prio(const struct task_struct *p);
   1855
   1856/**
   1857 * task_nice - return the nice value of a given task.
   1858 * @p: the task in question.
   1859 *
   1860 * Return: The nice value [ -20 ... 0 ... 19 ].
   1861 */
   1862static inline int task_nice(const struct task_struct *p)
   1863{
   1864	return PRIO_TO_NICE((p)->static_prio);
   1865}
   1866
   1867extern int can_nice(const struct task_struct *p, const int nice);
   1868extern int task_curr(const struct task_struct *p);
   1869extern int idle_cpu(int cpu);
   1870extern int available_idle_cpu(int cpu);
   1871extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
   1872extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
   1873extern void sched_set_fifo(struct task_struct *p);
   1874extern void sched_set_fifo_low(struct task_struct *p);
   1875extern void sched_set_normal(struct task_struct *p, int nice);
   1876extern int sched_setattr(struct task_struct *, const struct sched_attr *);
   1877extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
   1878extern struct task_struct *idle_task(int cpu);
   1879
   1880/**
   1881 * is_idle_task - is the specified task an idle task?
   1882 * @p: the task in question.
   1883 *
   1884 * Return: 1 if @p is an idle task. 0 otherwise.
   1885 */
   1886static __always_inline bool is_idle_task(const struct task_struct *p)
   1887{
   1888	return !!(p->flags & PF_IDLE);
   1889}
   1890
   1891extern struct task_struct *curr_task(int cpu);
   1892extern void ia64_set_curr_task(int cpu, struct task_struct *p);
   1893
   1894void yield(void);
   1895
   1896union thread_union {
   1897#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
   1898	struct task_struct task;
   1899#endif
   1900#ifndef CONFIG_THREAD_INFO_IN_TASK
   1901	struct thread_info thread_info;
   1902#endif
   1903	unsigned long stack[THREAD_SIZE/sizeof(long)];
   1904};
   1905
   1906#ifndef CONFIG_THREAD_INFO_IN_TASK
   1907extern struct thread_info init_thread_info;
   1908#endif
   1909
   1910extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];
   1911
   1912#ifdef CONFIG_THREAD_INFO_IN_TASK
   1913# define task_thread_info(task)	(&(task)->thread_info)
   1914#elif !defined(__HAVE_THREAD_FUNCTIONS)
   1915# define task_thread_info(task)	((struct thread_info *)(task)->stack)
   1916#endif
   1917
   1918/*
   1919 * find a task by one of its numerical ids
   1920 *
   1921 * find_task_by_pid_ns():
   1922 *      finds a task by its pid in the specified namespace
   1923 * find_task_by_vpid():
   1924 *      finds a task by its virtual pid
   1925 *
   1926 * see also find_vpid() etc in include/linux/pid.h
   1927 */
   1928
   1929extern struct task_struct *find_task_by_vpid(pid_t nr);
   1930extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);
   1931
   1932/*
   1933 * find a task by its virtual pid and get the task struct
   1934 */
   1935extern struct task_struct *find_get_task_by_vpid(pid_t nr);
   1936
   1937extern int wake_up_state(struct task_struct *tsk, unsigned int state);
   1938extern int wake_up_process(struct task_struct *tsk);
   1939extern void wake_up_new_task(struct task_struct *tsk);
   1940
   1941#ifdef CONFIG_SMP
   1942extern void kick_process(struct task_struct *tsk);
   1943#else
   1944static inline void kick_process(struct task_struct *tsk) { }
   1945#endif
   1946
   1947extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
   1948
   1949static inline void set_task_comm(struct task_struct *tsk, const char *from)
   1950{
   1951	__set_task_comm(tsk, from, false);
   1952}
   1953
   1954extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
   1955#define get_task_comm(buf, tsk) ({			\
   1956	BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN);	\
   1957	__get_task_comm(buf, sizeof(buf), tsk);		\
   1958})
   1959
   1960#ifdef CONFIG_SMP
   1961static __always_inline void scheduler_ipi(void)
   1962{
   1963	/*
   1964	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
   1965	 * TIF_NEED_RESCHED remotely (for the first time) will also send
   1966	 * this IPI.
   1967	 */
   1968	preempt_fold_need_resched();
   1969}
   1970extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
   1971#else
   1972static inline void scheduler_ipi(void) { }
   1973static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
   1974{
   1975	return 1;
   1976}
   1977#endif
   1978
   1979/*
   1980 * Set thread flags in other task's structures.
   1981 * See asm/thread_info.h for TIF_xxxx flags available:
   1982 */
   1983static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
   1984{
   1985	set_ti_thread_flag(task_thread_info(tsk), flag);
   1986}
   1987
   1988static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
   1989{
   1990	clear_ti_thread_flag(task_thread_info(tsk), flag);
   1991}
   1992
   1993static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
   1994					  bool value)
   1995{
   1996	update_ti_thread_flag(task_thread_info(tsk), flag, value);
   1997}
   1998
   1999static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
   2000{
   2001	return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
   2002}
   2003
   2004static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
   2005{
   2006	return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
   2007}
   2008
   2009static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
   2010{
   2011	return test_ti_thread_flag(task_thread_info(tsk), flag);
   2012}
   2013
   2014static inline void set_tsk_need_resched(struct task_struct *tsk)
   2015{
   2016	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
   2017}
   2018
   2019static inline void clear_tsk_need_resched(struct task_struct *tsk)
   2020{
   2021	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
   2022}
   2023
   2024static inline int test_tsk_need_resched(struct task_struct *tsk)
   2025{
   2026	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
   2027}
   2028
   2029/*
   2030 * cond_resched() and cond_resched_lock(): latency reduction via
   2031 * explicit rescheduling in places that are safe. The return
   2032 * value indicates whether a reschedule was done in fact.
   2033 * cond_resched_lock() will drop the spinlock before scheduling,
   2034 */
   2035#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
   2036extern int __cond_resched(void);
   2037
   2038#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
   2039
   2040DECLARE_STATIC_CALL(cond_resched, __cond_resched);
   2041
   2042static __always_inline int _cond_resched(void)
   2043{
   2044	return static_call_mod(cond_resched)();
   2045}
   2046
   2047#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
   2048extern int dynamic_cond_resched(void);
   2049
   2050static __always_inline int _cond_resched(void)
   2051{
   2052	return dynamic_cond_resched();
   2053}
   2054
   2055#else
   2056
   2057static inline int _cond_resched(void)
   2058{
   2059	return __cond_resched();
   2060}
   2061
   2062#endif /* CONFIG_PREEMPT_DYNAMIC */
   2063
   2064#else
   2065
   2066static inline int _cond_resched(void) { return 0; }
   2067
   2068#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
   2069
   2070#define cond_resched() ({			\
   2071	__might_resched(__FILE__, __LINE__, 0);	\
   2072	_cond_resched();			\
   2073})
   2074
   2075extern int __cond_resched_lock(spinlock_t *lock);
   2076extern int __cond_resched_rwlock_read(rwlock_t *lock);
   2077extern int __cond_resched_rwlock_write(rwlock_t *lock);
   2078
   2079#define MIGHT_RESCHED_RCU_SHIFT		8
   2080#define MIGHT_RESCHED_PREEMPT_MASK	((1U << MIGHT_RESCHED_RCU_SHIFT) - 1)
   2081
   2082#ifndef CONFIG_PREEMPT_RT
   2083/*
   2084 * Non RT kernels have an elevated preempt count due to the held lock,
   2085 * but are not allowed to be inside a RCU read side critical section
   2086 */
   2087# define PREEMPT_LOCK_RESCHED_OFFSETS	PREEMPT_LOCK_OFFSET
   2088#else
   2089/*
   2090 * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in
   2091 * cond_resched*lock() has to take that into account because it checks for
   2092 * preempt_count() and rcu_preempt_depth().
   2093 */
   2094# define PREEMPT_LOCK_RESCHED_OFFSETS	\
   2095	(PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT))
   2096#endif
   2097
   2098#define cond_resched_lock(lock) ({						\
   2099	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);	\
   2100	__cond_resched_lock(lock);						\
   2101})
   2102
   2103#define cond_resched_rwlock_read(lock) ({					\
   2104	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);	\
   2105	__cond_resched_rwlock_read(lock);					\
   2106})
   2107
   2108#define cond_resched_rwlock_write(lock) ({					\
   2109	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);	\
   2110	__cond_resched_rwlock_write(lock);					\
   2111})
   2112
   2113static inline void cond_resched_rcu(void)
   2114{
   2115#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
   2116	rcu_read_unlock();
   2117	cond_resched();
   2118	rcu_read_lock();
   2119#endif
   2120}
   2121
   2122#ifdef CONFIG_PREEMPT_DYNAMIC
   2123
   2124extern bool preempt_model_none(void);
   2125extern bool preempt_model_voluntary(void);
   2126extern bool preempt_model_full(void);
   2127
   2128#else
   2129
   2130static inline bool preempt_model_none(void)
   2131{
   2132	return IS_ENABLED(CONFIG_PREEMPT_NONE);
   2133}
   2134static inline bool preempt_model_voluntary(void)
   2135{
   2136	return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
   2137}
   2138static inline bool preempt_model_full(void)
   2139{
   2140	return IS_ENABLED(CONFIG_PREEMPT);
   2141}
   2142
   2143#endif
   2144
   2145static inline bool preempt_model_rt(void)
   2146{
   2147	return IS_ENABLED(CONFIG_PREEMPT_RT);
   2148}
   2149
   2150/*
   2151 * Does the preemption model allow non-cooperative preemption?
   2152 *
   2153 * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
   2154 * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
   2155 * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
   2156 * PREEMPT_NONE model.
   2157 */
   2158static inline bool preempt_model_preemptible(void)
   2159{
   2160	return preempt_model_full() || preempt_model_rt();
   2161}
   2162
   2163/*
   2164 * Does a critical section need to be broken due to another
   2165 * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
   2166 * but a general need for low latency)
   2167 */
   2168static inline int spin_needbreak(spinlock_t *lock)
   2169{
   2170#ifdef CONFIG_PREEMPTION
   2171	return spin_is_contended(lock);
   2172#else
   2173	return 0;
   2174#endif
   2175}
   2176
   2177/*
   2178 * Check if a rwlock is contended.
   2179 * Returns non-zero if there is another task waiting on the rwlock.
   2180 * Returns zero if the lock is not contended or the system / underlying
   2181 * rwlock implementation does not support contention detection.
   2182 * Technically does not depend on CONFIG_PREEMPTION, but a general need
   2183 * for low latency.
   2184 */
   2185static inline int rwlock_needbreak(rwlock_t *lock)
   2186{
   2187#ifdef CONFIG_PREEMPTION
   2188	return rwlock_is_contended(lock);
   2189#else
   2190	return 0;
   2191#endif
   2192}
   2193
   2194static __always_inline bool need_resched(void)
   2195{
   2196	return unlikely(tif_need_resched());
   2197}
   2198
   2199/*
   2200 * Wrappers for p->thread_info->cpu access. No-op on UP.
   2201 */
   2202#ifdef CONFIG_SMP
   2203
   2204static inline unsigned int task_cpu(const struct task_struct *p)
   2205{
   2206	return READ_ONCE(task_thread_info(p)->cpu);
   2207}
   2208
   2209extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
   2210
   2211#else
   2212
   2213static inline unsigned int task_cpu(const struct task_struct *p)
   2214{
   2215	return 0;
   2216}
   2217
   2218static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
   2219{
   2220}
   2221
   2222#endif /* CONFIG_SMP */
   2223
   2224extern bool sched_task_on_rq(struct task_struct *p);
   2225extern unsigned long get_wchan(struct task_struct *p);
   2226
   2227/*
   2228 * In order to reduce various lock holder preemption latencies provide an
   2229 * interface to see if a vCPU is currently running or not.
   2230 *
   2231 * This allows us to terminate optimistic spin loops and block, analogous to
   2232 * the native optimistic spin heuristic of testing if the lock owner task is
   2233 * running or not.
   2234 */
   2235#ifndef vcpu_is_preempted
   2236static inline bool vcpu_is_preempted(int cpu)
   2237{
   2238	return false;
   2239}
   2240#endif
   2241
   2242extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
   2243extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
   2244
   2245#ifndef TASK_SIZE_OF
   2246#define TASK_SIZE_OF(tsk)	TASK_SIZE
   2247#endif
   2248
   2249#ifdef CONFIG_SMP
   2250static inline bool owner_on_cpu(struct task_struct *owner)
   2251{
   2252	/*
   2253	 * As lock holder preemption issue, we both skip spinning if
   2254	 * task is not on cpu or its cpu is preempted
   2255	 */
   2256	return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner));
   2257}
   2258
   2259/* Returns effective CPU energy utilization, as seen by the scheduler */
   2260unsigned long sched_cpu_util(int cpu, unsigned long max);
   2261#endif /* CONFIG_SMP */
   2262
   2263#ifdef CONFIG_RSEQ
   2264
   2265/*
   2266 * Map the event mask on the user-space ABI enum rseq_cs_flags
   2267 * for direct mask checks.
   2268 */
   2269enum rseq_event_mask_bits {
   2270	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
   2271	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
   2272	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
   2273};
   2274
   2275enum rseq_event_mask {
   2276	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
   2277	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
   2278	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
   2279};
   2280
   2281static inline void rseq_set_notify_resume(struct task_struct *t)
   2282{
   2283	if (t->rseq)
   2284		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
   2285}
   2286
   2287void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
   2288
   2289static inline void rseq_handle_notify_resume(struct ksignal *ksig,
   2290					     struct pt_regs *regs)
   2291{
   2292	if (current->rseq)
   2293		__rseq_handle_notify_resume(ksig, regs);
   2294}
   2295
   2296static inline void rseq_signal_deliver(struct ksignal *ksig,
   2297				       struct pt_regs *regs)
   2298{
   2299	preempt_disable();
   2300	__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
   2301	preempt_enable();
   2302	rseq_handle_notify_resume(ksig, regs);
   2303}
   2304
   2305/* rseq_preempt() requires preemption to be disabled. */
   2306static inline void rseq_preempt(struct task_struct *t)
   2307{
   2308	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
   2309	rseq_set_notify_resume(t);
   2310}
   2311
   2312/* rseq_migrate() requires preemption to be disabled. */
   2313static inline void rseq_migrate(struct task_struct *t)
   2314{
   2315	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
   2316	rseq_set_notify_resume(t);
   2317}
   2318
   2319/*
   2320 * If parent process has a registered restartable sequences area, the
   2321 * child inherits. Unregister rseq for a clone with CLONE_VM set.
   2322 */
   2323static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
   2324{
   2325	if (clone_flags & CLONE_VM) {
   2326		t->rseq = NULL;
   2327		t->rseq_sig = 0;
   2328		t->rseq_event_mask = 0;
   2329	} else {
   2330		t->rseq = current->rseq;
   2331		t->rseq_sig = current->rseq_sig;
   2332		t->rseq_event_mask = current->rseq_event_mask;
   2333	}
   2334}
   2335
   2336static inline void rseq_execve(struct task_struct *t)
   2337{
   2338	t->rseq = NULL;
   2339	t->rseq_sig = 0;
   2340	t->rseq_event_mask = 0;
   2341}
   2342
   2343#else
   2344
   2345static inline void rseq_set_notify_resume(struct task_struct *t)
   2346{
   2347}
   2348static inline void rseq_handle_notify_resume(struct ksignal *ksig,
   2349					     struct pt_regs *regs)
   2350{
   2351}
   2352static inline void rseq_signal_deliver(struct ksignal *ksig,
   2353				       struct pt_regs *regs)
   2354{
   2355}
   2356static inline void rseq_preempt(struct task_struct *t)
   2357{
   2358}
   2359static inline void rseq_migrate(struct task_struct *t)
   2360{
   2361}
   2362static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
   2363{
   2364}
   2365static inline void rseq_execve(struct task_struct *t)
   2366{
   2367}
   2368
   2369#endif
   2370
   2371#ifdef CONFIG_DEBUG_RSEQ
   2372
   2373void rseq_syscall(struct pt_regs *regs);
   2374
   2375#else
   2376
   2377static inline void rseq_syscall(struct pt_regs *regs)
   2378{
   2379}
   2380
   2381#endif
   2382
   2383#ifdef CONFIG_SCHED_CORE
   2384extern void sched_core_free(struct task_struct *tsk);
   2385extern void sched_core_fork(struct task_struct *p);
   2386extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
   2387				unsigned long uaddr);
   2388#else
   2389static inline void sched_core_free(struct task_struct *tsk) { }
   2390static inline void sched_core_fork(struct task_struct *p) { }
   2391#endif
   2392
   2393extern void sched_set_stop_task(int cpu, struct task_struct *stop);
   2394
   2395#endif