deadline.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
deadline.c (83553B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Deadline Scheduling Class (SCHED_DEADLINE)
      4 *
      5 * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
      6 *
      7 * Tasks that periodically executes their instances for less than their
      8 * runtime won't miss any of their deadlines.
      9 * Tasks that are not periodic or sporadic or that tries to execute more
     10 * than their reserved bandwidth will be slowed down (and may potentially
     11 * miss some of their deadlines), and won't affect any other task.
     12 *
     13 * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
     14 *                    Juri Lelli <juri.lelli@gmail.com>,
     15 *                    Michael Trimarchi <michael@amarulasolutions.com>,
     16 *                    Fabio Checconi <fchecconi@gmail.com>
     17 */
     18
     19/*
     20 * Default limits for DL period; on the top end we guard against small util
     21 * tasks still getting ridiculously long effective runtimes, on the bottom end we
     22 * guard against timer DoS.
     23 */
     24static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
     25static unsigned int sysctl_sched_dl_period_min = 100;     /* 100 us */
     26#ifdef CONFIG_SYSCTL
     27static struct ctl_table sched_dl_sysctls[] = {
     28	{
     29		.procname       = "sched_deadline_period_max_us",
     30		.data           = &sysctl_sched_dl_period_max,
     31		.maxlen         = sizeof(unsigned int),
     32		.mode           = 0644,
     33		.proc_handler   = proc_dointvec,
     34	},
     35	{
     36		.procname       = "sched_deadline_period_min_us",
     37		.data           = &sysctl_sched_dl_period_min,
     38		.maxlen         = sizeof(unsigned int),
     39		.mode           = 0644,
     40		.proc_handler   = proc_dointvec,
     41	},
     42	{}
     43};
     44
     45static int __init sched_dl_sysctl_init(void)
     46{
     47	register_sysctl_init("kernel", sched_dl_sysctls);
     48	return 0;
     49}
     50late_initcall(sched_dl_sysctl_init);
     51#endif
     52
     53static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
     54{
     55	return container_of(dl_se, struct task_struct, dl);
     56}
     57
     58static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
     59{
     60	return container_of(dl_rq, struct rq, dl);
     61}
     62
     63static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
     64{
     65	struct task_struct *p = dl_task_of(dl_se);
     66	struct rq *rq = task_rq(p);
     67
     68	return &rq->dl;
     69}
     70
     71static inline int on_dl_rq(struct sched_dl_entity *dl_se)
     72{
     73	return !RB_EMPTY_NODE(&dl_se->rb_node);
     74}
     75
     76#ifdef CONFIG_RT_MUTEXES
     77static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
     78{
     79	return dl_se->pi_se;
     80}
     81
     82static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
     83{
     84	return pi_of(dl_se) != dl_se;
     85}
     86#else
     87static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
     88{
     89	return dl_se;
     90}
     91
     92static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
     93{
     94	return false;
     95}
     96#endif
     97
     98#ifdef CONFIG_SMP
     99static inline struct dl_bw *dl_bw_of(int i)
    100{
    101	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
    102			 "sched RCU must be held");
    103	return &cpu_rq(i)->rd->dl_bw;
    104}
    105
    106static inline int dl_bw_cpus(int i)
    107{
    108	struct root_domain *rd = cpu_rq(i)->rd;
    109	int cpus;
    110
    111	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
    112			 "sched RCU must be held");
    113
    114	if (cpumask_subset(rd->span, cpu_active_mask))
    115		return cpumask_weight(rd->span);
    116
    117	cpus = 0;
    118
    119	for_each_cpu_and(i, rd->span, cpu_active_mask)
    120		cpus++;
    121
    122	return cpus;
    123}
    124
    125static inline unsigned long __dl_bw_capacity(int i)
    126{
    127	struct root_domain *rd = cpu_rq(i)->rd;
    128	unsigned long cap = 0;
    129
    130	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
    131			 "sched RCU must be held");
    132
    133	for_each_cpu_and(i, rd->span, cpu_active_mask)
    134		cap += capacity_orig_of(i);
    135
    136	return cap;
    137}
    138
    139/*
    140 * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
    141 * of the CPU the task is running on rather rd's \Sum CPU capacity.
    142 */
    143static inline unsigned long dl_bw_capacity(int i)
    144{
    145	if (!static_branch_unlikely(&sched_asym_cpucapacity) &&
    146	    capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
    147		return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
    148	} else {
    149		return __dl_bw_capacity(i);
    150	}
    151}
    152
    153static inline bool dl_bw_visited(int cpu, u64 gen)
    154{
    155	struct root_domain *rd = cpu_rq(cpu)->rd;
    156
    157	if (rd->visit_gen == gen)
    158		return true;
    159
    160	rd->visit_gen = gen;
    161	return false;
    162}
    163
    164static inline
    165void __dl_update(struct dl_bw *dl_b, s64 bw)
    166{
    167	struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
    168	int i;
    169
    170	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
    171			 "sched RCU must be held");
    172	for_each_cpu_and(i, rd->span, cpu_active_mask) {
    173		struct rq *rq = cpu_rq(i);
    174
    175		rq->dl.extra_bw += bw;
    176	}
    177}
    178#else
    179static inline struct dl_bw *dl_bw_of(int i)
    180{
    181	return &cpu_rq(i)->dl.dl_bw;
    182}
    183
    184static inline int dl_bw_cpus(int i)
    185{
    186	return 1;
    187}
    188
    189static inline unsigned long dl_bw_capacity(int i)
    190{
    191	return SCHED_CAPACITY_SCALE;
    192}
    193
    194static inline bool dl_bw_visited(int cpu, u64 gen)
    195{
    196	return false;
    197}
    198
    199static inline
    200void __dl_update(struct dl_bw *dl_b, s64 bw)
    201{
    202	struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
    203
    204	dl->extra_bw += bw;
    205}
    206#endif
    207
    208static inline
    209void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
    210{
    211	dl_b->total_bw -= tsk_bw;
    212	__dl_update(dl_b, (s32)tsk_bw / cpus);
    213}
    214
    215static inline
    216void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
    217{
    218	dl_b->total_bw += tsk_bw;
    219	__dl_update(dl_b, -((s32)tsk_bw / cpus));
    220}
    221
    222static inline bool
    223__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw)
    224{
    225	return dl_b->bw != -1 &&
    226	       cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
    227}
    228
    229static inline
    230void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
    231{
    232	u64 old = dl_rq->running_bw;
    233
    234	lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
    235	dl_rq->running_bw += dl_bw;
    236	SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
    237	SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
    238	/* kick cpufreq (see the comment in kernel/sched/sched.h). */
    239	cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
    240}
    241
    242static inline
    243void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
    244{
    245	u64 old = dl_rq->running_bw;
    246
    247	lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
    248	dl_rq->running_bw -= dl_bw;
    249	SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
    250	if (dl_rq->running_bw > old)
    251		dl_rq->running_bw = 0;
    252	/* kick cpufreq (see the comment in kernel/sched/sched.h). */
    253	cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
    254}
    255
    256static inline
    257void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
    258{
    259	u64 old = dl_rq->this_bw;
    260
    261	lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
    262	dl_rq->this_bw += dl_bw;
    263	SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
    264}
    265
    266static inline
    267void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
    268{
    269	u64 old = dl_rq->this_bw;
    270
    271	lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
    272	dl_rq->this_bw -= dl_bw;
    273	SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
    274	if (dl_rq->this_bw > old)
    275		dl_rq->this_bw = 0;
    276	SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
    277}
    278
    279static inline
    280void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
    281{
    282	if (!dl_entity_is_special(dl_se))
    283		__add_rq_bw(dl_se->dl_bw, dl_rq);
    284}
    285
    286static inline
    287void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
    288{
    289	if (!dl_entity_is_special(dl_se))
    290		__sub_rq_bw(dl_se->dl_bw, dl_rq);
    291}
    292
    293static inline
    294void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
    295{
    296	if (!dl_entity_is_special(dl_se))
    297		__add_running_bw(dl_se->dl_bw, dl_rq);
    298}
    299
    300static inline
    301void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
    302{
    303	if (!dl_entity_is_special(dl_se))
    304		__sub_running_bw(dl_se->dl_bw, dl_rq);
    305}
    306
    307static void dl_change_utilization(struct task_struct *p, u64 new_bw)
    308{
    309	struct rq *rq;
    310
    311	BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
    312
    313	if (task_on_rq_queued(p))
    314		return;
    315
    316	rq = task_rq(p);
    317	if (p->dl.dl_non_contending) {
    318		sub_running_bw(&p->dl, &rq->dl);
    319		p->dl.dl_non_contending = 0;
    320		/*
    321		 * If the timer handler is currently running and the
    322		 * timer cannot be canceled, inactive_task_timer()
    323		 * will see that dl_not_contending is not set, and
    324		 * will not touch the rq's active utilization,
    325		 * so we are still safe.
    326		 */
    327		if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
    328			put_task_struct(p);
    329	}
    330	__sub_rq_bw(p->dl.dl_bw, &rq->dl);
    331	__add_rq_bw(new_bw, &rq->dl);
    332}
    333
    334/*
    335 * The utilization of a task cannot be immediately removed from
    336 * the rq active utilization (running_bw) when the task blocks.
    337 * Instead, we have to wait for the so called "0-lag time".
    338 *
    339 * If a task blocks before the "0-lag time", a timer (the inactive
    340 * timer) is armed, and running_bw is decreased when the timer
    341 * fires.
    342 *
    343 * If the task wakes up again before the inactive timer fires,
    344 * the timer is canceled, whereas if the task wakes up after the
    345 * inactive timer fired (and running_bw has been decreased) the
    346 * task's utilization has to be added to running_bw again.
    347 * A flag in the deadline scheduling entity (dl_non_contending)
    348 * is used to avoid race conditions between the inactive timer handler
    349 * and task wakeups.
    350 *
    351 * The following diagram shows how running_bw is updated. A task is
    352 * "ACTIVE" when its utilization contributes to running_bw; an
    353 * "ACTIVE contending" task is in the TASK_RUNNING state, while an
    354 * "ACTIVE non contending" task is a blocked task for which the "0-lag time"
    355 * has not passed yet. An "INACTIVE" task is a task for which the "0-lag"
    356 * time already passed, which does not contribute to running_bw anymore.
    357 *                              +------------------+
    358 *             wakeup           |    ACTIVE        |
    359 *          +------------------>+   contending     |
    360 *          | add_running_bw    |                  |
    361 *          |                   +----+------+------+
    362 *          |                        |      ^
    363 *          |                dequeue |      |
    364 * +--------+-------+                |      |
    365 * |                |   t >= 0-lag   |      | wakeup
    366 * |    INACTIVE    |<---------------+      |
    367 * |                | sub_running_bw |      |
    368 * +--------+-------+                |      |
    369 *          ^                        |      |
    370 *          |              t < 0-lag |      |
    371 *          |                        |      |
    372 *          |                        V      |
    373 *          |                   +----+------+------+
    374 *          | sub_running_bw    |    ACTIVE        |
    375 *          +-------------------+                  |
    376 *            inactive timer    |  non contending  |
    377 *            fired             +------------------+
    378 *
    379 * The task_non_contending() function is invoked when a task
    380 * blocks, and checks if the 0-lag time already passed or
    381 * not (in the first case, it directly updates running_bw;
    382 * in the second case, it arms the inactive timer).
    383 *
    384 * The task_contending() function is invoked when a task wakes
    385 * up, and checks if the task is still in the "ACTIVE non contending"
    386 * state or not (in the second case, it updates running_bw).
    387 */
    388static void task_non_contending(struct task_struct *p)
    389{
    390	struct sched_dl_entity *dl_se = &p->dl;
    391	struct hrtimer *timer = &dl_se->inactive_timer;
    392	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
    393	struct rq *rq = rq_of_dl_rq(dl_rq);
    394	s64 zerolag_time;
    395
    396	/*
    397	 * If this is a non-deadline task that has been boosted,
    398	 * do nothing
    399	 */
    400	if (dl_se->dl_runtime == 0)
    401		return;
    402
    403	if (dl_entity_is_special(dl_se))
    404		return;
    405
    406	WARN_ON(dl_se->dl_non_contending);
    407
    408	zerolag_time = dl_se->deadline -
    409		 div64_long((dl_se->runtime * dl_se->dl_period),
    410			dl_se->dl_runtime);
    411
    412	/*
    413	 * Using relative times instead of the absolute "0-lag time"
    414	 * allows to simplify the code
    415	 */
    416	zerolag_time -= rq_clock(rq);
    417
    418	/*
    419	 * If the "0-lag time" already passed, decrease the active
    420	 * utilization now, instead of starting a timer
    421	 */
    422	if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
    423		if (dl_task(p))
    424			sub_running_bw(dl_se, dl_rq);
    425		if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
    426			struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
    427
    428			if (READ_ONCE(p->__state) == TASK_DEAD)
    429				sub_rq_bw(&p->dl, &rq->dl);
    430			raw_spin_lock(&dl_b->lock);
    431			__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
    432			__dl_clear_params(p);
    433			raw_spin_unlock(&dl_b->lock);
    434		}
    435
    436		return;
    437	}
    438
    439	dl_se->dl_non_contending = 1;
    440	get_task_struct(p);
    441	hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
    442}
    443
    444static void task_contending(struct sched_dl_entity *dl_se, int flags)
    445{
    446	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
    447
    448	/*
    449	 * If this is a non-deadline task that has been boosted,
    450	 * do nothing
    451	 */
    452	if (dl_se->dl_runtime == 0)
    453		return;
    454
    455	if (flags & ENQUEUE_MIGRATED)
    456		add_rq_bw(dl_se, dl_rq);
    457
    458	if (dl_se->dl_non_contending) {
    459		dl_se->dl_non_contending = 0;
    460		/*
    461		 * If the timer handler is currently running and the
    462		 * timer cannot be canceled, inactive_task_timer()
    463		 * will see that dl_not_contending is not set, and
    464		 * will not touch the rq's active utilization,
    465		 * so we are still safe.
    466		 */
    467		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
    468			put_task_struct(dl_task_of(dl_se));
    469	} else {
    470		/*
    471		 * Since "dl_non_contending" is not set, the
    472		 * task's utilization has already been removed from
    473		 * active utilization (either when the task blocked,
    474		 * when the "inactive timer" fired).
    475		 * So, add it back.
    476		 */
    477		add_running_bw(dl_se, dl_rq);
    478	}
    479}
    480
    481static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
    482{
    483	struct sched_dl_entity *dl_se = &p->dl;
    484
    485	return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
    486}
    487
    488static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
    489
    490void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
    491{
    492	raw_spin_lock_init(&dl_b->dl_runtime_lock);
    493	dl_b->dl_period = period;
    494	dl_b->dl_runtime = runtime;
    495}
    496
    497void init_dl_bw(struct dl_bw *dl_b)
    498{
    499	raw_spin_lock_init(&dl_b->lock);
    500	if (global_rt_runtime() == RUNTIME_INF)
    501		dl_b->bw = -1;
    502	else
    503		dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
    504	dl_b->total_bw = 0;
    505}
    506
    507void init_dl_rq(struct dl_rq *dl_rq)
    508{
    509	dl_rq->root = RB_ROOT_CACHED;
    510
    511#ifdef CONFIG_SMP
    512	/* zero means no -deadline tasks */
    513	dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
    514
    515	dl_rq->dl_nr_migratory = 0;
    516	dl_rq->overloaded = 0;
    517	dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
    518#else
    519	init_dl_bw(&dl_rq->dl_bw);
    520#endif
    521
    522	dl_rq->running_bw = 0;
    523	dl_rq->this_bw = 0;
    524	init_dl_rq_bw_ratio(dl_rq);
    525}
    526
    527#ifdef CONFIG_SMP
    528
    529static inline int dl_overloaded(struct rq *rq)
    530{
    531	return atomic_read(&rq->rd->dlo_count);
    532}
    533
    534static inline void dl_set_overload(struct rq *rq)
    535{
    536	if (!rq->online)
    537		return;
    538
    539	cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
    540	/*
    541	 * Must be visible before the overload count is
    542	 * set (as in sched_rt.c).
    543	 *
    544	 * Matched by the barrier in pull_dl_task().
    545	 */
    546	smp_wmb();
    547	atomic_inc(&rq->rd->dlo_count);
    548}
    549
    550static inline void dl_clear_overload(struct rq *rq)
    551{
    552	if (!rq->online)
    553		return;
    554
    555	atomic_dec(&rq->rd->dlo_count);
    556	cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
    557}
    558
    559static void update_dl_migration(struct dl_rq *dl_rq)
    560{
    561	if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
    562		if (!dl_rq->overloaded) {
    563			dl_set_overload(rq_of_dl_rq(dl_rq));
    564			dl_rq->overloaded = 1;
    565		}
    566	} else if (dl_rq->overloaded) {
    567		dl_clear_overload(rq_of_dl_rq(dl_rq));
    568		dl_rq->overloaded = 0;
    569	}
    570}
    571
    572static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
    573{
    574	struct task_struct *p = dl_task_of(dl_se);
    575
    576	if (p->nr_cpus_allowed > 1)
    577		dl_rq->dl_nr_migratory++;
    578
    579	update_dl_migration(dl_rq);
    580}
    581
    582static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
    583{
    584	struct task_struct *p = dl_task_of(dl_se);
    585
    586	if (p->nr_cpus_allowed > 1)
    587		dl_rq->dl_nr_migratory--;
    588
    589	update_dl_migration(dl_rq);
    590}
    591
    592#define __node_2_pdl(node) \
    593	rb_entry((node), struct task_struct, pushable_dl_tasks)
    594
    595static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
    596{
    597	return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
    598}
    599
    600/*
    601 * The list of pushable -deadline task is not a plist, like in
    602 * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
    603 */
    604static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
    605{
    606	struct rb_node *leftmost;
    607
    608	BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
    609
    610	leftmost = rb_add_cached(&p->pushable_dl_tasks,
    611				 &rq->dl.pushable_dl_tasks_root,
    612				 __pushable_less);
    613	if (leftmost)
    614		rq->dl.earliest_dl.next = p->dl.deadline;
    615}
    616
    617static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
    618{
    619	struct dl_rq *dl_rq = &rq->dl;
    620	struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root;
    621	struct rb_node *leftmost;
    622
    623	if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
    624		return;
    625
    626	leftmost = rb_erase_cached(&p->pushable_dl_tasks, root);
    627	if (leftmost)
    628		dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
    629
    630	RB_CLEAR_NODE(&p->pushable_dl_tasks);
    631}
    632
    633static inline int has_pushable_dl_tasks(struct rq *rq)
    634{
    635	return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
    636}
    637
    638static int push_dl_task(struct rq *rq);
    639
    640static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
    641{
    642	return rq->online && dl_task(prev);
    643}
    644
    645static DEFINE_PER_CPU(struct callback_head, dl_push_head);
    646static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
    647
    648static void push_dl_tasks(struct rq *);
    649static void pull_dl_task(struct rq *);
    650
    651static inline void deadline_queue_push_tasks(struct rq *rq)
    652{
    653	if (!has_pushable_dl_tasks(rq))
    654		return;
    655
    656	queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
    657}
    658
    659static inline void deadline_queue_pull_task(struct rq *rq)
    660{
    661	queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
    662}
    663
    664static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
    665
    666static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
    667{
    668	struct rq *later_rq = NULL;
    669	struct dl_bw *dl_b;
    670
    671	later_rq = find_lock_later_rq(p, rq);
    672	if (!later_rq) {
    673		int cpu;
    674
    675		/*
    676		 * If we cannot preempt any rq, fall back to pick any
    677		 * online CPU:
    678		 */
    679		cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
    680		if (cpu >= nr_cpu_ids) {
    681			/*
    682			 * Failed to find any suitable CPU.
    683			 * The task will never come back!
    684			 */
    685			BUG_ON(dl_bandwidth_enabled());
    686
    687			/*
    688			 * If admission control is disabled we
    689			 * try a little harder to let the task
    690			 * run.
    691			 */
    692			cpu = cpumask_any(cpu_active_mask);
    693		}
    694		later_rq = cpu_rq(cpu);
    695		double_lock_balance(rq, later_rq);
    696	}
    697
    698	if (p->dl.dl_non_contending || p->dl.dl_throttled) {
    699		/*
    700		 * Inactive timer is armed (or callback is running, but
    701		 * waiting for us to release rq locks). In any case, when it
    702		 * will fire (or continue), it will see running_bw of this
    703		 * task migrated to later_rq (and correctly handle it).
    704		 */
    705		sub_running_bw(&p->dl, &rq->dl);
    706		sub_rq_bw(&p->dl, &rq->dl);
    707
    708		add_rq_bw(&p->dl, &later_rq->dl);
    709		add_running_bw(&p->dl, &later_rq->dl);
    710	} else {
    711		sub_rq_bw(&p->dl, &rq->dl);
    712		add_rq_bw(&p->dl, &later_rq->dl);
    713	}
    714
    715	/*
    716	 * And we finally need to fixup root_domain(s) bandwidth accounting,
    717	 * since p is still hanging out in the old (now moved to default) root
    718	 * domain.
    719	 */
    720	dl_b = &rq->rd->dl_bw;
    721	raw_spin_lock(&dl_b->lock);
    722	__dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
    723	raw_spin_unlock(&dl_b->lock);
    724
    725	dl_b = &later_rq->rd->dl_bw;
    726	raw_spin_lock(&dl_b->lock);
    727	__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
    728	raw_spin_unlock(&dl_b->lock);
    729
    730	set_task_cpu(p, later_rq->cpu);
    731	double_unlock_balance(later_rq, rq);
    732
    733	return later_rq;
    734}
    735
    736#else
    737
    738static inline
    739void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
    740{
    741}
    742
    743static inline
    744void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
    745{
    746}
    747
    748static inline
    749void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
    750{
    751}
    752
    753static inline
    754void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
    755{
    756}
    757
    758static inline void deadline_queue_push_tasks(struct rq *rq)
    759{
    760}
    761
    762static inline void deadline_queue_pull_task(struct rq *rq)
    763{
    764}
    765#endif /* CONFIG_SMP */
    766
    767static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
    768static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
    769static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
    770
    771/*
    772 * We are being explicitly informed that a new instance is starting,
    773 * and this means that:
    774 *  - the absolute deadline of the entity has to be placed at
    775 *    current time + relative deadline;
    776 *  - the runtime of the entity has to be set to the maximum value.
    777 *
    778 * The capability of specifying such event is useful whenever a -deadline
    779 * entity wants to (try to!) synchronize its behaviour with the scheduler's
    780 * one, and to (try to!) reconcile itself with its own scheduling
    781 * parameters.
    782 */
    783static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
    784{
    785	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
    786	struct rq *rq = rq_of_dl_rq(dl_rq);
    787
    788	WARN_ON(is_dl_boosted(dl_se));
    789	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
    790
    791	/*
    792	 * We are racing with the deadline timer. So, do nothing because
    793	 * the deadline timer handler will take care of properly recharging
    794	 * the runtime and postponing the deadline
    795	 */
    796	if (dl_se->dl_throttled)
    797		return;
    798
    799	/*
    800	 * We use the regular wall clock time to set deadlines in the
    801	 * future; in fact, we must consider execution overheads (time
    802	 * spent on hardirq context, etc.).
    803	 */
    804	dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
    805	dl_se->runtime = dl_se->dl_runtime;
    806}
    807
    808/*
    809 * Pure Earliest Deadline First (EDF) scheduling does not deal with the
    810 * possibility of a entity lasting more than what it declared, and thus
    811 * exhausting its runtime.
    812 *
    813 * Here we are interested in making runtime overrun possible, but we do
    814 * not want a entity which is misbehaving to affect the scheduling of all
    815 * other entities.
    816 * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
    817 * is used, in order to confine each entity within its own bandwidth.
    818 *
    819 * This function deals exactly with that, and ensures that when the runtime
    820 * of a entity is replenished, its deadline is also postponed. That ensures
    821 * the overrunning entity can't interfere with other entity in the system and
    822 * can't make them miss their deadlines. Reasons why this kind of overruns
    823 * could happen are, typically, a entity voluntarily trying to overcome its
    824 * runtime, or it just underestimated it during sched_setattr().
    825 */
    826static void replenish_dl_entity(struct sched_dl_entity *dl_se)
    827{
    828	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
    829	struct rq *rq = rq_of_dl_rq(dl_rq);
    830
    831	BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
    832
    833	/*
    834	 * This could be the case for a !-dl task that is boosted.
    835	 * Just go with full inherited parameters.
    836	 */
    837	if (dl_se->dl_deadline == 0) {
    838		dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
    839		dl_se->runtime = pi_of(dl_se)->dl_runtime;
    840	}
    841
    842	if (dl_se->dl_yielded && dl_se->runtime > 0)
    843		dl_se->runtime = 0;
    844
    845	/*
    846	 * We keep moving the deadline away until we get some
    847	 * available runtime for the entity. This ensures correct
    848	 * handling of situations where the runtime overrun is
    849	 * arbitrary large.
    850	 */
    851	while (dl_se->runtime <= 0) {
    852		dl_se->deadline += pi_of(dl_se)->dl_period;
    853		dl_se->runtime += pi_of(dl_se)->dl_runtime;
    854	}
    855
    856	/*
    857	 * At this point, the deadline really should be "in
    858	 * the future" with respect to rq->clock. If it's
    859	 * not, we are, for some reason, lagging too much!
    860	 * Anyway, after having warn userspace abut that,
    861	 * we still try to keep the things running by
    862	 * resetting the deadline and the budget of the
    863	 * entity.
    864	 */
    865	if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
    866		printk_deferred_once("sched: DL replenish lagged too much\n");
    867		dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
    868		dl_se->runtime = pi_of(dl_se)->dl_runtime;
    869	}
    870
    871	if (dl_se->dl_yielded)
    872		dl_se->dl_yielded = 0;
    873	if (dl_se->dl_throttled)
    874		dl_se->dl_throttled = 0;
    875}
    876
    877/*
    878 * Here we check if --at time t-- an entity (which is probably being
    879 * [re]activated or, in general, enqueued) can use its remaining runtime
    880 * and its current deadline _without_ exceeding the bandwidth it is
    881 * assigned (function returns true if it can't). We are in fact applying
    882 * one of the CBS rules: when a task wakes up, if the residual runtime
    883 * over residual deadline fits within the allocated bandwidth, then we
    884 * can keep the current (absolute) deadline and residual budget without
    885 * disrupting the schedulability of the system. Otherwise, we should
    886 * refill the runtime and set the deadline a period in the future,
    887 * because keeping the current (absolute) deadline of the task would
    888 * result in breaking guarantees promised to other tasks (refer to
    889 * Documentation/scheduler/sched-deadline.rst for more information).
    890 *
    891 * This function returns true if:
    892 *
    893 *   runtime / (deadline - t) > dl_runtime / dl_deadline ,
    894 *
    895 * IOW we can't recycle current parameters.
    896 *
    897 * Notice that the bandwidth check is done against the deadline. For
    898 * task with deadline equal to period this is the same of using
    899 * dl_period instead of dl_deadline in the equation above.
    900 */
    901static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
    902{
    903	u64 left, right;
    904
    905	/*
    906	 * left and right are the two sides of the equation above,
    907	 * after a bit of shuffling to use multiplications instead
    908	 * of divisions.
    909	 *
    910	 * Note that none of the time values involved in the two
    911	 * multiplications are absolute: dl_deadline and dl_runtime
    912	 * are the relative deadline and the maximum runtime of each
    913	 * instance, runtime is the runtime left for the last instance
    914	 * and (deadline - t), since t is rq->clock, is the time left
    915	 * to the (absolute) deadline. Even if overflowing the u64 type
    916	 * is very unlikely to occur in both cases, here we scale down
    917	 * as we want to avoid that risk at all. Scaling down by 10
    918	 * means that we reduce granularity to 1us. We are fine with it,
    919	 * since this is only a true/false check and, anyway, thinking
    920	 * of anything below microseconds resolution is actually fiction
    921	 * (but still we want to give the user that illusion >;).
    922	 */
    923	left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
    924	right = ((dl_se->deadline - t) >> DL_SCALE) *
    925		(pi_of(dl_se)->dl_runtime >> DL_SCALE);
    926
    927	return dl_time_before(right, left);
    928}
    929
    930/*
    931 * Revised wakeup rule [1]: For self-suspending tasks, rather then
    932 * re-initializing task's runtime and deadline, the revised wakeup
    933 * rule adjusts the task's runtime to avoid the task to overrun its
    934 * density.
    935 *
    936 * Reasoning: a task may overrun the density if:
    937 *    runtime / (deadline - t) > dl_runtime / dl_deadline
    938 *
    939 * Therefore, runtime can be adjusted to:
    940 *     runtime = (dl_runtime / dl_deadline) * (deadline - t)
    941 *
    942 * In such way that runtime will be equal to the maximum density
    943 * the task can use without breaking any rule.
    944 *
    945 * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
    946 * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
    947 */
    948static void
    949update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
    950{
    951	u64 laxity = dl_se->deadline - rq_clock(rq);
    952
    953	/*
    954	 * If the task has deadline < period, and the deadline is in the past,
    955	 * it should already be throttled before this check.
    956	 *
    957	 * See update_dl_entity() comments for further details.
    958	 */
    959	WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
    960
    961	dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT;
    962}
    963
    964/*
    965 * Regarding the deadline, a task with implicit deadline has a relative
    966 * deadline == relative period. A task with constrained deadline has a
    967 * relative deadline <= relative period.
    968 *
    969 * We support constrained deadline tasks. However, there are some restrictions
    970 * applied only for tasks which do not have an implicit deadline. See
    971 * update_dl_entity() to know more about such restrictions.
    972 *
    973 * The dl_is_implicit() returns true if the task has an implicit deadline.
    974 */
    975static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
    976{
    977	return dl_se->dl_deadline == dl_se->dl_period;
    978}
    979
    980/*
    981 * When a deadline entity is placed in the runqueue, its runtime and deadline
    982 * might need to be updated. This is done by a CBS wake up rule. There are two
    983 * different rules: 1) the original CBS; and 2) the Revisited CBS.
    984 *
    985 * When the task is starting a new period, the Original CBS is used. In this
    986 * case, the runtime is replenished and a new absolute deadline is set.
    987 *
    988 * When a task is queued before the begin of the next period, using the
    989 * remaining runtime and deadline could make the entity to overflow, see
    990 * dl_entity_overflow() to find more about runtime overflow. When such case
    991 * is detected, the runtime and deadline need to be updated.
    992 *
    993 * If the task has an implicit deadline, i.e., deadline == period, the Original
    994 * CBS is applied. the runtime is replenished and a new absolute deadline is
    995 * set, as in the previous cases.
    996 *
    997 * However, the Original CBS does not work properly for tasks with
    998 * deadline < period, which are said to have a constrained deadline. By
    999 * applying the Original CBS, a constrained deadline task would be able to run
   1000 * runtime/deadline in a period. With deadline < period, the task would
   1001 * overrun the runtime/period allowed bandwidth, breaking the admission test.
   1002 *
   1003 * In order to prevent this misbehave, the Revisited CBS is used for
   1004 * constrained deadline tasks when a runtime overflow is detected. In the
   1005 * Revisited CBS, rather than replenishing & setting a new absolute deadline,
   1006 * the remaining runtime of the task is reduced to avoid runtime overflow.
   1007 * Please refer to the comments update_dl_revised_wakeup() function to find
   1008 * more about the Revised CBS rule.
   1009 */
   1010static void update_dl_entity(struct sched_dl_entity *dl_se)
   1011{
   1012	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
   1013	struct rq *rq = rq_of_dl_rq(dl_rq);
   1014
   1015	if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
   1016	    dl_entity_overflow(dl_se, rq_clock(rq))) {
   1017
   1018		if (unlikely(!dl_is_implicit(dl_se) &&
   1019			     !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
   1020			     !is_dl_boosted(dl_se))) {
   1021			update_dl_revised_wakeup(dl_se, rq);
   1022			return;
   1023		}
   1024
   1025		dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
   1026		dl_se->runtime = pi_of(dl_se)->dl_runtime;
   1027	}
   1028}
   1029
   1030static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
   1031{
   1032	return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period;
   1033}
   1034
   1035/*
   1036 * If the entity depleted all its runtime, and if we want it to sleep
   1037 * while waiting for some new execution time to become available, we
   1038 * set the bandwidth replenishment timer to the replenishment instant
   1039 * and try to activate it.
   1040 *
   1041 * Notice that it is important for the caller to know if the timer
   1042 * actually started or not (i.e., the replenishment instant is in
   1043 * the future or in the past).
   1044 */
   1045static int start_dl_timer(struct task_struct *p)
   1046{
   1047	struct sched_dl_entity *dl_se = &p->dl;
   1048	struct hrtimer *timer = &dl_se->dl_timer;
   1049	struct rq *rq = task_rq(p);
   1050	ktime_t now, act;
   1051	s64 delta;
   1052
   1053	lockdep_assert_rq_held(rq);
   1054
   1055	/*
   1056	 * We want the timer to fire at the deadline, but considering
   1057	 * that it is actually coming from rq->clock and not from
   1058	 * hrtimer's time base reading.
   1059	 */
   1060	act = ns_to_ktime(dl_next_period(dl_se));
   1061	now = hrtimer_cb_get_time(timer);
   1062	delta = ktime_to_ns(now) - rq_clock(rq);
   1063	act = ktime_add_ns(act, delta);
   1064
   1065	/*
   1066	 * If the expiry time already passed, e.g., because the value
   1067	 * chosen as the deadline is too small, don't even try to
   1068	 * start the timer in the past!
   1069	 */
   1070	if (ktime_us_delta(act, now) < 0)
   1071		return 0;
   1072
   1073	/*
   1074	 * !enqueued will guarantee another callback; even if one is already in
   1075	 * progress. This ensures a balanced {get,put}_task_struct().
   1076	 *
   1077	 * The race against __run_timer() clearing the enqueued state is
   1078	 * harmless because we're holding task_rq()->lock, therefore the timer
   1079	 * expiring after we've done the check will wait on its task_rq_lock()
   1080	 * and observe our state.
   1081	 */
   1082	if (!hrtimer_is_queued(timer)) {
   1083		get_task_struct(p);
   1084		hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
   1085	}
   1086
   1087	return 1;
   1088}
   1089
   1090/*
   1091 * This is the bandwidth enforcement timer callback. If here, we know
   1092 * a task is not on its dl_rq, since the fact that the timer was running
   1093 * means the task is throttled and needs a runtime replenishment.
   1094 *
   1095 * However, what we actually do depends on the fact the task is active,
   1096 * (it is on its rq) or has been removed from there by a call to
   1097 * dequeue_task_dl(). In the former case we must issue the runtime
   1098 * replenishment and add the task back to the dl_rq; in the latter, we just
   1099 * do nothing but clearing dl_throttled, so that runtime and deadline
   1100 * updating (and the queueing back to dl_rq) will be done by the
   1101 * next call to enqueue_task_dl().
   1102 */
   1103static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
   1104{
   1105	struct sched_dl_entity *dl_se = container_of(timer,
   1106						     struct sched_dl_entity,
   1107						     dl_timer);
   1108	struct task_struct *p = dl_task_of(dl_se);
   1109	struct rq_flags rf;
   1110	struct rq *rq;
   1111
   1112	rq = task_rq_lock(p, &rf);
   1113
   1114	/*
   1115	 * The task might have changed its scheduling policy to something
   1116	 * different than SCHED_DEADLINE (through switched_from_dl()).
   1117	 */
   1118	if (!dl_task(p))
   1119		goto unlock;
   1120
   1121	/*
   1122	 * The task might have been boosted by someone else and might be in the
   1123	 * boosting/deboosting path, its not throttled.
   1124	 */
   1125	if (is_dl_boosted(dl_se))
   1126		goto unlock;
   1127
   1128	/*
   1129	 * Spurious timer due to start_dl_timer() race; or we already received
   1130	 * a replenishment from rt_mutex_setprio().
   1131	 */
   1132	if (!dl_se->dl_throttled)
   1133		goto unlock;
   1134
   1135	sched_clock_tick();
   1136	update_rq_clock(rq);
   1137
   1138	/*
   1139	 * If the throttle happened during sched-out; like:
   1140	 *
   1141	 *   schedule()
   1142	 *     deactivate_task()
   1143	 *       dequeue_task_dl()
   1144	 *         update_curr_dl()
   1145	 *           start_dl_timer()
   1146	 *         __dequeue_task_dl()
   1147	 *     prev->on_rq = 0;
   1148	 *
   1149	 * We can be both throttled and !queued. Replenish the counter
   1150	 * but do not enqueue -- wait for our wakeup to do that.
   1151	 */
   1152	if (!task_on_rq_queued(p)) {
   1153		replenish_dl_entity(dl_se);
   1154		goto unlock;
   1155	}
   1156
   1157#ifdef CONFIG_SMP
   1158	if (unlikely(!rq->online)) {
   1159		/*
   1160		 * If the runqueue is no longer available, migrate the
   1161		 * task elsewhere. This necessarily changes rq.
   1162		 */
   1163		lockdep_unpin_lock(__rq_lockp(rq), rf.cookie);
   1164		rq = dl_task_offline_migration(rq, p);
   1165		rf.cookie = lockdep_pin_lock(__rq_lockp(rq));
   1166		update_rq_clock(rq);
   1167
   1168		/*
   1169		 * Now that the task has been migrated to the new RQ and we
   1170		 * have that locked, proceed as normal and enqueue the task
   1171		 * there.
   1172		 */
   1173	}
   1174#endif
   1175
   1176	enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
   1177	if (dl_task(rq->curr))
   1178		check_preempt_curr_dl(rq, p, 0);
   1179	else
   1180		resched_curr(rq);
   1181
   1182#ifdef CONFIG_SMP
   1183	/*
   1184	 * Queueing this task back might have overloaded rq, check if we need
   1185	 * to kick someone away.
   1186	 */
   1187	if (has_pushable_dl_tasks(rq)) {
   1188		/*
   1189		 * Nothing relies on rq->lock after this, so its safe to drop
   1190		 * rq->lock.
   1191		 */
   1192		rq_unpin_lock(rq, &rf);
   1193		push_dl_task(rq);
   1194		rq_repin_lock(rq, &rf);
   1195	}
   1196#endif
   1197
   1198unlock:
   1199	task_rq_unlock(rq, p, &rf);
   1200
   1201	/*
   1202	 * This can free the task_struct, including this hrtimer, do not touch
   1203	 * anything related to that after this.
   1204	 */
   1205	put_task_struct(p);
   1206
   1207	return HRTIMER_NORESTART;
   1208}
   1209
   1210void init_dl_task_timer(struct sched_dl_entity *dl_se)
   1211{
   1212	struct hrtimer *timer = &dl_se->dl_timer;
   1213
   1214	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
   1215	timer->function = dl_task_timer;
   1216}
   1217
   1218/*
   1219 * During the activation, CBS checks if it can reuse the current task's
   1220 * runtime and period. If the deadline of the task is in the past, CBS
   1221 * cannot use the runtime, and so it replenishes the task. This rule
   1222 * works fine for implicit deadline tasks (deadline == period), and the
   1223 * CBS was designed for implicit deadline tasks. However, a task with
   1224 * constrained deadline (deadline < period) might be awakened after the
   1225 * deadline, but before the next period. In this case, replenishing the
   1226 * task would allow it to run for runtime / deadline. As in this case
   1227 * deadline < period, CBS enables a task to run for more than the
   1228 * runtime / period. In a very loaded system, this can cause a domino
   1229 * effect, making other tasks miss their deadlines.
   1230 *
   1231 * To avoid this problem, in the activation of a constrained deadline
   1232 * task after the deadline but before the next period, throttle the
   1233 * task and set the replenishing timer to the begin of the next period,
   1234 * unless it is boosted.
   1235 */
   1236static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
   1237{
   1238	struct task_struct *p = dl_task_of(dl_se);
   1239	struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
   1240
   1241	if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
   1242	    dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
   1243		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
   1244			return;
   1245		dl_se->dl_throttled = 1;
   1246		if (dl_se->runtime > 0)
   1247			dl_se->runtime = 0;
   1248	}
   1249}
   1250
   1251static
   1252int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
   1253{
   1254	return (dl_se->runtime <= 0);
   1255}
   1256
   1257/*
   1258 * This function implements the GRUB accounting rule:
   1259 * according to the GRUB reclaiming algorithm, the runtime is
   1260 * not decreased as "dq = -dt", but as
   1261 * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
   1262 * where u is the utilization of the task, Umax is the maximum reclaimable
   1263 * utilization, Uinact is the (per-runqueue) inactive utilization, computed
   1264 * as the difference between the "total runqueue utilization" and the
   1265 * runqueue active utilization, and Uextra is the (per runqueue) extra
   1266 * reclaimable utilization.
   1267 * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
   1268 * multiplied by 2^BW_SHIFT, the result has to be shifted right by
   1269 * BW_SHIFT.
   1270 * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
   1271 * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
   1272 * Since delta is a 64 bit variable, to have an overflow its value
   1273 * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
   1274 * So, overflow is not an issue here.
   1275 */
   1276static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
   1277{
   1278	u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
   1279	u64 u_act;
   1280	u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
   1281
   1282	/*
   1283	 * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
   1284	 * we compare u_inact + rq->dl.extra_bw with
   1285	 * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
   1286	 * u_inact + rq->dl.extra_bw can be larger than
   1287	 * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
   1288	 * leading to wrong results)
   1289	 */
   1290	if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
   1291		u_act = u_act_min;
   1292	else
   1293		u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
   1294
   1295	return (delta * u_act) >> BW_SHIFT;
   1296}
   1297
   1298/*
   1299 * Update the current task's runtime statistics (provided it is still
   1300 * a -deadline task and has not been removed from the dl_rq).
   1301 */
   1302static void update_curr_dl(struct rq *rq)
   1303{
   1304	struct task_struct *curr = rq->curr;
   1305	struct sched_dl_entity *dl_se = &curr->dl;
   1306	u64 delta_exec, scaled_delta_exec;
   1307	int cpu = cpu_of(rq);
   1308	u64 now;
   1309
   1310	if (!dl_task(curr) || !on_dl_rq(dl_se))
   1311		return;
   1312
   1313	/*
   1314	 * Consumed budget is computed considering the time as
   1315	 * observed by schedulable tasks (excluding time spent
   1316	 * in hardirq context, etc.). Deadlines are instead
   1317	 * computed using hard walltime. This seems to be the more
   1318	 * natural solution, but the full ramifications of this
   1319	 * approach need further study.
   1320	 */
   1321	now = rq_clock_task(rq);
   1322	delta_exec = now - curr->se.exec_start;
   1323	if (unlikely((s64)delta_exec <= 0)) {
   1324		if (unlikely(dl_se->dl_yielded))
   1325			goto throttle;
   1326		return;
   1327	}
   1328
   1329	schedstat_set(curr->stats.exec_max,
   1330		      max(curr->stats.exec_max, delta_exec));
   1331
   1332	trace_sched_stat_runtime(curr, delta_exec, 0);
   1333
   1334	curr->se.sum_exec_runtime += delta_exec;
   1335	account_group_exec_runtime(curr, delta_exec);
   1336
   1337	curr->se.exec_start = now;
   1338	cgroup_account_cputime(curr, delta_exec);
   1339
   1340	if (dl_entity_is_special(dl_se))
   1341		return;
   1342
   1343	/*
   1344	 * For tasks that participate in GRUB, we implement GRUB-PA: the
   1345	 * spare reclaimed bandwidth is used to clock down frequency.
   1346	 *
   1347	 * For the others, we still need to scale reservation parameters
   1348	 * according to current frequency and CPU maximum capacity.
   1349	 */
   1350	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
   1351		scaled_delta_exec = grub_reclaim(delta_exec,
   1352						 rq,
   1353						 &curr->dl);
   1354	} else {
   1355		unsigned long scale_freq = arch_scale_freq_capacity(cpu);
   1356		unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
   1357
   1358		scaled_delta_exec = cap_scale(delta_exec, scale_freq);
   1359		scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
   1360	}
   1361
   1362	dl_se->runtime -= scaled_delta_exec;
   1363
   1364throttle:
   1365	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
   1366		dl_se->dl_throttled = 1;
   1367
   1368		/* If requested, inform the user about runtime overruns. */
   1369		if (dl_runtime_exceeded(dl_se) &&
   1370		    (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
   1371			dl_se->dl_overrun = 1;
   1372
   1373		__dequeue_task_dl(rq, curr, 0);
   1374		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
   1375			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
   1376
   1377		if (!is_leftmost(curr, &rq->dl))
   1378			resched_curr(rq);
   1379	}
   1380
   1381	/*
   1382	 * Because -- for now -- we share the rt bandwidth, we need to
   1383	 * account our runtime there too, otherwise actual rt tasks
   1384	 * would be able to exceed the shared quota.
   1385	 *
   1386	 * Account to the root rt group for now.
   1387	 *
   1388	 * The solution we're working towards is having the RT groups scheduled
   1389	 * using deadline servers -- however there's a few nasties to figure
   1390	 * out before that can happen.
   1391	 */
   1392	if (rt_bandwidth_enabled()) {
   1393		struct rt_rq *rt_rq = &rq->rt;
   1394
   1395		raw_spin_lock(&rt_rq->rt_runtime_lock);
   1396		/*
   1397		 * We'll let actual RT tasks worry about the overflow here, we
   1398		 * have our own CBS to keep us inline; only account when RT
   1399		 * bandwidth is relevant.
   1400		 */
   1401		if (sched_rt_bandwidth_account(rt_rq))
   1402			rt_rq->rt_time += delta_exec;
   1403		raw_spin_unlock(&rt_rq->rt_runtime_lock);
   1404	}
   1405}
   1406
   1407static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
   1408{
   1409	struct sched_dl_entity *dl_se = container_of(timer,
   1410						     struct sched_dl_entity,
   1411						     inactive_timer);
   1412	struct task_struct *p = dl_task_of(dl_se);
   1413	struct rq_flags rf;
   1414	struct rq *rq;
   1415
   1416	rq = task_rq_lock(p, &rf);
   1417
   1418	sched_clock_tick();
   1419	update_rq_clock(rq);
   1420
   1421	if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
   1422		struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
   1423
   1424		if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) {
   1425			sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
   1426			sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
   1427			dl_se->dl_non_contending = 0;
   1428		}
   1429
   1430		raw_spin_lock(&dl_b->lock);
   1431		__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
   1432		raw_spin_unlock(&dl_b->lock);
   1433		__dl_clear_params(p);
   1434
   1435		goto unlock;
   1436	}
   1437	if (dl_se->dl_non_contending == 0)
   1438		goto unlock;
   1439
   1440	sub_running_bw(dl_se, &rq->dl);
   1441	dl_se->dl_non_contending = 0;
   1442unlock:
   1443	task_rq_unlock(rq, p, &rf);
   1444	put_task_struct(p);
   1445
   1446	return HRTIMER_NORESTART;
   1447}
   1448
   1449void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
   1450{
   1451	struct hrtimer *timer = &dl_se->inactive_timer;
   1452
   1453	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
   1454	timer->function = inactive_task_timer;
   1455}
   1456
   1457#define __node_2_dle(node) \
   1458	rb_entry((node), struct sched_dl_entity, rb_node)
   1459
   1460#ifdef CONFIG_SMP
   1461
   1462static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
   1463{
   1464	struct rq *rq = rq_of_dl_rq(dl_rq);
   1465
   1466	if (dl_rq->earliest_dl.curr == 0 ||
   1467	    dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
   1468		if (dl_rq->earliest_dl.curr == 0)
   1469			cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER);
   1470		dl_rq->earliest_dl.curr = deadline;
   1471		cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
   1472	}
   1473}
   1474
   1475static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
   1476{
   1477	struct rq *rq = rq_of_dl_rq(dl_rq);
   1478
   1479	/*
   1480	 * Since we may have removed our earliest (and/or next earliest)
   1481	 * task we must recompute them.
   1482	 */
   1483	if (!dl_rq->dl_nr_running) {
   1484		dl_rq->earliest_dl.curr = 0;
   1485		dl_rq->earliest_dl.next = 0;
   1486		cpudl_clear(&rq->rd->cpudl, rq->cpu);
   1487		cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
   1488	} else {
   1489		struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
   1490		struct sched_dl_entity *entry = __node_2_dle(leftmost);
   1491
   1492		dl_rq->earliest_dl.curr = entry->deadline;
   1493		cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
   1494	}
   1495}
   1496
   1497#else
   1498
   1499static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
   1500static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
   1501
   1502#endif /* CONFIG_SMP */
   1503
   1504static inline
   1505void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
   1506{
   1507	int prio = dl_task_of(dl_se)->prio;
   1508	u64 deadline = dl_se->deadline;
   1509
   1510	WARN_ON(!dl_prio(prio));
   1511	dl_rq->dl_nr_running++;
   1512	add_nr_running(rq_of_dl_rq(dl_rq), 1);
   1513
   1514	inc_dl_deadline(dl_rq, deadline);
   1515	inc_dl_migration(dl_se, dl_rq);
   1516}
   1517
   1518static inline
   1519void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
   1520{
   1521	int prio = dl_task_of(dl_se)->prio;
   1522
   1523	WARN_ON(!dl_prio(prio));
   1524	WARN_ON(!dl_rq->dl_nr_running);
   1525	dl_rq->dl_nr_running--;
   1526	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
   1527
   1528	dec_dl_deadline(dl_rq, dl_se->deadline);
   1529	dec_dl_migration(dl_se, dl_rq);
   1530}
   1531
   1532static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
   1533{
   1534	return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
   1535}
   1536
   1537static inline struct sched_statistics *
   1538__schedstats_from_dl_se(struct sched_dl_entity *dl_se)
   1539{
   1540	return &dl_task_of(dl_se)->stats;
   1541}
   1542
   1543static inline void
   1544update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
   1545{
   1546	struct sched_statistics *stats;
   1547
   1548	if (!schedstat_enabled())
   1549		return;
   1550
   1551	stats = __schedstats_from_dl_se(dl_se);
   1552	__update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
   1553}
   1554
   1555static inline void
   1556update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
   1557{
   1558	struct sched_statistics *stats;
   1559
   1560	if (!schedstat_enabled())
   1561		return;
   1562
   1563	stats = __schedstats_from_dl_se(dl_se);
   1564	__update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
   1565}
   1566
   1567static inline void
   1568update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
   1569{
   1570	struct sched_statistics *stats;
   1571
   1572	if (!schedstat_enabled())
   1573		return;
   1574
   1575	stats = __schedstats_from_dl_se(dl_se);
   1576	__update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
   1577}
   1578
   1579static inline void
   1580update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
   1581			int flags)
   1582{
   1583	if (!schedstat_enabled())
   1584		return;
   1585
   1586	if (flags & ENQUEUE_WAKEUP)
   1587		update_stats_enqueue_sleeper_dl(dl_rq, dl_se);
   1588}
   1589
   1590static inline void
   1591update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
   1592			int flags)
   1593{
   1594	struct task_struct *p = dl_task_of(dl_se);
   1595
   1596	if (!schedstat_enabled())
   1597		return;
   1598
   1599	if ((flags & DEQUEUE_SLEEP)) {
   1600		unsigned int state;
   1601
   1602		state = READ_ONCE(p->__state);
   1603		if (state & TASK_INTERRUPTIBLE)
   1604			__schedstat_set(p->stats.sleep_start,
   1605					rq_clock(rq_of_dl_rq(dl_rq)));
   1606
   1607		if (state & TASK_UNINTERRUPTIBLE)
   1608			__schedstat_set(p->stats.block_start,
   1609					rq_clock(rq_of_dl_rq(dl_rq)));
   1610	}
   1611}
   1612
   1613static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
   1614{
   1615	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
   1616
   1617	BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
   1618
   1619	rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
   1620
   1621	inc_dl_tasks(dl_se, dl_rq);
   1622}
   1623
   1624static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
   1625{
   1626	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
   1627
   1628	if (RB_EMPTY_NODE(&dl_se->rb_node))
   1629		return;
   1630
   1631	rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
   1632
   1633	RB_CLEAR_NODE(&dl_se->rb_node);
   1634
   1635	dec_dl_tasks(dl_se, dl_rq);
   1636}
   1637
   1638static void
   1639enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
   1640{
   1641	BUG_ON(on_dl_rq(dl_se));
   1642
   1643	update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
   1644
   1645	/*
   1646	 * If this is a wakeup or a new instance, the scheduling
   1647	 * parameters of the task might need updating. Otherwise,
   1648	 * we want a replenishment of its runtime.
   1649	 */
   1650	if (flags & ENQUEUE_WAKEUP) {
   1651		task_contending(dl_se, flags);
   1652		update_dl_entity(dl_se);
   1653	} else if (flags & ENQUEUE_REPLENISH) {
   1654		replenish_dl_entity(dl_se);
   1655	} else if ((flags & ENQUEUE_RESTORE) &&
   1656		  dl_time_before(dl_se->deadline,
   1657				 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
   1658		setup_new_dl_entity(dl_se);
   1659	}
   1660
   1661	__enqueue_dl_entity(dl_se);
   1662}
   1663
   1664static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
   1665{
   1666	__dequeue_dl_entity(dl_se);
   1667}
   1668
   1669static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
   1670{
   1671	if (is_dl_boosted(&p->dl)) {
   1672		/*
   1673		 * Because of delays in the detection of the overrun of a
   1674		 * thread's runtime, it might be the case that a thread
   1675		 * goes to sleep in a rt mutex with negative runtime. As
   1676		 * a consequence, the thread will be throttled.
   1677		 *
   1678		 * While waiting for the mutex, this thread can also be
   1679		 * boosted via PI, resulting in a thread that is throttled
   1680		 * and boosted at the same time.
   1681		 *
   1682		 * In this case, the boost overrides the throttle.
   1683		 */
   1684		if (p->dl.dl_throttled) {
   1685			/*
   1686			 * The replenish timer needs to be canceled. No
   1687			 * problem if it fires concurrently: boosted threads
   1688			 * are ignored in dl_task_timer().
   1689			 */
   1690			hrtimer_try_to_cancel(&p->dl.dl_timer);
   1691			p->dl.dl_throttled = 0;
   1692		}
   1693	} else if (!dl_prio(p->normal_prio)) {
   1694		/*
   1695		 * Special case in which we have a !SCHED_DEADLINE task that is going
   1696		 * to be deboosted, but exceeds its runtime while doing so. No point in
   1697		 * replenishing it, as it's going to return back to its original
   1698		 * scheduling class after this. If it has been throttled, we need to
   1699		 * clear the flag, otherwise the task may wake up as throttled after
   1700		 * being boosted again with no means to replenish the runtime and clear
   1701		 * the throttle.
   1702		 */
   1703		p->dl.dl_throttled = 0;
   1704		BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH);
   1705		return;
   1706	}
   1707
   1708	/*
   1709	 * Check if a constrained deadline task was activated
   1710	 * after the deadline but before the next period.
   1711	 * If that is the case, the task will be throttled and
   1712	 * the replenishment timer will be set to the next period.
   1713	 */
   1714	if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
   1715		dl_check_constrained_dl(&p->dl);
   1716
   1717	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
   1718		add_rq_bw(&p->dl, &rq->dl);
   1719		add_running_bw(&p->dl, &rq->dl);
   1720	}
   1721
   1722	/*
   1723	 * If p is throttled, we do not enqueue it. In fact, if it exhausted
   1724	 * its budget it needs a replenishment and, since it now is on
   1725	 * its rq, the bandwidth timer callback (which clearly has not
   1726	 * run yet) will take care of this.
   1727	 * However, the active utilization does not depend on the fact
   1728	 * that the task is on the runqueue or not (but depends on the
   1729	 * task's state - in GRUB parlance, "inactive" vs "active contending").
   1730	 * In other words, even if a task is throttled its utilization must
   1731	 * be counted in the active utilization; hence, we need to call
   1732	 * add_running_bw().
   1733	 */
   1734	if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
   1735		if (flags & ENQUEUE_WAKEUP)
   1736			task_contending(&p->dl, flags);
   1737
   1738		return;
   1739	}
   1740
   1741	check_schedstat_required();
   1742	update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl);
   1743
   1744	enqueue_dl_entity(&p->dl, flags);
   1745
   1746	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
   1747		enqueue_pushable_dl_task(rq, p);
   1748}
   1749
   1750static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
   1751{
   1752	update_stats_dequeue_dl(&rq->dl, &p->dl, flags);
   1753	dequeue_dl_entity(&p->dl);
   1754	dequeue_pushable_dl_task(rq, p);
   1755}
   1756
   1757static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
   1758{
   1759	update_curr_dl(rq);
   1760	__dequeue_task_dl(rq, p, flags);
   1761
   1762	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
   1763		sub_running_bw(&p->dl, &rq->dl);
   1764		sub_rq_bw(&p->dl, &rq->dl);
   1765	}
   1766
   1767	/*
   1768	 * This check allows to start the inactive timer (or to immediately
   1769	 * decrease the active utilization, if needed) in two cases:
   1770	 * when the task blocks and when it is terminating
   1771	 * (p->state == TASK_DEAD). We can handle the two cases in the same
   1772	 * way, because from GRUB's point of view the same thing is happening
   1773	 * (the task moves from "active contending" to "active non contending"
   1774	 * or "inactive")
   1775	 */
   1776	if (flags & DEQUEUE_SLEEP)
   1777		task_non_contending(p);
   1778}
   1779
   1780/*
   1781 * Yield task semantic for -deadline tasks is:
   1782 *
   1783 *   get off from the CPU until our next instance, with
   1784 *   a new runtime. This is of little use now, since we
   1785 *   don't have a bandwidth reclaiming mechanism. Anyway,
   1786 *   bandwidth reclaiming is planned for the future, and
   1787 *   yield_task_dl will indicate that some spare budget
   1788 *   is available for other task instances to use it.
   1789 */
   1790static void yield_task_dl(struct rq *rq)
   1791{
   1792	/*
   1793	 * We make the task go to sleep until its current deadline by
   1794	 * forcing its runtime to zero. This way, update_curr_dl() stops
   1795	 * it and the bandwidth timer will wake it up and will give it
   1796	 * new scheduling parameters (thanks to dl_yielded=1).
   1797	 */
   1798	rq->curr->dl.dl_yielded = 1;
   1799
   1800	update_rq_clock(rq);
   1801	update_curr_dl(rq);
   1802	/*
   1803	 * Tell update_rq_clock() that we've just updated,
   1804	 * so we don't do microscopic update in schedule()
   1805	 * and double the fastpath cost.
   1806	 */
   1807	rq_clock_skip_update(rq);
   1808}
   1809
   1810#ifdef CONFIG_SMP
   1811
   1812static int find_later_rq(struct task_struct *task);
   1813
   1814static int
   1815select_task_rq_dl(struct task_struct *p, int cpu, int flags)
   1816{
   1817	struct task_struct *curr;
   1818	bool select_rq;
   1819	struct rq *rq;
   1820
   1821	if (!(flags & WF_TTWU))
   1822		goto out;
   1823
   1824	rq = cpu_rq(cpu);
   1825
   1826	rcu_read_lock();
   1827	curr = READ_ONCE(rq->curr); /* unlocked access */
   1828
   1829	/*
   1830	 * If we are dealing with a -deadline task, we must
   1831	 * decide where to wake it up.
   1832	 * If it has a later deadline and the current task
   1833	 * on this rq can't move (provided the waking task
   1834	 * can!) we prefer to send it somewhere else. On the
   1835	 * other hand, if it has a shorter deadline, we
   1836	 * try to make it stay here, it might be important.
   1837	 */
   1838	select_rq = unlikely(dl_task(curr)) &&
   1839		    (curr->nr_cpus_allowed < 2 ||
   1840		     !dl_entity_preempt(&p->dl, &curr->dl)) &&
   1841		    p->nr_cpus_allowed > 1;
   1842
   1843	/*
   1844	 * Take the capacity of the CPU into account to
   1845	 * ensure it fits the requirement of the task.
   1846	 */
   1847	if (static_branch_unlikely(&sched_asym_cpucapacity))
   1848		select_rq |= !dl_task_fits_capacity(p, cpu);
   1849
   1850	if (select_rq) {
   1851		int target = find_later_rq(p);
   1852
   1853		if (target != -1 &&
   1854				(dl_time_before(p->dl.deadline,
   1855					cpu_rq(target)->dl.earliest_dl.curr) ||
   1856				(cpu_rq(target)->dl.dl_nr_running == 0)))
   1857			cpu = target;
   1858	}
   1859	rcu_read_unlock();
   1860
   1861out:
   1862	return cpu;
   1863}
   1864
   1865static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
   1866{
   1867	struct rq_flags rf;
   1868	struct rq *rq;
   1869
   1870	if (READ_ONCE(p->__state) != TASK_WAKING)
   1871		return;
   1872
   1873	rq = task_rq(p);
   1874	/*
   1875	 * Since p->state == TASK_WAKING, set_task_cpu() has been called
   1876	 * from try_to_wake_up(). Hence, p->pi_lock is locked, but
   1877	 * rq->lock is not... So, lock it
   1878	 */
   1879	rq_lock(rq, &rf);
   1880	if (p->dl.dl_non_contending) {
   1881		update_rq_clock(rq);
   1882		sub_running_bw(&p->dl, &rq->dl);
   1883		p->dl.dl_non_contending = 0;
   1884		/*
   1885		 * If the timer handler is currently running and the
   1886		 * timer cannot be canceled, inactive_task_timer()
   1887		 * will see that dl_not_contending is not set, and
   1888		 * will not touch the rq's active utilization,
   1889		 * so we are still safe.
   1890		 */
   1891		if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
   1892			put_task_struct(p);
   1893	}
   1894	sub_rq_bw(&p->dl, &rq->dl);
   1895	rq_unlock(rq, &rf);
   1896}
   1897
   1898static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
   1899{
   1900	/*
   1901	 * Current can't be migrated, useless to reschedule,
   1902	 * let's hope p can move out.
   1903	 */
   1904	if (rq->curr->nr_cpus_allowed == 1 ||
   1905	    !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
   1906		return;
   1907
   1908	/*
   1909	 * p is migratable, so let's not schedule it and
   1910	 * see if it is pushed or pulled somewhere else.
   1911	 */
   1912	if (p->nr_cpus_allowed != 1 &&
   1913	    cpudl_find(&rq->rd->cpudl, p, NULL))
   1914		return;
   1915
   1916	resched_curr(rq);
   1917}
   1918
   1919static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
   1920{
   1921	if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
   1922		/*
   1923		 * This is OK, because current is on_cpu, which avoids it being
   1924		 * picked for load-balance and preemption/IRQs are still
   1925		 * disabled avoiding further scheduler activity on it and we've
   1926		 * not yet started the picking loop.
   1927		 */
   1928		rq_unpin_lock(rq, rf);
   1929		pull_dl_task(rq);
   1930		rq_repin_lock(rq, rf);
   1931	}
   1932
   1933	return sched_stop_runnable(rq) || sched_dl_runnable(rq);
   1934}
   1935#endif /* CONFIG_SMP */
   1936
   1937/*
   1938 * Only called when both the current and waking task are -deadline
   1939 * tasks.
   1940 */
   1941static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
   1942				  int flags)
   1943{
   1944	if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
   1945		resched_curr(rq);
   1946		return;
   1947	}
   1948
   1949#ifdef CONFIG_SMP
   1950	/*
   1951	 * In the unlikely case current and p have the same deadline
   1952	 * let us try to decide what's the best thing to do...
   1953	 */
   1954	if ((p->dl.deadline == rq->curr->dl.deadline) &&
   1955	    !test_tsk_need_resched(rq->curr))
   1956		check_preempt_equal_dl(rq, p);
   1957#endif /* CONFIG_SMP */
   1958}
   1959
   1960#ifdef CONFIG_SCHED_HRTICK
   1961static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
   1962{
   1963	hrtick_start(rq, p->dl.runtime);
   1964}
   1965#else /* !CONFIG_SCHED_HRTICK */
   1966static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
   1967{
   1968}
   1969#endif
   1970
   1971static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
   1972{
   1973	struct sched_dl_entity *dl_se = &p->dl;
   1974	struct dl_rq *dl_rq = &rq->dl;
   1975
   1976	p->se.exec_start = rq_clock_task(rq);
   1977	if (on_dl_rq(&p->dl))
   1978		update_stats_wait_end_dl(dl_rq, dl_se);
   1979
   1980	/* You can't push away the running task */
   1981	dequeue_pushable_dl_task(rq, p);
   1982
   1983	if (!first)
   1984		return;
   1985
   1986	if (hrtick_enabled_dl(rq))
   1987		start_hrtick_dl(rq, p);
   1988
   1989	if (rq->curr->sched_class != &dl_sched_class)
   1990		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
   1991
   1992	deadline_queue_push_tasks(rq);
   1993}
   1994
   1995static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
   1996{
   1997	struct rb_node *left = rb_first_cached(&dl_rq->root);
   1998
   1999	if (!left)
   2000		return NULL;
   2001
   2002	return __node_2_dle(left);
   2003}
   2004
   2005static struct task_struct *pick_task_dl(struct rq *rq)
   2006{
   2007	struct sched_dl_entity *dl_se;
   2008	struct dl_rq *dl_rq = &rq->dl;
   2009	struct task_struct *p;
   2010
   2011	if (!sched_dl_runnable(rq))
   2012		return NULL;
   2013
   2014	dl_se = pick_next_dl_entity(dl_rq);
   2015	BUG_ON(!dl_se);
   2016	p = dl_task_of(dl_se);
   2017
   2018	return p;
   2019}
   2020
   2021static struct task_struct *pick_next_task_dl(struct rq *rq)
   2022{
   2023	struct task_struct *p;
   2024
   2025	p = pick_task_dl(rq);
   2026	if (p)
   2027		set_next_task_dl(rq, p, true);
   2028
   2029	return p;
   2030}
   2031
   2032static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
   2033{
   2034	struct sched_dl_entity *dl_se = &p->dl;
   2035	struct dl_rq *dl_rq = &rq->dl;
   2036
   2037	if (on_dl_rq(&p->dl))
   2038		update_stats_wait_start_dl(dl_rq, dl_se);
   2039
   2040	update_curr_dl(rq);
   2041
   2042	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
   2043	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
   2044		enqueue_pushable_dl_task(rq, p);
   2045}
   2046
   2047/*
   2048 * scheduler tick hitting a task of our scheduling class.
   2049 *
   2050 * NOTE: This function can be called remotely by the tick offload that
   2051 * goes along full dynticks. Therefore no local assumption can be made
   2052 * and everything must be accessed through the @rq and @curr passed in
   2053 * parameters.
   2054 */
   2055static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
   2056{
   2057	update_curr_dl(rq);
   2058
   2059	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
   2060	/*
   2061	 * Even when we have runtime, update_curr_dl() might have resulted in us
   2062	 * not being the leftmost task anymore. In that case NEED_RESCHED will
   2063	 * be set and schedule() will start a new hrtick for the next task.
   2064	 */
   2065	if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
   2066	    is_leftmost(p, &rq->dl))
   2067		start_hrtick_dl(rq, p);
   2068}
   2069
   2070static void task_fork_dl(struct task_struct *p)
   2071{
   2072	/*
   2073	 * SCHED_DEADLINE tasks cannot fork and this is achieved through
   2074	 * sched_fork()
   2075	 */
   2076}
   2077
   2078#ifdef CONFIG_SMP
   2079
   2080/* Only try algorithms three times */
   2081#define DL_MAX_TRIES 3
   2082
   2083static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
   2084{
   2085	if (!task_running(rq, p) &&
   2086	    cpumask_test_cpu(cpu, &p->cpus_mask))
   2087		return 1;
   2088	return 0;
   2089}
   2090
   2091/*
   2092 * Return the earliest pushable rq's task, which is suitable to be executed
   2093 * on the CPU, NULL otherwise:
   2094 */
   2095static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
   2096{
   2097	struct task_struct *p = NULL;
   2098	struct rb_node *next_node;
   2099
   2100	if (!has_pushable_dl_tasks(rq))
   2101		return NULL;
   2102
   2103	next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
   2104
   2105next_node:
   2106	if (next_node) {
   2107		p = __node_2_pdl(next_node);
   2108
   2109		if (pick_dl_task(rq, p, cpu))
   2110			return p;
   2111
   2112		next_node = rb_next(next_node);
   2113		goto next_node;
   2114	}
   2115
   2116	return NULL;
   2117}
   2118
   2119static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
   2120
   2121static int find_later_rq(struct task_struct *task)
   2122{
   2123	struct sched_domain *sd;
   2124	struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
   2125	int this_cpu = smp_processor_id();
   2126	int cpu = task_cpu(task);
   2127
   2128	/* Make sure the mask is initialized first */
   2129	if (unlikely(!later_mask))
   2130		return -1;
   2131
   2132	if (task->nr_cpus_allowed == 1)
   2133		return -1;
   2134
   2135	/*
   2136	 * We have to consider system topology and task affinity
   2137	 * first, then we can look for a suitable CPU.
   2138	 */
   2139	if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
   2140		return -1;
   2141
   2142	/*
   2143	 * If we are here, some targets have been found, including
   2144	 * the most suitable which is, among the runqueues where the
   2145	 * current tasks have later deadlines than the task's one, the
   2146	 * rq with the latest possible one.
   2147	 *
   2148	 * Now we check how well this matches with task's
   2149	 * affinity and system topology.
   2150	 *
   2151	 * The last CPU where the task run is our first
   2152	 * guess, since it is most likely cache-hot there.
   2153	 */
   2154	if (cpumask_test_cpu(cpu, later_mask))
   2155		return cpu;
   2156	/*
   2157	 * Check if this_cpu is to be skipped (i.e., it is
   2158	 * not in the mask) or not.
   2159	 */
   2160	if (!cpumask_test_cpu(this_cpu, later_mask))
   2161		this_cpu = -1;
   2162
   2163	rcu_read_lock();
   2164	for_each_domain(cpu, sd) {
   2165		if (sd->flags & SD_WAKE_AFFINE) {
   2166			int best_cpu;
   2167
   2168			/*
   2169			 * If possible, preempting this_cpu is
   2170			 * cheaper than migrating.
   2171			 */
   2172			if (this_cpu != -1 &&
   2173			    cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
   2174				rcu_read_unlock();
   2175				return this_cpu;
   2176			}
   2177
   2178			best_cpu = cpumask_any_and_distribute(later_mask,
   2179							      sched_domain_span(sd));
   2180			/*
   2181			 * Last chance: if a CPU being in both later_mask
   2182			 * and current sd span is valid, that becomes our
   2183			 * choice. Of course, the latest possible CPU is
   2184			 * already under consideration through later_mask.
   2185			 */
   2186			if (best_cpu < nr_cpu_ids) {
   2187				rcu_read_unlock();
   2188				return best_cpu;
   2189			}
   2190		}
   2191	}
   2192	rcu_read_unlock();
   2193
   2194	/*
   2195	 * At this point, all our guesses failed, we just return
   2196	 * 'something', and let the caller sort the things out.
   2197	 */
   2198	if (this_cpu != -1)
   2199		return this_cpu;
   2200
   2201	cpu = cpumask_any_distribute(later_mask);
   2202	if (cpu < nr_cpu_ids)
   2203		return cpu;
   2204
   2205	return -1;
   2206}
   2207
   2208/* Locks the rq it finds */
   2209static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
   2210{
   2211	struct rq *later_rq = NULL;
   2212	int tries;
   2213	int cpu;
   2214
   2215	for (tries = 0; tries < DL_MAX_TRIES; tries++) {
   2216		cpu = find_later_rq(task);
   2217
   2218		if ((cpu == -1) || (cpu == rq->cpu))
   2219			break;
   2220
   2221		later_rq = cpu_rq(cpu);
   2222
   2223		if (later_rq->dl.dl_nr_running &&
   2224		    !dl_time_before(task->dl.deadline,
   2225					later_rq->dl.earliest_dl.curr)) {
   2226			/*
   2227			 * Target rq has tasks of equal or earlier deadline,
   2228			 * retrying does not release any lock and is unlikely
   2229			 * to yield a different result.
   2230			 */
   2231			later_rq = NULL;
   2232			break;
   2233		}
   2234
   2235		/* Retry if something changed. */
   2236		if (double_lock_balance(rq, later_rq)) {
   2237			if (unlikely(task_rq(task) != rq ||
   2238				     !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
   2239				     task_running(rq, task) ||
   2240				     !dl_task(task) ||
   2241				     !task_on_rq_queued(task))) {
   2242				double_unlock_balance(rq, later_rq);
   2243				later_rq = NULL;
   2244				break;
   2245			}
   2246		}
   2247
   2248		/*
   2249		 * If the rq we found has no -deadline task, or
   2250		 * its earliest one has a later deadline than our
   2251		 * task, the rq is a good one.
   2252		 */
   2253		if (!later_rq->dl.dl_nr_running ||
   2254		    dl_time_before(task->dl.deadline,
   2255				   later_rq->dl.earliest_dl.curr))
   2256			break;
   2257
   2258		/* Otherwise we try again. */
   2259		double_unlock_balance(rq, later_rq);
   2260		later_rq = NULL;
   2261	}
   2262
   2263	return later_rq;
   2264}
   2265
   2266static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
   2267{
   2268	struct task_struct *p;
   2269
   2270	if (!has_pushable_dl_tasks(rq))
   2271		return NULL;
   2272
   2273	p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
   2274
   2275	BUG_ON(rq->cpu != task_cpu(p));
   2276	BUG_ON(task_current(rq, p));
   2277	BUG_ON(p->nr_cpus_allowed <= 1);
   2278
   2279	BUG_ON(!task_on_rq_queued(p));
   2280	BUG_ON(!dl_task(p));
   2281
   2282	return p;
   2283}
   2284
   2285/*
   2286 * See if the non running -deadline tasks on this rq
   2287 * can be sent to some other CPU where they can preempt
   2288 * and start executing.
   2289 */
   2290static int push_dl_task(struct rq *rq)
   2291{
   2292	struct task_struct *next_task;
   2293	struct rq *later_rq;
   2294	int ret = 0;
   2295
   2296	if (!rq->dl.overloaded)
   2297		return 0;
   2298
   2299	next_task = pick_next_pushable_dl_task(rq);
   2300	if (!next_task)
   2301		return 0;
   2302
   2303retry:
   2304	/*
   2305	 * If next_task preempts rq->curr, and rq->curr
   2306	 * can move away, it makes sense to just reschedule
   2307	 * without going further in pushing next_task.
   2308	 */
   2309	if (dl_task(rq->curr) &&
   2310	    dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
   2311	    rq->curr->nr_cpus_allowed > 1) {
   2312		resched_curr(rq);
   2313		return 0;
   2314	}
   2315
   2316	if (is_migration_disabled(next_task))
   2317		return 0;
   2318
   2319	if (WARN_ON(next_task == rq->curr))
   2320		return 0;
   2321
   2322	/* We might release rq lock */
   2323	get_task_struct(next_task);
   2324
   2325	/* Will lock the rq it'll find */
   2326	later_rq = find_lock_later_rq(next_task, rq);
   2327	if (!later_rq) {
   2328		struct task_struct *task;
   2329
   2330		/*
   2331		 * We must check all this again, since
   2332		 * find_lock_later_rq releases rq->lock and it is
   2333		 * then possible that next_task has migrated.
   2334		 */
   2335		task = pick_next_pushable_dl_task(rq);
   2336		if (task == next_task) {
   2337			/*
   2338			 * The task is still there. We don't try
   2339			 * again, some other CPU will pull it when ready.
   2340			 */
   2341			goto out;
   2342		}
   2343
   2344		if (!task)
   2345			/* No more tasks */
   2346			goto out;
   2347
   2348		put_task_struct(next_task);
   2349		next_task = task;
   2350		goto retry;
   2351	}
   2352
   2353	deactivate_task(rq, next_task, 0);
   2354	set_task_cpu(next_task, later_rq->cpu);
   2355	activate_task(later_rq, next_task, 0);
   2356	ret = 1;
   2357
   2358	resched_curr(later_rq);
   2359
   2360	double_unlock_balance(rq, later_rq);
   2361
   2362out:
   2363	put_task_struct(next_task);
   2364
   2365	return ret;
   2366}
   2367
   2368static void push_dl_tasks(struct rq *rq)
   2369{
   2370	/* push_dl_task() will return true if it moved a -deadline task */
   2371	while (push_dl_task(rq))
   2372		;
   2373}
   2374
   2375static void pull_dl_task(struct rq *this_rq)
   2376{
   2377	int this_cpu = this_rq->cpu, cpu;
   2378	struct task_struct *p, *push_task;
   2379	bool resched = false;
   2380	struct rq *src_rq;
   2381	u64 dmin = LONG_MAX;
   2382
   2383	if (likely(!dl_overloaded(this_rq)))
   2384		return;
   2385
   2386	/*
   2387	 * Match the barrier from dl_set_overloaded; this guarantees that if we
   2388	 * see overloaded we must also see the dlo_mask bit.
   2389	 */
   2390	smp_rmb();
   2391
   2392	for_each_cpu(cpu, this_rq->rd->dlo_mask) {
   2393		if (this_cpu == cpu)
   2394			continue;
   2395
   2396		src_rq = cpu_rq(cpu);
   2397
   2398		/*
   2399		 * It looks racy, abd it is! However, as in sched_rt.c,
   2400		 * we are fine with this.
   2401		 */
   2402		if (this_rq->dl.dl_nr_running &&
   2403		    dl_time_before(this_rq->dl.earliest_dl.curr,
   2404				   src_rq->dl.earliest_dl.next))
   2405			continue;
   2406
   2407		/* Might drop this_rq->lock */
   2408		push_task = NULL;
   2409		double_lock_balance(this_rq, src_rq);
   2410
   2411		/*
   2412		 * If there are no more pullable tasks on the
   2413		 * rq, we're done with it.
   2414		 */
   2415		if (src_rq->dl.dl_nr_running <= 1)
   2416			goto skip;
   2417
   2418		p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
   2419
   2420		/*
   2421		 * We found a task to be pulled if:
   2422		 *  - it preempts our current (if there's one),
   2423		 *  - it will preempt the last one we pulled (if any).
   2424		 */
   2425		if (p && dl_time_before(p->dl.deadline, dmin) &&
   2426		    (!this_rq->dl.dl_nr_running ||
   2427		     dl_time_before(p->dl.deadline,
   2428				    this_rq->dl.earliest_dl.curr))) {
   2429			WARN_ON(p == src_rq->curr);
   2430			WARN_ON(!task_on_rq_queued(p));
   2431
   2432			/*
   2433			 * Then we pull iff p has actually an earlier
   2434			 * deadline than the current task of its runqueue.
   2435			 */
   2436			if (dl_time_before(p->dl.deadline,
   2437					   src_rq->curr->dl.deadline))
   2438				goto skip;
   2439
   2440			if (is_migration_disabled(p)) {
   2441				push_task = get_push_task(src_rq);
   2442			} else {
   2443				deactivate_task(src_rq, p, 0);
   2444				set_task_cpu(p, this_cpu);
   2445				activate_task(this_rq, p, 0);
   2446				dmin = p->dl.deadline;
   2447				resched = true;
   2448			}
   2449
   2450			/* Is there any other task even earlier? */
   2451		}
   2452skip:
   2453		double_unlock_balance(this_rq, src_rq);
   2454
   2455		if (push_task) {
   2456			raw_spin_rq_unlock(this_rq);
   2457			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
   2458					    push_task, &src_rq->push_work);
   2459			raw_spin_rq_lock(this_rq);
   2460		}
   2461	}
   2462
   2463	if (resched)
   2464		resched_curr(this_rq);
   2465}
   2466
   2467/*
   2468 * Since the task is not running and a reschedule is not going to happen
   2469 * anytime soon on its runqueue, we try pushing it away now.
   2470 */
   2471static void task_woken_dl(struct rq *rq, struct task_struct *p)
   2472{
   2473	if (!task_running(rq, p) &&
   2474	    !test_tsk_need_resched(rq->curr) &&
   2475	    p->nr_cpus_allowed > 1 &&
   2476	    dl_task(rq->curr) &&
   2477	    (rq->curr->nr_cpus_allowed < 2 ||
   2478	     !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
   2479		push_dl_tasks(rq);
   2480	}
   2481}
   2482
   2483static void set_cpus_allowed_dl(struct task_struct *p,
   2484				const struct cpumask *new_mask,
   2485				u32 flags)
   2486{
   2487	struct root_domain *src_rd;
   2488	struct rq *rq;
   2489
   2490	BUG_ON(!dl_task(p));
   2491
   2492	rq = task_rq(p);
   2493	src_rd = rq->rd;
   2494	/*
   2495	 * Migrating a SCHED_DEADLINE task between exclusive
   2496	 * cpusets (different root_domains) entails a bandwidth
   2497	 * update. We already made space for us in the destination
   2498	 * domain (see cpuset_can_attach()).
   2499	 */
   2500	if (!cpumask_intersects(src_rd->span, new_mask)) {
   2501		struct dl_bw *src_dl_b;
   2502
   2503		src_dl_b = dl_bw_of(cpu_of(rq));
   2504		/*
   2505		 * We now free resources of the root_domain we are migrating
   2506		 * off. In the worst case, sched_setattr() may temporary fail
   2507		 * until we complete the update.
   2508		 */
   2509		raw_spin_lock(&src_dl_b->lock);
   2510		__dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
   2511		raw_spin_unlock(&src_dl_b->lock);
   2512	}
   2513
   2514	set_cpus_allowed_common(p, new_mask, flags);
   2515}
   2516
   2517/* Assumes rq->lock is held */
   2518static void rq_online_dl(struct rq *rq)
   2519{
   2520	if (rq->dl.overloaded)
   2521		dl_set_overload(rq);
   2522
   2523	cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
   2524	if (rq->dl.dl_nr_running > 0)
   2525		cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
   2526}
   2527
   2528/* Assumes rq->lock is held */
   2529static void rq_offline_dl(struct rq *rq)
   2530{
   2531	if (rq->dl.overloaded)
   2532		dl_clear_overload(rq);
   2533
   2534	cpudl_clear(&rq->rd->cpudl, rq->cpu);
   2535	cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
   2536}
   2537
   2538void __init init_sched_dl_class(void)
   2539{
   2540	unsigned int i;
   2541
   2542	for_each_possible_cpu(i)
   2543		zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
   2544					GFP_KERNEL, cpu_to_node(i));
   2545}
   2546
   2547void dl_add_task_root_domain(struct task_struct *p)
   2548{
   2549	struct rq_flags rf;
   2550	struct rq *rq;
   2551	struct dl_bw *dl_b;
   2552
   2553	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
   2554	if (!dl_task(p)) {
   2555		raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
   2556		return;
   2557	}
   2558
   2559	rq = __task_rq_lock(p, &rf);
   2560
   2561	dl_b = &rq->rd->dl_bw;
   2562	raw_spin_lock(&dl_b->lock);
   2563
   2564	__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
   2565
   2566	raw_spin_unlock(&dl_b->lock);
   2567
   2568	task_rq_unlock(rq, p, &rf);
   2569}
   2570
   2571void dl_clear_root_domain(struct root_domain *rd)
   2572{
   2573	unsigned long flags;
   2574
   2575	raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
   2576	rd->dl_bw.total_bw = 0;
   2577	raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
   2578}
   2579
   2580#endif /* CONFIG_SMP */
   2581
   2582static void switched_from_dl(struct rq *rq, struct task_struct *p)
   2583{
   2584	/*
   2585	 * task_non_contending() can start the "inactive timer" (if the 0-lag
   2586	 * time is in the future). If the task switches back to dl before
   2587	 * the "inactive timer" fires, it can continue to consume its current
   2588	 * runtime using its current deadline. If it stays outside of
   2589	 * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer()
   2590	 * will reset the task parameters.
   2591	 */
   2592	if (task_on_rq_queued(p) && p->dl.dl_runtime)
   2593		task_non_contending(p);
   2594
   2595	if (!task_on_rq_queued(p)) {
   2596		/*
   2597		 * Inactive timer is armed. However, p is leaving DEADLINE and
   2598		 * might migrate away from this rq while continuing to run on
   2599		 * some other class. We need to remove its contribution from
   2600		 * this rq running_bw now, or sub_rq_bw (below) will complain.
   2601		 */
   2602		if (p->dl.dl_non_contending)
   2603			sub_running_bw(&p->dl, &rq->dl);
   2604		sub_rq_bw(&p->dl, &rq->dl);
   2605	}
   2606
   2607	/*
   2608	 * We cannot use inactive_task_timer() to invoke sub_running_bw()
   2609	 * at the 0-lag time, because the task could have been migrated
   2610	 * while SCHED_OTHER in the meanwhile.
   2611	 */
   2612	if (p->dl.dl_non_contending)
   2613		p->dl.dl_non_contending = 0;
   2614
   2615	/*
   2616	 * Since this might be the only -deadline task on the rq,
   2617	 * this is the right place to try to pull some other one
   2618	 * from an overloaded CPU, if any.
   2619	 */
   2620	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
   2621		return;
   2622
   2623	deadline_queue_pull_task(rq);
   2624}
   2625
   2626/*
   2627 * When switching to -deadline, we may overload the rq, then
   2628 * we try to push someone off, if possible.
   2629 */
   2630static void switched_to_dl(struct rq *rq, struct task_struct *p)
   2631{
   2632	if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
   2633		put_task_struct(p);
   2634
   2635	/* If p is not queued we will update its parameters at next wakeup. */
   2636	if (!task_on_rq_queued(p)) {
   2637		add_rq_bw(&p->dl, &rq->dl);
   2638
   2639		return;
   2640	}
   2641
   2642	if (rq->curr != p) {
   2643#ifdef CONFIG_SMP
   2644		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
   2645			deadline_queue_push_tasks(rq);
   2646#endif
   2647		if (dl_task(rq->curr))
   2648			check_preempt_curr_dl(rq, p, 0);
   2649		else
   2650			resched_curr(rq);
   2651	} else {
   2652		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
   2653	}
   2654}
   2655
   2656/*
   2657 * If the scheduling parameters of a -deadline task changed,
   2658 * a push or pull operation might be needed.
   2659 */
   2660static void prio_changed_dl(struct rq *rq, struct task_struct *p,
   2661			    int oldprio)
   2662{
   2663	if (task_on_rq_queued(p) || task_current(rq, p)) {
   2664#ifdef CONFIG_SMP
   2665		/*
   2666		 * This might be too much, but unfortunately
   2667		 * we don't have the old deadline value, and
   2668		 * we can't argue if the task is increasing
   2669		 * or lowering its prio, so...
   2670		 */
   2671		if (!rq->dl.overloaded)
   2672			deadline_queue_pull_task(rq);
   2673
   2674		/*
   2675		 * If we now have a earlier deadline task than p,
   2676		 * then reschedule, provided p is still on this
   2677		 * runqueue.
   2678		 */
   2679		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
   2680			resched_curr(rq);
   2681#else
   2682		/*
   2683		 * Again, we don't know if p has a earlier
   2684		 * or later deadline, so let's blindly set a
   2685		 * (maybe not needed) rescheduling point.
   2686		 */
   2687		resched_curr(rq);
   2688#endif /* CONFIG_SMP */
   2689	}
   2690}
   2691
   2692DEFINE_SCHED_CLASS(dl) = {
   2693
   2694	.enqueue_task		= enqueue_task_dl,
   2695	.dequeue_task		= dequeue_task_dl,
   2696	.yield_task		= yield_task_dl,
   2697
   2698	.check_preempt_curr	= check_preempt_curr_dl,
   2699
   2700	.pick_next_task		= pick_next_task_dl,
   2701	.put_prev_task		= put_prev_task_dl,
   2702	.set_next_task		= set_next_task_dl,
   2703
   2704#ifdef CONFIG_SMP
   2705	.balance		= balance_dl,
   2706	.pick_task		= pick_task_dl,
   2707	.select_task_rq		= select_task_rq_dl,
   2708	.migrate_task_rq	= migrate_task_rq_dl,
   2709	.set_cpus_allowed       = set_cpus_allowed_dl,
   2710	.rq_online              = rq_online_dl,
   2711	.rq_offline             = rq_offline_dl,
   2712	.task_woken		= task_woken_dl,
   2713	.find_lock_rq		= find_lock_later_rq,
   2714#endif
   2715
   2716	.task_tick		= task_tick_dl,
   2717	.task_fork              = task_fork_dl,
   2718
   2719	.prio_changed           = prio_changed_dl,
   2720	.switched_from		= switched_from_dl,
   2721	.switched_to		= switched_to_dl,
   2722
   2723	.update_curr		= update_curr_dl,
   2724};
   2725
   2726/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
   2727static u64 dl_generation;
   2728
   2729int sched_dl_global_validate(void)
   2730{
   2731	u64 runtime = global_rt_runtime();
   2732	u64 period = global_rt_period();
   2733	u64 new_bw = to_ratio(period, runtime);
   2734	u64 gen = ++dl_generation;
   2735	struct dl_bw *dl_b;
   2736	int cpu, cpus, ret = 0;
   2737	unsigned long flags;
   2738
   2739	/*
   2740	 * Here we want to check the bandwidth not being set to some
   2741	 * value smaller than the currently allocated bandwidth in
   2742	 * any of the root_domains.
   2743	 */
   2744	for_each_possible_cpu(cpu) {
   2745		rcu_read_lock_sched();
   2746
   2747		if (dl_bw_visited(cpu, gen))
   2748			goto next;
   2749
   2750		dl_b = dl_bw_of(cpu);
   2751		cpus = dl_bw_cpus(cpu);
   2752
   2753		raw_spin_lock_irqsave(&dl_b->lock, flags);
   2754		if (new_bw * cpus < dl_b->total_bw)
   2755			ret = -EBUSY;
   2756		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
   2757
   2758next:
   2759		rcu_read_unlock_sched();
   2760
   2761		if (ret)
   2762			break;
   2763	}
   2764
   2765	return ret;
   2766}
   2767
   2768static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
   2769{
   2770	if (global_rt_runtime() == RUNTIME_INF) {
   2771		dl_rq->bw_ratio = 1 << RATIO_SHIFT;
   2772		dl_rq->extra_bw = 1 << BW_SHIFT;
   2773	} else {
   2774		dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
   2775			  global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
   2776		dl_rq->extra_bw = to_ratio(global_rt_period(),
   2777						    global_rt_runtime());
   2778	}
   2779}
   2780
   2781void sched_dl_do_global(void)
   2782{
   2783	u64 new_bw = -1;
   2784	u64 gen = ++dl_generation;
   2785	struct dl_bw *dl_b;
   2786	int cpu;
   2787	unsigned long flags;
   2788
   2789	if (global_rt_runtime() != RUNTIME_INF)
   2790		new_bw = to_ratio(global_rt_period(), global_rt_runtime());
   2791
   2792	for_each_possible_cpu(cpu) {
   2793		rcu_read_lock_sched();
   2794
   2795		if (dl_bw_visited(cpu, gen)) {
   2796			rcu_read_unlock_sched();
   2797			continue;
   2798		}
   2799
   2800		dl_b = dl_bw_of(cpu);
   2801
   2802		raw_spin_lock_irqsave(&dl_b->lock, flags);
   2803		dl_b->bw = new_bw;
   2804		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
   2805
   2806		rcu_read_unlock_sched();
   2807		init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
   2808	}
   2809}
   2810
   2811/*
   2812 * We must be sure that accepting a new task (or allowing changing the
   2813 * parameters of an existing one) is consistent with the bandwidth
   2814 * constraints. If yes, this function also accordingly updates the currently
   2815 * allocated bandwidth to reflect the new situation.
   2816 *
   2817 * This function is called while holding p's rq->lock.
   2818 */
   2819int sched_dl_overflow(struct task_struct *p, int policy,
   2820		      const struct sched_attr *attr)
   2821{
   2822	u64 period = attr->sched_period ?: attr->sched_deadline;
   2823	u64 runtime = attr->sched_runtime;
   2824	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
   2825	int cpus, err = -1, cpu = task_cpu(p);
   2826	struct dl_bw *dl_b = dl_bw_of(cpu);
   2827	unsigned long cap;
   2828
   2829	if (attr->sched_flags & SCHED_FLAG_SUGOV)
   2830		return 0;
   2831
   2832	/* !deadline task may carry old deadline bandwidth */
   2833	if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
   2834		return 0;
   2835
   2836	/*
   2837	 * Either if a task, enters, leave, or stays -deadline but changes
   2838	 * its parameters, we may need to update accordingly the total
   2839	 * allocated bandwidth of the container.
   2840	 */
   2841	raw_spin_lock(&dl_b->lock);
   2842	cpus = dl_bw_cpus(cpu);
   2843	cap = dl_bw_capacity(cpu);
   2844
   2845	if (dl_policy(policy) && !task_has_dl_policy(p) &&
   2846	    !__dl_overflow(dl_b, cap, 0, new_bw)) {
   2847		if (hrtimer_active(&p->dl.inactive_timer))
   2848			__dl_sub(dl_b, p->dl.dl_bw, cpus);
   2849		__dl_add(dl_b, new_bw, cpus);
   2850		err = 0;
   2851	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
   2852		   !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
   2853		/*
   2854		 * XXX this is slightly incorrect: when the task
   2855		 * utilization decreases, we should delay the total
   2856		 * utilization change until the task's 0-lag point.
   2857		 * But this would require to set the task's "inactive
   2858		 * timer" when the task is not inactive.
   2859		 */
   2860		__dl_sub(dl_b, p->dl.dl_bw, cpus);
   2861		__dl_add(dl_b, new_bw, cpus);
   2862		dl_change_utilization(p, new_bw);
   2863		err = 0;
   2864	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
   2865		/*
   2866		 * Do not decrease the total deadline utilization here,
   2867		 * switched_from_dl() will take care to do it at the correct
   2868		 * (0-lag) time.
   2869		 */
   2870		err = 0;
   2871	}
   2872	raw_spin_unlock(&dl_b->lock);
   2873
   2874	return err;
   2875}
   2876
   2877/*
   2878 * This function initializes the sched_dl_entity of a newly becoming
   2879 * SCHED_DEADLINE task.
   2880 *
   2881 * Only the static values are considered here, the actual runtime and the
   2882 * absolute deadline will be properly calculated when the task is enqueued
   2883 * for the first time with its new policy.
   2884 */
   2885void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
   2886{
   2887	struct sched_dl_entity *dl_se = &p->dl;
   2888
   2889	dl_se->dl_runtime = attr->sched_runtime;
   2890	dl_se->dl_deadline = attr->sched_deadline;
   2891	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
   2892	dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
   2893	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
   2894	dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
   2895}
   2896
   2897void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
   2898{
   2899	struct sched_dl_entity *dl_se = &p->dl;
   2900
   2901	attr->sched_priority = p->rt_priority;
   2902	attr->sched_runtime = dl_se->dl_runtime;
   2903	attr->sched_deadline = dl_se->dl_deadline;
   2904	attr->sched_period = dl_se->dl_period;
   2905	attr->sched_flags &= ~SCHED_DL_FLAGS;
   2906	attr->sched_flags |= dl_se->flags;
   2907}
   2908
   2909/*
   2910 * This function validates the new parameters of a -deadline task.
   2911 * We ask for the deadline not being zero, and greater or equal
   2912 * than the runtime, as well as the period of being zero or
   2913 * greater than deadline. Furthermore, we have to be sure that
   2914 * user parameters are above the internal resolution of 1us (we
   2915 * check sched_runtime only since it is always the smaller one) and
   2916 * below 2^63 ns (we have to check both sched_deadline and
   2917 * sched_period, as the latter can be zero).
   2918 */
   2919bool __checkparam_dl(const struct sched_attr *attr)
   2920{
   2921	u64 period, max, min;
   2922
   2923	/* special dl tasks don't actually use any parameter */
   2924	if (attr->sched_flags & SCHED_FLAG_SUGOV)
   2925		return true;
   2926
   2927	/* deadline != 0 */
   2928	if (attr->sched_deadline == 0)
   2929		return false;
   2930
   2931	/*
   2932	 * Since we truncate DL_SCALE bits, make sure we're at least
   2933	 * that big.
   2934	 */
   2935	if (attr->sched_runtime < (1ULL << DL_SCALE))
   2936		return false;
   2937
   2938	/*
   2939	 * Since we use the MSB for wrap-around and sign issues, make
   2940	 * sure it's not set (mind that period can be equal to zero).
   2941	 */
   2942	if (attr->sched_deadline & (1ULL << 63) ||
   2943	    attr->sched_period & (1ULL << 63))
   2944		return false;
   2945
   2946	period = attr->sched_period;
   2947	if (!period)
   2948		period = attr->sched_deadline;
   2949
   2950	/* runtime <= deadline <= period (if period != 0) */
   2951	if (period < attr->sched_deadline ||
   2952	    attr->sched_deadline < attr->sched_runtime)
   2953		return false;
   2954
   2955	max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
   2956	min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
   2957
   2958	if (period < min || period > max)
   2959		return false;
   2960
   2961	return true;
   2962}
   2963
   2964/*
   2965 * This function clears the sched_dl_entity static params.
   2966 */
   2967void __dl_clear_params(struct task_struct *p)
   2968{
   2969	struct sched_dl_entity *dl_se = &p->dl;
   2970
   2971	dl_se->dl_runtime		= 0;
   2972	dl_se->dl_deadline		= 0;
   2973	dl_se->dl_period		= 0;
   2974	dl_se->flags			= 0;
   2975	dl_se->dl_bw			= 0;
   2976	dl_se->dl_density		= 0;
   2977
   2978	dl_se->dl_throttled		= 0;
   2979	dl_se->dl_yielded		= 0;
   2980	dl_se->dl_non_contending	= 0;
   2981	dl_se->dl_overrun		= 0;
   2982
   2983#ifdef CONFIG_RT_MUTEXES
   2984	dl_se->pi_se			= dl_se;
   2985#endif
   2986}
   2987
   2988bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
   2989{
   2990	struct sched_dl_entity *dl_se = &p->dl;
   2991
   2992	if (dl_se->dl_runtime != attr->sched_runtime ||
   2993	    dl_se->dl_deadline != attr->sched_deadline ||
   2994	    dl_se->dl_period != attr->sched_period ||
   2995	    dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
   2996		return true;
   2997
   2998	return false;
   2999}
   3000
   3001#ifdef CONFIG_SMP
   3002int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
   3003				 const struct cpumask *trial)
   3004{
   3005	int ret = 1, trial_cpus;
   3006	struct dl_bw *cur_dl_b;
   3007	unsigned long flags;
   3008
   3009	rcu_read_lock_sched();
   3010	cur_dl_b = dl_bw_of(cpumask_any(cur));
   3011	trial_cpus = cpumask_weight(trial);
   3012
   3013	raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
   3014	if (cur_dl_b->bw != -1 &&
   3015	    cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
   3016		ret = 0;
   3017	raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
   3018	rcu_read_unlock_sched();
   3019
   3020	return ret;
   3021}
   3022
   3023int dl_cpu_busy(int cpu, struct task_struct *p)
   3024{
   3025	unsigned long flags, cap;
   3026	struct dl_bw *dl_b;
   3027	bool overflow;
   3028
   3029	rcu_read_lock_sched();
   3030	dl_b = dl_bw_of(cpu);
   3031	raw_spin_lock_irqsave(&dl_b->lock, flags);
   3032	cap = dl_bw_capacity(cpu);
   3033	overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0);
   3034
   3035	if (!overflow && p) {
   3036		/*
   3037		 * We reserve space for this task in the destination
   3038		 * root_domain, as we can't fail after this point.
   3039		 * We will free resources in the source root_domain
   3040		 * later on (see set_cpus_allowed_dl()).
   3041		 */
   3042		__dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu));
   3043	}
   3044
   3045	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
   3046	rcu_read_unlock_sched();
   3047
   3048	return overflow ? -EBUSY : 0;
   3049}
   3050#endif
   3051
   3052#ifdef CONFIG_SCHED_DEBUG
   3053void print_dl_stats(struct seq_file *m, int cpu)
   3054{
   3055	print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
   3056}
   3057#endif /* CONFIG_SCHED_DEBUG */