rt.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
rt.c (73043B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
      4 * policies)
      5 */
      6
      7int sched_rr_timeslice = RR_TIMESLICE;
      8/* More than 4 hours if BW_SHIFT equals 20. */
      9static const u64 max_rt_runtime = MAX_BW;
     10
     11static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
     12
     13struct rt_bandwidth def_rt_bandwidth;
     14
     15/*
     16 * period over which we measure -rt task CPU usage in us.
     17 * default: 1s
     18 */
     19unsigned int sysctl_sched_rt_period = 1000000;
     20
     21/*
     22 * part of the period that we allow rt tasks to run in us.
     23 * default: 0.95s
     24 */
     25int sysctl_sched_rt_runtime = 950000;
     26
     27#ifdef CONFIG_SYSCTL
     28static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
     29static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
     30		size_t *lenp, loff_t *ppos);
     31static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
     32		size_t *lenp, loff_t *ppos);
     33static struct ctl_table sched_rt_sysctls[] = {
     34	{
     35		.procname       = "sched_rt_period_us",
     36		.data           = &sysctl_sched_rt_period,
     37		.maxlen         = sizeof(unsigned int),
     38		.mode           = 0644,
     39		.proc_handler   = sched_rt_handler,
     40	},
     41	{
     42		.procname       = "sched_rt_runtime_us",
     43		.data           = &sysctl_sched_rt_runtime,
     44		.maxlen         = sizeof(int),
     45		.mode           = 0644,
     46		.proc_handler   = sched_rt_handler,
     47	},
     48	{
     49		.procname       = "sched_rr_timeslice_ms",
     50		.data           = &sysctl_sched_rr_timeslice,
     51		.maxlen         = sizeof(int),
     52		.mode           = 0644,
     53		.proc_handler   = sched_rr_handler,
     54	},
     55	{}
     56};
     57
     58static int __init sched_rt_sysctl_init(void)
     59{
     60	register_sysctl_init("kernel", sched_rt_sysctls);
     61	return 0;
     62}
     63late_initcall(sched_rt_sysctl_init);
     64#endif
     65
     66static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
     67{
     68	struct rt_bandwidth *rt_b =
     69		container_of(timer, struct rt_bandwidth, rt_period_timer);
     70	int idle = 0;
     71	int overrun;
     72
     73	raw_spin_lock(&rt_b->rt_runtime_lock);
     74	for (;;) {
     75		overrun = hrtimer_forward_now(timer, rt_b->rt_period);
     76		if (!overrun)
     77			break;
     78
     79		raw_spin_unlock(&rt_b->rt_runtime_lock);
     80		idle = do_sched_rt_period_timer(rt_b, overrun);
     81		raw_spin_lock(&rt_b->rt_runtime_lock);
     82	}
     83	if (idle)
     84		rt_b->rt_period_active = 0;
     85	raw_spin_unlock(&rt_b->rt_runtime_lock);
     86
     87	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
     88}
     89
     90void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
     91{
     92	rt_b->rt_period = ns_to_ktime(period);
     93	rt_b->rt_runtime = runtime;
     94
     95	raw_spin_lock_init(&rt_b->rt_runtime_lock);
     96
     97	hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
     98		     HRTIMER_MODE_REL_HARD);
     99	rt_b->rt_period_timer.function = sched_rt_period_timer;
    100}
    101
    102static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
    103{
    104	raw_spin_lock(&rt_b->rt_runtime_lock);
    105	if (!rt_b->rt_period_active) {
    106		rt_b->rt_period_active = 1;
    107		/*
    108		 * SCHED_DEADLINE updates the bandwidth, as a run away
    109		 * RT task with a DL task could hog a CPU. But DL does
    110		 * not reset the period. If a deadline task was running
    111		 * without an RT task running, it can cause RT tasks to
    112		 * throttle when they start up. Kick the timer right away
    113		 * to update the period.
    114		 */
    115		hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
    116		hrtimer_start_expires(&rt_b->rt_period_timer,
    117				      HRTIMER_MODE_ABS_PINNED_HARD);
    118	}
    119	raw_spin_unlock(&rt_b->rt_runtime_lock);
    120}
    121
    122static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
    123{
    124	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
    125		return;
    126
    127	do_start_rt_bandwidth(rt_b);
    128}
    129
    130void init_rt_rq(struct rt_rq *rt_rq)
    131{
    132	struct rt_prio_array *array;
    133	int i;
    134
    135	array = &rt_rq->active;
    136	for (i = 0; i < MAX_RT_PRIO; i++) {
    137		INIT_LIST_HEAD(array->queue + i);
    138		__clear_bit(i, array->bitmap);
    139	}
    140	/* delimiter for bitsearch: */
    141	__set_bit(MAX_RT_PRIO, array->bitmap);
    142
    143#if defined CONFIG_SMP
    144	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
    145	rt_rq->highest_prio.next = MAX_RT_PRIO-1;
    146	rt_rq->rt_nr_migratory = 0;
    147	rt_rq->overloaded = 0;
    148	plist_head_init(&rt_rq->pushable_tasks);
    149#endif /* CONFIG_SMP */
    150	/* We start is dequeued state, because no RT tasks are queued */
    151	rt_rq->rt_queued = 0;
    152
    153	rt_rq->rt_time = 0;
    154	rt_rq->rt_throttled = 0;
    155	rt_rq->rt_runtime = 0;
    156	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
    157}
    158
    159#ifdef CONFIG_RT_GROUP_SCHED
    160static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
    161{
    162	hrtimer_cancel(&rt_b->rt_period_timer);
    163}
    164
    165#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
    166
    167static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
    168{
    169#ifdef CONFIG_SCHED_DEBUG
    170	WARN_ON_ONCE(!rt_entity_is_task(rt_se));
    171#endif
    172	return container_of(rt_se, struct task_struct, rt);
    173}
    174
    175static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
    176{
    177	return rt_rq->rq;
    178}
    179
    180static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
    181{
    182	return rt_se->rt_rq;
    183}
    184
    185static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
    186{
    187	struct rt_rq *rt_rq = rt_se->rt_rq;
    188
    189	return rt_rq->rq;
    190}
    191
    192void unregister_rt_sched_group(struct task_group *tg)
    193{
    194	if (tg->rt_se)
    195		destroy_rt_bandwidth(&tg->rt_bandwidth);
    196
    197}
    198
    199void free_rt_sched_group(struct task_group *tg)
    200{
    201	int i;
    202
    203	for_each_possible_cpu(i) {
    204		if (tg->rt_rq)
    205			kfree(tg->rt_rq[i]);
    206		if (tg->rt_se)
    207			kfree(tg->rt_se[i]);
    208	}
    209
    210	kfree(tg->rt_rq);
    211	kfree(tg->rt_se);
    212}
    213
    214void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
    215		struct sched_rt_entity *rt_se, int cpu,
    216		struct sched_rt_entity *parent)
    217{
    218	struct rq *rq = cpu_rq(cpu);
    219
    220	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
    221	rt_rq->rt_nr_boosted = 0;
    222	rt_rq->rq = rq;
    223	rt_rq->tg = tg;
    224
    225	tg->rt_rq[cpu] = rt_rq;
    226	tg->rt_se[cpu] = rt_se;
    227
    228	if (!rt_se)
    229		return;
    230
    231	if (!parent)
    232		rt_se->rt_rq = &rq->rt;
    233	else
    234		rt_se->rt_rq = parent->my_q;
    235
    236	rt_se->my_q = rt_rq;
    237	rt_se->parent = parent;
    238	INIT_LIST_HEAD(&rt_se->run_list);
    239}
    240
    241int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
    242{
    243	struct rt_rq *rt_rq;
    244	struct sched_rt_entity *rt_se;
    245	int i;
    246
    247	tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
    248	if (!tg->rt_rq)
    249		goto err;
    250	tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
    251	if (!tg->rt_se)
    252		goto err;
    253
    254	init_rt_bandwidth(&tg->rt_bandwidth,
    255			ktime_to_ns(def_rt_bandwidth.rt_period), 0);
    256
    257	for_each_possible_cpu(i) {
    258		rt_rq = kzalloc_node(sizeof(struct rt_rq),
    259				     GFP_KERNEL, cpu_to_node(i));
    260		if (!rt_rq)
    261			goto err;
    262
    263		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
    264				     GFP_KERNEL, cpu_to_node(i));
    265		if (!rt_se)
    266			goto err_free_rq;
    267
    268		init_rt_rq(rt_rq);
    269		rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
    270		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
    271	}
    272
    273	return 1;
    274
    275err_free_rq:
    276	kfree(rt_rq);
    277err:
    278	return 0;
    279}
    280
    281#else /* CONFIG_RT_GROUP_SCHED */
    282
    283#define rt_entity_is_task(rt_se) (1)
    284
    285static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
    286{
    287	return container_of(rt_se, struct task_struct, rt);
    288}
    289
    290static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
    291{
    292	return container_of(rt_rq, struct rq, rt);
    293}
    294
    295static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
    296{
    297	struct task_struct *p = rt_task_of(rt_se);
    298
    299	return task_rq(p);
    300}
    301
    302static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
    303{
    304	struct rq *rq = rq_of_rt_se(rt_se);
    305
    306	return &rq->rt;
    307}
    308
    309void unregister_rt_sched_group(struct task_group *tg) { }
    310
    311void free_rt_sched_group(struct task_group *tg) { }
    312
    313int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
    314{
    315	return 1;
    316}
    317#endif /* CONFIG_RT_GROUP_SCHED */
    318
    319#ifdef CONFIG_SMP
    320
    321static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
    322{
    323	/* Try to pull RT tasks here if we lower this rq's prio */
    324	return rq->online && rq->rt.highest_prio.curr > prev->prio;
    325}
    326
    327static inline int rt_overloaded(struct rq *rq)
    328{
    329	return atomic_read(&rq->rd->rto_count);
    330}
    331
    332static inline void rt_set_overload(struct rq *rq)
    333{
    334	if (!rq->online)
    335		return;
    336
    337	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
    338	/*
    339	 * Make sure the mask is visible before we set
    340	 * the overload count. That is checked to determine
    341	 * if we should look at the mask. It would be a shame
    342	 * if we looked at the mask, but the mask was not
    343	 * updated yet.
    344	 *
    345	 * Matched by the barrier in pull_rt_task().
    346	 */
    347	smp_wmb();
    348	atomic_inc(&rq->rd->rto_count);
    349}
    350
    351static inline void rt_clear_overload(struct rq *rq)
    352{
    353	if (!rq->online)
    354		return;
    355
    356	/* the order here really doesn't matter */
    357	atomic_dec(&rq->rd->rto_count);
    358	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
    359}
    360
    361static void update_rt_migration(struct rt_rq *rt_rq)
    362{
    363	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
    364		if (!rt_rq->overloaded) {
    365			rt_set_overload(rq_of_rt_rq(rt_rq));
    366			rt_rq->overloaded = 1;
    367		}
    368	} else if (rt_rq->overloaded) {
    369		rt_clear_overload(rq_of_rt_rq(rt_rq));
    370		rt_rq->overloaded = 0;
    371	}
    372}
    373
    374static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
    375{
    376	struct task_struct *p;
    377
    378	if (!rt_entity_is_task(rt_se))
    379		return;
    380
    381	p = rt_task_of(rt_se);
    382	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
    383
    384	rt_rq->rt_nr_total++;
    385	if (p->nr_cpus_allowed > 1)
    386		rt_rq->rt_nr_migratory++;
    387
    388	update_rt_migration(rt_rq);
    389}
    390
    391static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
    392{
    393	struct task_struct *p;
    394
    395	if (!rt_entity_is_task(rt_se))
    396		return;
    397
    398	p = rt_task_of(rt_se);
    399	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
    400
    401	rt_rq->rt_nr_total--;
    402	if (p->nr_cpus_allowed > 1)
    403		rt_rq->rt_nr_migratory--;
    404
    405	update_rt_migration(rt_rq);
    406}
    407
    408static inline int has_pushable_tasks(struct rq *rq)
    409{
    410	return !plist_head_empty(&rq->rt.pushable_tasks);
    411}
    412
    413static DEFINE_PER_CPU(struct callback_head, rt_push_head);
    414static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
    415
    416static void push_rt_tasks(struct rq *);
    417static void pull_rt_task(struct rq *);
    418
    419static inline void rt_queue_push_tasks(struct rq *rq)
    420{
    421	if (!has_pushable_tasks(rq))
    422		return;
    423
    424	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
    425}
    426
    427static inline void rt_queue_pull_task(struct rq *rq)
    428{
    429	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
    430}
    431
    432static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
    433{
    434	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
    435	plist_node_init(&p->pushable_tasks, p->prio);
    436	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
    437
    438	/* Update the highest prio pushable task */
    439	if (p->prio < rq->rt.highest_prio.next)
    440		rq->rt.highest_prio.next = p->prio;
    441}
    442
    443static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
    444{
    445	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
    446
    447	/* Update the new highest prio pushable task */
    448	if (has_pushable_tasks(rq)) {
    449		p = plist_first_entry(&rq->rt.pushable_tasks,
    450				      struct task_struct, pushable_tasks);
    451		rq->rt.highest_prio.next = p->prio;
    452	} else {
    453		rq->rt.highest_prio.next = MAX_RT_PRIO-1;
    454	}
    455}
    456
    457#else
    458
    459static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
    460{
    461}
    462
    463static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
    464{
    465}
    466
    467static inline
    468void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
    469{
    470}
    471
    472static inline
    473void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
    474{
    475}
    476
    477static inline void rt_queue_push_tasks(struct rq *rq)
    478{
    479}
    480#endif /* CONFIG_SMP */
    481
    482static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
    483static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
    484
    485static inline int on_rt_rq(struct sched_rt_entity *rt_se)
    486{
    487	return rt_se->on_rq;
    488}
    489
    490#ifdef CONFIG_UCLAMP_TASK
    491/*
    492 * Verify the fitness of task @p to run on @cpu taking into account the uclamp
    493 * settings.
    494 *
    495 * This check is only important for heterogeneous systems where uclamp_min value
    496 * is higher than the capacity of a @cpu. For non-heterogeneous system this
    497 * function will always return true.
    498 *
    499 * The function will return true if the capacity of the @cpu is >= the
    500 * uclamp_min and false otherwise.
    501 *
    502 * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
    503 * > uclamp_max.
    504 */
    505static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
    506{
    507	unsigned int min_cap;
    508	unsigned int max_cap;
    509	unsigned int cpu_cap;
    510
    511	/* Only heterogeneous systems can benefit from this check */
    512	if (!static_branch_unlikely(&sched_asym_cpucapacity))
    513		return true;
    514
    515	min_cap = uclamp_eff_value(p, UCLAMP_MIN);
    516	max_cap = uclamp_eff_value(p, UCLAMP_MAX);
    517
    518	cpu_cap = capacity_orig_of(cpu);
    519
    520	return cpu_cap >= min(min_cap, max_cap);
    521}
    522#else
    523static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
    524{
    525	return true;
    526}
    527#endif
    528
    529#ifdef CONFIG_RT_GROUP_SCHED
    530
    531static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
    532{
    533	if (!rt_rq->tg)
    534		return RUNTIME_INF;
    535
    536	return rt_rq->rt_runtime;
    537}
    538
    539static inline u64 sched_rt_period(struct rt_rq *rt_rq)
    540{
    541	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
    542}
    543
    544typedef struct task_group *rt_rq_iter_t;
    545
    546static inline struct task_group *next_task_group(struct task_group *tg)
    547{
    548	do {
    549		tg = list_entry_rcu(tg->list.next,
    550			typeof(struct task_group), list);
    551	} while (&tg->list != &task_groups && task_group_is_autogroup(tg));
    552
    553	if (&tg->list == &task_groups)
    554		tg = NULL;
    555
    556	return tg;
    557}
    558
    559#define for_each_rt_rq(rt_rq, iter, rq)					\
    560	for (iter = container_of(&task_groups, typeof(*iter), list);	\
    561		(iter = next_task_group(iter)) &&			\
    562		(rt_rq = iter->rt_rq[cpu_of(rq)]);)
    563
    564#define for_each_sched_rt_entity(rt_se) \
    565	for (; rt_se; rt_se = rt_se->parent)
    566
    567static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
    568{
    569	return rt_se->my_q;
    570}
    571
    572static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
    573static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
    574
    575static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
    576{
    577	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
    578	struct rq *rq = rq_of_rt_rq(rt_rq);
    579	struct sched_rt_entity *rt_se;
    580
    581	int cpu = cpu_of(rq);
    582
    583	rt_se = rt_rq->tg->rt_se[cpu];
    584
    585	if (rt_rq->rt_nr_running) {
    586		if (!rt_se)
    587			enqueue_top_rt_rq(rt_rq);
    588		else if (!on_rt_rq(rt_se))
    589			enqueue_rt_entity(rt_se, 0);
    590
    591		if (rt_rq->highest_prio.curr < curr->prio)
    592			resched_curr(rq);
    593	}
    594}
    595
    596static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
    597{
    598	struct sched_rt_entity *rt_se;
    599	int cpu = cpu_of(rq_of_rt_rq(rt_rq));
    600
    601	rt_se = rt_rq->tg->rt_se[cpu];
    602
    603	if (!rt_se) {
    604		dequeue_top_rt_rq(rt_rq);
    605		/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
    606		cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
    607	}
    608	else if (on_rt_rq(rt_se))
    609		dequeue_rt_entity(rt_se, 0);
    610}
    611
    612static inline int rt_rq_throttled(struct rt_rq *rt_rq)
    613{
    614	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
    615}
    616
    617static int rt_se_boosted(struct sched_rt_entity *rt_se)
    618{
    619	struct rt_rq *rt_rq = group_rt_rq(rt_se);
    620	struct task_struct *p;
    621
    622	if (rt_rq)
    623		return !!rt_rq->rt_nr_boosted;
    624
    625	p = rt_task_of(rt_se);
    626	return p->prio != p->normal_prio;
    627}
    628
    629#ifdef CONFIG_SMP
    630static inline const struct cpumask *sched_rt_period_mask(void)
    631{
    632	return this_rq()->rd->span;
    633}
    634#else
    635static inline const struct cpumask *sched_rt_period_mask(void)
    636{
    637	return cpu_online_mask;
    638}
    639#endif
    640
    641static inline
    642struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
    643{
    644	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
    645}
    646
    647static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
    648{
    649	return &rt_rq->tg->rt_bandwidth;
    650}
    651
    652#else /* !CONFIG_RT_GROUP_SCHED */
    653
    654static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
    655{
    656	return rt_rq->rt_runtime;
    657}
    658
    659static inline u64 sched_rt_period(struct rt_rq *rt_rq)
    660{
    661	return ktime_to_ns(def_rt_bandwidth.rt_period);
    662}
    663
    664typedef struct rt_rq *rt_rq_iter_t;
    665
    666#define for_each_rt_rq(rt_rq, iter, rq) \
    667	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
    668
    669#define for_each_sched_rt_entity(rt_se) \
    670	for (; rt_se; rt_se = NULL)
    671
    672static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
    673{
    674	return NULL;
    675}
    676
    677static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
    678{
    679	struct rq *rq = rq_of_rt_rq(rt_rq);
    680
    681	if (!rt_rq->rt_nr_running)
    682		return;
    683
    684	enqueue_top_rt_rq(rt_rq);
    685	resched_curr(rq);
    686}
    687
    688static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
    689{
    690	dequeue_top_rt_rq(rt_rq);
    691}
    692
    693static inline int rt_rq_throttled(struct rt_rq *rt_rq)
    694{
    695	return rt_rq->rt_throttled;
    696}
    697
    698static inline const struct cpumask *sched_rt_period_mask(void)
    699{
    700	return cpu_online_mask;
    701}
    702
    703static inline
    704struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
    705{
    706	return &cpu_rq(cpu)->rt;
    707}
    708
    709static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
    710{
    711	return &def_rt_bandwidth;
    712}
    713
    714#endif /* CONFIG_RT_GROUP_SCHED */
    715
    716bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
    717{
    718	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
    719
    720	return (hrtimer_active(&rt_b->rt_period_timer) ||
    721		rt_rq->rt_time < rt_b->rt_runtime);
    722}
    723
    724#ifdef CONFIG_SMP
    725/*
    726 * We ran out of runtime, see if we can borrow some from our neighbours.
    727 */
    728static void do_balance_runtime(struct rt_rq *rt_rq)
    729{
    730	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
    731	struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
    732	int i, weight;
    733	u64 rt_period;
    734
    735	weight = cpumask_weight(rd->span);
    736
    737	raw_spin_lock(&rt_b->rt_runtime_lock);
    738	rt_period = ktime_to_ns(rt_b->rt_period);
    739	for_each_cpu(i, rd->span) {
    740		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
    741		s64 diff;
    742
    743		if (iter == rt_rq)
    744			continue;
    745
    746		raw_spin_lock(&iter->rt_runtime_lock);
    747		/*
    748		 * Either all rqs have inf runtime and there's nothing to steal
    749		 * or __disable_runtime() below sets a specific rq to inf to
    750		 * indicate its been disabled and disallow stealing.
    751		 */
    752		if (iter->rt_runtime == RUNTIME_INF)
    753			goto next;
    754
    755		/*
    756		 * From runqueues with spare time, take 1/n part of their
    757		 * spare time, but no more than our period.
    758		 */
    759		diff = iter->rt_runtime - iter->rt_time;
    760		if (diff > 0) {
    761			diff = div_u64((u64)diff, weight);
    762			if (rt_rq->rt_runtime + diff > rt_period)
    763				diff = rt_period - rt_rq->rt_runtime;
    764			iter->rt_runtime -= diff;
    765			rt_rq->rt_runtime += diff;
    766			if (rt_rq->rt_runtime == rt_period) {
    767				raw_spin_unlock(&iter->rt_runtime_lock);
    768				break;
    769			}
    770		}
    771next:
    772		raw_spin_unlock(&iter->rt_runtime_lock);
    773	}
    774	raw_spin_unlock(&rt_b->rt_runtime_lock);
    775}
    776
    777/*
    778 * Ensure this RQ takes back all the runtime it lend to its neighbours.
    779 */
    780static void __disable_runtime(struct rq *rq)
    781{
    782	struct root_domain *rd = rq->rd;
    783	rt_rq_iter_t iter;
    784	struct rt_rq *rt_rq;
    785
    786	if (unlikely(!scheduler_running))
    787		return;
    788
    789	for_each_rt_rq(rt_rq, iter, rq) {
    790		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
    791		s64 want;
    792		int i;
    793
    794		raw_spin_lock(&rt_b->rt_runtime_lock);
    795		raw_spin_lock(&rt_rq->rt_runtime_lock);
    796		/*
    797		 * Either we're all inf and nobody needs to borrow, or we're
    798		 * already disabled and thus have nothing to do, or we have
    799		 * exactly the right amount of runtime to take out.
    800		 */
    801		if (rt_rq->rt_runtime == RUNTIME_INF ||
    802				rt_rq->rt_runtime == rt_b->rt_runtime)
    803			goto balanced;
    804		raw_spin_unlock(&rt_rq->rt_runtime_lock);
    805
    806		/*
    807		 * Calculate the difference between what we started out with
    808		 * and what we current have, that's the amount of runtime
    809		 * we lend and now have to reclaim.
    810		 */
    811		want = rt_b->rt_runtime - rt_rq->rt_runtime;
    812
    813		/*
    814		 * Greedy reclaim, take back as much as we can.
    815		 */
    816		for_each_cpu(i, rd->span) {
    817			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
    818			s64 diff;
    819
    820			/*
    821			 * Can't reclaim from ourselves or disabled runqueues.
    822			 */
    823			if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
    824				continue;
    825
    826			raw_spin_lock(&iter->rt_runtime_lock);
    827			if (want > 0) {
    828				diff = min_t(s64, iter->rt_runtime, want);
    829				iter->rt_runtime -= diff;
    830				want -= diff;
    831			} else {
    832				iter->rt_runtime -= want;
    833				want -= want;
    834			}
    835			raw_spin_unlock(&iter->rt_runtime_lock);
    836
    837			if (!want)
    838				break;
    839		}
    840
    841		raw_spin_lock(&rt_rq->rt_runtime_lock);
    842		/*
    843		 * We cannot be left wanting - that would mean some runtime
    844		 * leaked out of the system.
    845		 */
    846		BUG_ON(want);
    847balanced:
    848		/*
    849		 * Disable all the borrow logic by pretending we have inf
    850		 * runtime - in which case borrowing doesn't make sense.
    851		 */
    852		rt_rq->rt_runtime = RUNTIME_INF;
    853		rt_rq->rt_throttled = 0;
    854		raw_spin_unlock(&rt_rq->rt_runtime_lock);
    855		raw_spin_unlock(&rt_b->rt_runtime_lock);
    856
    857		/* Make rt_rq available for pick_next_task() */
    858		sched_rt_rq_enqueue(rt_rq);
    859	}
    860}
    861
    862static void __enable_runtime(struct rq *rq)
    863{
    864	rt_rq_iter_t iter;
    865	struct rt_rq *rt_rq;
    866
    867	if (unlikely(!scheduler_running))
    868		return;
    869
    870	/*
    871	 * Reset each runqueue's bandwidth settings
    872	 */
    873	for_each_rt_rq(rt_rq, iter, rq) {
    874		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
    875
    876		raw_spin_lock(&rt_b->rt_runtime_lock);
    877		raw_spin_lock(&rt_rq->rt_runtime_lock);
    878		rt_rq->rt_runtime = rt_b->rt_runtime;
    879		rt_rq->rt_time = 0;
    880		rt_rq->rt_throttled = 0;
    881		raw_spin_unlock(&rt_rq->rt_runtime_lock);
    882		raw_spin_unlock(&rt_b->rt_runtime_lock);
    883	}
    884}
    885
    886static void balance_runtime(struct rt_rq *rt_rq)
    887{
    888	if (!sched_feat(RT_RUNTIME_SHARE))
    889		return;
    890
    891	if (rt_rq->rt_time > rt_rq->rt_runtime) {
    892		raw_spin_unlock(&rt_rq->rt_runtime_lock);
    893		do_balance_runtime(rt_rq);
    894		raw_spin_lock(&rt_rq->rt_runtime_lock);
    895	}
    896}
    897#else /* !CONFIG_SMP */
    898static inline void balance_runtime(struct rt_rq *rt_rq) {}
    899#endif /* CONFIG_SMP */
    900
    901static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
    902{
    903	int i, idle = 1, throttled = 0;
    904	const struct cpumask *span;
    905
    906	span = sched_rt_period_mask();
    907#ifdef CONFIG_RT_GROUP_SCHED
    908	/*
    909	 * FIXME: isolated CPUs should really leave the root task group,
    910	 * whether they are isolcpus or were isolated via cpusets, lest
    911	 * the timer run on a CPU which does not service all runqueues,
    912	 * potentially leaving other CPUs indefinitely throttled.  If
    913	 * isolation is really required, the user will turn the throttle
    914	 * off to kill the perturbations it causes anyway.  Meanwhile,
    915	 * this maintains functionality for boot and/or troubleshooting.
    916	 */
    917	if (rt_b == &root_task_group.rt_bandwidth)
    918		span = cpu_online_mask;
    919#endif
    920	for_each_cpu(i, span) {
    921		int enqueue = 0;
    922		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
    923		struct rq *rq = rq_of_rt_rq(rt_rq);
    924		struct rq_flags rf;
    925		int skip;
    926
    927		/*
    928		 * When span == cpu_online_mask, taking each rq->lock
    929		 * can be time-consuming. Try to avoid it when possible.
    930		 */
    931		raw_spin_lock(&rt_rq->rt_runtime_lock);
    932		if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
    933			rt_rq->rt_runtime = rt_b->rt_runtime;
    934		skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
    935		raw_spin_unlock(&rt_rq->rt_runtime_lock);
    936		if (skip)
    937			continue;
    938
    939		rq_lock(rq, &rf);
    940		update_rq_clock(rq);
    941
    942		if (rt_rq->rt_time) {
    943			u64 runtime;
    944
    945			raw_spin_lock(&rt_rq->rt_runtime_lock);
    946			if (rt_rq->rt_throttled)
    947				balance_runtime(rt_rq);
    948			runtime = rt_rq->rt_runtime;
    949			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
    950			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
    951				rt_rq->rt_throttled = 0;
    952				enqueue = 1;
    953
    954				/*
    955				 * When we're idle and a woken (rt) task is
    956				 * throttled check_preempt_curr() will set
    957				 * skip_update and the time between the wakeup
    958				 * and this unthrottle will get accounted as
    959				 * 'runtime'.
    960				 */
    961				if (rt_rq->rt_nr_running && rq->curr == rq->idle)
    962					rq_clock_cancel_skipupdate(rq);
    963			}
    964			if (rt_rq->rt_time || rt_rq->rt_nr_running)
    965				idle = 0;
    966			raw_spin_unlock(&rt_rq->rt_runtime_lock);
    967		} else if (rt_rq->rt_nr_running) {
    968			idle = 0;
    969			if (!rt_rq_throttled(rt_rq))
    970				enqueue = 1;
    971		}
    972		if (rt_rq->rt_throttled)
    973			throttled = 1;
    974
    975		if (enqueue)
    976			sched_rt_rq_enqueue(rt_rq);
    977		rq_unlock(rq, &rf);
    978	}
    979
    980	if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
    981		return 1;
    982
    983	return idle;
    984}
    985
    986static inline int rt_se_prio(struct sched_rt_entity *rt_se)
    987{
    988#ifdef CONFIG_RT_GROUP_SCHED
    989	struct rt_rq *rt_rq = group_rt_rq(rt_se);
    990
    991	if (rt_rq)
    992		return rt_rq->highest_prio.curr;
    993#endif
    994
    995	return rt_task_of(rt_se)->prio;
    996}
    997
    998static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
    999{
   1000	u64 runtime = sched_rt_runtime(rt_rq);
   1001
   1002	if (rt_rq->rt_throttled)
   1003		return rt_rq_throttled(rt_rq);
   1004
   1005	if (runtime >= sched_rt_period(rt_rq))
   1006		return 0;
   1007
   1008	balance_runtime(rt_rq);
   1009	runtime = sched_rt_runtime(rt_rq);
   1010	if (runtime == RUNTIME_INF)
   1011		return 0;
   1012
   1013	if (rt_rq->rt_time > runtime) {
   1014		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
   1015
   1016		/*
   1017		 * Don't actually throttle groups that have no runtime assigned
   1018		 * but accrue some time due to boosting.
   1019		 */
   1020		if (likely(rt_b->rt_runtime)) {
   1021			rt_rq->rt_throttled = 1;
   1022			printk_deferred_once("sched: RT throttling activated\n");
   1023		} else {
   1024			/*
   1025			 * In case we did anyway, make it go away,
   1026			 * replenishment is a joke, since it will replenish us
   1027			 * with exactly 0 ns.
   1028			 */
   1029			rt_rq->rt_time = 0;
   1030		}
   1031
   1032		if (rt_rq_throttled(rt_rq)) {
   1033			sched_rt_rq_dequeue(rt_rq);
   1034			return 1;
   1035		}
   1036	}
   1037
   1038	return 0;
   1039}
   1040
   1041/*
   1042 * Update the current task's runtime statistics. Skip current tasks that
   1043 * are not in our scheduling class.
   1044 */
   1045static void update_curr_rt(struct rq *rq)
   1046{
   1047	struct task_struct *curr = rq->curr;
   1048	struct sched_rt_entity *rt_se = &curr->rt;
   1049	u64 delta_exec;
   1050	u64 now;
   1051
   1052	if (curr->sched_class != &rt_sched_class)
   1053		return;
   1054
   1055	now = rq_clock_task(rq);
   1056	delta_exec = now - curr->se.exec_start;
   1057	if (unlikely((s64)delta_exec <= 0))
   1058		return;
   1059
   1060	schedstat_set(curr->stats.exec_max,
   1061		      max(curr->stats.exec_max, delta_exec));
   1062
   1063	trace_sched_stat_runtime(curr, delta_exec, 0);
   1064
   1065	curr->se.sum_exec_runtime += delta_exec;
   1066	account_group_exec_runtime(curr, delta_exec);
   1067
   1068	curr->se.exec_start = now;
   1069	cgroup_account_cputime(curr, delta_exec);
   1070
   1071	if (!rt_bandwidth_enabled())
   1072		return;
   1073
   1074	for_each_sched_rt_entity(rt_se) {
   1075		struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
   1076		int exceeded;
   1077
   1078		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
   1079			raw_spin_lock(&rt_rq->rt_runtime_lock);
   1080			rt_rq->rt_time += delta_exec;
   1081			exceeded = sched_rt_runtime_exceeded(rt_rq);
   1082			if (exceeded)
   1083				resched_curr(rq);
   1084			raw_spin_unlock(&rt_rq->rt_runtime_lock);
   1085			if (exceeded)
   1086				do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
   1087		}
   1088	}
   1089}
   1090
   1091static void
   1092dequeue_top_rt_rq(struct rt_rq *rt_rq)
   1093{
   1094	struct rq *rq = rq_of_rt_rq(rt_rq);
   1095
   1096	BUG_ON(&rq->rt != rt_rq);
   1097
   1098	if (!rt_rq->rt_queued)
   1099		return;
   1100
   1101	BUG_ON(!rq->nr_running);
   1102
   1103	sub_nr_running(rq, rt_rq->rt_nr_running);
   1104	rt_rq->rt_queued = 0;
   1105
   1106}
   1107
   1108static void
   1109enqueue_top_rt_rq(struct rt_rq *rt_rq)
   1110{
   1111	struct rq *rq = rq_of_rt_rq(rt_rq);
   1112
   1113	BUG_ON(&rq->rt != rt_rq);
   1114
   1115	if (rt_rq->rt_queued)
   1116		return;
   1117
   1118	if (rt_rq_throttled(rt_rq))
   1119		return;
   1120
   1121	if (rt_rq->rt_nr_running) {
   1122		add_nr_running(rq, rt_rq->rt_nr_running);
   1123		rt_rq->rt_queued = 1;
   1124	}
   1125
   1126	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
   1127	cpufreq_update_util(rq, 0);
   1128}
   1129
   1130#if defined CONFIG_SMP
   1131
   1132static void
   1133inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
   1134{
   1135	struct rq *rq = rq_of_rt_rq(rt_rq);
   1136
   1137#ifdef CONFIG_RT_GROUP_SCHED
   1138	/*
   1139	 * Change rq's cpupri only if rt_rq is the top queue.
   1140	 */
   1141	if (&rq->rt != rt_rq)
   1142		return;
   1143#endif
   1144	if (rq->online && prio < prev_prio)
   1145		cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
   1146}
   1147
   1148static void
   1149dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
   1150{
   1151	struct rq *rq = rq_of_rt_rq(rt_rq);
   1152
   1153#ifdef CONFIG_RT_GROUP_SCHED
   1154	/*
   1155	 * Change rq's cpupri only if rt_rq is the top queue.
   1156	 */
   1157	if (&rq->rt != rt_rq)
   1158		return;
   1159#endif
   1160	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
   1161		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
   1162}
   1163
   1164#else /* CONFIG_SMP */
   1165
   1166static inline
   1167void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
   1168static inline
   1169void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
   1170
   1171#endif /* CONFIG_SMP */
   1172
   1173#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
   1174static void
   1175inc_rt_prio(struct rt_rq *rt_rq, int prio)
   1176{
   1177	int prev_prio = rt_rq->highest_prio.curr;
   1178
   1179	if (prio < prev_prio)
   1180		rt_rq->highest_prio.curr = prio;
   1181
   1182	inc_rt_prio_smp(rt_rq, prio, prev_prio);
   1183}
   1184
   1185static void
   1186dec_rt_prio(struct rt_rq *rt_rq, int prio)
   1187{
   1188	int prev_prio = rt_rq->highest_prio.curr;
   1189
   1190	if (rt_rq->rt_nr_running) {
   1191
   1192		WARN_ON(prio < prev_prio);
   1193
   1194		/*
   1195		 * This may have been our highest task, and therefore
   1196		 * we may have some recomputation to do
   1197		 */
   1198		if (prio == prev_prio) {
   1199			struct rt_prio_array *array = &rt_rq->active;
   1200
   1201			rt_rq->highest_prio.curr =
   1202				sched_find_first_bit(array->bitmap);
   1203		}
   1204
   1205	} else {
   1206		rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
   1207	}
   1208
   1209	dec_rt_prio_smp(rt_rq, prio, prev_prio);
   1210}
   1211
   1212#else
   1213
   1214static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
   1215static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
   1216
   1217#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
   1218
   1219#ifdef CONFIG_RT_GROUP_SCHED
   1220
   1221static void
   1222inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
   1223{
   1224	if (rt_se_boosted(rt_se))
   1225		rt_rq->rt_nr_boosted++;
   1226
   1227	if (rt_rq->tg)
   1228		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
   1229}
   1230
   1231static void
   1232dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
   1233{
   1234	if (rt_se_boosted(rt_se))
   1235		rt_rq->rt_nr_boosted--;
   1236
   1237	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
   1238}
   1239
   1240#else /* CONFIG_RT_GROUP_SCHED */
   1241
   1242static void
   1243inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
   1244{
   1245	start_rt_bandwidth(&def_rt_bandwidth);
   1246}
   1247
   1248static inline
   1249void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
   1250
   1251#endif /* CONFIG_RT_GROUP_SCHED */
   1252
   1253static inline
   1254unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
   1255{
   1256	struct rt_rq *group_rq = group_rt_rq(rt_se);
   1257
   1258	if (group_rq)
   1259		return group_rq->rt_nr_running;
   1260	else
   1261		return 1;
   1262}
   1263
   1264static inline
   1265unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
   1266{
   1267	struct rt_rq *group_rq = group_rt_rq(rt_se);
   1268	struct task_struct *tsk;
   1269
   1270	if (group_rq)
   1271		return group_rq->rr_nr_running;
   1272
   1273	tsk = rt_task_of(rt_se);
   1274
   1275	return (tsk->policy == SCHED_RR) ? 1 : 0;
   1276}
   1277
   1278static inline
   1279void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
   1280{
   1281	int prio = rt_se_prio(rt_se);
   1282
   1283	WARN_ON(!rt_prio(prio));
   1284	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
   1285	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
   1286
   1287	inc_rt_prio(rt_rq, prio);
   1288	inc_rt_migration(rt_se, rt_rq);
   1289	inc_rt_group(rt_se, rt_rq);
   1290}
   1291
   1292static inline
   1293void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
   1294{
   1295	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
   1296	WARN_ON(!rt_rq->rt_nr_running);
   1297	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
   1298	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
   1299
   1300	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
   1301	dec_rt_migration(rt_se, rt_rq);
   1302	dec_rt_group(rt_se, rt_rq);
   1303}
   1304
   1305/*
   1306 * Change rt_se->run_list location unless SAVE && !MOVE
   1307 *
   1308 * assumes ENQUEUE/DEQUEUE flags match
   1309 */
   1310static inline bool move_entity(unsigned int flags)
   1311{
   1312	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
   1313		return false;
   1314
   1315	return true;
   1316}
   1317
   1318static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
   1319{
   1320	list_del_init(&rt_se->run_list);
   1321
   1322	if (list_empty(array->queue + rt_se_prio(rt_se)))
   1323		__clear_bit(rt_se_prio(rt_se), array->bitmap);
   1324
   1325	rt_se->on_list = 0;
   1326}
   1327
   1328static inline struct sched_statistics *
   1329__schedstats_from_rt_se(struct sched_rt_entity *rt_se)
   1330{
   1331#ifdef CONFIG_RT_GROUP_SCHED
   1332	/* schedstats is not supported for rt group. */
   1333	if (!rt_entity_is_task(rt_se))
   1334		return NULL;
   1335#endif
   1336
   1337	return &rt_task_of(rt_se)->stats;
   1338}
   1339
   1340static inline void
   1341update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
   1342{
   1343	struct sched_statistics *stats;
   1344	struct task_struct *p = NULL;
   1345
   1346	if (!schedstat_enabled())
   1347		return;
   1348
   1349	if (rt_entity_is_task(rt_se))
   1350		p = rt_task_of(rt_se);
   1351
   1352	stats = __schedstats_from_rt_se(rt_se);
   1353	if (!stats)
   1354		return;
   1355
   1356	__update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
   1357}
   1358
   1359static inline void
   1360update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
   1361{
   1362	struct sched_statistics *stats;
   1363	struct task_struct *p = NULL;
   1364
   1365	if (!schedstat_enabled())
   1366		return;
   1367
   1368	if (rt_entity_is_task(rt_se))
   1369		p = rt_task_of(rt_se);
   1370
   1371	stats = __schedstats_from_rt_se(rt_se);
   1372	if (!stats)
   1373		return;
   1374
   1375	__update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
   1376}
   1377
   1378static inline void
   1379update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
   1380			int flags)
   1381{
   1382	if (!schedstat_enabled())
   1383		return;
   1384
   1385	if (flags & ENQUEUE_WAKEUP)
   1386		update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
   1387}
   1388
   1389static inline void
   1390update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
   1391{
   1392	struct sched_statistics *stats;
   1393	struct task_struct *p = NULL;
   1394
   1395	if (!schedstat_enabled())
   1396		return;
   1397
   1398	if (rt_entity_is_task(rt_se))
   1399		p = rt_task_of(rt_se);
   1400
   1401	stats = __schedstats_from_rt_se(rt_se);
   1402	if (!stats)
   1403		return;
   1404
   1405	__update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
   1406}
   1407
   1408static inline void
   1409update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
   1410			int flags)
   1411{
   1412	struct task_struct *p = NULL;
   1413
   1414	if (!schedstat_enabled())
   1415		return;
   1416
   1417	if (rt_entity_is_task(rt_se))
   1418		p = rt_task_of(rt_se);
   1419
   1420	if ((flags & DEQUEUE_SLEEP) && p) {
   1421		unsigned int state;
   1422
   1423		state = READ_ONCE(p->__state);
   1424		if (state & TASK_INTERRUPTIBLE)
   1425			__schedstat_set(p->stats.sleep_start,
   1426					rq_clock(rq_of_rt_rq(rt_rq)));
   1427
   1428		if (state & TASK_UNINTERRUPTIBLE)
   1429			__schedstat_set(p->stats.block_start,
   1430					rq_clock(rq_of_rt_rq(rt_rq)));
   1431	}
   1432}
   1433
   1434static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
   1435{
   1436	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
   1437	struct rt_prio_array *array = &rt_rq->active;
   1438	struct rt_rq *group_rq = group_rt_rq(rt_se);
   1439	struct list_head *queue = array->queue + rt_se_prio(rt_se);
   1440
   1441	/*
   1442	 * Don't enqueue the group if its throttled, or when empty.
   1443	 * The latter is a consequence of the former when a child group
   1444	 * get throttled and the current group doesn't have any other
   1445	 * active members.
   1446	 */
   1447	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
   1448		if (rt_se->on_list)
   1449			__delist_rt_entity(rt_se, array);
   1450		return;
   1451	}
   1452
   1453	if (move_entity(flags)) {
   1454		WARN_ON_ONCE(rt_se->on_list);
   1455		if (flags & ENQUEUE_HEAD)
   1456			list_add(&rt_se->run_list, queue);
   1457		else
   1458			list_add_tail(&rt_se->run_list, queue);
   1459
   1460		__set_bit(rt_se_prio(rt_se), array->bitmap);
   1461		rt_se->on_list = 1;
   1462	}
   1463	rt_se->on_rq = 1;
   1464
   1465	inc_rt_tasks(rt_se, rt_rq);
   1466}
   1467
   1468static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
   1469{
   1470	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
   1471	struct rt_prio_array *array = &rt_rq->active;
   1472
   1473	if (move_entity(flags)) {
   1474		WARN_ON_ONCE(!rt_se->on_list);
   1475		__delist_rt_entity(rt_se, array);
   1476	}
   1477	rt_se->on_rq = 0;
   1478
   1479	dec_rt_tasks(rt_se, rt_rq);
   1480}
   1481
   1482/*
   1483 * Because the prio of an upper entry depends on the lower
   1484 * entries, we must remove entries top - down.
   1485 */
   1486static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
   1487{
   1488	struct sched_rt_entity *back = NULL;
   1489
   1490	for_each_sched_rt_entity(rt_se) {
   1491		rt_se->back = back;
   1492		back = rt_se;
   1493	}
   1494
   1495	dequeue_top_rt_rq(rt_rq_of_se(back));
   1496
   1497	for (rt_se = back; rt_se; rt_se = rt_se->back) {
   1498		if (on_rt_rq(rt_se))
   1499			__dequeue_rt_entity(rt_se, flags);
   1500	}
   1501}
   1502
   1503static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
   1504{
   1505	struct rq *rq = rq_of_rt_se(rt_se);
   1506
   1507	update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
   1508
   1509	dequeue_rt_stack(rt_se, flags);
   1510	for_each_sched_rt_entity(rt_se)
   1511		__enqueue_rt_entity(rt_se, flags);
   1512	enqueue_top_rt_rq(&rq->rt);
   1513}
   1514
   1515static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
   1516{
   1517	struct rq *rq = rq_of_rt_se(rt_se);
   1518
   1519	update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
   1520
   1521	dequeue_rt_stack(rt_se, flags);
   1522
   1523	for_each_sched_rt_entity(rt_se) {
   1524		struct rt_rq *rt_rq = group_rt_rq(rt_se);
   1525
   1526		if (rt_rq && rt_rq->rt_nr_running)
   1527			__enqueue_rt_entity(rt_se, flags);
   1528	}
   1529	enqueue_top_rt_rq(&rq->rt);
   1530}
   1531
   1532/*
   1533 * Adding/removing a task to/from a priority array:
   1534 */
   1535static void
   1536enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
   1537{
   1538	struct sched_rt_entity *rt_se = &p->rt;
   1539
   1540	if (flags & ENQUEUE_WAKEUP)
   1541		rt_se->timeout = 0;
   1542
   1543	check_schedstat_required();
   1544	update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
   1545
   1546	enqueue_rt_entity(rt_se, flags);
   1547
   1548	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
   1549		enqueue_pushable_task(rq, p);
   1550}
   1551
   1552static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
   1553{
   1554	struct sched_rt_entity *rt_se = &p->rt;
   1555
   1556	update_curr_rt(rq);
   1557	dequeue_rt_entity(rt_se, flags);
   1558
   1559	dequeue_pushable_task(rq, p);
   1560}
   1561
   1562/*
   1563 * Put task to the head or the end of the run list without the overhead of
   1564 * dequeue followed by enqueue.
   1565 */
   1566static void
   1567requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
   1568{
   1569	if (on_rt_rq(rt_se)) {
   1570		struct rt_prio_array *array = &rt_rq->active;
   1571		struct list_head *queue = array->queue + rt_se_prio(rt_se);
   1572
   1573		if (head)
   1574			list_move(&rt_se->run_list, queue);
   1575		else
   1576			list_move_tail(&rt_se->run_list, queue);
   1577	}
   1578}
   1579
   1580static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
   1581{
   1582	struct sched_rt_entity *rt_se = &p->rt;
   1583	struct rt_rq *rt_rq;
   1584
   1585	for_each_sched_rt_entity(rt_se) {
   1586		rt_rq = rt_rq_of_se(rt_se);
   1587		requeue_rt_entity(rt_rq, rt_se, head);
   1588	}
   1589}
   1590
   1591static void yield_task_rt(struct rq *rq)
   1592{
   1593	requeue_task_rt(rq, rq->curr, 0);
   1594}
   1595
   1596#ifdef CONFIG_SMP
   1597static int find_lowest_rq(struct task_struct *task);
   1598
   1599static int
   1600select_task_rq_rt(struct task_struct *p, int cpu, int flags)
   1601{
   1602	struct task_struct *curr;
   1603	struct rq *rq;
   1604	bool test;
   1605
   1606	/* For anything but wake ups, just return the task_cpu */
   1607	if (!(flags & (WF_TTWU | WF_FORK)))
   1608		goto out;
   1609
   1610	rq = cpu_rq(cpu);
   1611
   1612	rcu_read_lock();
   1613	curr = READ_ONCE(rq->curr); /* unlocked access */
   1614
   1615	/*
   1616	 * If the current task on @p's runqueue is an RT task, then
   1617	 * try to see if we can wake this RT task up on another
   1618	 * runqueue. Otherwise simply start this RT task
   1619	 * on its current runqueue.
   1620	 *
   1621	 * We want to avoid overloading runqueues. If the woken
   1622	 * task is a higher priority, then it will stay on this CPU
   1623	 * and the lower prio task should be moved to another CPU.
   1624	 * Even though this will probably make the lower prio task
   1625	 * lose its cache, we do not want to bounce a higher task
   1626	 * around just because it gave up its CPU, perhaps for a
   1627	 * lock?
   1628	 *
   1629	 * For equal prio tasks, we just let the scheduler sort it out.
   1630	 *
   1631	 * Otherwise, just let it ride on the affined RQ and the
   1632	 * post-schedule router will push the preempted task away
   1633	 *
   1634	 * This test is optimistic, if we get it wrong the load-balancer
   1635	 * will have to sort it out.
   1636	 *
   1637	 * We take into account the capacity of the CPU to ensure it fits the
   1638	 * requirement of the task - which is only important on heterogeneous
   1639	 * systems like big.LITTLE.
   1640	 */
   1641	test = curr &&
   1642	       unlikely(rt_task(curr)) &&
   1643	       (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
   1644
   1645	if (test || !rt_task_fits_capacity(p, cpu)) {
   1646		int target = find_lowest_rq(p);
   1647
   1648		/*
   1649		 * Bail out if we were forcing a migration to find a better
   1650		 * fitting CPU but our search failed.
   1651		 */
   1652		if (!test && target != -1 && !rt_task_fits_capacity(p, target))
   1653			goto out_unlock;
   1654
   1655		/*
   1656		 * Don't bother moving it if the destination CPU is
   1657		 * not running a lower priority task.
   1658		 */
   1659		if (target != -1 &&
   1660		    p->prio < cpu_rq(target)->rt.highest_prio.curr)
   1661			cpu = target;
   1662	}
   1663
   1664out_unlock:
   1665	rcu_read_unlock();
   1666
   1667out:
   1668	return cpu;
   1669}
   1670
   1671static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
   1672{
   1673	/*
   1674	 * Current can't be migrated, useless to reschedule,
   1675	 * let's hope p can move out.
   1676	 */
   1677	if (rq->curr->nr_cpus_allowed == 1 ||
   1678	    !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
   1679		return;
   1680
   1681	/*
   1682	 * p is migratable, so let's not schedule it and
   1683	 * see if it is pushed or pulled somewhere else.
   1684	 */
   1685	if (p->nr_cpus_allowed != 1 &&
   1686	    cpupri_find(&rq->rd->cpupri, p, NULL))
   1687		return;
   1688
   1689	/*
   1690	 * There appear to be other CPUs that can accept
   1691	 * the current task but none can run 'p', so lets reschedule
   1692	 * to try and push the current task away:
   1693	 */
   1694	requeue_task_rt(rq, p, 1);
   1695	resched_curr(rq);
   1696}
   1697
   1698static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
   1699{
   1700	if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
   1701		/*
   1702		 * This is OK, because current is on_cpu, which avoids it being
   1703		 * picked for load-balance and preemption/IRQs are still
   1704		 * disabled avoiding further scheduler activity on it and we've
   1705		 * not yet started the picking loop.
   1706		 */
   1707		rq_unpin_lock(rq, rf);
   1708		pull_rt_task(rq);
   1709		rq_repin_lock(rq, rf);
   1710	}
   1711
   1712	return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
   1713}
   1714#endif /* CONFIG_SMP */
   1715
   1716/*
   1717 * Preempt the current task with a newly woken task if needed:
   1718 */
   1719static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
   1720{
   1721	if (p->prio < rq->curr->prio) {
   1722		resched_curr(rq);
   1723		return;
   1724	}
   1725
   1726#ifdef CONFIG_SMP
   1727	/*
   1728	 * If:
   1729	 *
   1730	 * - the newly woken task is of equal priority to the current task
   1731	 * - the newly woken task is non-migratable while current is migratable
   1732	 * - current will be preempted on the next reschedule
   1733	 *
   1734	 * we should check to see if current can readily move to a different
   1735	 * cpu.  If so, we will reschedule to allow the push logic to try
   1736	 * to move current somewhere else, making room for our non-migratable
   1737	 * task.
   1738	 */
   1739	if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
   1740		check_preempt_equal_prio(rq, p);
   1741#endif
   1742}
   1743
   1744static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
   1745{
   1746	struct sched_rt_entity *rt_se = &p->rt;
   1747	struct rt_rq *rt_rq = &rq->rt;
   1748
   1749	p->se.exec_start = rq_clock_task(rq);
   1750	if (on_rt_rq(&p->rt))
   1751		update_stats_wait_end_rt(rt_rq, rt_se);
   1752
   1753	/* The running task is never eligible for pushing */
   1754	dequeue_pushable_task(rq, p);
   1755
   1756	if (!first)
   1757		return;
   1758
   1759	/*
   1760	 * If prev task was rt, put_prev_task() has already updated the
   1761	 * utilization. We only care of the case where we start to schedule a
   1762	 * rt task
   1763	 */
   1764	if (rq->curr->sched_class != &rt_sched_class)
   1765		update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
   1766
   1767	rt_queue_push_tasks(rq);
   1768}
   1769
   1770static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
   1771{
   1772	struct rt_prio_array *array = &rt_rq->active;
   1773	struct sched_rt_entity *next = NULL;
   1774	struct list_head *queue;
   1775	int idx;
   1776
   1777	idx = sched_find_first_bit(array->bitmap);
   1778	BUG_ON(idx >= MAX_RT_PRIO);
   1779
   1780	queue = array->queue + idx;
   1781	next = list_entry(queue->next, struct sched_rt_entity, run_list);
   1782
   1783	return next;
   1784}
   1785
   1786static struct task_struct *_pick_next_task_rt(struct rq *rq)
   1787{
   1788	struct sched_rt_entity *rt_se;
   1789	struct rt_rq *rt_rq  = &rq->rt;
   1790
   1791	do {
   1792		rt_se = pick_next_rt_entity(rt_rq);
   1793		BUG_ON(!rt_se);
   1794		rt_rq = group_rt_rq(rt_se);
   1795	} while (rt_rq);
   1796
   1797	return rt_task_of(rt_se);
   1798}
   1799
   1800static struct task_struct *pick_task_rt(struct rq *rq)
   1801{
   1802	struct task_struct *p;
   1803
   1804	if (!sched_rt_runnable(rq))
   1805		return NULL;
   1806
   1807	p = _pick_next_task_rt(rq);
   1808
   1809	return p;
   1810}
   1811
   1812static struct task_struct *pick_next_task_rt(struct rq *rq)
   1813{
   1814	struct task_struct *p = pick_task_rt(rq);
   1815
   1816	if (p)
   1817		set_next_task_rt(rq, p, true);
   1818
   1819	return p;
   1820}
   1821
   1822static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
   1823{
   1824	struct sched_rt_entity *rt_se = &p->rt;
   1825	struct rt_rq *rt_rq = &rq->rt;
   1826
   1827	if (on_rt_rq(&p->rt))
   1828		update_stats_wait_start_rt(rt_rq, rt_se);
   1829
   1830	update_curr_rt(rq);
   1831
   1832	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
   1833
   1834	/*
   1835	 * The previous task needs to be made eligible for pushing
   1836	 * if it is still active
   1837	 */
   1838	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
   1839		enqueue_pushable_task(rq, p);
   1840}
   1841
   1842#ifdef CONFIG_SMP
   1843
   1844/* Only try algorithms three times */
   1845#define RT_MAX_TRIES 3
   1846
   1847static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
   1848{
   1849	if (!task_running(rq, p) &&
   1850	    cpumask_test_cpu(cpu, &p->cpus_mask))
   1851		return 1;
   1852
   1853	return 0;
   1854}
   1855
   1856/*
   1857 * Return the highest pushable rq's task, which is suitable to be executed
   1858 * on the CPU, NULL otherwise
   1859 */
   1860static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
   1861{
   1862	struct plist_head *head = &rq->rt.pushable_tasks;
   1863	struct task_struct *p;
   1864
   1865	if (!has_pushable_tasks(rq))
   1866		return NULL;
   1867
   1868	plist_for_each_entry(p, head, pushable_tasks) {
   1869		if (pick_rt_task(rq, p, cpu))
   1870			return p;
   1871	}
   1872
   1873	return NULL;
   1874}
   1875
   1876static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
   1877
   1878static int find_lowest_rq(struct task_struct *task)
   1879{
   1880	struct sched_domain *sd;
   1881	struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
   1882	int this_cpu = smp_processor_id();
   1883	int cpu      = task_cpu(task);
   1884	int ret;
   1885
   1886	/* Make sure the mask is initialized first */
   1887	if (unlikely(!lowest_mask))
   1888		return -1;
   1889
   1890	if (task->nr_cpus_allowed == 1)
   1891		return -1; /* No other targets possible */
   1892
   1893	/*
   1894	 * If we're on asym system ensure we consider the different capacities
   1895	 * of the CPUs when searching for the lowest_mask.
   1896	 */
   1897	if (static_branch_unlikely(&sched_asym_cpucapacity)) {
   1898
   1899		ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
   1900					  task, lowest_mask,
   1901					  rt_task_fits_capacity);
   1902	} else {
   1903
   1904		ret = cpupri_find(&task_rq(task)->rd->cpupri,
   1905				  task, lowest_mask);
   1906	}
   1907
   1908	if (!ret)
   1909		return -1; /* No targets found */
   1910
   1911	/*
   1912	 * At this point we have built a mask of CPUs representing the
   1913	 * lowest priority tasks in the system.  Now we want to elect
   1914	 * the best one based on our affinity and topology.
   1915	 *
   1916	 * We prioritize the last CPU that the task executed on since
   1917	 * it is most likely cache-hot in that location.
   1918	 */
   1919	if (cpumask_test_cpu(cpu, lowest_mask))
   1920		return cpu;
   1921
   1922	/*
   1923	 * Otherwise, we consult the sched_domains span maps to figure
   1924	 * out which CPU is logically closest to our hot cache data.
   1925	 */
   1926	if (!cpumask_test_cpu(this_cpu, lowest_mask))
   1927		this_cpu = -1; /* Skip this_cpu opt if not among lowest */
   1928
   1929	rcu_read_lock();
   1930	for_each_domain(cpu, sd) {
   1931		if (sd->flags & SD_WAKE_AFFINE) {
   1932			int best_cpu;
   1933
   1934			/*
   1935			 * "this_cpu" is cheaper to preempt than a
   1936			 * remote processor.
   1937			 */
   1938			if (this_cpu != -1 &&
   1939			    cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
   1940				rcu_read_unlock();
   1941				return this_cpu;
   1942			}
   1943
   1944			best_cpu = cpumask_any_and_distribute(lowest_mask,
   1945							      sched_domain_span(sd));
   1946			if (best_cpu < nr_cpu_ids) {
   1947				rcu_read_unlock();
   1948				return best_cpu;
   1949			}
   1950		}
   1951	}
   1952	rcu_read_unlock();
   1953
   1954	/*
   1955	 * And finally, if there were no matches within the domains
   1956	 * just give the caller *something* to work with from the compatible
   1957	 * locations.
   1958	 */
   1959	if (this_cpu != -1)
   1960		return this_cpu;
   1961
   1962	cpu = cpumask_any_distribute(lowest_mask);
   1963	if (cpu < nr_cpu_ids)
   1964		return cpu;
   1965
   1966	return -1;
   1967}
   1968
   1969/* Will lock the rq it finds */
   1970static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
   1971{
   1972	struct rq *lowest_rq = NULL;
   1973	int tries;
   1974	int cpu;
   1975
   1976	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
   1977		cpu = find_lowest_rq(task);
   1978
   1979		if ((cpu == -1) || (cpu == rq->cpu))
   1980			break;
   1981
   1982		lowest_rq = cpu_rq(cpu);
   1983
   1984		if (lowest_rq->rt.highest_prio.curr <= task->prio) {
   1985			/*
   1986			 * Target rq has tasks of equal or higher priority,
   1987			 * retrying does not release any lock and is unlikely
   1988			 * to yield a different result.
   1989			 */
   1990			lowest_rq = NULL;
   1991			break;
   1992		}
   1993
   1994		/* if the prio of this runqueue changed, try again */
   1995		if (double_lock_balance(rq, lowest_rq)) {
   1996			/*
   1997			 * We had to unlock the run queue. In
   1998			 * the mean time, task could have
   1999			 * migrated already or had its affinity changed.
   2000			 * Also make sure that it wasn't scheduled on its rq.
   2001			 */
   2002			if (unlikely(task_rq(task) != rq ||
   2003				     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
   2004				     task_running(rq, task) ||
   2005				     !rt_task(task) ||
   2006				     !task_on_rq_queued(task))) {
   2007
   2008				double_unlock_balance(rq, lowest_rq);
   2009				lowest_rq = NULL;
   2010				break;
   2011			}
   2012		}
   2013
   2014		/* If this rq is still suitable use it. */
   2015		if (lowest_rq->rt.highest_prio.curr > task->prio)
   2016			break;
   2017
   2018		/* try again */
   2019		double_unlock_balance(rq, lowest_rq);
   2020		lowest_rq = NULL;
   2021	}
   2022
   2023	return lowest_rq;
   2024}
   2025
   2026static struct task_struct *pick_next_pushable_task(struct rq *rq)
   2027{
   2028	struct task_struct *p;
   2029
   2030	if (!has_pushable_tasks(rq))
   2031		return NULL;
   2032
   2033	p = plist_first_entry(&rq->rt.pushable_tasks,
   2034			      struct task_struct, pushable_tasks);
   2035
   2036	BUG_ON(rq->cpu != task_cpu(p));
   2037	BUG_ON(task_current(rq, p));
   2038	BUG_ON(p->nr_cpus_allowed <= 1);
   2039
   2040	BUG_ON(!task_on_rq_queued(p));
   2041	BUG_ON(!rt_task(p));
   2042
   2043	return p;
   2044}
   2045
   2046/*
   2047 * If the current CPU has more than one RT task, see if the non
   2048 * running task can migrate over to a CPU that is running a task
   2049 * of lesser priority.
   2050 */
   2051static int push_rt_task(struct rq *rq, bool pull)
   2052{
   2053	struct task_struct *next_task;
   2054	struct rq *lowest_rq;
   2055	int ret = 0;
   2056
   2057	if (!rq->rt.overloaded)
   2058		return 0;
   2059
   2060	next_task = pick_next_pushable_task(rq);
   2061	if (!next_task)
   2062		return 0;
   2063
   2064retry:
   2065	/*
   2066	 * It's possible that the next_task slipped in of
   2067	 * higher priority than current. If that's the case
   2068	 * just reschedule current.
   2069	 */
   2070	if (unlikely(next_task->prio < rq->curr->prio)) {
   2071		resched_curr(rq);
   2072		return 0;
   2073	}
   2074
   2075	if (is_migration_disabled(next_task)) {
   2076		struct task_struct *push_task = NULL;
   2077		int cpu;
   2078
   2079		if (!pull || rq->push_busy)
   2080			return 0;
   2081
   2082		/*
   2083		 * Invoking find_lowest_rq() on anything but an RT task doesn't
   2084		 * make sense. Per the above priority check, curr has to
   2085		 * be of higher priority than next_task, so no need to
   2086		 * reschedule when bailing out.
   2087		 *
   2088		 * Note that the stoppers are masqueraded as SCHED_FIFO
   2089		 * (cf. sched_set_stop_task()), so we can't rely on rt_task().
   2090		 */
   2091		if (rq->curr->sched_class != &rt_sched_class)
   2092			return 0;
   2093
   2094		cpu = find_lowest_rq(rq->curr);
   2095		if (cpu == -1 || cpu == rq->cpu)
   2096			return 0;
   2097
   2098		/*
   2099		 * Given we found a CPU with lower priority than @next_task,
   2100		 * therefore it should be running. However we cannot migrate it
   2101		 * to this other CPU, instead attempt to push the current
   2102		 * running task on this CPU away.
   2103		 */
   2104		push_task = get_push_task(rq);
   2105		if (push_task) {
   2106			raw_spin_rq_unlock(rq);
   2107			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
   2108					    push_task, &rq->push_work);
   2109			raw_spin_rq_lock(rq);
   2110		}
   2111
   2112		return 0;
   2113	}
   2114
   2115	if (WARN_ON(next_task == rq->curr))
   2116		return 0;
   2117
   2118	/* We might release rq lock */
   2119	get_task_struct(next_task);
   2120
   2121	/* find_lock_lowest_rq locks the rq if found */
   2122	lowest_rq = find_lock_lowest_rq(next_task, rq);
   2123	if (!lowest_rq) {
   2124		struct task_struct *task;
   2125		/*
   2126		 * find_lock_lowest_rq releases rq->lock
   2127		 * so it is possible that next_task has migrated.
   2128		 *
   2129		 * We need to make sure that the task is still on the same
   2130		 * run-queue and is also still the next task eligible for
   2131		 * pushing.
   2132		 */
   2133		task = pick_next_pushable_task(rq);
   2134		if (task == next_task) {
   2135			/*
   2136			 * The task hasn't migrated, and is still the next
   2137			 * eligible task, but we failed to find a run-queue
   2138			 * to push it to.  Do not retry in this case, since
   2139			 * other CPUs will pull from us when ready.
   2140			 */
   2141			goto out;
   2142		}
   2143
   2144		if (!task)
   2145			/* No more tasks, just exit */
   2146			goto out;
   2147
   2148		/*
   2149		 * Something has shifted, try again.
   2150		 */
   2151		put_task_struct(next_task);
   2152		next_task = task;
   2153		goto retry;
   2154	}
   2155
   2156	deactivate_task(rq, next_task, 0);
   2157	set_task_cpu(next_task, lowest_rq->cpu);
   2158	activate_task(lowest_rq, next_task, 0);
   2159	resched_curr(lowest_rq);
   2160	ret = 1;
   2161
   2162	double_unlock_balance(rq, lowest_rq);
   2163out:
   2164	put_task_struct(next_task);
   2165
   2166	return ret;
   2167}
   2168
   2169static void push_rt_tasks(struct rq *rq)
   2170{
   2171	/* push_rt_task will return true if it moved an RT */
   2172	while (push_rt_task(rq, false))
   2173		;
   2174}
   2175
   2176#ifdef HAVE_RT_PUSH_IPI
   2177
   2178/*
   2179 * When a high priority task schedules out from a CPU and a lower priority
   2180 * task is scheduled in, a check is made to see if there's any RT tasks
   2181 * on other CPUs that are waiting to run because a higher priority RT task
   2182 * is currently running on its CPU. In this case, the CPU with multiple RT
   2183 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
   2184 * up that may be able to run one of its non-running queued RT tasks.
   2185 *
   2186 * All CPUs with overloaded RT tasks need to be notified as there is currently
   2187 * no way to know which of these CPUs have the highest priority task waiting
   2188 * to run. Instead of trying to take a spinlock on each of these CPUs,
   2189 * which has shown to cause large latency when done on machines with many
   2190 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
   2191 * RT tasks waiting to run.
   2192 *
   2193 * Just sending an IPI to each of the CPUs is also an issue, as on large
   2194 * count CPU machines, this can cause an IPI storm on a CPU, especially
   2195 * if its the only CPU with multiple RT tasks queued, and a large number
   2196 * of CPUs scheduling a lower priority task at the same time.
   2197 *
   2198 * Each root domain has its own irq work function that can iterate over
   2199 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
   2200 * task must be checked if there's one or many CPUs that are lowering
   2201 * their priority, there's a single irq work iterator that will try to
   2202 * push off RT tasks that are waiting to run.
   2203 *
   2204 * When a CPU schedules a lower priority task, it will kick off the
   2205 * irq work iterator that will jump to each CPU with overloaded RT tasks.
   2206 * As it only takes the first CPU that schedules a lower priority task
   2207 * to start the process, the rto_start variable is incremented and if
   2208 * the atomic result is one, then that CPU will try to take the rto_lock.
   2209 * This prevents high contention on the lock as the process handles all
   2210 * CPUs scheduling lower priority tasks.
   2211 *
   2212 * All CPUs that are scheduling a lower priority task will increment the
   2213 * rt_loop_next variable. This will make sure that the irq work iterator
   2214 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
   2215 * priority task, even if the iterator is in the middle of a scan. Incrementing
   2216 * the rt_loop_next will cause the iterator to perform another scan.
   2217 *
   2218 */
   2219static int rto_next_cpu(struct root_domain *rd)
   2220{
   2221	int next;
   2222	int cpu;
   2223
   2224	/*
   2225	 * When starting the IPI RT pushing, the rto_cpu is set to -1,
   2226	 * rt_next_cpu() will simply return the first CPU found in
   2227	 * the rto_mask.
   2228	 *
   2229	 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
   2230	 * will return the next CPU found in the rto_mask.
   2231	 *
   2232	 * If there are no more CPUs left in the rto_mask, then a check is made
   2233	 * against rto_loop and rto_loop_next. rto_loop is only updated with
   2234	 * the rto_lock held, but any CPU may increment the rto_loop_next
   2235	 * without any locking.
   2236	 */
   2237	for (;;) {
   2238
   2239		/* When rto_cpu is -1 this acts like cpumask_first() */
   2240		cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
   2241
   2242		rd->rto_cpu = cpu;
   2243
   2244		if (cpu < nr_cpu_ids)
   2245			return cpu;
   2246
   2247		rd->rto_cpu = -1;
   2248
   2249		/*
   2250		 * ACQUIRE ensures we see the @rto_mask changes
   2251		 * made prior to the @next value observed.
   2252		 *
   2253		 * Matches WMB in rt_set_overload().
   2254		 */
   2255		next = atomic_read_acquire(&rd->rto_loop_next);
   2256
   2257		if (rd->rto_loop == next)
   2258			break;
   2259
   2260		rd->rto_loop = next;
   2261	}
   2262
   2263	return -1;
   2264}
   2265
   2266static inline bool rto_start_trylock(atomic_t *v)
   2267{
   2268	return !atomic_cmpxchg_acquire(v, 0, 1);
   2269}
   2270
   2271static inline void rto_start_unlock(atomic_t *v)
   2272{
   2273	atomic_set_release(v, 0);
   2274}
   2275
   2276static void tell_cpu_to_push(struct rq *rq)
   2277{
   2278	int cpu = -1;
   2279
   2280	/* Keep the loop going if the IPI is currently active */
   2281	atomic_inc(&rq->rd->rto_loop_next);
   2282
   2283	/* Only one CPU can initiate a loop at a time */
   2284	if (!rto_start_trylock(&rq->rd->rto_loop_start))
   2285		return;
   2286
   2287	raw_spin_lock(&rq->rd->rto_lock);
   2288
   2289	/*
   2290	 * The rto_cpu is updated under the lock, if it has a valid CPU
   2291	 * then the IPI is still running and will continue due to the
   2292	 * update to loop_next, and nothing needs to be done here.
   2293	 * Otherwise it is finishing up and an ipi needs to be sent.
   2294	 */
   2295	if (rq->rd->rto_cpu < 0)
   2296		cpu = rto_next_cpu(rq->rd);
   2297
   2298	raw_spin_unlock(&rq->rd->rto_lock);
   2299
   2300	rto_start_unlock(&rq->rd->rto_loop_start);
   2301
   2302	if (cpu >= 0) {
   2303		/* Make sure the rd does not get freed while pushing */
   2304		sched_get_rd(rq->rd);
   2305		irq_work_queue_on(&rq->rd->rto_push_work, cpu);
   2306	}
   2307}
   2308
   2309/* Called from hardirq context */
   2310void rto_push_irq_work_func(struct irq_work *work)
   2311{
   2312	struct root_domain *rd =
   2313		container_of(work, struct root_domain, rto_push_work);
   2314	struct rq *rq;
   2315	int cpu;
   2316
   2317	rq = this_rq();
   2318
   2319	/*
   2320	 * We do not need to grab the lock to check for has_pushable_tasks.
   2321	 * When it gets updated, a check is made if a push is possible.
   2322	 */
   2323	if (has_pushable_tasks(rq)) {
   2324		raw_spin_rq_lock(rq);
   2325		while (push_rt_task(rq, true))
   2326			;
   2327		raw_spin_rq_unlock(rq);
   2328	}
   2329
   2330	raw_spin_lock(&rd->rto_lock);
   2331
   2332	/* Pass the IPI to the next rt overloaded queue */
   2333	cpu = rto_next_cpu(rd);
   2334
   2335	raw_spin_unlock(&rd->rto_lock);
   2336
   2337	if (cpu < 0) {
   2338		sched_put_rd(rd);
   2339		return;
   2340	}
   2341
   2342	/* Try the next RT overloaded CPU */
   2343	irq_work_queue_on(&rd->rto_push_work, cpu);
   2344}
   2345#endif /* HAVE_RT_PUSH_IPI */
   2346
   2347static void pull_rt_task(struct rq *this_rq)
   2348{
   2349	int this_cpu = this_rq->cpu, cpu;
   2350	bool resched = false;
   2351	struct task_struct *p, *push_task;
   2352	struct rq *src_rq;
   2353	int rt_overload_count = rt_overloaded(this_rq);
   2354
   2355	if (likely(!rt_overload_count))
   2356		return;
   2357
   2358	/*
   2359	 * Match the barrier from rt_set_overloaded; this guarantees that if we
   2360	 * see overloaded we must also see the rto_mask bit.
   2361	 */
   2362	smp_rmb();
   2363
   2364	/* If we are the only overloaded CPU do nothing */
   2365	if (rt_overload_count == 1 &&
   2366	    cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
   2367		return;
   2368
   2369#ifdef HAVE_RT_PUSH_IPI
   2370	if (sched_feat(RT_PUSH_IPI)) {
   2371		tell_cpu_to_push(this_rq);
   2372		return;
   2373	}
   2374#endif
   2375
   2376	for_each_cpu(cpu, this_rq->rd->rto_mask) {
   2377		if (this_cpu == cpu)
   2378			continue;
   2379
   2380		src_rq = cpu_rq(cpu);
   2381
   2382		/*
   2383		 * Don't bother taking the src_rq->lock if the next highest
   2384		 * task is known to be lower-priority than our current task.
   2385		 * This may look racy, but if this value is about to go
   2386		 * logically higher, the src_rq will push this task away.
   2387		 * And if its going logically lower, we do not care
   2388		 */
   2389		if (src_rq->rt.highest_prio.next >=
   2390		    this_rq->rt.highest_prio.curr)
   2391			continue;
   2392
   2393		/*
   2394		 * We can potentially drop this_rq's lock in
   2395		 * double_lock_balance, and another CPU could
   2396		 * alter this_rq
   2397		 */
   2398		push_task = NULL;
   2399		double_lock_balance(this_rq, src_rq);
   2400
   2401		/*
   2402		 * We can pull only a task, which is pushable
   2403		 * on its rq, and no others.
   2404		 */
   2405		p = pick_highest_pushable_task(src_rq, this_cpu);
   2406
   2407		/*
   2408		 * Do we have an RT task that preempts
   2409		 * the to-be-scheduled task?
   2410		 */
   2411		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
   2412			WARN_ON(p == src_rq->curr);
   2413			WARN_ON(!task_on_rq_queued(p));
   2414
   2415			/*
   2416			 * There's a chance that p is higher in priority
   2417			 * than what's currently running on its CPU.
   2418			 * This is just that p is waking up and hasn't
   2419			 * had a chance to schedule. We only pull
   2420			 * p if it is lower in priority than the
   2421			 * current task on the run queue
   2422			 */
   2423			if (p->prio < src_rq->curr->prio)
   2424				goto skip;
   2425
   2426			if (is_migration_disabled(p)) {
   2427				push_task = get_push_task(src_rq);
   2428			} else {
   2429				deactivate_task(src_rq, p, 0);
   2430				set_task_cpu(p, this_cpu);
   2431				activate_task(this_rq, p, 0);
   2432				resched = true;
   2433			}
   2434			/*
   2435			 * We continue with the search, just in
   2436			 * case there's an even higher prio task
   2437			 * in another runqueue. (low likelihood
   2438			 * but possible)
   2439			 */
   2440		}
   2441skip:
   2442		double_unlock_balance(this_rq, src_rq);
   2443
   2444		if (push_task) {
   2445			raw_spin_rq_unlock(this_rq);
   2446			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
   2447					    push_task, &src_rq->push_work);
   2448			raw_spin_rq_lock(this_rq);
   2449		}
   2450	}
   2451
   2452	if (resched)
   2453		resched_curr(this_rq);
   2454}
   2455
   2456/*
   2457 * If we are not running and we are not going to reschedule soon, we should
   2458 * try to push tasks away now
   2459 */
   2460static void task_woken_rt(struct rq *rq, struct task_struct *p)
   2461{
   2462	bool need_to_push = !task_running(rq, p) &&
   2463			    !test_tsk_need_resched(rq->curr) &&
   2464			    p->nr_cpus_allowed > 1 &&
   2465			    (dl_task(rq->curr) || rt_task(rq->curr)) &&
   2466			    (rq->curr->nr_cpus_allowed < 2 ||
   2467			     rq->curr->prio <= p->prio);
   2468
   2469	if (need_to_push)
   2470		push_rt_tasks(rq);
   2471}
   2472
   2473/* Assumes rq->lock is held */
   2474static void rq_online_rt(struct rq *rq)
   2475{
   2476	if (rq->rt.overloaded)
   2477		rt_set_overload(rq);
   2478
   2479	__enable_runtime(rq);
   2480
   2481	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
   2482}
   2483
   2484/* Assumes rq->lock is held */
   2485static void rq_offline_rt(struct rq *rq)
   2486{
   2487	if (rq->rt.overloaded)
   2488		rt_clear_overload(rq);
   2489
   2490	__disable_runtime(rq);
   2491
   2492	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
   2493}
   2494
   2495/*
   2496 * When switch from the rt queue, we bring ourselves to a position
   2497 * that we might want to pull RT tasks from other runqueues.
   2498 */
   2499static void switched_from_rt(struct rq *rq, struct task_struct *p)
   2500{
   2501	/*
   2502	 * If there are other RT tasks then we will reschedule
   2503	 * and the scheduling of the other RT tasks will handle
   2504	 * the balancing. But if we are the last RT task
   2505	 * we may need to handle the pulling of RT tasks
   2506	 * now.
   2507	 */
   2508	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
   2509		return;
   2510
   2511	rt_queue_pull_task(rq);
   2512}
   2513
   2514void __init init_sched_rt_class(void)
   2515{
   2516	unsigned int i;
   2517
   2518	for_each_possible_cpu(i) {
   2519		zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
   2520					GFP_KERNEL, cpu_to_node(i));
   2521	}
   2522}
   2523#endif /* CONFIG_SMP */
   2524
   2525/*
   2526 * When switching a task to RT, we may overload the runqueue
   2527 * with RT tasks. In this case we try to push them off to
   2528 * other runqueues.
   2529 */
   2530static void switched_to_rt(struct rq *rq, struct task_struct *p)
   2531{
   2532	/*
   2533	 * If we are running, update the avg_rt tracking, as the running time
   2534	 * will now on be accounted into the latter.
   2535	 */
   2536	if (task_current(rq, p)) {
   2537		update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
   2538		return;
   2539	}
   2540
   2541	/*
   2542	 * If we are not running we may need to preempt the current
   2543	 * running task. If that current running task is also an RT task
   2544	 * then see if we can move to another run queue.
   2545	 */
   2546	if (task_on_rq_queued(p)) {
   2547#ifdef CONFIG_SMP
   2548		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
   2549			rt_queue_push_tasks(rq);
   2550#endif /* CONFIG_SMP */
   2551		if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
   2552			resched_curr(rq);
   2553	}
   2554}
   2555
   2556/*
   2557 * Priority of the task has changed. This may cause
   2558 * us to initiate a push or pull.
   2559 */
   2560static void
   2561prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
   2562{
   2563	if (!task_on_rq_queued(p))
   2564		return;
   2565
   2566	if (task_current(rq, p)) {
   2567#ifdef CONFIG_SMP
   2568		/*
   2569		 * If our priority decreases while running, we
   2570		 * may need to pull tasks to this runqueue.
   2571		 */
   2572		if (oldprio < p->prio)
   2573			rt_queue_pull_task(rq);
   2574
   2575		/*
   2576		 * If there's a higher priority task waiting to run
   2577		 * then reschedule.
   2578		 */
   2579		if (p->prio > rq->rt.highest_prio.curr)
   2580			resched_curr(rq);
   2581#else
   2582		/* For UP simply resched on drop of prio */
   2583		if (oldprio < p->prio)
   2584			resched_curr(rq);
   2585#endif /* CONFIG_SMP */
   2586	} else {
   2587		/*
   2588		 * This task is not running, but if it is
   2589		 * greater than the current running task
   2590		 * then reschedule.
   2591		 */
   2592		if (p->prio < rq->curr->prio)
   2593			resched_curr(rq);
   2594	}
   2595}
   2596
   2597#ifdef CONFIG_POSIX_TIMERS
   2598static void watchdog(struct rq *rq, struct task_struct *p)
   2599{
   2600	unsigned long soft, hard;
   2601
   2602	/* max may change after cur was read, this will be fixed next tick */
   2603	soft = task_rlimit(p, RLIMIT_RTTIME);
   2604	hard = task_rlimit_max(p, RLIMIT_RTTIME);
   2605
   2606	if (soft != RLIM_INFINITY) {
   2607		unsigned long next;
   2608
   2609		if (p->rt.watchdog_stamp != jiffies) {
   2610			p->rt.timeout++;
   2611			p->rt.watchdog_stamp = jiffies;
   2612		}
   2613
   2614		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
   2615		if (p->rt.timeout > next) {
   2616			posix_cputimers_rt_watchdog(&p->posix_cputimers,
   2617						    p->se.sum_exec_runtime);
   2618		}
   2619	}
   2620}
   2621#else
   2622static inline void watchdog(struct rq *rq, struct task_struct *p) { }
   2623#endif
   2624
   2625/*
   2626 * scheduler tick hitting a task of our scheduling class.
   2627 *
   2628 * NOTE: This function can be called remotely by the tick offload that
   2629 * goes along full dynticks. Therefore no local assumption can be made
   2630 * and everything must be accessed through the @rq and @curr passed in
   2631 * parameters.
   2632 */
   2633static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
   2634{
   2635	struct sched_rt_entity *rt_se = &p->rt;
   2636
   2637	update_curr_rt(rq);
   2638	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
   2639
   2640	watchdog(rq, p);
   2641
   2642	/*
   2643	 * RR tasks need a special form of timeslice management.
   2644	 * FIFO tasks have no timeslices.
   2645	 */
   2646	if (p->policy != SCHED_RR)
   2647		return;
   2648
   2649	if (--p->rt.time_slice)
   2650		return;
   2651
   2652	p->rt.time_slice = sched_rr_timeslice;
   2653
   2654	/*
   2655	 * Requeue to the end of queue if we (and all of our ancestors) are not
   2656	 * the only element on the queue
   2657	 */
   2658	for_each_sched_rt_entity(rt_se) {
   2659		if (rt_se->run_list.prev != rt_se->run_list.next) {
   2660			requeue_task_rt(rq, p, 0);
   2661			resched_curr(rq);
   2662			return;
   2663		}
   2664	}
   2665}
   2666
   2667static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
   2668{
   2669	/*
   2670	 * Time slice is 0 for SCHED_FIFO tasks
   2671	 */
   2672	if (task->policy == SCHED_RR)
   2673		return sched_rr_timeslice;
   2674	else
   2675		return 0;
   2676}
   2677
   2678DEFINE_SCHED_CLASS(rt) = {
   2679
   2680	.enqueue_task		= enqueue_task_rt,
   2681	.dequeue_task		= dequeue_task_rt,
   2682	.yield_task		= yield_task_rt,
   2683
   2684	.check_preempt_curr	= check_preempt_curr_rt,
   2685
   2686	.pick_next_task		= pick_next_task_rt,
   2687	.put_prev_task		= put_prev_task_rt,
   2688	.set_next_task          = set_next_task_rt,
   2689
   2690#ifdef CONFIG_SMP
   2691	.balance		= balance_rt,
   2692	.pick_task		= pick_task_rt,
   2693	.select_task_rq		= select_task_rq_rt,
   2694	.set_cpus_allowed       = set_cpus_allowed_common,
   2695	.rq_online              = rq_online_rt,
   2696	.rq_offline             = rq_offline_rt,
   2697	.task_woken		= task_woken_rt,
   2698	.switched_from		= switched_from_rt,
   2699	.find_lock_rq		= find_lock_lowest_rq,
   2700#endif
   2701
   2702	.task_tick		= task_tick_rt,
   2703
   2704	.get_rr_interval	= get_rr_interval_rt,
   2705
   2706	.prio_changed		= prio_changed_rt,
   2707	.switched_to		= switched_to_rt,
   2708
   2709	.update_curr		= update_curr_rt,
   2710
   2711#ifdef CONFIG_UCLAMP_TASK
   2712	.uclamp_enabled		= 1,
   2713#endif
   2714};
   2715
   2716#ifdef CONFIG_RT_GROUP_SCHED
   2717/*
   2718 * Ensure that the real time constraints are schedulable.
   2719 */
   2720static DEFINE_MUTEX(rt_constraints_mutex);
   2721
   2722static inline int tg_has_rt_tasks(struct task_group *tg)
   2723{
   2724	struct task_struct *task;
   2725	struct css_task_iter it;
   2726	int ret = 0;
   2727
   2728	/*
   2729	 * Autogroups do not have RT tasks; see autogroup_create().
   2730	 */
   2731	if (task_group_is_autogroup(tg))
   2732		return 0;
   2733
   2734	css_task_iter_start(&tg->css, 0, &it);
   2735	while (!ret && (task = css_task_iter_next(&it)))
   2736		ret |= rt_task(task);
   2737	css_task_iter_end(&it);
   2738
   2739	return ret;
   2740}
   2741
   2742struct rt_schedulable_data {
   2743	struct task_group *tg;
   2744	u64 rt_period;
   2745	u64 rt_runtime;
   2746};
   2747
   2748static int tg_rt_schedulable(struct task_group *tg, void *data)
   2749{
   2750	struct rt_schedulable_data *d = data;
   2751	struct task_group *child;
   2752	unsigned long total, sum = 0;
   2753	u64 period, runtime;
   2754
   2755	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
   2756	runtime = tg->rt_bandwidth.rt_runtime;
   2757
   2758	if (tg == d->tg) {
   2759		period = d->rt_period;
   2760		runtime = d->rt_runtime;
   2761	}
   2762
   2763	/*
   2764	 * Cannot have more runtime than the period.
   2765	 */
   2766	if (runtime > period && runtime != RUNTIME_INF)
   2767		return -EINVAL;
   2768
   2769	/*
   2770	 * Ensure we don't starve existing RT tasks if runtime turns zero.
   2771	 */
   2772	if (rt_bandwidth_enabled() && !runtime &&
   2773	    tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
   2774		return -EBUSY;
   2775
   2776	total = to_ratio(period, runtime);
   2777
   2778	/*
   2779	 * Nobody can have more than the global setting allows.
   2780	 */
   2781	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
   2782		return -EINVAL;
   2783
   2784	/*
   2785	 * The sum of our children's runtime should not exceed our own.
   2786	 */
   2787	list_for_each_entry_rcu(child, &tg->children, siblings) {
   2788		period = ktime_to_ns(child->rt_bandwidth.rt_period);
   2789		runtime = child->rt_bandwidth.rt_runtime;
   2790
   2791		if (child == d->tg) {
   2792			period = d->rt_period;
   2793			runtime = d->rt_runtime;
   2794		}
   2795
   2796		sum += to_ratio(period, runtime);
   2797	}
   2798
   2799	if (sum > total)
   2800		return -EINVAL;
   2801
   2802	return 0;
   2803}
   2804
   2805static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
   2806{
   2807	int ret;
   2808
   2809	struct rt_schedulable_data data = {
   2810		.tg = tg,
   2811		.rt_period = period,
   2812		.rt_runtime = runtime,
   2813	};
   2814
   2815	rcu_read_lock();
   2816	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
   2817	rcu_read_unlock();
   2818
   2819	return ret;
   2820}
   2821
   2822static int tg_set_rt_bandwidth(struct task_group *tg,
   2823		u64 rt_period, u64 rt_runtime)
   2824{
   2825	int i, err = 0;
   2826
   2827	/*
   2828	 * Disallowing the root group RT runtime is BAD, it would disallow the
   2829	 * kernel creating (and or operating) RT threads.
   2830	 */
   2831	if (tg == &root_task_group && rt_runtime == 0)
   2832		return -EINVAL;
   2833
   2834	/* No period doesn't make any sense. */
   2835	if (rt_period == 0)
   2836		return -EINVAL;
   2837
   2838	/*
   2839	 * Bound quota to defend quota against overflow during bandwidth shift.
   2840	 */
   2841	if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
   2842		return -EINVAL;
   2843
   2844	mutex_lock(&rt_constraints_mutex);
   2845	err = __rt_schedulable(tg, rt_period, rt_runtime);
   2846	if (err)
   2847		goto unlock;
   2848
   2849	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
   2850	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
   2851	tg->rt_bandwidth.rt_runtime = rt_runtime;
   2852
   2853	for_each_possible_cpu(i) {
   2854		struct rt_rq *rt_rq = tg->rt_rq[i];
   2855
   2856		raw_spin_lock(&rt_rq->rt_runtime_lock);
   2857		rt_rq->rt_runtime = rt_runtime;
   2858		raw_spin_unlock(&rt_rq->rt_runtime_lock);
   2859	}
   2860	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
   2861unlock:
   2862	mutex_unlock(&rt_constraints_mutex);
   2863
   2864	return err;
   2865}
   2866
   2867int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
   2868{
   2869	u64 rt_runtime, rt_period;
   2870
   2871	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
   2872	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
   2873	if (rt_runtime_us < 0)
   2874		rt_runtime = RUNTIME_INF;
   2875	else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
   2876		return -EINVAL;
   2877
   2878	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
   2879}
   2880
   2881long sched_group_rt_runtime(struct task_group *tg)
   2882{
   2883	u64 rt_runtime_us;
   2884
   2885	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
   2886		return -1;
   2887
   2888	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
   2889	do_div(rt_runtime_us, NSEC_PER_USEC);
   2890	return rt_runtime_us;
   2891}
   2892
   2893int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
   2894{
   2895	u64 rt_runtime, rt_period;
   2896
   2897	if (rt_period_us > U64_MAX / NSEC_PER_USEC)
   2898		return -EINVAL;
   2899
   2900	rt_period = rt_period_us * NSEC_PER_USEC;
   2901	rt_runtime = tg->rt_bandwidth.rt_runtime;
   2902
   2903	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
   2904}
   2905
   2906long sched_group_rt_period(struct task_group *tg)
   2907{
   2908	u64 rt_period_us;
   2909
   2910	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
   2911	do_div(rt_period_us, NSEC_PER_USEC);
   2912	return rt_period_us;
   2913}
   2914
   2915#ifdef CONFIG_SYSCTL
   2916static int sched_rt_global_constraints(void)
   2917{
   2918	int ret = 0;
   2919
   2920	mutex_lock(&rt_constraints_mutex);
   2921	ret = __rt_schedulable(NULL, 0, 0);
   2922	mutex_unlock(&rt_constraints_mutex);
   2923
   2924	return ret;
   2925}
   2926#endif /* CONFIG_SYSCTL */
   2927
   2928int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
   2929{
   2930	/* Don't accept realtime tasks when there is no way for them to run */
   2931	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
   2932		return 0;
   2933
   2934	return 1;
   2935}
   2936
   2937#else /* !CONFIG_RT_GROUP_SCHED */
   2938
   2939#ifdef CONFIG_SYSCTL
   2940static int sched_rt_global_constraints(void)
   2941{
   2942	unsigned long flags;
   2943	int i;
   2944
   2945	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
   2946	for_each_possible_cpu(i) {
   2947		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
   2948
   2949		raw_spin_lock(&rt_rq->rt_runtime_lock);
   2950		rt_rq->rt_runtime = global_rt_runtime();
   2951		raw_spin_unlock(&rt_rq->rt_runtime_lock);
   2952	}
   2953	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
   2954
   2955	return 0;
   2956}
   2957#endif /* CONFIG_SYSCTL */
   2958#endif /* CONFIG_RT_GROUP_SCHED */
   2959
   2960#ifdef CONFIG_SYSCTL
   2961static int sched_rt_global_validate(void)
   2962{
   2963	if (sysctl_sched_rt_period <= 0)
   2964		return -EINVAL;
   2965
   2966	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
   2967		((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
   2968		 ((u64)sysctl_sched_rt_runtime *
   2969			NSEC_PER_USEC > max_rt_runtime)))
   2970		return -EINVAL;
   2971
   2972	return 0;
   2973}
   2974
   2975static void sched_rt_do_global(void)
   2976{
   2977	unsigned long flags;
   2978
   2979	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
   2980	def_rt_bandwidth.rt_runtime = global_rt_runtime();
   2981	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
   2982	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
   2983}
   2984
   2985static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
   2986		size_t *lenp, loff_t *ppos)
   2987{
   2988	int old_period, old_runtime;
   2989	static DEFINE_MUTEX(mutex);
   2990	int ret;
   2991
   2992	mutex_lock(&mutex);
   2993	old_period = sysctl_sched_rt_period;
   2994	old_runtime = sysctl_sched_rt_runtime;
   2995
   2996	ret = proc_dointvec(table, write, buffer, lenp, ppos);
   2997
   2998	if (!ret && write) {
   2999		ret = sched_rt_global_validate();
   3000		if (ret)
   3001			goto undo;
   3002
   3003		ret = sched_dl_global_validate();
   3004		if (ret)
   3005			goto undo;
   3006
   3007		ret = sched_rt_global_constraints();
   3008		if (ret)
   3009			goto undo;
   3010
   3011		sched_rt_do_global();
   3012		sched_dl_do_global();
   3013	}
   3014	if (0) {
   3015undo:
   3016		sysctl_sched_rt_period = old_period;
   3017		sysctl_sched_rt_runtime = old_runtime;
   3018	}
   3019	mutex_unlock(&mutex);
   3020
   3021	return ret;
   3022}
   3023
   3024static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
   3025		size_t *lenp, loff_t *ppos)
   3026{
   3027	int ret;
   3028	static DEFINE_MUTEX(mutex);
   3029
   3030	mutex_lock(&mutex);
   3031	ret = proc_dointvec(table, write, buffer, lenp, ppos);
   3032	/*
   3033	 * Make sure that internally we keep jiffies.
   3034	 * Also, writing zero resets the timeslice to default:
   3035	 */
   3036	if (!ret && write) {
   3037		sched_rr_timeslice =
   3038			sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
   3039			msecs_to_jiffies(sysctl_sched_rr_timeslice);
   3040	}
   3041	mutex_unlock(&mutex);
   3042
   3043	return ret;
   3044}
   3045#endif /* CONFIG_SYSCTL */
   3046
   3047#ifdef CONFIG_SCHED_DEBUG
   3048void print_rt_stats(struct seq_file *m, int cpu)
   3049{
   3050	rt_rq_iter_t iter;
   3051	struct rt_rq *rt_rq;
   3052
   3053	rcu_read_lock();
   3054	for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
   3055		print_rt_rq(m, cpu, rt_rq);
   3056	rcu_read_unlock();
   3057}
   3058#endif /* CONFIG_SCHED_DEBUG */