cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

fair.c (316769B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
      4 *
      5 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
      6 *
      7 *  Interactivity improvements by Mike Galbraith
      8 *  (C) 2007 Mike Galbraith <efault@gmx.de>
      9 *
     10 *  Various enhancements by Dmitry Adamushko.
     11 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
     12 *
     13 *  Group scheduling enhancements by Srivatsa Vaddagiri
     14 *  Copyright IBM Corporation, 2007
     15 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
     16 *
     17 *  Scaled math optimizations by Thomas Gleixner
     18 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
     19 *
     20 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
     21 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
     22 */
     23#include <linux/energy_model.h>
     24#include <linux/mmap_lock.h>
     25#include <linux/hugetlb_inline.h>
     26#include <linux/jiffies.h>
     27#include <linux/mm_api.h>
     28#include <linux/highmem.h>
     29#include <linux/spinlock_api.h>
     30#include <linux/cpumask_api.h>
     31#include <linux/lockdep_api.h>
     32#include <linux/softirq.h>
     33#include <linux/refcount_api.h>
     34#include <linux/topology.h>
     35#include <linux/sched/clock.h>
     36#include <linux/sched/cond_resched.h>
     37#include <linux/sched/cputime.h>
     38#include <linux/sched/isolation.h>
     39#include <linux/sched/nohz.h>
     40
     41#include <linux/cpuidle.h>
     42#include <linux/interrupt.h>
     43#include <linux/mempolicy.h>
     44#include <linux/mutex_api.h>
     45#include <linux/profile.h>
     46#include <linux/psi.h>
     47#include <linux/ratelimit.h>
     48#include <linux/task_work.h>
     49
     50#include <asm/switch_to.h>
     51
     52#include <linux/sched/cond_resched.h>
     53
     54#include "sched.h"
     55#include "stats.h"
     56#include "autogroup.h"
     57
     58/*
     59 * Targeted preemption latency for CPU-bound tasks:
     60 *
     61 * NOTE: this latency value is not the same as the concept of
     62 * 'timeslice length' - timeslices in CFS are of variable length
     63 * and have no persistent notion like in traditional, time-slice
     64 * based scheduling concepts.
     65 *
     66 * (to see the precise effective timeslice length of your workload,
     67 *  run vmstat and monitor the context-switches (cs) field)
     68 *
     69 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
     70 */
     71unsigned int sysctl_sched_latency			= 6000000ULL;
     72static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;
     73
     74/*
     75 * The initial- and re-scaling of tunables is configurable
     76 *
     77 * Options are:
     78 *
     79 *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
     80 *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
     81 *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
     82 *
     83 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
     84 */
     85unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
     86
     87/*
     88 * Minimal preemption granularity for CPU-bound tasks:
     89 *
     90 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
     91 */
     92unsigned int sysctl_sched_min_granularity			= 750000ULL;
     93static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
     94
     95/*
     96 * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
     97 * Applies only when SCHED_IDLE tasks compete with normal tasks.
     98 *
     99 * (default: 0.75 msec)
    100 */
    101unsigned int sysctl_sched_idle_min_granularity			= 750000ULL;
    102
    103/*
    104 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
    105 */
    106static unsigned int sched_nr_latency = 8;
    107
    108/*
    109 * After fork, child runs first. If set to 0 (default) then
    110 * parent will (try to) run first.
    111 */
    112unsigned int sysctl_sched_child_runs_first __read_mostly;
    113
    114/*
    115 * SCHED_OTHER wake-up granularity.
    116 *
    117 * This option delays the preemption effects of decoupled workloads
    118 * and reduces their over-scheduling. Synchronous workloads will still
    119 * have immediate wakeup/sleep latencies.
    120 *
    121 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
    122 */
    123unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
    124static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
    125
    126const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
    127
    128int sched_thermal_decay_shift;
    129static int __init setup_sched_thermal_decay_shift(char *str)
    130{
    131	int _shift = 0;
    132
    133	if (kstrtoint(str, 0, &_shift))
    134		pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
    135
    136	sched_thermal_decay_shift = clamp(_shift, 0, 10);
    137	return 1;
    138}
    139__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
    140
    141#ifdef CONFIG_SMP
    142/*
    143 * For asym packing, by default the lower numbered CPU has higher priority.
    144 */
    145int __weak arch_asym_cpu_priority(int cpu)
    146{
    147	return -cpu;
    148}
    149
    150/*
    151 * The margin used when comparing utilization with CPU capacity.
    152 *
    153 * (default: ~20%)
    154 */
    155#define fits_capacity(cap, max)	((cap) * 1280 < (max) * 1024)
    156
    157/*
    158 * The margin used when comparing CPU capacities.
    159 * is 'cap1' noticeably greater than 'cap2'
    160 *
    161 * (default: ~5%)
    162 */
    163#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
    164#endif
    165
    166#ifdef CONFIG_CFS_BANDWIDTH
    167/*
    168 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
    169 * each time a cfs_rq requests quota.
    170 *
    171 * Note: in the case that the slice exceeds the runtime remaining (either due
    172 * to consumption or the quota being specified to be smaller than the slice)
    173 * we will always only issue the remaining available time.
    174 *
    175 * (default: 5 msec, units: microseconds)
    176 */
    177static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
    178#endif
    179
    180#ifdef CONFIG_SYSCTL
    181static struct ctl_table sched_fair_sysctls[] = {
    182	{
    183		.procname       = "sched_child_runs_first",
    184		.data           = &sysctl_sched_child_runs_first,
    185		.maxlen         = sizeof(unsigned int),
    186		.mode           = 0644,
    187		.proc_handler   = proc_dointvec,
    188	},
    189#ifdef CONFIG_CFS_BANDWIDTH
    190	{
    191		.procname       = "sched_cfs_bandwidth_slice_us",
    192		.data           = &sysctl_sched_cfs_bandwidth_slice,
    193		.maxlen         = sizeof(unsigned int),
    194		.mode           = 0644,
    195		.proc_handler   = proc_dointvec_minmax,
    196		.extra1         = SYSCTL_ONE,
    197	},
    198#endif
    199	{}
    200};
    201
    202static int __init sched_fair_sysctl_init(void)
    203{
    204	register_sysctl_init("kernel", sched_fair_sysctls);
    205	return 0;
    206}
    207late_initcall(sched_fair_sysctl_init);
    208#endif
    209
    210static inline void update_load_add(struct load_weight *lw, unsigned long inc)
    211{
    212	lw->weight += inc;
    213	lw->inv_weight = 0;
    214}
    215
    216static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
    217{
    218	lw->weight -= dec;
    219	lw->inv_weight = 0;
    220}
    221
    222static inline void update_load_set(struct load_weight *lw, unsigned long w)
    223{
    224	lw->weight = w;
    225	lw->inv_weight = 0;
    226}
    227
    228/*
    229 * Increase the granularity value when there are more CPUs,
    230 * because with more CPUs the 'effective latency' as visible
    231 * to users decreases. But the relationship is not linear,
    232 * so pick a second-best guess by going with the log2 of the
    233 * number of CPUs.
    234 *
    235 * This idea comes from the SD scheduler of Con Kolivas:
    236 */
    237static unsigned int get_update_sysctl_factor(void)
    238{
    239	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
    240	unsigned int factor;
    241
    242	switch (sysctl_sched_tunable_scaling) {
    243	case SCHED_TUNABLESCALING_NONE:
    244		factor = 1;
    245		break;
    246	case SCHED_TUNABLESCALING_LINEAR:
    247		factor = cpus;
    248		break;
    249	case SCHED_TUNABLESCALING_LOG:
    250	default:
    251		factor = 1 + ilog2(cpus);
    252		break;
    253	}
    254
    255	return factor;
    256}
    257
    258static void update_sysctl(void)
    259{
    260	unsigned int factor = get_update_sysctl_factor();
    261
    262#define SET_SYSCTL(name) \
    263	(sysctl_##name = (factor) * normalized_sysctl_##name)
    264	SET_SYSCTL(sched_min_granularity);
    265	SET_SYSCTL(sched_latency);
    266	SET_SYSCTL(sched_wakeup_granularity);
    267#undef SET_SYSCTL
    268}
    269
    270void __init sched_init_granularity(void)
    271{
    272	update_sysctl();
    273}
    274
    275#define WMULT_CONST	(~0U)
    276#define WMULT_SHIFT	32
    277
    278static void __update_inv_weight(struct load_weight *lw)
    279{
    280	unsigned long w;
    281
    282	if (likely(lw->inv_weight))
    283		return;
    284
    285	w = scale_load_down(lw->weight);
    286
    287	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
    288		lw->inv_weight = 1;
    289	else if (unlikely(!w))
    290		lw->inv_weight = WMULT_CONST;
    291	else
    292		lw->inv_weight = WMULT_CONST / w;
    293}
    294
    295/*
    296 * delta_exec * weight / lw.weight
    297 *   OR
    298 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
    299 *
    300 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
    301 * we're guaranteed shift stays positive because inv_weight is guaranteed to
    302 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
    303 *
    304 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
    305 * weight/lw.weight <= 1, and therefore our shift will also be positive.
    306 */
    307static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
    308{
    309	u64 fact = scale_load_down(weight);
    310	u32 fact_hi = (u32)(fact >> 32);
    311	int shift = WMULT_SHIFT;
    312	int fs;
    313
    314	__update_inv_weight(lw);
    315
    316	if (unlikely(fact_hi)) {
    317		fs = fls(fact_hi);
    318		shift -= fs;
    319		fact >>= fs;
    320	}
    321
    322	fact = mul_u32_u32(fact, lw->inv_weight);
    323
    324	fact_hi = (u32)(fact >> 32);
    325	if (fact_hi) {
    326		fs = fls(fact_hi);
    327		shift -= fs;
    328		fact >>= fs;
    329	}
    330
    331	return mul_u64_u32_shr(delta_exec, fact, shift);
    332}
    333
    334
    335const struct sched_class fair_sched_class;
    336
    337/**************************************************************
    338 * CFS operations on generic schedulable entities:
    339 */
    340
    341#ifdef CONFIG_FAIR_GROUP_SCHED
    342
    343/* Walk up scheduling entities hierarchy */
    344#define for_each_sched_entity(se) \
    345		for (; se; se = se->parent)
    346
    347static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
    348{
    349	struct rq *rq = rq_of(cfs_rq);
    350	int cpu = cpu_of(rq);
    351
    352	if (cfs_rq->on_list)
    353		return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
    354
    355	cfs_rq->on_list = 1;
    356
    357	/*
    358	 * Ensure we either appear before our parent (if already
    359	 * enqueued) or force our parent to appear after us when it is
    360	 * enqueued. The fact that we always enqueue bottom-up
    361	 * reduces this to two cases and a special case for the root
    362	 * cfs_rq. Furthermore, it also means that we will always reset
    363	 * tmp_alone_branch either when the branch is connected
    364	 * to a tree or when we reach the top of the tree
    365	 */
    366	if (cfs_rq->tg->parent &&
    367	    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
    368		/*
    369		 * If parent is already on the list, we add the child
    370		 * just before. Thanks to circular linked property of
    371		 * the list, this means to put the child at the tail
    372		 * of the list that starts by parent.
    373		 */
    374		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
    375			&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
    376		/*
    377		 * The branch is now connected to its tree so we can
    378		 * reset tmp_alone_branch to the beginning of the
    379		 * list.
    380		 */
    381		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
    382		return true;
    383	}
    384
    385	if (!cfs_rq->tg->parent) {
    386		/*
    387		 * cfs rq without parent should be put
    388		 * at the tail of the list.
    389		 */
    390		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
    391			&rq->leaf_cfs_rq_list);
    392		/*
    393		 * We have reach the top of a tree so we can reset
    394		 * tmp_alone_branch to the beginning of the list.
    395		 */
    396		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
    397		return true;
    398	}
    399
    400	/*
    401	 * The parent has not already been added so we want to
    402	 * make sure that it will be put after us.
    403	 * tmp_alone_branch points to the begin of the branch
    404	 * where we will add parent.
    405	 */
    406	list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
    407	/*
    408	 * update tmp_alone_branch to points to the new begin
    409	 * of the branch
    410	 */
    411	rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
    412	return false;
    413}
    414
    415static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
    416{
    417	if (cfs_rq->on_list) {
    418		struct rq *rq = rq_of(cfs_rq);
    419
    420		/*
    421		 * With cfs_rq being unthrottled/throttled during an enqueue,
    422		 * it can happen the tmp_alone_branch points the a leaf that
    423		 * we finally want to del. In this case, tmp_alone_branch moves
    424		 * to the prev element but it will point to rq->leaf_cfs_rq_list
    425		 * at the end of the enqueue.
    426		 */
    427		if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
    428			rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
    429
    430		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
    431		cfs_rq->on_list = 0;
    432	}
    433}
    434
    435static inline void assert_list_leaf_cfs_rq(struct rq *rq)
    436{
    437	SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
    438}
    439
    440/* Iterate thr' all leaf cfs_rq's on a runqueue */
    441#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
    442	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
    443				 leaf_cfs_rq_list)
    444
    445/* Do the two (enqueued) entities belong to the same group ? */
    446static inline struct cfs_rq *
    447is_same_group(struct sched_entity *se, struct sched_entity *pse)
    448{
    449	if (se->cfs_rq == pse->cfs_rq)
    450		return se->cfs_rq;
    451
    452	return NULL;
    453}
    454
    455static inline struct sched_entity *parent_entity(struct sched_entity *se)
    456{
    457	return se->parent;
    458}
    459
    460static void
    461find_matching_se(struct sched_entity **se, struct sched_entity **pse)
    462{
    463	int se_depth, pse_depth;
    464
    465	/*
    466	 * preemption test can be made between sibling entities who are in the
    467	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
    468	 * both tasks until we find their ancestors who are siblings of common
    469	 * parent.
    470	 */
    471
    472	/* First walk up until both entities are at same depth */
    473	se_depth = (*se)->depth;
    474	pse_depth = (*pse)->depth;
    475
    476	while (se_depth > pse_depth) {
    477		se_depth--;
    478		*se = parent_entity(*se);
    479	}
    480
    481	while (pse_depth > se_depth) {
    482		pse_depth--;
    483		*pse = parent_entity(*pse);
    484	}
    485
    486	while (!is_same_group(*se, *pse)) {
    487		*se = parent_entity(*se);
    488		*pse = parent_entity(*pse);
    489	}
    490}
    491
    492static int tg_is_idle(struct task_group *tg)
    493{
    494	return tg->idle > 0;
    495}
    496
    497static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
    498{
    499	return cfs_rq->idle > 0;
    500}
    501
    502static int se_is_idle(struct sched_entity *se)
    503{
    504	if (entity_is_task(se))
    505		return task_has_idle_policy(task_of(se));
    506	return cfs_rq_is_idle(group_cfs_rq(se));
    507}
    508
    509#else	/* !CONFIG_FAIR_GROUP_SCHED */
    510
    511#define for_each_sched_entity(se) \
    512		for (; se; se = NULL)
    513
    514static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
    515{
    516	return true;
    517}
    518
    519static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
    520{
    521}
    522
    523static inline void assert_list_leaf_cfs_rq(struct rq *rq)
    524{
    525}
    526
    527#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
    528		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
    529
    530static inline struct sched_entity *parent_entity(struct sched_entity *se)
    531{
    532	return NULL;
    533}
    534
    535static inline void
    536find_matching_se(struct sched_entity **se, struct sched_entity **pse)
    537{
    538}
    539
    540static inline int tg_is_idle(struct task_group *tg)
    541{
    542	return 0;
    543}
    544
    545static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
    546{
    547	return 0;
    548}
    549
    550static int se_is_idle(struct sched_entity *se)
    551{
    552	return 0;
    553}
    554
    555#endif	/* CONFIG_FAIR_GROUP_SCHED */
    556
    557static __always_inline
    558void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
    559
    560/**************************************************************
    561 * Scheduling class tree data structure manipulation methods:
    562 */
    563
    564static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
    565{
    566	s64 delta = (s64)(vruntime - max_vruntime);
    567	if (delta > 0)
    568		max_vruntime = vruntime;
    569
    570	return max_vruntime;
    571}
    572
    573static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
    574{
    575	s64 delta = (s64)(vruntime - min_vruntime);
    576	if (delta < 0)
    577		min_vruntime = vruntime;
    578
    579	return min_vruntime;
    580}
    581
    582static inline bool entity_before(struct sched_entity *a,
    583				struct sched_entity *b)
    584{
    585	return (s64)(a->vruntime - b->vruntime) < 0;
    586}
    587
    588#define __node_2_se(node) \
    589	rb_entry((node), struct sched_entity, run_node)
    590
    591static void update_min_vruntime(struct cfs_rq *cfs_rq)
    592{
    593	struct sched_entity *curr = cfs_rq->curr;
    594	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
    595
    596	u64 vruntime = cfs_rq->min_vruntime;
    597
    598	if (curr) {
    599		if (curr->on_rq)
    600			vruntime = curr->vruntime;
    601		else
    602			curr = NULL;
    603	}
    604
    605	if (leftmost) { /* non-empty tree */
    606		struct sched_entity *se = __node_2_se(leftmost);
    607
    608		if (!curr)
    609			vruntime = se->vruntime;
    610		else
    611			vruntime = min_vruntime(vruntime, se->vruntime);
    612	}
    613
    614	/* ensure we never gain time by being placed backwards. */
    615	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
    616#ifndef CONFIG_64BIT
    617	smp_wmb();
    618	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
    619#endif
    620}
    621
    622static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
    623{
    624	return entity_before(__node_2_se(a), __node_2_se(b));
    625}
    626
    627/*
    628 * Enqueue an entity into the rb-tree:
    629 */
    630static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
    631{
    632	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
    633}
    634
    635static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
    636{
    637	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
    638}
    639
    640struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
    641{
    642	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
    643
    644	if (!left)
    645		return NULL;
    646
    647	return __node_2_se(left);
    648}
    649
    650static struct sched_entity *__pick_next_entity(struct sched_entity *se)
    651{
    652	struct rb_node *next = rb_next(&se->run_node);
    653
    654	if (!next)
    655		return NULL;
    656
    657	return __node_2_se(next);
    658}
    659
    660#ifdef CONFIG_SCHED_DEBUG
    661struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
    662{
    663	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
    664
    665	if (!last)
    666		return NULL;
    667
    668	return __node_2_se(last);
    669}
    670
    671/**************************************************************
    672 * Scheduling class statistics methods:
    673 */
    674
    675int sched_update_scaling(void)
    676{
    677	unsigned int factor = get_update_sysctl_factor();
    678
    679	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
    680					sysctl_sched_min_granularity);
    681
    682#define WRT_SYSCTL(name) \
    683	(normalized_sysctl_##name = sysctl_##name / (factor))
    684	WRT_SYSCTL(sched_min_granularity);
    685	WRT_SYSCTL(sched_latency);
    686	WRT_SYSCTL(sched_wakeup_granularity);
    687#undef WRT_SYSCTL
    688
    689	return 0;
    690}
    691#endif
    692
    693/*
    694 * delta /= w
    695 */
    696static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
    697{
    698	if (unlikely(se->load.weight != NICE_0_LOAD))
    699		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
    700
    701	return delta;
    702}
    703
    704/*
    705 * The idea is to set a period in which each task runs once.
    706 *
    707 * When there are too many tasks (sched_nr_latency) we have to stretch
    708 * this period because otherwise the slices get too small.
    709 *
    710 * p = (nr <= nl) ? l : l*nr/nl
    711 */
    712static u64 __sched_period(unsigned long nr_running)
    713{
    714	if (unlikely(nr_running > sched_nr_latency))
    715		return nr_running * sysctl_sched_min_granularity;
    716	else
    717		return sysctl_sched_latency;
    718}
    719
    720static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
    721
    722/*
    723 * We calculate the wall-time slice from the period by taking a part
    724 * proportional to the weight.
    725 *
    726 * s = p*P[w/rw]
    727 */
    728static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
    729{
    730	unsigned int nr_running = cfs_rq->nr_running;
    731	struct sched_entity *init_se = se;
    732	unsigned int min_gran;
    733	u64 slice;
    734
    735	if (sched_feat(ALT_PERIOD))
    736		nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
    737
    738	slice = __sched_period(nr_running + !se->on_rq);
    739
    740	for_each_sched_entity(se) {
    741		struct load_weight *load;
    742		struct load_weight lw;
    743		struct cfs_rq *qcfs_rq;
    744
    745		qcfs_rq = cfs_rq_of(se);
    746		load = &qcfs_rq->load;
    747
    748		if (unlikely(!se->on_rq)) {
    749			lw = qcfs_rq->load;
    750
    751			update_load_add(&lw, se->load.weight);
    752			load = &lw;
    753		}
    754		slice = __calc_delta(slice, se->load.weight, load);
    755	}
    756
    757	if (sched_feat(BASE_SLICE)) {
    758		if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
    759			min_gran = sysctl_sched_idle_min_granularity;
    760		else
    761			min_gran = sysctl_sched_min_granularity;
    762
    763		slice = max_t(u64, slice, min_gran);
    764	}
    765
    766	return slice;
    767}
    768
    769/*
    770 * We calculate the vruntime slice of a to-be-inserted task.
    771 *
    772 * vs = s/w
    773 */
    774static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
    775{
    776	return calc_delta_fair(sched_slice(cfs_rq, se), se);
    777}
    778
    779#include "pelt.h"
    780#ifdef CONFIG_SMP
    781
    782static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
    783static unsigned long task_h_load(struct task_struct *p);
    784static unsigned long capacity_of(int cpu);
    785
    786/* Give new sched_entity start runnable values to heavy its load in infant time */
    787void init_entity_runnable_average(struct sched_entity *se)
    788{
    789	struct sched_avg *sa = &se->avg;
    790
    791	memset(sa, 0, sizeof(*sa));
    792
    793	/*
    794	 * Tasks are initialized with full load to be seen as heavy tasks until
    795	 * they get a chance to stabilize to their real load level.
    796	 * Group entities are initialized with zero load to reflect the fact that
    797	 * nothing has been attached to the task group yet.
    798	 */
    799	if (entity_is_task(se))
    800		sa->load_avg = scale_load_down(se->load.weight);
    801
    802	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
    803}
    804
    805static void attach_entity_cfs_rq(struct sched_entity *se);
    806
    807/*
    808 * With new tasks being created, their initial util_avgs are extrapolated
    809 * based on the cfs_rq's current util_avg:
    810 *
    811 *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
    812 *
    813 * However, in many cases, the above util_avg does not give a desired
    814 * value. Moreover, the sum of the util_avgs may be divergent, such
    815 * as when the series is a harmonic series.
    816 *
    817 * To solve this problem, we also cap the util_avg of successive tasks to
    818 * only 1/2 of the left utilization budget:
    819 *
    820 *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
    821 *
    822 * where n denotes the nth task and cpu_scale the CPU capacity.
    823 *
    824 * For example, for a CPU with 1024 of capacity, a simplest series from
    825 * the beginning would be like:
    826 *
    827 *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
    828 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
    829 *
    830 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
    831 * if util_avg > util_avg_cap.
    832 */
    833void post_init_entity_util_avg(struct task_struct *p)
    834{
    835	struct sched_entity *se = &p->se;
    836	struct cfs_rq *cfs_rq = cfs_rq_of(se);
    837	struct sched_avg *sa = &se->avg;
    838	long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
    839	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
    840
    841	if (cap > 0) {
    842		if (cfs_rq->avg.util_avg != 0) {
    843			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
    844			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
    845
    846			if (sa->util_avg > cap)
    847				sa->util_avg = cap;
    848		} else {
    849			sa->util_avg = cap;
    850		}
    851	}
    852
    853	sa->runnable_avg = sa->util_avg;
    854
    855	if (p->sched_class != &fair_sched_class) {
    856		/*
    857		 * For !fair tasks do:
    858		 *
    859		update_cfs_rq_load_avg(now, cfs_rq);
    860		attach_entity_load_avg(cfs_rq, se);
    861		switched_from_fair(rq, p);
    862		 *
    863		 * such that the next switched_to_fair() has the
    864		 * expected state.
    865		 */
    866		se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
    867		return;
    868	}
    869
    870	attach_entity_cfs_rq(se);
    871}
    872
    873#else /* !CONFIG_SMP */
    874void init_entity_runnable_average(struct sched_entity *se)
    875{
    876}
    877void post_init_entity_util_avg(struct task_struct *p)
    878{
    879}
    880static void update_tg_load_avg(struct cfs_rq *cfs_rq)
    881{
    882}
    883#endif /* CONFIG_SMP */
    884
    885/*
    886 * Update the current task's runtime statistics.
    887 */
    888static void update_curr(struct cfs_rq *cfs_rq)
    889{
    890	struct sched_entity *curr = cfs_rq->curr;
    891	u64 now = rq_clock_task(rq_of(cfs_rq));
    892	u64 delta_exec;
    893
    894	if (unlikely(!curr))
    895		return;
    896
    897	delta_exec = now - curr->exec_start;
    898	if (unlikely((s64)delta_exec <= 0))
    899		return;
    900
    901	curr->exec_start = now;
    902
    903	if (schedstat_enabled()) {
    904		struct sched_statistics *stats;
    905
    906		stats = __schedstats_from_se(curr);
    907		__schedstat_set(stats->exec_max,
    908				max(delta_exec, stats->exec_max));
    909	}
    910
    911	curr->sum_exec_runtime += delta_exec;
    912	schedstat_add(cfs_rq->exec_clock, delta_exec);
    913
    914	curr->vruntime += calc_delta_fair(delta_exec, curr);
    915	update_min_vruntime(cfs_rq);
    916
    917	if (entity_is_task(curr)) {
    918		struct task_struct *curtask = task_of(curr);
    919
    920		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
    921		cgroup_account_cputime(curtask, delta_exec);
    922		account_group_exec_runtime(curtask, delta_exec);
    923	}
    924
    925	account_cfs_rq_runtime(cfs_rq, delta_exec);
    926}
    927
    928static void update_curr_fair(struct rq *rq)
    929{
    930	update_curr(cfs_rq_of(&rq->curr->se));
    931}
    932
    933static inline void
    934update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
    935{
    936	struct sched_statistics *stats;
    937	struct task_struct *p = NULL;
    938
    939	if (!schedstat_enabled())
    940		return;
    941
    942	stats = __schedstats_from_se(se);
    943
    944	if (entity_is_task(se))
    945		p = task_of(se);
    946
    947	__update_stats_wait_start(rq_of(cfs_rq), p, stats);
    948}
    949
    950static inline void
    951update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
    952{
    953	struct sched_statistics *stats;
    954	struct task_struct *p = NULL;
    955
    956	if (!schedstat_enabled())
    957		return;
    958
    959	stats = __schedstats_from_se(se);
    960
    961	/*
    962	 * When the sched_schedstat changes from 0 to 1, some sched se
    963	 * maybe already in the runqueue, the se->statistics.wait_start
    964	 * will be 0.So it will let the delta wrong. We need to avoid this
    965	 * scenario.
    966	 */
    967	if (unlikely(!schedstat_val(stats->wait_start)))
    968		return;
    969
    970	if (entity_is_task(se))
    971		p = task_of(se);
    972
    973	__update_stats_wait_end(rq_of(cfs_rq), p, stats);
    974}
    975
    976static inline void
    977update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
    978{
    979	struct sched_statistics *stats;
    980	struct task_struct *tsk = NULL;
    981
    982	if (!schedstat_enabled())
    983		return;
    984
    985	stats = __schedstats_from_se(se);
    986
    987	if (entity_is_task(se))
    988		tsk = task_of(se);
    989
    990	__update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
    991}
    992
    993/*
    994 * Task is being enqueued - update stats:
    995 */
    996static inline void
    997update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
    998{
    999	if (!schedstat_enabled())
   1000		return;
   1001
   1002	/*
   1003	 * Are we enqueueing a waiting task? (for current tasks
   1004	 * a dequeue/enqueue event is a NOP)
   1005	 */
   1006	if (se != cfs_rq->curr)
   1007		update_stats_wait_start_fair(cfs_rq, se);
   1008
   1009	if (flags & ENQUEUE_WAKEUP)
   1010		update_stats_enqueue_sleeper_fair(cfs_rq, se);
   1011}
   1012
   1013static inline void
   1014update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   1015{
   1016
   1017	if (!schedstat_enabled())
   1018		return;
   1019
   1020	/*
   1021	 * Mark the end of the wait period if dequeueing a
   1022	 * waiting task:
   1023	 */
   1024	if (se != cfs_rq->curr)
   1025		update_stats_wait_end_fair(cfs_rq, se);
   1026
   1027	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
   1028		struct task_struct *tsk = task_of(se);
   1029		unsigned int state;
   1030
   1031		/* XXX racy against TTWU */
   1032		state = READ_ONCE(tsk->__state);
   1033		if (state & TASK_INTERRUPTIBLE)
   1034			__schedstat_set(tsk->stats.sleep_start,
   1035				      rq_clock(rq_of(cfs_rq)));
   1036		if (state & TASK_UNINTERRUPTIBLE)
   1037			__schedstat_set(tsk->stats.block_start,
   1038				      rq_clock(rq_of(cfs_rq)));
   1039	}
   1040}
   1041
   1042/*
   1043 * We are picking a new current task - update its stats:
   1044 */
   1045static inline void
   1046update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
   1047{
   1048	/*
   1049	 * We are starting a new run period:
   1050	 */
   1051	se->exec_start = rq_clock_task(rq_of(cfs_rq));
   1052}
   1053
   1054/**************************************************
   1055 * Scheduling class queueing methods:
   1056 */
   1057
   1058#ifdef CONFIG_NUMA_BALANCING
   1059/*
   1060 * Approximate time to scan a full NUMA task in ms. The task scan period is
   1061 * calculated based on the tasks virtual memory size and
   1062 * numa_balancing_scan_size.
   1063 */
   1064unsigned int sysctl_numa_balancing_scan_period_min = 1000;
   1065unsigned int sysctl_numa_balancing_scan_period_max = 60000;
   1066
   1067/* Portion of address space to scan in MB */
   1068unsigned int sysctl_numa_balancing_scan_size = 256;
   1069
   1070/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
   1071unsigned int sysctl_numa_balancing_scan_delay = 1000;
   1072
   1073struct numa_group {
   1074	refcount_t refcount;
   1075
   1076	spinlock_t lock; /* nr_tasks, tasks */
   1077	int nr_tasks;
   1078	pid_t gid;
   1079	int active_nodes;
   1080
   1081	struct rcu_head rcu;
   1082	unsigned long total_faults;
   1083	unsigned long max_faults_cpu;
   1084	/*
   1085	 * faults[] array is split into two regions: faults_mem and faults_cpu.
   1086	 *
   1087	 * Faults_cpu is used to decide whether memory should move
   1088	 * towards the CPU. As a consequence, these stats are weighted
   1089	 * more by CPU use than by memory faults.
   1090	 */
   1091	unsigned long faults[];
   1092};
   1093
   1094/*
   1095 * For functions that can be called in multiple contexts that permit reading
   1096 * ->numa_group (see struct task_struct for locking rules).
   1097 */
   1098static struct numa_group *deref_task_numa_group(struct task_struct *p)
   1099{
   1100	return rcu_dereference_check(p->numa_group, p == current ||
   1101		(lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
   1102}
   1103
   1104static struct numa_group *deref_curr_numa_group(struct task_struct *p)
   1105{
   1106	return rcu_dereference_protected(p->numa_group, p == current);
   1107}
   1108
   1109static inline unsigned long group_faults_priv(struct numa_group *ng);
   1110static inline unsigned long group_faults_shared(struct numa_group *ng);
   1111
   1112static unsigned int task_nr_scan_windows(struct task_struct *p)
   1113{
   1114	unsigned long rss = 0;
   1115	unsigned long nr_scan_pages;
   1116
   1117	/*
   1118	 * Calculations based on RSS as non-present and empty pages are skipped
   1119	 * by the PTE scanner and NUMA hinting faults should be trapped based
   1120	 * on resident pages
   1121	 */
   1122	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
   1123	rss = get_mm_rss(p->mm);
   1124	if (!rss)
   1125		rss = nr_scan_pages;
   1126
   1127	rss = round_up(rss, nr_scan_pages);
   1128	return rss / nr_scan_pages;
   1129}
   1130
   1131/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
   1132#define MAX_SCAN_WINDOW 2560
   1133
   1134static unsigned int task_scan_min(struct task_struct *p)
   1135{
   1136	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
   1137	unsigned int scan, floor;
   1138	unsigned int windows = 1;
   1139
   1140	if (scan_size < MAX_SCAN_WINDOW)
   1141		windows = MAX_SCAN_WINDOW / scan_size;
   1142	floor = 1000 / windows;
   1143
   1144	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
   1145	return max_t(unsigned int, floor, scan);
   1146}
   1147
   1148static unsigned int task_scan_start(struct task_struct *p)
   1149{
   1150	unsigned long smin = task_scan_min(p);
   1151	unsigned long period = smin;
   1152	struct numa_group *ng;
   1153
   1154	/* Scale the maximum scan period with the amount of shared memory. */
   1155	rcu_read_lock();
   1156	ng = rcu_dereference(p->numa_group);
   1157	if (ng) {
   1158		unsigned long shared = group_faults_shared(ng);
   1159		unsigned long private = group_faults_priv(ng);
   1160
   1161		period *= refcount_read(&ng->refcount);
   1162		period *= shared + 1;
   1163		period /= private + shared + 1;
   1164	}
   1165	rcu_read_unlock();
   1166
   1167	return max(smin, period);
   1168}
   1169
   1170static unsigned int task_scan_max(struct task_struct *p)
   1171{
   1172	unsigned long smin = task_scan_min(p);
   1173	unsigned long smax;
   1174	struct numa_group *ng;
   1175
   1176	/* Watch for min being lower than max due to floor calculations */
   1177	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
   1178
   1179	/* Scale the maximum scan period with the amount of shared memory. */
   1180	ng = deref_curr_numa_group(p);
   1181	if (ng) {
   1182		unsigned long shared = group_faults_shared(ng);
   1183		unsigned long private = group_faults_priv(ng);
   1184		unsigned long period = smax;
   1185
   1186		period *= refcount_read(&ng->refcount);
   1187		period *= shared + 1;
   1188		period /= private + shared + 1;
   1189
   1190		smax = max(smax, period);
   1191	}
   1192
   1193	return max(smin, smax);
   1194}
   1195
   1196static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
   1197{
   1198	rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
   1199	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
   1200}
   1201
   1202static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
   1203{
   1204	rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
   1205	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
   1206}
   1207
   1208/* Shared or private faults. */
   1209#define NR_NUMA_HINT_FAULT_TYPES 2
   1210
   1211/* Memory and CPU locality */
   1212#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
   1213
   1214/* Averaged statistics, and temporary buffers. */
   1215#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
   1216
   1217pid_t task_numa_group_id(struct task_struct *p)
   1218{
   1219	struct numa_group *ng;
   1220	pid_t gid = 0;
   1221
   1222	rcu_read_lock();
   1223	ng = rcu_dereference(p->numa_group);
   1224	if (ng)
   1225		gid = ng->gid;
   1226	rcu_read_unlock();
   1227
   1228	return gid;
   1229}
   1230
   1231/*
   1232 * The averaged statistics, shared & private, memory & CPU,
   1233 * occupy the first half of the array. The second half of the
   1234 * array is for current counters, which are averaged into the
   1235 * first set by task_numa_placement.
   1236 */
   1237static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
   1238{
   1239	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
   1240}
   1241
   1242static inline unsigned long task_faults(struct task_struct *p, int nid)
   1243{
   1244	if (!p->numa_faults)
   1245		return 0;
   1246
   1247	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
   1248		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
   1249}
   1250
   1251static inline unsigned long group_faults(struct task_struct *p, int nid)
   1252{
   1253	struct numa_group *ng = deref_task_numa_group(p);
   1254
   1255	if (!ng)
   1256		return 0;
   1257
   1258	return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
   1259		ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
   1260}
   1261
   1262static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
   1263{
   1264	return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
   1265		group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
   1266}
   1267
   1268static inline unsigned long group_faults_priv(struct numa_group *ng)
   1269{
   1270	unsigned long faults = 0;
   1271	int node;
   1272
   1273	for_each_online_node(node) {
   1274		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
   1275	}
   1276
   1277	return faults;
   1278}
   1279
   1280static inline unsigned long group_faults_shared(struct numa_group *ng)
   1281{
   1282	unsigned long faults = 0;
   1283	int node;
   1284
   1285	for_each_online_node(node) {
   1286		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
   1287	}
   1288
   1289	return faults;
   1290}
   1291
   1292/*
   1293 * A node triggering more than 1/3 as many NUMA faults as the maximum is
   1294 * considered part of a numa group's pseudo-interleaving set. Migrations
   1295 * between these nodes are slowed down, to allow things to settle down.
   1296 */
   1297#define ACTIVE_NODE_FRACTION 3
   1298
   1299static bool numa_is_active_node(int nid, struct numa_group *ng)
   1300{
   1301	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
   1302}
   1303
   1304/* Handle placement on systems where not all nodes are directly connected. */
   1305static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
   1306					int lim_dist, bool task)
   1307{
   1308	unsigned long score = 0;
   1309	int node, max_dist;
   1310
   1311	/*
   1312	 * All nodes are directly connected, and the same distance
   1313	 * from each other. No need for fancy placement algorithms.
   1314	 */
   1315	if (sched_numa_topology_type == NUMA_DIRECT)
   1316		return 0;
   1317
   1318	/* sched_max_numa_distance may be changed in parallel. */
   1319	max_dist = READ_ONCE(sched_max_numa_distance);
   1320	/*
   1321	 * This code is called for each node, introducing N^2 complexity,
   1322	 * which should be ok given the number of nodes rarely exceeds 8.
   1323	 */
   1324	for_each_online_node(node) {
   1325		unsigned long faults;
   1326		int dist = node_distance(nid, node);
   1327
   1328		/*
   1329		 * The furthest away nodes in the system are not interesting
   1330		 * for placement; nid was already counted.
   1331		 */
   1332		if (dist >= max_dist || node == nid)
   1333			continue;
   1334
   1335		/*
   1336		 * On systems with a backplane NUMA topology, compare groups
   1337		 * of nodes, and move tasks towards the group with the most
   1338		 * memory accesses. When comparing two nodes at distance
   1339		 * "hoplimit", only nodes closer by than "hoplimit" are part
   1340		 * of each group. Skip other nodes.
   1341		 */
   1342		if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
   1343			continue;
   1344
   1345		/* Add up the faults from nearby nodes. */
   1346		if (task)
   1347			faults = task_faults(p, node);
   1348		else
   1349			faults = group_faults(p, node);
   1350
   1351		/*
   1352		 * On systems with a glueless mesh NUMA topology, there are
   1353		 * no fixed "groups of nodes". Instead, nodes that are not
   1354		 * directly connected bounce traffic through intermediate
   1355		 * nodes; a numa_group can occupy any set of nodes.
   1356		 * The further away a node is, the less the faults count.
   1357		 * This seems to result in good task placement.
   1358		 */
   1359		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
   1360			faults *= (max_dist - dist);
   1361			faults /= (max_dist - LOCAL_DISTANCE);
   1362		}
   1363
   1364		score += faults;
   1365	}
   1366
   1367	return score;
   1368}
   1369
   1370/*
   1371 * These return the fraction of accesses done by a particular task, or
   1372 * task group, on a particular numa node.  The group weight is given a
   1373 * larger multiplier, in order to group tasks together that are almost
   1374 * evenly spread out between numa nodes.
   1375 */
   1376static inline unsigned long task_weight(struct task_struct *p, int nid,
   1377					int dist)
   1378{
   1379	unsigned long faults, total_faults;
   1380
   1381	if (!p->numa_faults)
   1382		return 0;
   1383
   1384	total_faults = p->total_numa_faults;
   1385
   1386	if (!total_faults)
   1387		return 0;
   1388
   1389	faults = task_faults(p, nid);
   1390	faults += score_nearby_nodes(p, nid, dist, true);
   1391
   1392	return 1000 * faults / total_faults;
   1393}
   1394
   1395static inline unsigned long group_weight(struct task_struct *p, int nid,
   1396					 int dist)
   1397{
   1398	struct numa_group *ng = deref_task_numa_group(p);
   1399	unsigned long faults, total_faults;
   1400
   1401	if (!ng)
   1402		return 0;
   1403
   1404	total_faults = ng->total_faults;
   1405
   1406	if (!total_faults)
   1407		return 0;
   1408
   1409	faults = group_faults(p, nid);
   1410	faults += score_nearby_nodes(p, nid, dist, false);
   1411
   1412	return 1000 * faults / total_faults;
   1413}
   1414
   1415bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
   1416				int src_nid, int dst_cpu)
   1417{
   1418	struct numa_group *ng = deref_curr_numa_group(p);
   1419	int dst_nid = cpu_to_node(dst_cpu);
   1420	int last_cpupid, this_cpupid;
   1421
   1422	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
   1423	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
   1424
   1425	/*
   1426	 * Allow first faults or private faults to migrate immediately early in
   1427	 * the lifetime of a task. The magic number 4 is based on waiting for
   1428	 * two full passes of the "multi-stage node selection" test that is
   1429	 * executed below.
   1430	 */
   1431	if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
   1432	    (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
   1433		return true;
   1434
   1435	/*
   1436	 * Multi-stage node selection is used in conjunction with a periodic
   1437	 * migration fault to build a temporal task<->page relation. By using
   1438	 * a two-stage filter we remove short/unlikely relations.
   1439	 *
   1440	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
   1441	 * a task's usage of a particular page (n_p) per total usage of this
   1442	 * page (n_t) (in a given time-span) to a probability.
   1443	 *
   1444	 * Our periodic faults will sample this probability and getting the
   1445	 * same result twice in a row, given these samples are fully
   1446	 * independent, is then given by P(n)^2, provided our sample period
   1447	 * is sufficiently short compared to the usage pattern.
   1448	 *
   1449	 * This quadric squishes small probabilities, making it less likely we
   1450	 * act on an unlikely task<->page relation.
   1451	 */
   1452	if (!cpupid_pid_unset(last_cpupid) &&
   1453				cpupid_to_nid(last_cpupid) != dst_nid)
   1454		return false;
   1455
   1456	/* Always allow migrate on private faults */
   1457	if (cpupid_match_pid(p, last_cpupid))
   1458		return true;
   1459
   1460	/* A shared fault, but p->numa_group has not been set up yet. */
   1461	if (!ng)
   1462		return true;
   1463
   1464	/*
   1465	 * Destination node is much more heavily used than the source
   1466	 * node? Allow migration.
   1467	 */
   1468	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
   1469					ACTIVE_NODE_FRACTION)
   1470		return true;
   1471
   1472	/*
   1473	 * Distribute memory according to CPU & memory use on each node,
   1474	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
   1475	 *
   1476	 * faults_cpu(dst)   3   faults_cpu(src)
   1477	 * --------------- * - > ---------------
   1478	 * faults_mem(dst)   4   faults_mem(src)
   1479	 */
   1480	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
   1481	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
   1482}
   1483
   1484/*
   1485 * 'numa_type' describes the node at the moment of load balancing.
   1486 */
   1487enum numa_type {
   1488	/* The node has spare capacity that can be used to run more tasks.  */
   1489	node_has_spare = 0,
   1490	/*
   1491	 * The node is fully used and the tasks don't compete for more CPU
   1492	 * cycles. Nevertheless, some tasks might wait before running.
   1493	 */
   1494	node_fully_busy,
   1495	/*
   1496	 * The node is overloaded and can't provide expected CPU cycles to all
   1497	 * tasks.
   1498	 */
   1499	node_overloaded
   1500};
   1501
   1502/* Cached statistics for all CPUs within a node */
   1503struct numa_stats {
   1504	unsigned long load;
   1505	unsigned long runnable;
   1506	unsigned long util;
   1507	/* Total compute capacity of CPUs on a node */
   1508	unsigned long compute_capacity;
   1509	unsigned int nr_running;
   1510	unsigned int weight;
   1511	enum numa_type node_type;
   1512	int idle_cpu;
   1513};
   1514
   1515static inline bool is_core_idle(int cpu)
   1516{
   1517#ifdef CONFIG_SCHED_SMT
   1518	int sibling;
   1519
   1520	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
   1521		if (cpu == sibling)
   1522			continue;
   1523
   1524		if (!idle_cpu(sibling))
   1525			return false;
   1526	}
   1527#endif
   1528
   1529	return true;
   1530}
   1531
   1532struct task_numa_env {
   1533	struct task_struct *p;
   1534
   1535	int src_cpu, src_nid;
   1536	int dst_cpu, dst_nid;
   1537	int imb_numa_nr;
   1538
   1539	struct numa_stats src_stats, dst_stats;
   1540
   1541	int imbalance_pct;
   1542	int dist;
   1543
   1544	struct task_struct *best_task;
   1545	long best_imp;
   1546	int best_cpu;
   1547};
   1548
   1549static unsigned long cpu_load(struct rq *rq);
   1550static unsigned long cpu_runnable(struct rq *rq);
   1551static inline long adjust_numa_imbalance(int imbalance,
   1552					int dst_running, int imb_numa_nr);
   1553
   1554static inline enum
   1555numa_type numa_classify(unsigned int imbalance_pct,
   1556			 struct numa_stats *ns)
   1557{
   1558	if ((ns->nr_running > ns->weight) &&
   1559	    (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
   1560	     ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
   1561		return node_overloaded;
   1562
   1563	if ((ns->nr_running < ns->weight) ||
   1564	    (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
   1565	     ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
   1566		return node_has_spare;
   1567
   1568	return node_fully_busy;
   1569}
   1570
   1571#ifdef CONFIG_SCHED_SMT
   1572/* Forward declarations of select_idle_sibling helpers */
   1573static inline bool test_idle_cores(int cpu, bool def);
   1574static inline int numa_idle_core(int idle_core, int cpu)
   1575{
   1576	if (!static_branch_likely(&sched_smt_present) ||
   1577	    idle_core >= 0 || !test_idle_cores(cpu, false))
   1578		return idle_core;
   1579
   1580	/*
   1581	 * Prefer cores instead of packing HT siblings
   1582	 * and triggering future load balancing.
   1583	 */
   1584	if (is_core_idle(cpu))
   1585		idle_core = cpu;
   1586
   1587	return idle_core;
   1588}
   1589#else
   1590static inline int numa_idle_core(int idle_core, int cpu)
   1591{
   1592	return idle_core;
   1593}
   1594#endif
   1595
   1596/*
   1597 * Gather all necessary information to make NUMA balancing placement
   1598 * decisions that are compatible with standard load balancer. This
   1599 * borrows code and logic from update_sg_lb_stats but sharing a
   1600 * common implementation is impractical.
   1601 */
   1602static void update_numa_stats(struct task_numa_env *env,
   1603			      struct numa_stats *ns, int nid,
   1604			      bool find_idle)
   1605{
   1606	int cpu, idle_core = -1;
   1607
   1608	memset(ns, 0, sizeof(*ns));
   1609	ns->idle_cpu = -1;
   1610
   1611	rcu_read_lock();
   1612	for_each_cpu(cpu, cpumask_of_node(nid)) {
   1613		struct rq *rq = cpu_rq(cpu);
   1614
   1615		ns->load += cpu_load(rq);
   1616		ns->runnable += cpu_runnable(rq);
   1617		ns->util += cpu_util_cfs(cpu);
   1618		ns->nr_running += rq->cfs.h_nr_running;
   1619		ns->compute_capacity += capacity_of(cpu);
   1620
   1621		if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
   1622			if (READ_ONCE(rq->numa_migrate_on) ||
   1623			    !cpumask_test_cpu(cpu, env->p->cpus_ptr))
   1624				continue;
   1625
   1626			if (ns->idle_cpu == -1)
   1627				ns->idle_cpu = cpu;
   1628
   1629			idle_core = numa_idle_core(idle_core, cpu);
   1630		}
   1631	}
   1632	rcu_read_unlock();
   1633
   1634	ns->weight = cpumask_weight(cpumask_of_node(nid));
   1635
   1636	ns->node_type = numa_classify(env->imbalance_pct, ns);
   1637
   1638	if (idle_core >= 0)
   1639		ns->idle_cpu = idle_core;
   1640}
   1641
   1642static void task_numa_assign(struct task_numa_env *env,
   1643			     struct task_struct *p, long imp)
   1644{
   1645	struct rq *rq = cpu_rq(env->dst_cpu);
   1646
   1647	/* Check if run-queue part of active NUMA balance. */
   1648	if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
   1649		int cpu;
   1650		int start = env->dst_cpu;
   1651
   1652		/* Find alternative idle CPU. */
   1653		for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
   1654			if (cpu == env->best_cpu || !idle_cpu(cpu) ||
   1655			    !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
   1656				continue;
   1657			}
   1658
   1659			env->dst_cpu = cpu;
   1660			rq = cpu_rq(env->dst_cpu);
   1661			if (!xchg(&rq->numa_migrate_on, 1))
   1662				goto assign;
   1663		}
   1664
   1665		/* Failed to find an alternative idle CPU */
   1666		return;
   1667	}
   1668
   1669assign:
   1670	/*
   1671	 * Clear previous best_cpu/rq numa-migrate flag, since task now
   1672	 * found a better CPU to move/swap.
   1673	 */
   1674	if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
   1675		rq = cpu_rq(env->best_cpu);
   1676		WRITE_ONCE(rq->numa_migrate_on, 0);
   1677	}
   1678
   1679	if (env->best_task)
   1680		put_task_struct(env->best_task);
   1681	if (p)
   1682		get_task_struct(p);
   1683
   1684	env->best_task = p;
   1685	env->best_imp = imp;
   1686	env->best_cpu = env->dst_cpu;
   1687}
   1688
   1689static bool load_too_imbalanced(long src_load, long dst_load,
   1690				struct task_numa_env *env)
   1691{
   1692	long imb, old_imb;
   1693	long orig_src_load, orig_dst_load;
   1694	long src_capacity, dst_capacity;
   1695
   1696	/*
   1697	 * The load is corrected for the CPU capacity available on each node.
   1698	 *
   1699	 * src_load        dst_load
   1700	 * ------------ vs ---------
   1701	 * src_capacity    dst_capacity
   1702	 */
   1703	src_capacity = env->src_stats.compute_capacity;
   1704	dst_capacity = env->dst_stats.compute_capacity;
   1705
   1706	imb = abs(dst_load * src_capacity - src_load * dst_capacity);
   1707
   1708	orig_src_load = env->src_stats.load;
   1709	orig_dst_load = env->dst_stats.load;
   1710
   1711	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
   1712
   1713	/* Would this change make things worse? */
   1714	return (imb > old_imb);
   1715}
   1716
   1717/*
   1718 * Maximum NUMA importance can be 1998 (2*999);
   1719 * SMALLIMP @ 30 would be close to 1998/64.
   1720 * Used to deter task migration.
   1721 */
   1722#define SMALLIMP	30
   1723
   1724/*
   1725 * This checks if the overall compute and NUMA accesses of the system would
   1726 * be improved if the source tasks was migrated to the target dst_cpu taking
   1727 * into account that it might be best if task running on the dst_cpu should
   1728 * be exchanged with the source task
   1729 */
   1730static bool task_numa_compare(struct task_numa_env *env,
   1731			      long taskimp, long groupimp, bool maymove)
   1732{
   1733	struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
   1734	struct rq *dst_rq = cpu_rq(env->dst_cpu);
   1735	long imp = p_ng ? groupimp : taskimp;
   1736	struct task_struct *cur;
   1737	long src_load, dst_load;
   1738	int dist = env->dist;
   1739	long moveimp = imp;
   1740	long load;
   1741	bool stopsearch = false;
   1742
   1743	if (READ_ONCE(dst_rq->numa_migrate_on))
   1744		return false;
   1745
   1746	rcu_read_lock();
   1747	cur = rcu_dereference(dst_rq->curr);
   1748	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
   1749		cur = NULL;
   1750
   1751	/*
   1752	 * Because we have preemption enabled we can get migrated around and
   1753	 * end try selecting ourselves (current == env->p) as a swap candidate.
   1754	 */
   1755	if (cur == env->p) {
   1756		stopsearch = true;
   1757		goto unlock;
   1758	}
   1759
   1760	if (!cur) {
   1761		if (maymove && moveimp >= env->best_imp)
   1762			goto assign;
   1763		else
   1764			goto unlock;
   1765	}
   1766
   1767	/* Skip this swap candidate if cannot move to the source cpu. */
   1768	if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
   1769		goto unlock;
   1770
   1771	/*
   1772	 * Skip this swap candidate if it is not moving to its preferred
   1773	 * node and the best task is.
   1774	 */
   1775	if (env->best_task &&
   1776	    env->best_task->numa_preferred_nid == env->src_nid &&
   1777	    cur->numa_preferred_nid != env->src_nid) {
   1778		goto unlock;
   1779	}
   1780
   1781	/*
   1782	 * "imp" is the fault differential for the source task between the
   1783	 * source and destination node. Calculate the total differential for
   1784	 * the source task and potential destination task. The more negative
   1785	 * the value is, the more remote accesses that would be expected to
   1786	 * be incurred if the tasks were swapped.
   1787	 *
   1788	 * If dst and source tasks are in the same NUMA group, or not
   1789	 * in any group then look only at task weights.
   1790	 */
   1791	cur_ng = rcu_dereference(cur->numa_group);
   1792	if (cur_ng == p_ng) {
   1793		imp = taskimp + task_weight(cur, env->src_nid, dist) -
   1794		      task_weight(cur, env->dst_nid, dist);
   1795		/*
   1796		 * Add some hysteresis to prevent swapping the
   1797		 * tasks within a group over tiny differences.
   1798		 */
   1799		if (cur_ng)
   1800			imp -= imp / 16;
   1801	} else {
   1802		/*
   1803		 * Compare the group weights. If a task is all by itself
   1804		 * (not part of a group), use the task weight instead.
   1805		 */
   1806		if (cur_ng && p_ng)
   1807			imp += group_weight(cur, env->src_nid, dist) -
   1808			       group_weight(cur, env->dst_nid, dist);
   1809		else
   1810			imp += task_weight(cur, env->src_nid, dist) -
   1811			       task_weight(cur, env->dst_nid, dist);
   1812	}
   1813
   1814	/* Discourage picking a task already on its preferred node */
   1815	if (cur->numa_preferred_nid == env->dst_nid)
   1816		imp -= imp / 16;
   1817
   1818	/*
   1819	 * Encourage picking a task that moves to its preferred node.
   1820	 * This potentially makes imp larger than it's maximum of
   1821	 * 1998 (see SMALLIMP and task_weight for why) but in this
   1822	 * case, it does not matter.
   1823	 */
   1824	if (cur->numa_preferred_nid == env->src_nid)
   1825		imp += imp / 8;
   1826
   1827	if (maymove && moveimp > imp && moveimp > env->best_imp) {
   1828		imp = moveimp;
   1829		cur = NULL;
   1830		goto assign;
   1831	}
   1832
   1833	/*
   1834	 * Prefer swapping with a task moving to its preferred node over a
   1835	 * task that is not.
   1836	 */
   1837	if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
   1838	    env->best_task->numa_preferred_nid != env->src_nid) {
   1839		goto assign;
   1840	}
   1841
   1842	/*
   1843	 * If the NUMA importance is less than SMALLIMP,
   1844	 * task migration might only result in ping pong
   1845	 * of tasks and also hurt performance due to cache
   1846	 * misses.
   1847	 */
   1848	if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
   1849		goto unlock;
   1850
   1851	/*
   1852	 * In the overloaded case, try and keep the load balanced.
   1853	 */
   1854	load = task_h_load(env->p) - task_h_load(cur);
   1855	if (!load)
   1856		goto assign;
   1857
   1858	dst_load = env->dst_stats.load + load;
   1859	src_load = env->src_stats.load - load;
   1860
   1861	if (load_too_imbalanced(src_load, dst_load, env))
   1862		goto unlock;
   1863
   1864assign:
   1865	/* Evaluate an idle CPU for a task numa move. */
   1866	if (!cur) {
   1867		int cpu = env->dst_stats.idle_cpu;
   1868
   1869		/* Nothing cached so current CPU went idle since the search. */
   1870		if (cpu < 0)
   1871			cpu = env->dst_cpu;
   1872
   1873		/*
   1874		 * If the CPU is no longer truly idle and the previous best CPU
   1875		 * is, keep using it.
   1876		 */
   1877		if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
   1878		    idle_cpu(env->best_cpu)) {
   1879			cpu = env->best_cpu;
   1880		}
   1881
   1882		env->dst_cpu = cpu;
   1883	}
   1884
   1885	task_numa_assign(env, cur, imp);
   1886
   1887	/*
   1888	 * If a move to idle is allowed because there is capacity or load
   1889	 * balance improves then stop the search. While a better swap
   1890	 * candidate may exist, a search is not free.
   1891	 */
   1892	if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
   1893		stopsearch = true;
   1894
   1895	/*
   1896	 * If a swap candidate must be identified and the current best task
   1897	 * moves its preferred node then stop the search.
   1898	 */
   1899	if (!maymove && env->best_task &&
   1900	    env->best_task->numa_preferred_nid == env->src_nid) {
   1901		stopsearch = true;
   1902	}
   1903unlock:
   1904	rcu_read_unlock();
   1905
   1906	return stopsearch;
   1907}
   1908
   1909static void task_numa_find_cpu(struct task_numa_env *env,
   1910				long taskimp, long groupimp)
   1911{
   1912	bool maymove = false;
   1913	int cpu;
   1914
   1915	/*
   1916	 * If dst node has spare capacity, then check if there is an
   1917	 * imbalance that would be overruled by the load balancer.
   1918	 */
   1919	if (env->dst_stats.node_type == node_has_spare) {
   1920		unsigned int imbalance;
   1921		int src_running, dst_running;
   1922
   1923		/*
   1924		 * Would movement cause an imbalance? Note that if src has
   1925		 * more running tasks that the imbalance is ignored as the
   1926		 * move improves the imbalance from the perspective of the
   1927		 * CPU load balancer.
   1928		 * */
   1929		src_running = env->src_stats.nr_running - 1;
   1930		dst_running = env->dst_stats.nr_running + 1;
   1931		imbalance = max(0, dst_running - src_running);
   1932		imbalance = adjust_numa_imbalance(imbalance, dst_running,
   1933						  env->imb_numa_nr);
   1934
   1935		/* Use idle CPU if there is no imbalance */
   1936		if (!imbalance) {
   1937			maymove = true;
   1938			if (env->dst_stats.idle_cpu >= 0) {
   1939				env->dst_cpu = env->dst_stats.idle_cpu;
   1940				task_numa_assign(env, NULL, 0);
   1941				return;
   1942			}
   1943		}
   1944	} else {
   1945		long src_load, dst_load, load;
   1946		/*
   1947		 * If the improvement from just moving env->p direction is better
   1948		 * than swapping tasks around, check if a move is possible.
   1949		 */
   1950		load = task_h_load(env->p);
   1951		dst_load = env->dst_stats.load + load;
   1952		src_load = env->src_stats.load - load;
   1953		maymove = !load_too_imbalanced(src_load, dst_load, env);
   1954	}
   1955
   1956	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
   1957		/* Skip this CPU if the source task cannot migrate */
   1958		if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
   1959			continue;
   1960
   1961		env->dst_cpu = cpu;
   1962		if (task_numa_compare(env, taskimp, groupimp, maymove))
   1963			break;
   1964	}
   1965}
   1966
   1967static int task_numa_migrate(struct task_struct *p)
   1968{
   1969	struct task_numa_env env = {
   1970		.p = p,
   1971
   1972		.src_cpu = task_cpu(p),
   1973		.src_nid = task_node(p),
   1974
   1975		.imbalance_pct = 112,
   1976
   1977		.best_task = NULL,
   1978		.best_imp = 0,
   1979		.best_cpu = -1,
   1980	};
   1981	unsigned long taskweight, groupweight;
   1982	struct sched_domain *sd;
   1983	long taskimp, groupimp;
   1984	struct numa_group *ng;
   1985	struct rq *best_rq;
   1986	int nid, ret, dist;
   1987
   1988	/*
   1989	 * Pick the lowest SD_NUMA domain, as that would have the smallest
   1990	 * imbalance and would be the first to start moving tasks about.
   1991	 *
   1992	 * And we want to avoid any moving of tasks about, as that would create
   1993	 * random movement of tasks -- counter the numa conditions we're trying
   1994	 * to satisfy here.
   1995	 */
   1996	rcu_read_lock();
   1997	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
   1998	if (sd) {
   1999		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
   2000		env.imb_numa_nr = sd->imb_numa_nr;
   2001	}
   2002	rcu_read_unlock();
   2003
   2004	/*
   2005	 * Cpusets can break the scheduler domain tree into smaller
   2006	 * balance domains, some of which do not cross NUMA boundaries.
   2007	 * Tasks that are "trapped" in such domains cannot be migrated
   2008	 * elsewhere, so there is no point in (re)trying.
   2009	 */
   2010	if (unlikely(!sd)) {
   2011		sched_setnuma(p, task_node(p));
   2012		return -EINVAL;
   2013	}
   2014
   2015	env.dst_nid = p->numa_preferred_nid;
   2016	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
   2017	taskweight = task_weight(p, env.src_nid, dist);
   2018	groupweight = group_weight(p, env.src_nid, dist);
   2019	update_numa_stats(&env, &env.src_stats, env.src_nid, false);
   2020	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
   2021	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
   2022	update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
   2023
   2024	/* Try to find a spot on the preferred nid. */
   2025	task_numa_find_cpu(&env, taskimp, groupimp);
   2026
   2027	/*
   2028	 * Look at other nodes in these cases:
   2029	 * - there is no space available on the preferred_nid
   2030	 * - the task is part of a numa_group that is interleaved across
   2031	 *   multiple NUMA nodes; in order to better consolidate the group,
   2032	 *   we need to check other locations.
   2033	 */
   2034	ng = deref_curr_numa_group(p);
   2035	if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
   2036		for_each_node_state(nid, N_CPU) {
   2037			if (nid == env.src_nid || nid == p->numa_preferred_nid)
   2038				continue;
   2039
   2040			dist = node_distance(env.src_nid, env.dst_nid);
   2041			if (sched_numa_topology_type == NUMA_BACKPLANE &&
   2042						dist != env.dist) {
   2043				taskweight = task_weight(p, env.src_nid, dist);
   2044				groupweight = group_weight(p, env.src_nid, dist);
   2045			}
   2046
   2047			/* Only consider nodes where both task and groups benefit */
   2048			taskimp = task_weight(p, nid, dist) - taskweight;
   2049			groupimp = group_weight(p, nid, dist) - groupweight;
   2050			if (taskimp < 0 && groupimp < 0)
   2051				continue;
   2052
   2053			env.dist = dist;
   2054			env.dst_nid = nid;
   2055			update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
   2056			task_numa_find_cpu(&env, taskimp, groupimp);
   2057		}
   2058	}
   2059
   2060	/*
   2061	 * If the task is part of a workload that spans multiple NUMA nodes,
   2062	 * and is migrating into one of the workload's active nodes, remember
   2063	 * this node as the task's preferred numa node, so the workload can
   2064	 * settle down.
   2065	 * A task that migrated to a second choice node will be better off
   2066	 * trying for a better one later. Do not set the preferred node here.
   2067	 */
   2068	if (ng) {
   2069		if (env.best_cpu == -1)
   2070			nid = env.src_nid;
   2071		else
   2072			nid = cpu_to_node(env.best_cpu);
   2073
   2074		if (nid != p->numa_preferred_nid)
   2075			sched_setnuma(p, nid);
   2076	}
   2077
   2078	/* No better CPU than the current one was found. */
   2079	if (env.best_cpu == -1) {
   2080		trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
   2081		return -EAGAIN;
   2082	}
   2083
   2084	best_rq = cpu_rq(env.best_cpu);
   2085	if (env.best_task == NULL) {
   2086		ret = migrate_task_to(p, env.best_cpu);
   2087		WRITE_ONCE(best_rq->numa_migrate_on, 0);
   2088		if (ret != 0)
   2089			trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
   2090		return ret;
   2091	}
   2092
   2093	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
   2094	WRITE_ONCE(best_rq->numa_migrate_on, 0);
   2095
   2096	if (ret != 0)
   2097		trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
   2098	put_task_struct(env.best_task);
   2099	return ret;
   2100}
   2101
   2102/* Attempt to migrate a task to a CPU on the preferred node. */
   2103static void numa_migrate_preferred(struct task_struct *p)
   2104{
   2105	unsigned long interval = HZ;
   2106
   2107	/* This task has no NUMA fault statistics yet */
   2108	if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
   2109		return;
   2110
   2111	/* Periodically retry migrating the task to the preferred node */
   2112	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
   2113	p->numa_migrate_retry = jiffies + interval;
   2114
   2115	/* Success if task is already running on preferred CPU */
   2116	if (task_node(p) == p->numa_preferred_nid)
   2117		return;
   2118
   2119	/* Otherwise, try migrate to a CPU on the preferred node */
   2120	task_numa_migrate(p);
   2121}
   2122
   2123/*
   2124 * Find out how many nodes the workload is actively running on. Do this by
   2125 * tracking the nodes from which NUMA hinting faults are triggered. This can
   2126 * be different from the set of nodes where the workload's memory is currently
   2127 * located.
   2128 */
   2129static void numa_group_count_active_nodes(struct numa_group *numa_group)
   2130{
   2131	unsigned long faults, max_faults = 0;
   2132	int nid, active_nodes = 0;
   2133
   2134	for_each_node_state(nid, N_CPU) {
   2135		faults = group_faults_cpu(numa_group, nid);
   2136		if (faults > max_faults)
   2137			max_faults = faults;
   2138	}
   2139
   2140	for_each_node_state(nid, N_CPU) {
   2141		faults = group_faults_cpu(numa_group, nid);
   2142		if (faults * ACTIVE_NODE_FRACTION > max_faults)
   2143			active_nodes++;
   2144	}
   2145
   2146	numa_group->max_faults_cpu = max_faults;
   2147	numa_group->active_nodes = active_nodes;
   2148}
   2149
   2150/*
   2151 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
   2152 * increments. The more local the fault statistics are, the higher the scan
   2153 * period will be for the next scan window. If local/(local+remote) ratio is
   2154 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
   2155 * the scan period will decrease. Aim for 70% local accesses.
   2156 */
   2157#define NUMA_PERIOD_SLOTS 10
   2158#define NUMA_PERIOD_THRESHOLD 7
   2159
   2160/*
   2161 * Increase the scan period (slow down scanning) if the majority of
   2162 * our memory is already on our local node, or if the majority of
   2163 * the page accesses are shared with other processes.
   2164 * Otherwise, decrease the scan period.
   2165 */
   2166static void update_task_scan_period(struct task_struct *p,
   2167			unsigned long shared, unsigned long private)
   2168{
   2169	unsigned int period_slot;
   2170	int lr_ratio, ps_ratio;
   2171	int diff;
   2172
   2173	unsigned long remote = p->numa_faults_locality[0];
   2174	unsigned long local = p->numa_faults_locality[1];
   2175
   2176	/*
   2177	 * If there were no record hinting faults then either the task is
   2178	 * completely idle or all activity is in areas that are not of interest
   2179	 * to automatic numa balancing. Related to that, if there were failed
   2180	 * migration then it implies we are migrating too quickly or the local
   2181	 * node is overloaded. In either case, scan slower
   2182	 */
   2183	if (local + shared == 0 || p->numa_faults_locality[2]) {
   2184		p->numa_scan_period = min(p->numa_scan_period_max,
   2185			p->numa_scan_period << 1);
   2186
   2187		p->mm->numa_next_scan = jiffies +
   2188			msecs_to_jiffies(p->numa_scan_period);
   2189
   2190		return;
   2191	}
   2192
   2193	/*
   2194	 * Prepare to scale scan period relative to the current period.
   2195	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
   2196	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
   2197	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
   2198	 */
   2199	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
   2200	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
   2201	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
   2202
   2203	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
   2204		/*
   2205		 * Most memory accesses are local. There is no need to
   2206		 * do fast NUMA scanning, since memory is already local.
   2207		 */
   2208		int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
   2209		if (!slot)
   2210			slot = 1;
   2211		diff = slot * period_slot;
   2212	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
   2213		/*
   2214		 * Most memory accesses are shared with other tasks.
   2215		 * There is no point in continuing fast NUMA scanning,
   2216		 * since other tasks may just move the memory elsewhere.
   2217		 */
   2218		int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
   2219		if (!slot)
   2220			slot = 1;
   2221		diff = slot * period_slot;
   2222	} else {
   2223		/*
   2224		 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
   2225		 * yet they are not on the local NUMA node. Speed up
   2226		 * NUMA scanning to get the memory moved over.
   2227		 */
   2228		int ratio = max(lr_ratio, ps_ratio);
   2229		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
   2230	}
   2231
   2232	p->numa_scan_period = clamp(p->numa_scan_period + diff,
   2233			task_scan_min(p), task_scan_max(p));
   2234	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
   2235}
   2236
   2237/*
   2238 * Get the fraction of time the task has been running since the last
   2239 * NUMA placement cycle. The scheduler keeps similar statistics, but
   2240 * decays those on a 32ms period, which is orders of magnitude off
   2241 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
   2242 * stats only if the task is so new there are no NUMA statistics yet.
   2243 */
   2244static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
   2245{
   2246	u64 runtime, delta, now;
   2247	/* Use the start of this time slice to avoid calculations. */
   2248	now = p->se.exec_start;
   2249	runtime = p->se.sum_exec_runtime;
   2250
   2251	if (p->last_task_numa_placement) {
   2252		delta = runtime - p->last_sum_exec_runtime;
   2253		*period = now - p->last_task_numa_placement;
   2254
   2255		/* Avoid time going backwards, prevent potential divide error: */
   2256		if (unlikely((s64)*period < 0))
   2257			*period = 0;
   2258	} else {
   2259		delta = p->se.avg.load_sum;
   2260		*period = LOAD_AVG_MAX;
   2261	}
   2262
   2263	p->last_sum_exec_runtime = runtime;
   2264	p->last_task_numa_placement = now;
   2265
   2266	return delta;
   2267}
   2268
   2269/*
   2270 * Determine the preferred nid for a task in a numa_group. This needs to
   2271 * be done in a way that produces consistent results with group_weight,
   2272 * otherwise workloads might not converge.
   2273 */
   2274static int preferred_group_nid(struct task_struct *p, int nid)
   2275{
   2276	nodemask_t nodes;
   2277	int dist;
   2278
   2279	/* Direct connections between all NUMA nodes. */
   2280	if (sched_numa_topology_type == NUMA_DIRECT)
   2281		return nid;
   2282
   2283	/*
   2284	 * On a system with glueless mesh NUMA topology, group_weight
   2285	 * scores nodes according to the number of NUMA hinting faults on
   2286	 * both the node itself, and on nearby nodes.
   2287	 */
   2288	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
   2289		unsigned long score, max_score = 0;
   2290		int node, max_node = nid;
   2291
   2292		dist = sched_max_numa_distance;
   2293
   2294		for_each_node_state(node, N_CPU) {
   2295			score = group_weight(p, node, dist);
   2296			if (score > max_score) {
   2297				max_score = score;
   2298				max_node = node;
   2299			}
   2300		}
   2301		return max_node;
   2302	}
   2303
   2304	/*
   2305	 * Finding the preferred nid in a system with NUMA backplane
   2306	 * interconnect topology is more involved. The goal is to locate
   2307	 * tasks from numa_groups near each other in the system, and
   2308	 * untangle workloads from different sides of the system. This requires
   2309	 * searching down the hierarchy of node groups, recursively searching
   2310	 * inside the highest scoring group of nodes. The nodemask tricks
   2311	 * keep the complexity of the search down.
   2312	 */
   2313	nodes = node_states[N_CPU];
   2314	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
   2315		unsigned long max_faults = 0;
   2316		nodemask_t max_group = NODE_MASK_NONE;
   2317		int a, b;
   2318
   2319		/* Are there nodes at this distance from each other? */
   2320		if (!find_numa_distance(dist))
   2321			continue;
   2322
   2323		for_each_node_mask(a, nodes) {
   2324			unsigned long faults = 0;
   2325			nodemask_t this_group;
   2326			nodes_clear(this_group);
   2327
   2328			/* Sum group's NUMA faults; includes a==b case. */
   2329			for_each_node_mask(b, nodes) {
   2330				if (node_distance(a, b) < dist) {
   2331					faults += group_faults(p, b);
   2332					node_set(b, this_group);
   2333					node_clear(b, nodes);
   2334				}
   2335			}
   2336
   2337			/* Remember the top group. */
   2338			if (faults > max_faults) {
   2339				max_faults = faults;
   2340				max_group = this_group;
   2341				/*
   2342				 * subtle: at the smallest distance there is
   2343				 * just one node left in each "group", the
   2344				 * winner is the preferred nid.
   2345				 */
   2346				nid = a;
   2347			}
   2348		}
   2349		/* Next round, evaluate the nodes within max_group. */
   2350		if (!max_faults)
   2351			break;
   2352		nodes = max_group;
   2353	}
   2354	return nid;
   2355}
   2356
   2357static void task_numa_placement(struct task_struct *p)
   2358{
   2359	int seq, nid, max_nid = NUMA_NO_NODE;
   2360	unsigned long max_faults = 0;
   2361	unsigned long fault_types[2] = { 0, 0 };
   2362	unsigned long total_faults;
   2363	u64 runtime, period;
   2364	spinlock_t *group_lock = NULL;
   2365	struct numa_group *ng;
   2366
   2367	/*
   2368	 * The p->mm->numa_scan_seq field gets updated without
   2369	 * exclusive access. Use READ_ONCE() here to ensure
   2370	 * that the field is read in a single access:
   2371	 */
   2372	seq = READ_ONCE(p->mm->numa_scan_seq);
   2373	if (p->numa_scan_seq == seq)
   2374		return;
   2375	p->numa_scan_seq = seq;
   2376	p->numa_scan_period_max = task_scan_max(p);
   2377
   2378	total_faults = p->numa_faults_locality[0] +
   2379		       p->numa_faults_locality[1];
   2380	runtime = numa_get_avg_runtime(p, &period);
   2381
   2382	/* If the task is part of a group prevent parallel updates to group stats */
   2383	ng = deref_curr_numa_group(p);
   2384	if (ng) {
   2385		group_lock = &ng->lock;
   2386		spin_lock_irq(group_lock);
   2387	}
   2388
   2389	/* Find the node with the highest number of faults */
   2390	for_each_online_node(nid) {
   2391		/* Keep track of the offsets in numa_faults array */
   2392		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
   2393		unsigned long faults = 0, group_faults = 0;
   2394		int priv;
   2395
   2396		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
   2397			long diff, f_diff, f_weight;
   2398
   2399			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
   2400			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
   2401			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
   2402			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
   2403
   2404			/* Decay existing window, copy faults since last scan */
   2405			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
   2406			fault_types[priv] += p->numa_faults[membuf_idx];
   2407			p->numa_faults[membuf_idx] = 0;
   2408
   2409			/*
   2410			 * Normalize the faults_from, so all tasks in a group
   2411			 * count according to CPU use, instead of by the raw
   2412			 * number of faults. Tasks with little runtime have
   2413			 * little over-all impact on throughput, and thus their
   2414			 * faults are less important.
   2415			 */
   2416			f_weight = div64_u64(runtime << 16, period + 1);
   2417			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
   2418				   (total_faults + 1);
   2419			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
   2420			p->numa_faults[cpubuf_idx] = 0;
   2421
   2422			p->numa_faults[mem_idx] += diff;
   2423			p->numa_faults[cpu_idx] += f_diff;
   2424			faults += p->numa_faults[mem_idx];
   2425			p->total_numa_faults += diff;
   2426			if (ng) {
   2427				/*
   2428				 * safe because we can only change our own group
   2429				 *
   2430				 * mem_idx represents the offset for a given
   2431				 * nid and priv in a specific region because it
   2432				 * is at the beginning of the numa_faults array.
   2433				 */
   2434				ng->faults[mem_idx] += diff;
   2435				ng->faults[cpu_idx] += f_diff;
   2436				ng->total_faults += diff;
   2437				group_faults += ng->faults[mem_idx];
   2438			}
   2439		}
   2440
   2441		if (!ng) {
   2442			if (faults > max_faults) {
   2443				max_faults = faults;
   2444				max_nid = nid;
   2445			}
   2446		} else if (group_faults > max_faults) {
   2447			max_faults = group_faults;
   2448			max_nid = nid;
   2449		}
   2450	}
   2451
   2452	/* Cannot migrate task to CPU-less node */
   2453	if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
   2454		int near_nid = max_nid;
   2455		int distance, near_distance = INT_MAX;
   2456
   2457		for_each_node_state(nid, N_CPU) {
   2458			distance = node_distance(max_nid, nid);
   2459			if (distance < near_distance) {
   2460				near_nid = nid;
   2461				near_distance = distance;
   2462			}
   2463		}
   2464		max_nid = near_nid;
   2465	}
   2466
   2467	if (ng) {
   2468		numa_group_count_active_nodes(ng);
   2469		spin_unlock_irq(group_lock);
   2470		max_nid = preferred_group_nid(p, max_nid);
   2471	}
   2472
   2473	if (max_faults) {
   2474		/* Set the new preferred node */
   2475		if (max_nid != p->numa_preferred_nid)
   2476			sched_setnuma(p, max_nid);
   2477	}
   2478
   2479	update_task_scan_period(p, fault_types[0], fault_types[1]);
   2480}
   2481
   2482static inline int get_numa_group(struct numa_group *grp)
   2483{
   2484	return refcount_inc_not_zero(&grp->refcount);
   2485}
   2486
   2487static inline void put_numa_group(struct numa_group *grp)
   2488{
   2489	if (refcount_dec_and_test(&grp->refcount))
   2490		kfree_rcu(grp, rcu);
   2491}
   2492
   2493static void task_numa_group(struct task_struct *p, int cpupid, int flags,
   2494			int *priv)
   2495{
   2496	struct numa_group *grp, *my_grp;
   2497	struct task_struct *tsk;
   2498	bool join = false;
   2499	int cpu = cpupid_to_cpu(cpupid);
   2500	int i;
   2501
   2502	if (unlikely(!deref_curr_numa_group(p))) {
   2503		unsigned int size = sizeof(struct numa_group) +
   2504				    NR_NUMA_HINT_FAULT_STATS *
   2505				    nr_node_ids * sizeof(unsigned long);
   2506
   2507		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
   2508		if (!grp)
   2509			return;
   2510
   2511		refcount_set(&grp->refcount, 1);
   2512		grp->active_nodes = 1;
   2513		grp->max_faults_cpu = 0;
   2514		spin_lock_init(&grp->lock);
   2515		grp->gid = p->pid;
   2516
   2517		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
   2518			grp->faults[i] = p->numa_faults[i];
   2519
   2520		grp->total_faults = p->total_numa_faults;
   2521
   2522		grp->nr_tasks++;
   2523		rcu_assign_pointer(p->numa_group, grp);
   2524	}
   2525
   2526	rcu_read_lock();
   2527	tsk = READ_ONCE(cpu_rq(cpu)->curr);
   2528
   2529	if (!cpupid_match_pid(tsk, cpupid))
   2530		goto no_join;
   2531
   2532	grp = rcu_dereference(tsk->numa_group);
   2533	if (!grp)
   2534		goto no_join;
   2535
   2536	my_grp = deref_curr_numa_group(p);
   2537	if (grp == my_grp)
   2538		goto no_join;
   2539
   2540	/*
   2541	 * Only join the other group if its bigger; if we're the bigger group,
   2542	 * the other task will join us.
   2543	 */
   2544	if (my_grp->nr_tasks > grp->nr_tasks)
   2545		goto no_join;
   2546
   2547	/*
   2548	 * Tie-break on the grp address.
   2549	 */
   2550	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
   2551		goto no_join;
   2552
   2553	/* Always join threads in the same process. */
   2554	if (tsk->mm == current->mm)
   2555		join = true;
   2556
   2557	/* Simple filter to avoid false positives due to PID collisions */
   2558	if (flags & TNF_SHARED)
   2559		join = true;
   2560
   2561	/* Update priv based on whether false sharing was detected */
   2562	*priv = !join;
   2563
   2564	if (join && !get_numa_group(grp))
   2565		goto no_join;
   2566
   2567	rcu_read_unlock();
   2568
   2569	if (!join)
   2570		return;
   2571
   2572	BUG_ON(irqs_disabled());
   2573	double_lock_irq(&my_grp->lock, &grp->lock);
   2574
   2575	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
   2576		my_grp->faults[i] -= p->numa_faults[i];
   2577		grp->faults[i] += p->numa_faults[i];
   2578	}
   2579	my_grp->total_faults -= p->total_numa_faults;
   2580	grp->total_faults += p->total_numa_faults;
   2581
   2582	my_grp->nr_tasks--;
   2583	grp->nr_tasks++;
   2584
   2585	spin_unlock(&my_grp->lock);
   2586	spin_unlock_irq(&grp->lock);
   2587
   2588	rcu_assign_pointer(p->numa_group, grp);
   2589
   2590	put_numa_group(my_grp);
   2591	return;
   2592
   2593no_join:
   2594	rcu_read_unlock();
   2595	return;
   2596}
   2597
   2598/*
   2599 * Get rid of NUMA statistics associated with a task (either current or dead).
   2600 * If @final is set, the task is dead and has reached refcount zero, so we can
   2601 * safely free all relevant data structures. Otherwise, there might be
   2602 * concurrent reads from places like load balancing and procfs, and we should
   2603 * reset the data back to default state without freeing ->numa_faults.
   2604 */
   2605void task_numa_free(struct task_struct *p, bool final)
   2606{
   2607	/* safe: p either is current or is being freed by current */
   2608	struct numa_group *grp = rcu_dereference_raw(p->numa_group);
   2609	unsigned long *numa_faults = p->numa_faults;
   2610	unsigned long flags;
   2611	int i;
   2612
   2613	if (!numa_faults)
   2614		return;
   2615
   2616	if (grp) {
   2617		spin_lock_irqsave(&grp->lock, flags);
   2618		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
   2619			grp->faults[i] -= p->numa_faults[i];
   2620		grp->total_faults -= p->total_numa_faults;
   2621
   2622		grp->nr_tasks--;
   2623		spin_unlock_irqrestore(&grp->lock, flags);
   2624		RCU_INIT_POINTER(p->numa_group, NULL);
   2625		put_numa_group(grp);
   2626	}
   2627
   2628	if (final) {
   2629		p->numa_faults = NULL;
   2630		kfree(numa_faults);
   2631	} else {
   2632		p->total_numa_faults = 0;
   2633		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
   2634			numa_faults[i] = 0;
   2635	}
   2636}
   2637
   2638/*
   2639 * Got a PROT_NONE fault for a page on @node.
   2640 */
   2641void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
   2642{
   2643	struct task_struct *p = current;
   2644	bool migrated = flags & TNF_MIGRATED;
   2645	int cpu_node = task_node(current);
   2646	int local = !!(flags & TNF_FAULT_LOCAL);
   2647	struct numa_group *ng;
   2648	int priv;
   2649
   2650	if (!static_branch_likely(&sched_numa_balancing))
   2651		return;
   2652
   2653	/* for example, ksmd faulting in a user's mm */
   2654	if (!p->mm)
   2655		return;
   2656
   2657	/* Allocate buffer to track faults on a per-node basis */
   2658	if (unlikely(!p->numa_faults)) {
   2659		int size = sizeof(*p->numa_faults) *
   2660			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
   2661
   2662		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
   2663		if (!p->numa_faults)
   2664			return;
   2665
   2666		p->total_numa_faults = 0;
   2667		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
   2668	}
   2669
   2670	/*
   2671	 * First accesses are treated as private, otherwise consider accesses
   2672	 * to be private if the accessing pid has not changed
   2673	 */
   2674	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
   2675		priv = 1;
   2676	} else {
   2677		priv = cpupid_match_pid(p, last_cpupid);
   2678		if (!priv && !(flags & TNF_NO_GROUP))
   2679			task_numa_group(p, last_cpupid, flags, &priv);
   2680	}
   2681
   2682	/*
   2683	 * If a workload spans multiple NUMA nodes, a shared fault that
   2684	 * occurs wholly within the set of nodes that the workload is
   2685	 * actively using should be counted as local. This allows the
   2686	 * scan rate to slow down when a workload has settled down.
   2687	 */
   2688	ng = deref_curr_numa_group(p);
   2689	if (!priv && !local && ng && ng->active_nodes > 1 &&
   2690				numa_is_active_node(cpu_node, ng) &&
   2691				numa_is_active_node(mem_node, ng))
   2692		local = 1;
   2693
   2694	/*
   2695	 * Retry to migrate task to preferred node periodically, in case it
   2696	 * previously failed, or the scheduler moved us.
   2697	 */
   2698	if (time_after(jiffies, p->numa_migrate_retry)) {
   2699		task_numa_placement(p);
   2700		numa_migrate_preferred(p);
   2701	}
   2702
   2703	if (migrated)
   2704		p->numa_pages_migrated += pages;
   2705	if (flags & TNF_MIGRATE_FAIL)
   2706		p->numa_faults_locality[2] += pages;
   2707
   2708	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
   2709	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
   2710	p->numa_faults_locality[local] += pages;
   2711}
   2712
   2713static void reset_ptenuma_scan(struct task_struct *p)
   2714{
   2715	/*
   2716	 * We only did a read acquisition of the mmap sem, so
   2717	 * p->mm->numa_scan_seq is written to without exclusive access
   2718	 * and the update is not guaranteed to be atomic. That's not
   2719	 * much of an issue though, since this is just used for
   2720	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
   2721	 * expensive, to avoid any form of compiler optimizations:
   2722	 */
   2723	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
   2724	p->mm->numa_scan_offset = 0;
   2725}
   2726
   2727/*
   2728 * The expensive part of numa migration is done from task_work context.
   2729 * Triggered from task_tick_numa().
   2730 */
   2731static void task_numa_work(struct callback_head *work)
   2732{
   2733	unsigned long migrate, next_scan, now = jiffies;
   2734	struct task_struct *p = current;
   2735	struct mm_struct *mm = p->mm;
   2736	u64 runtime = p->se.sum_exec_runtime;
   2737	struct vm_area_struct *vma;
   2738	unsigned long start, end;
   2739	unsigned long nr_pte_updates = 0;
   2740	long pages, virtpages;
   2741
   2742	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
   2743
   2744	work->next = work;
   2745	/*
   2746	 * Who cares about NUMA placement when they're dying.
   2747	 *
   2748	 * NOTE: make sure not to dereference p->mm before this check,
   2749	 * exit_task_work() happens _after_ exit_mm() so we could be called
   2750	 * without p->mm even though we still had it when we enqueued this
   2751	 * work.
   2752	 */
   2753	if (p->flags & PF_EXITING)
   2754		return;
   2755
   2756	if (!mm->numa_next_scan) {
   2757		mm->numa_next_scan = now +
   2758			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
   2759	}
   2760
   2761	/*
   2762	 * Enforce maximal scan/migration frequency..
   2763	 */
   2764	migrate = mm->numa_next_scan;
   2765	if (time_before(now, migrate))
   2766		return;
   2767
   2768	if (p->numa_scan_period == 0) {
   2769		p->numa_scan_period_max = task_scan_max(p);
   2770		p->numa_scan_period = task_scan_start(p);
   2771	}
   2772
   2773	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
   2774	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
   2775		return;
   2776
   2777	/*
   2778	 * Delay this task enough that another task of this mm will likely win
   2779	 * the next time around.
   2780	 */
   2781	p->node_stamp += 2 * TICK_NSEC;
   2782
   2783	start = mm->numa_scan_offset;
   2784	pages = sysctl_numa_balancing_scan_size;
   2785	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
   2786	virtpages = pages * 8;	   /* Scan up to this much virtual space */
   2787	if (!pages)
   2788		return;
   2789
   2790
   2791	if (!mmap_read_trylock(mm))
   2792		return;
   2793	vma = find_vma(mm, start);
   2794	if (!vma) {
   2795		reset_ptenuma_scan(p);
   2796		start = 0;
   2797		vma = mm->mmap;
   2798	}
   2799	for (; vma; vma = vma->vm_next) {
   2800		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
   2801			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
   2802			continue;
   2803		}
   2804
   2805		/*
   2806		 * Shared library pages mapped by multiple processes are not
   2807		 * migrated as it is expected they are cache replicated. Avoid
   2808		 * hinting faults in read-only file-backed mappings or the vdso
   2809		 * as migrating the pages will be of marginal benefit.
   2810		 */
   2811		if (!vma->vm_mm ||
   2812		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
   2813			continue;
   2814
   2815		/*
   2816		 * Skip inaccessible VMAs to avoid any confusion between
   2817		 * PROT_NONE and NUMA hinting ptes
   2818		 */
   2819		if (!vma_is_accessible(vma))
   2820			continue;
   2821
   2822		do {
   2823			start = max(start, vma->vm_start);
   2824			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
   2825			end = min(end, vma->vm_end);
   2826			nr_pte_updates = change_prot_numa(vma, start, end);
   2827
   2828			/*
   2829			 * Try to scan sysctl_numa_balancing_size worth of
   2830			 * hpages that have at least one present PTE that
   2831			 * is not already pte-numa. If the VMA contains
   2832			 * areas that are unused or already full of prot_numa
   2833			 * PTEs, scan up to virtpages, to skip through those
   2834			 * areas faster.
   2835			 */
   2836			if (nr_pte_updates)
   2837				pages -= (end - start) >> PAGE_SHIFT;
   2838			virtpages -= (end - start) >> PAGE_SHIFT;
   2839
   2840			start = end;
   2841			if (pages <= 0 || virtpages <= 0)
   2842				goto out;
   2843
   2844			cond_resched();
   2845		} while (end != vma->vm_end);
   2846	}
   2847
   2848out:
   2849	/*
   2850	 * It is possible to reach the end of the VMA list but the last few
   2851	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
   2852	 * would find the !migratable VMA on the next scan but not reset the
   2853	 * scanner to the start so check it now.
   2854	 */
   2855	if (vma)
   2856		mm->numa_scan_offset = start;
   2857	else
   2858		reset_ptenuma_scan(p);
   2859	mmap_read_unlock(mm);
   2860
   2861	/*
   2862	 * Make sure tasks use at least 32x as much time to run other code
   2863	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
   2864	 * Usually update_task_scan_period slows down scanning enough; on an
   2865	 * overloaded system we need to limit overhead on a per task basis.
   2866	 */
   2867	if (unlikely(p->se.sum_exec_runtime != runtime)) {
   2868		u64 diff = p->se.sum_exec_runtime - runtime;
   2869		p->node_stamp += 32 * diff;
   2870	}
   2871}
   2872
   2873void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
   2874{
   2875	int mm_users = 0;
   2876	struct mm_struct *mm = p->mm;
   2877
   2878	if (mm) {
   2879		mm_users = atomic_read(&mm->mm_users);
   2880		if (mm_users == 1) {
   2881			mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
   2882			mm->numa_scan_seq = 0;
   2883		}
   2884	}
   2885	p->node_stamp			= 0;
   2886	p->numa_scan_seq		= mm ? mm->numa_scan_seq : 0;
   2887	p->numa_scan_period		= sysctl_numa_balancing_scan_delay;
   2888	/* Protect against double add, see task_tick_numa and task_numa_work */
   2889	p->numa_work.next		= &p->numa_work;
   2890	p->numa_faults			= NULL;
   2891	p->numa_pages_migrated		= 0;
   2892	p->total_numa_faults		= 0;
   2893	RCU_INIT_POINTER(p->numa_group, NULL);
   2894	p->last_task_numa_placement	= 0;
   2895	p->last_sum_exec_runtime	= 0;
   2896
   2897	init_task_work(&p->numa_work, task_numa_work);
   2898
   2899	/* New address space, reset the preferred nid */
   2900	if (!(clone_flags & CLONE_VM)) {
   2901		p->numa_preferred_nid = NUMA_NO_NODE;
   2902		return;
   2903	}
   2904
   2905	/*
   2906	 * New thread, keep existing numa_preferred_nid which should be copied
   2907	 * already by arch_dup_task_struct but stagger when scans start.
   2908	 */
   2909	if (mm) {
   2910		unsigned int delay;
   2911
   2912		delay = min_t(unsigned int, task_scan_max(current),
   2913			current->numa_scan_period * mm_users * NSEC_PER_MSEC);
   2914		delay += 2 * TICK_NSEC;
   2915		p->node_stamp = delay;
   2916	}
   2917}
   2918
   2919/*
   2920 * Drive the periodic memory faults..
   2921 */
   2922static void task_tick_numa(struct rq *rq, struct task_struct *curr)
   2923{
   2924	struct callback_head *work = &curr->numa_work;
   2925	u64 period, now;
   2926
   2927	/*
   2928	 * We don't care about NUMA placement if we don't have memory.
   2929	 */
   2930	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
   2931		return;
   2932
   2933	/*
   2934	 * Using runtime rather than walltime has the dual advantage that
   2935	 * we (mostly) drive the selection from busy threads and that the
   2936	 * task needs to have done some actual work before we bother with
   2937	 * NUMA placement.
   2938	 */
   2939	now = curr->se.sum_exec_runtime;
   2940	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
   2941
   2942	if (now > curr->node_stamp + period) {
   2943		if (!curr->node_stamp)
   2944			curr->numa_scan_period = task_scan_start(curr);
   2945		curr->node_stamp += period;
   2946
   2947		if (!time_before(jiffies, curr->mm->numa_next_scan))
   2948			task_work_add(curr, work, TWA_RESUME);
   2949	}
   2950}
   2951
   2952static void update_scan_period(struct task_struct *p, int new_cpu)
   2953{
   2954	int src_nid = cpu_to_node(task_cpu(p));
   2955	int dst_nid = cpu_to_node(new_cpu);
   2956
   2957	if (!static_branch_likely(&sched_numa_balancing))
   2958		return;
   2959
   2960	if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
   2961		return;
   2962
   2963	if (src_nid == dst_nid)
   2964		return;
   2965
   2966	/*
   2967	 * Allow resets if faults have been trapped before one scan
   2968	 * has completed. This is most likely due to a new task that
   2969	 * is pulled cross-node due to wakeups or load balancing.
   2970	 */
   2971	if (p->numa_scan_seq) {
   2972		/*
   2973		 * Avoid scan adjustments if moving to the preferred
   2974		 * node or if the task was not previously running on
   2975		 * the preferred node.
   2976		 */
   2977		if (dst_nid == p->numa_preferred_nid ||
   2978		    (p->numa_preferred_nid != NUMA_NO_NODE &&
   2979			src_nid != p->numa_preferred_nid))
   2980			return;
   2981	}
   2982
   2983	p->numa_scan_period = task_scan_start(p);
   2984}
   2985
   2986#else
   2987static void task_tick_numa(struct rq *rq, struct task_struct *curr)
   2988{
   2989}
   2990
   2991static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
   2992{
   2993}
   2994
   2995static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
   2996{
   2997}
   2998
   2999static inline void update_scan_period(struct task_struct *p, int new_cpu)
   3000{
   3001}
   3002
   3003#endif /* CONFIG_NUMA_BALANCING */
   3004
   3005static void
   3006account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
   3007{
   3008	update_load_add(&cfs_rq->load, se->load.weight);
   3009#ifdef CONFIG_SMP
   3010	if (entity_is_task(se)) {
   3011		struct rq *rq = rq_of(cfs_rq);
   3012
   3013		account_numa_enqueue(rq, task_of(se));
   3014		list_add(&se->group_node, &rq->cfs_tasks);
   3015	}
   3016#endif
   3017	cfs_rq->nr_running++;
   3018	if (se_is_idle(se))
   3019		cfs_rq->idle_nr_running++;
   3020}
   3021
   3022static void
   3023account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
   3024{
   3025	update_load_sub(&cfs_rq->load, se->load.weight);
   3026#ifdef CONFIG_SMP
   3027	if (entity_is_task(se)) {
   3028		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
   3029		list_del_init(&se->group_node);
   3030	}
   3031#endif
   3032	cfs_rq->nr_running--;
   3033	if (se_is_idle(se))
   3034		cfs_rq->idle_nr_running--;
   3035}
   3036
   3037/*
   3038 * Signed add and clamp on underflow.
   3039 *
   3040 * Explicitly do a load-store to ensure the intermediate value never hits
   3041 * memory. This allows lockless observations without ever seeing the negative
   3042 * values.
   3043 */
   3044#define add_positive(_ptr, _val) do {                           \
   3045	typeof(_ptr) ptr = (_ptr);                              \
   3046	typeof(_val) val = (_val);                              \
   3047	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
   3048								\
   3049	res = var + val;                                        \
   3050								\
   3051	if (val < 0 && res > var)                               \
   3052		res = 0;                                        \
   3053								\
   3054	WRITE_ONCE(*ptr, res);                                  \
   3055} while (0)
   3056
   3057/*
   3058 * Unsigned subtract and clamp on underflow.
   3059 *
   3060 * Explicitly do a load-store to ensure the intermediate value never hits
   3061 * memory. This allows lockless observations without ever seeing the negative
   3062 * values.
   3063 */
   3064#define sub_positive(_ptr, _val) do {				\
   3065	typeof(_ptr) ptr = (_ptr);				\
   3066	typeof(*ptr) val = (_val);				\
   3067	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
   3068	res = var - val;					\
   3069	if (res > var)						\
   3070		res = 0;					\
   3071	WRITE_ONCE(*ptr, res);					\
   3072} while (0)
   3073
   3074/*
   3075 * Remove and clamp on negative, from a local variable.
   3076 *
   3077 * A variant of sub_positive(), which does not use explicit load-store
   3078 * and is thus optimized for local variable updates.
   3079 */
   3080#define lsub_positive(_ptr, _val) do {				\
   3081	typeof(_ptr) ptr = (_ptr);				\
   3082	*ptr -= min_t(typeof(*ptr), *ptr, _val);		\
   3083} while (0)
   3084
   3085#ifdef CONFIG_SMP
   3086static inline void
   3087enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   3088{
   3089	cfs_rq->avg.load_avg += se->avg.load_avg;
   3090	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
   3091}
   3092
   3093static inline void
   3094dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   3095{
   3096	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
   3097	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
   3098	/* See update_cfs_rq_load_avg() */
   3099	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
   3100					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
   3101}
   3102#else
   3103static inline void
   3104enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
   3105static inline void
   3106dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
   3107#endif
   3108
   3109static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
   3110			    unsigned long weight)
   3111{
   3112	if (se->on_rq) {
   3113		/* commit outstanding execution time */
   3114		if (cfs_rq->curr == se)
   3115			update_curr(cfs_rq);
   3116		update_load_sub(&cfs_rq->load, se->load.weight);
   3117	}
   3118	dequeue_load_avg(cfs_rq, se);
   3119
   3120	update_load_set(&se->load, weight);
   3121
   3122#ifdef CONFIG_SMP
   3123	do {
   3124		u32 divider = get_pelt_divider(&se->avg);
   3125
   3126		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
   3127	} while (0);
   3128#endif
   3129
   3130	enqueue_load_avg(cfs_rq, se);
   3131	if (se->on_rq)
   3132		update_load_add(&cfs_rq->load, se->load.weight);
   3133
   3134}
   3135
   3136void reweight_task(struct task_struct *p, int prio)
   3137{
   3138	struct sched_entity *se = &p->se;
   3139	struct cfs_rq *cfs_rq = cfs_rq_of(se);
   3140	struct load_weight *load = &se->load;
   3141	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
   3142
   3143	reweight_entity(cfs_rq, se, weight);
   3144	load->inv_weight = sched_prio_to_wmult[prio];
   3145}
   3146
   3147#ifdef CONFIG_FAIR_GROUP_SCHED
   3148#ifdef CONFIG_SMP
   3149/*
   3150 * All this does is approximate the hierarchical proportion which includes that
   3151 * global sum we all love to hate.
   3152 *
   3153 * That is, the weight of a group entity, is the proportional share of the
   3154 * group weight based on the group runqueue weights. That is:
   3155 *
   3156 *                     tg->weight * grq->load.weight
   3157 *   ge->load.weight = -----------------------------               (1)
   3158 *                       \Sum grq->load.weight
   3159 *
   3160 * Now, because computing that sum is prohibitively expensive to compute (been
   3161 * there, done that) we approximate it with this average stuff. The average
   3162 * moves slower and therefore the approximation is cheaper and more stable.
   3163 *
   3164 * So instead of the above, we substitute:
   3165 *
   3166 *   grq->load.weight -> grq->avg.load_avg                         (2)
   3167 *
   3168 * which yields the following:
   3169 *
   3170 *                     tg->weight * grq->avg.load_avg
   3171 *   ge->load.weight = ------------------------------              (3)
   3172 *                             tg->load_avg
   3173 *
   3174 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
   3175 *
   3176 * That is shares_avg, and it is right (given the approximation (2)).
   3177 *
   3178 * The problem with it is that because the average is slow -- it was designed
   3179 * to be exactly that of course -- this leads to transients in boundary
   3180 * conditions. In specific, the case where the group was idle and we start the
   3181 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
   3182 * yielding bad latency etc..
   3183 *
   3184 * Now, in that special case (1) reduces to:
   3185 *
   3186 *                     tg->weight * grq->load.weight
   3187 *   ge->load.weight = ----------------------------- = tg->weight   (4)
   3188 *                         grp->load.weight
   3189 *
   3190 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
   3191 *
   3192 * So what we do is modify our approximation (3) to approach (4) in the (near)
   3193 * UP case, like:
   3194 *
   3195 *   ge->load.weight =
   3196 *
   3197 *              tg->weight * grq->load.weight
   3198 *     ---------------------------------------------------         (5)
   3199 *     tg->load_avg - grq->avg.load_avg + grq->load.weight
   3200 *
   3201 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
   3202 * we need to use grq->avg.load_avg as its lower bound, which then gives:
   3203 *
   3204 *
   3205 *                     tg->weight * grq->load.weight
   3206 *   ge->load.weight = -----------------------------		   (6)
   3207 *                             tg_load_avg'
   3208 *
   3209 * Where:
   3210 *
   3211 *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
   3212 *                  max(grq->load.weight, grq->avg.load_avg)
   3213 *
   3214 * And that is shares_weight and is icky. In the (near) UP case it approaches
   3215 * (4) while in the normal case it approaches (3). It consistently
   3216 * overestimates the ge->load.weight and therefore:
   3217 *
   3218 *   \Sum ge->load.weight >= tg->weight
   3219 *
   3220 * hence icky!
   3221 */
   3222static long calc_group_shares(struct cfs_rq *cfs_rq)
   3223{
   3224	long tg_weight, tg_shares, load, shares;
   3225	struct task_group *tg = cfs_rq->tg;
   3226
   3227	tg_shares = READ_ONCE(tg->shares);
   3228
   3229	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
   3230
   3231	tg_weight = atomic_long_read(&tg->load_avg);
   3232
   3233	/* Ensure tg_weight >= load */
   3234	tg_weight -= cfs_rq->tg_load_avg_contrib;
   3235	tg_weight += load;
   3236
   3237	shares = (tg_shares * load);
   3238	if (tg_weight)
   3239		shares /= tg_weight;
   3240
   3241	/*
   3242	 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
   3243	 * of a group with small tg->shares value. It is a floor value which is
   3244	 * assigned as a minimum load.weight to the sched_entity representing
   3245	 * the group on a CPU.
   3246	 *
   3247	 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
   3248	 * on an 8-core system with 8 tasks each runnable on one CPU shares has
   3249	 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
   3250	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
   3251	 * instead of 0.
   3252	 */
   3253	return clamp_t(long, shares, MIN_SHARES, tg_shares);
   3254}
   3255#endif /* CONFIG_SMP */
   3256
   3257static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
   3258
   3259/*
   3260 * Recomputes the group entity based on the current state of its group
   3261 * runqueue.
   3262 */
   3263static void update_cfs_group(struct sched_entity *se)
   3264{
   3265	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
   3266	long shares;
   3267
   3268	if (!gcfs_rq)
   3269		return;
   3270
   3271	if (throttled_hierarchy(gcfs_rq))
   3272		return;
   3273
   3274#ifndef CONFIG_SMP
   3275	shares = READ_ONCE(gcfs_rq->tg->shares);
   3276
   3277	if (likely(se->load.weight == shares))
   3278		return;
   3279#else
   3280	shares   = calc_group_shares(gcfs_rq);
   3281#endif
   3282
   3283	reweight_entity(cfs_rq_of(se), se, shares);
   3284}
   3285
   3286#else /* CONFIG_FAIR_GROUP_SCHED */
   3287static inline void update_cfs_group(struct sched_entity *se)
   3288{
   3289}
   3290#endif /* CONFIG_FAIR_GROUP_SCHED */
   3291
   3292static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
   3293{
   3294	struct rq *rq = rq_of(cfs_rq);
   3295
   3296	if (&rq->cfs == cfs_rq) {
   3297		/*
   3298		 * There are a few boundary cases this might miss but it should
   3299		 * get called often enough that that should (hopefully) not be
   3300		 * a real problem.
   3301		 *
   3302		 * It will not get called when we go idle, because the idle
   3303		 * thread is a different class (!fair), nor will the utilization
   3304		 * number include things like RT tasks.
   3305		 *
   3306		 * As is, the util number is not freq-invariant (we'd have to
   3307		 * implement arch_scale_freq_capacity() for that).
   3308		 *
   3309		 * See cpu_util_cfs().
   3310		 */
   3311		cpufreq_update_util(rq, flags);
   3312	}
   3313}
   3314
   3315#ifdef CONFIG_SMP
   3316#ifdef CONFIG_FAIR_GROUP_SCHED
   3317/*
   3318 * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
   3319 * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
   3320 * bottom-up, we only have to test whether the cfs_rq before us on the list
   3321 * is our child.
   3322 * If cfs_rq is not on the list, test whether a child needs its to be added to
   3323 * connect a branch to the tree  * (see list_add_leaf_cfs_rq() for details).
   3324 */
   3325static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
   3326{
   3327	struct cfs_rq *prev_cfs_rq;
   3328	struct list_head *prev;
   3329
   3330	if (cfs_rq->on_list) {
   3331		prev = cfs_rq->leaf_cfs_rq_list.prev;
   3332	} else {
   3333		struct rq *rq = rq_of(cfs_rq);
   3334
   3335		prev = rq->tmp_alone_branch;
   3336	}
   3337
   3338	prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
   3339
   3340	return (prev_cfs_rq->tg->parent == cfs_rq->tg);
   3341}
   3342
   3343static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
   3344{
   3345	if (cfs_rq->load.weight)
   3346		return false;
   3347
   3348	if (cfs_rq->avg.load_sum)
   3349		return false;
   3350
   3351	if (cfs_rq->avg.util_sum)
   3352		return false;
   3353
   3354	if (cfs_rq->avg.runnable_sum)
   3355		return false;
   3356
   3357	if (child_cfs_rq_on_list(cfs_rq))
   3358		return false;
   3359
   3360	/*
   3361	 * _avg must be null when _sum are null because _avg = _sum / divider
   3362	 * Make sure that rounding and/or propagation of PELT values never
   3363	 * break this.
   3364	 */
   3365	SCHED_WARN_ON(cfs_rq->avg.load_avg ||
   3366		      cfs_rq->avg.util_avg ||
   3367		      cfs_rq->avg.runnable_avg);
   3368
   3369	return true;
   3370}
   3371
   3372/**
   3373 * update_tg_load_avg - update the tg's load avg
   3374 * @cfs_rq: the cfs_rq whose avg changed
   3375 *
   3376 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
   3377 * However, because tg->load_avg is a global value there are performance
   3378 * considerations.
   3379 *
   3380 * In order to avoid having to look at the other cfs_rq's, we use a
   3381 * differential update where we store the last value we propagated. This in
   3382 * turn allows skipping updates if the differential is 'small'.
   3383 *
   3384 * Updating tg's load_avg is necessary before update_cfs_share().
   3385 */
   3386static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
   3387{
   3388	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
   3389
   3390	/*
   3391	 * No need to update load_avg for root_task_group as it is not used.
   3392	 */
   3393	if (cfs_rq->tg == &root_task_group)
   3394		return;
   3395
   3396	if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
   3397		atomic_long_add(delta, &cfs_rq->tg->load_avg);
   3398		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
   3399	}
   3400}
   3401
   3402/*
   3403 * Called within set_task_rq() right before setting a task's CPU. The
   3404 * caller only guarantees p->pi_lock is held; no other assumptions,
   3405 * including the state of rq->lock, should be made.
   3406 */
   3407void set_task_rq_fair(struct sched_entity *se,
   3408		      struct cfs_rq *prev, struct cfs_rq *next)
   3409{
   3410	u64 p_last_update_time;
   3411	u64 n_last_update_time;
   3412
   3413	if (!sched_feat(ATTACH_AGE_LOAD))
   3414		return;
   3415
   3416	/*
   3417	 * We are supposed to update the task to "current" time, then its up to
   3418	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
   3419	 * getting what current time is, so simply throw away the out-of-date
   3420	 * time. This will result in the wakee task is less decayed, but giving
   3421	 * the wakee more load sounds not bad.
   3422	 */
   3423	if (!(se->avg.last_update_time && prev))
   3424		return;
   3425
   3426#ifndef CONFIG_64BIT
   3427	{
   3428		u64 p_last_update_time_copy;
   3429		u64 n_last_update_time_copy;
   3430
   3431		do {
   3432			p_last_update_time_copy = prev->load_last_update_time_copy;
   3433			n_last_update_time_copy = next->load_last_update_time_copy;
   3434
   3435			smp_rmb();
   3436
   3437			p_last_update_time = prev->avg.last_update_time;
   3438			n_last_update_time = next->avg.last_update_time;
   3439
   3440		} while (p_last_update_time != p_last_update_time_copy ||
   3441			 n_last_update_time != n_last_update_time_copy);
   3442	}
   3443#else
   3444	p_last_update_time = prev->avg.last_update_time;
   3445	n_last_update_time = next->avg.last_update_time;
   3446#endif
   3447	__update_load_avg_blocked_se(p_last_update_time, se);
   3448	se->avg.last_update_time = n_last_update_time;
   3449}
   3450
   3451/*
   3452 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
   3453 * propagate its contribution. The key to this propagation is the invariant
   3454 * that for each group:
   3455 *
   3456 *   ge->avg == grq->avg						(1)
   3457 *
   3458 * _IFF_ we look at the pure running and runnable sums. Because they
   3459 * represent the very same entity, just at different points in the hierarchy.
   3460 *
   3461 * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
   3462 * and simply copies the running/runnable sum over (but still wrong, because
   3463 * the group entity and group rq do not have their PELT windows aligned).
   3464 *
   3465 * However, update_tg_cfs_load() is more complex. So we have:
   3466 *
   3467 *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg		(2)
   3468 *
   3469 * And since, like util, the runnable part should be directly transferable,
   3470 * the following would _appear_ to be the straight forward approach:
   3471 *
   3472 *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg	(3)
   3473 *
   3474 * And per (1) we have:
   3475 *
   3476 *   ge->avg.runnable_avg == grq->avg.runnable_avg
   3477 *
   3478 * Which gives:
   3479 *
   3480 *                      ge->load.weight * grq->avg.load_avg
   3481 *   ge->avg.load_avg = -----------------------------------		(4)
   3482 *                               grq->load.weight
   3483 *
   3484 * Except that is wrong!
   3485 *
   3486 * Because while for entities historical weight is not important and we
   3487 * really only care about our future and therefore can consider a pure
   3488 * runnable sum, runqueues can NOT do this.
   3489 *
   3490 * We specifically want runqueues to have a load_avg that includes
   3491 * historical weights. Those represent the blocked load, the load we expect
   3492 * to (shortly) return to us. This only works by keeping the weights as
   3493 * integral part of the sum. We therefore cannot decompose as per (3).
   3494 *
   3495 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
   3496 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
   3497 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
   3498 * runnable section of these tasks overlap (or not). If they were to perfectly
   3499 * align the rq as a whole would be runnable 2/3 of the time. If however we
   3500 * always have at least 1 runnable task, the rq as a whole is always runnable.
   3501 *
   3502 * So we'll have to approximate.. :/
   3503 *
   3504 * Given the constraint:
   3505 *
   3506 *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
   3507 *
   3508 * We can construct a rule that adds runnable to a rq by assuming minimal
   3509 * overlap.
   3510 *
   3511 * On removal, we'll assume each task is equally runnable; which yields:
   3512 *
   3513 *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
   3514 *
   3515 * XXX: only do this for the part of runnable > running ?
   3516 *
   3517 */
   3518static inline void
   3519update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
   3520{
   3521	long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
   3522	u32 new_sum, divider;
   3523
   3524	/* Nothing to update */
   3525	if (!delta_avg)
   3526		return;
   3527
   3528	/*
   3529	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
   3530	 * See ___update_load_avg() for details.
   3531	 */
   3532	divider = get_pelt_divider(&cfs_rq->avg);
   3533
   3534
   3535	/* Set new sched_entity's utilization */
   3536	se->avg.util_avg = gcfs_rq->avg.util_avg;
   3537	new_sum = se->avg.util_avg * divider;
   3538	delta_sum = (long)new_sum - (long)se->avg.util_sum;
   3539	se->avg.util_sum = new_sum;
   3540
   3541	/* Update parent cfs_rq utilization */
   3542	add_positive(&cfs_rq->avg.util_avg, delta_avg);
   3543	add_positive(&cfs_rq->avg.util_sum, delta_sum);
   3544
   3545	/* See update_cfs_rq_load_avg() */
   3546	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
   3547					  cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
   3548}
   3549
   3550static inline void
   3551update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
   3552{
   3553	long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
   3554	u32 new_sum, divider;
   3555
   3556	/* Nothing to update */
   3557	if (!delta_avg)
   3558		return;
   3559
   3560	/*
   3561	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
   3562	 * See ___update_load_avg() for details.
   3563	 */
   3564	divider = get_pelt_divider(&cfs_rq->avg);
   3565
   3566	/* Set new sched_entity's runnable */
   3567	se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
   3568	new_sum = se->avg.runnable_avg * divider;
   3569	delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
   3570	se->avg.runnable_sum = new_sum;
   3571
   3572	/* Update parent cfs_rq runnable */
   3573	add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
   3574	add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
   3575	/* See update_cfs_rq_load_avg() */
   3576	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
   3577					      cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
   3578}
   3579
   3580static inline void
   3581update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
   3582{
   3583	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
   3584	unsigned long load_avg;
   3585	u64 load_sum = 0;
   3586	s64 delta_sum;
   3587	u32 divider;
   3588
   3589	if (!runnable_sum)
   3590		return;
   3591
   3592	gcfs_rq->prop_runnable_sum = 0;
   3593
   3594	/*
   3595	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
   3596	 * See ___update_load_avg() for details.
   3597	 */
   3598	divider = get_pelt_divider(&cfs_rq->avg);
   3599
   3600	if (runnable_sum >= 0) {
   3601		/*
   3602		 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
   3603		 * the CPU is saturated running == runnable.
   3604		 */
   3605		runnable_sum += se->avg.load_sum;
   3606		runnable_sum = min_t(long, runnable_sum, divider);
   3607	} else {
   3608		/*
   3609		 * Estimate the new unweighted runnable_sum of the gcfs_rq by
   3610		 * assuming all tasks are equally runnable.
   3611		 */
   3612		if (scale_load_down(gcfs_rq->load.weight)) {
   3613			load_sum = div_u64(gcfs_rq->avg.load_sum,
   3614				scale_load_down(gcfs_rq->load.weight));
   3615		}
   3616
   3617		/* But make sure to not inflate se's runnable */
   3618		runnable_sum = min(se->avg.load_sum, load_sum);
   3619	}
   3620
   3621	/*
   3622	 * runnable_sum can't be lower than running_sum
   3623	 * Rescale running sum to be in the same range as runnable sum
   3624	 * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
   3625	 * runnable_sum is in [0 : LOAD_AVG_MAX]
   3626	 */
   3627	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
   3628	runnable_sum = max(runnable_sum, running_sum);
   3629
   3630	load_sum = se_weight(se) * runnable_sum;
   3631	load_avg = div_u64(load_sum, divider);
   3632
   3633	delta_avg = load_avg - se->avg.load_avg;
   3634	if (!delta_avg)
   3635		return;
   3636
   3637	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
   3638
   3639	se->avg.load_sum = runnable_sum;
   3640	se->avg.load_avg = load_avg;
   3641	add_positive(&cfs_rq->avg.load_avg, delta_avg);
   3642	add_positive(&cfs_rq->avg.load_sum, delta_sum);
   3643	/* See update_cfs_rq_load_avg() */
   3644	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
   3645					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
   3646}
   3647
   3648static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
   3649{
   3650	cfs_rq->propagate = 1;
   3651	cfs_rq->prop_runnable_sum += runnable_sum;
   3652}
   3653
   3654/* Update task and its cfs_rq load average */
   3655static inline int propagate_entity_load_avg(struct sched_entity *se)
   3656{
   3657	struct cfs_rq *cfs_rq, *gcfs_rq;
   3658
   3659	if (entity_is_task(se))
   3660		return 0;
   3661
   3662	gcfs_rq = group_cfs_rq(se);
   3663	if (!gcfs_rq->propagate)
   3664		return 0;
   3665
   3666	gcfs_rq->propagate = 0;
   3667
   3668	cfs_rq = cfs_rq_of(se);
   3669
   3670	add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
   3671
   3672	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
   3673	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
   3674	update_tg_cfs_load(cfs_rq, se, gcfs_rq);
   3675
   3676	trace_pelt_cfs_tp(cfs_rq);
   3677	trace_pelt_se_tp(se);
   3678
   3679	return 1;
   3680}
   3681
   3682/*
   3683 * Check if we need to update the load and the utilization of a blocked
   3684 * group_entity:
   3685 */
   3686static inline bool skip_blocked_update(struct sched_entity *se)
   3687{
   3688	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
   3689
   3690	/*
   3691	 * If sched_entity still have not zero load or utilization, we have to
   3692	 * decay it:
   3693	 */
   3694	if (se->avg.load_avg || se->avg.util_avg)
   3695		return false;
   3696
   3697	/*
   3698	 * If there is a pending propagation, we have to update the load and
   3699	 * the utilization of the sched_entity:
   3700	 */
   3701	if (gcfs_rq->propagate)
   3702		return false;
   3703
   3704	/*
   3705	 * Otherwise, the load and the utilization of the sched_entity is
   3706	 * already zero and there is no pending propagation, so it will be a
   3707	 * waste of time to try to decay it:
   3708	 */
   3709	return true;
   3710}
   3711
   3712#else /* CONFIG_FAIR_GROUP_SCHED */
   3713
   3714static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
   3715
   3716static inline int propagate_entity_load_avg(struct sched_entity *se)
   3717{
   3718	return 0;
   3719}
   3720
   3721static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
   3722
   3723#endif /* CONFIG_FAIR_GROUP_SCHED */
   3724
   3725/**
   3726 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
   3727 * @now: current time, as per cfs_rq_clock_pelt()
   3728 * @cfs_rq: cfs_rq to update
   3729 *
   3730 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
   3731 * avg. The immediate corollary is that all (fair) tasks must be attached, see
   3732 * post_init_entity_util_avg().
   3733 *
   3734 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
   3735 *
   3736 * Return: true if the load decayed or we removed load.
   3737 *
   3738 * Since both these conditions indicate a changed cfs_rq->avg.load we should
   3739 * call update_tg_load_avg() when this function returns true.
   3740 */
   3741static inline int
   3742update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
   3743{
   3744	unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
   3745	struct sched_avg *sa = &cfs_rq->avg;
   3746	int decayed = 0;
   3747
   3748	if (cfs_rq->removed.nr) {
   3749		unsigned long r;
   3750		u32 divider = get_pelt_divider(&cfs_rq->avg);
   3751
   3752		raw_spin_lock(&cfs_rq->removed.lock);
   3753		swap(cfs_rq->removed.util_avg, removed_util);
   3754		swap(cfs_rq->removed.load_avg, removed_load);
   3755		swap(cfs_rq->removed.runnable_avg, removed_runnable);
   3756		cfs_rq->removed.nr = 0;
   3757		raw_spin_unlock(&cfs_rq->removed.lock);
   3758
   3759		r = removed_load;
   3760		sub_positive(&sa->load_avg, r);
   3761		sub_positive(&sa->load_sum, r * divider);
   3762		/* See sa->util_sum below */
   3763		sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
   3764
   3765		r = removed_util;
   3766		sub_positive(&sa->util_avg, r);
   3767		sub_positive(&sa->util_sum, r * divider);
   3768		/*
   3769		 * Because of rounding, se->util_sum might ends up being +1 more than
   3770		 * cfs->util_sum. Although this is not a problem by itself, detaching
   3771		 * a lot of tasks with the rounding problem between 2 updates of
   3772		 * util_avg (~1ms) can make cfs->util_sum becoming null whereas
   3773		 * cfs_util_avg is not.
   3774		 * Check that util_sum is still above its lower bound for the new
   3775		 * util_avg. Given that period_contrib might have moved since the last
   3776		 * sync, we are only sure that util_sum must be above or equal to
   3777		 *    util_avg * minimum possible divider
   3778		 */
   3779		sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
   3780
   3781		r = removed_runnable;
   3782		sub_positive(&sa->runnable_avg, r);
   3783		sub_positive(&sa->runnable_sum, r * divider);
   3784		/* See sa->util_sum above */
   3785		sa->runnable_sum = max_t(u32, sa->runnable_sum,
   3786					      sa->runnable_avg * PELT_MIN_DIVIDER);
   3787
   3788		/*
   3789		 * removed_runnable is the unweighted version of removed_load so we
   3790		 * can use it to estimate removed_load_sum.
   3791		 */
   3792		add_tg_cfs_propagate(cfs_rq,
   3793			-(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
   3794
   3795		decayed = 1;
   3796	}
   3797
   3798	decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
   3799
   3800#ifndef CONFIG_64BIT
   3801	smp_wmb();
   3802	cfs_rq->load_last_update_time_copy = sa->last_update_time;
   3803#endif
   3804
   3805	return decayed;
   3806}
   3807
   3808/**
   3809 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
   3810 * @cfs_rq: cfs_rq to attach to
   3811 * @se: sched_entity to attach
   3812 *
   3813 * Must call update_cfs_rq_load_avg() before this, since we rely on
   3814 * cfs_rq->avg.last_update_time being current.
   3815 */
   3816static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   3817{
   3818	/*
   3819	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
   3820	 * See ___update_load_avg() for details.
   3821	 */
   3822	u32 divider = get_pelt_divider(&cfs_rq->avg);
   3823
   3824	/*
   3825	 * When we attach the @se to the @cfs_rq, we must align the decay
   3826	 * window because without that, really weird and wonderful things can
   3827	 * happen.
   3828	 *
   3829	 * XXX illustrate
   3830	 */
   3831	se->avg.last_update_time = cfs_rq->avg.last_update_time;
   3832	se->avg.period_contrib = cfs_rq->avg.period_contrib;
   3833
   3834	/*
   3835	 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
   3836	 * period_contrib. This isn't strictly correct, but since we're
   3837	 * entirely outside of the PELT hierarchy, nobody cares if we truncate
   3838	 * _sum a little.
   3839	 */
   3840	se->avg.util_sum = se->avg.util_avg * divider;
   3841
   3842	se->avg.runnable_sum = se->avg.runnable_avg * divider;
   3843
   3844	se->avg.load_sum = se->avg.load_avg * divider;
   3845	if (se_weight(se) < se->avg.load_sum)
   3846		se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
   3847	else
   3848		se->avg.load_sum = 1;
   3849
   3850	enqueue_load_avg(cfs_rq, se);
   3851	cfs_rq->avg.util_avg += se->avg.util_avg;
   3852	cfs_rq->avg.util_sum += se->avg.util_sum;
   3853	cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
   3854	cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
   3855
   3856	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
   3857
   3858	cfs_rq_util_change(cfs_rq, 0);
   3859
   3860	trace_pelt_cfs_tp(cfs_rq);
   3861}
   3862
   3863/**
   3864 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
   3865 * @cfs_rq: cfs_rq to detach from
   3866 * @se: sched_entity to detach
   3867 *
   3868 * Must call update_cfs_rq_load_avg() before this, since we rely on
   3869 * cfs_rq->avg.last_update_time being current.
   3870 */
   3871static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   3872{
   3873	dequeue_load_avg(cfs_rq, se);
   3874	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
   3875	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
   3876	/* See update_cfs_rq_load_avg() */
   3877	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
   3878					  cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
   3879
   3880	sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
   3881	sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
   3882	/* See update_cfs_rq_load_avg() */
   3883	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
   3884					      cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
   3885
   3886	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
   3887
   3888	cfs_rq_util_change(cfs_rq, 0);
   3889
   3890	trace_pelt_cfs_tp(cfs_rq);
   3891}
   3892
   3893/*
   3894 * Optional action to be done while updating the load average
   3895 */
   3896#define UPDATE_TG	0x1
   3897#define SKIP_AGE_LOAD	0x2
   3898#define DO_ATTACH	0x4
   3899
   3900/* Update task and its cfs_rq load average */
   3901static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   3902{
   3903	u64 now = cfs_rq_clock_pelt(cfs_rq);
   3904	int decayed;
   3905
   3906	/*
   3907	 * Track task load average for carrying it to new CPU after migrated, and
   3908	 * track group sched_entity load average for task_h_load calc in migration
   3909	 */
   3910	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
   3911		__update_load_avg_se(now, cfs_rq, se);
   3912
   3913	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
   3914	decayed |= propagate_entity_load_avg(se);
   3915
   3916	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
   3917
   3918		/*
   3919		 * DO_ATTACH means we're here from enqueue_entity().
   3920		 * !last_update_time means we've passed through
   3921		 * migrate_task_rq_fair() indicating we migrated.
   3922		 *
   3923		 * IOW we're enqueueing a task on a new CPU.
   3924		 */
   3925		attach_entity_load_avg(cfs_rq, se);
   3926		update_tg_load_avg(cfs_rq);
   3927
   3928	} else if (decayed) {
   3929		cfs_rq_util_change(cfs_rq, 0);
   3930
   3931		if (flags & UPDATE_TG)
   3932			update_tg_load_avg(cfs_rq);
   3933	}
   3934}
   3935
   3936#ifndef CONFIG_64BIT
   3937static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
   3938{
   3939	u64 last_update_time_copy;
   3940	u64 last_update_time;
   3941
   3942	do {
   3943		last_update_time_copy = cfs_rq->load_last_update_time_copy;
   3944		smp_rmb();
   3945		last_update_time = cfs_rq->avg.last_update_time;
   3946	} while (last_update_time != last_update_time_copy);
   3947
   3948	return last_update_time;
   3949}
   3950#else
   3951static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
   3952{
   3953	return cfs_rq->avg.last_update_time;
   3954}
   3955#endif
   3956
   3957/*
   3958 * Synchronize entity load avg of dequeued entity without locking
   3959 * the previous rq.
   3960 */
   3961static void sync_entity_load_avg(struct sched_entity *se)
   3962{
   3963	struct cfs_rq *cfs_rq = cfs_rq_of(se);
   3964	u64 last_update_time;
   3965
   3966	last_update_time = cfs_rq_last_update_time(cfs_rq);
   3967	__update_load_avg_blocked_se(last_update_time, se);
   3968}
   3969
   3970/*
   3971 * Task first catches up with cfs_rq, and then subtract
   3972 * itself from the cfs_rq (task must be off the queue now).
   3973 */
   3974static void remove_entity_load_avg(struct sched_entity *se)
   3975{
   3976	struct cfs_rq *cfs_rq = cfs_rq_of(se);
   3977	unsigned long flags;
   3978
   3979	/*
   3980	 * tasks cannot exit without having gone through wake_up_new_task() ->
   3981	 * post_init_entity_util_avg() which will have added things to the
   3982	 * cfs_rq, so we can remove unconditionally.
   3983	 */
   3984
   3985	sync_entity_load_avg(se);
   3986
   3987	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
   3988	++cfs_rq->removed.nr;
   3989	cfs_rq->removed.util_avg	+= se->avg.util_avg;
   3990	cfs_rq->removed.load_avg	+= se->avg.load_avg;
   3991	cfs_rq->removed.runnable_avg	+= se->avg.runnable_avg;
   3992	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
   3993}
   3994
   3995static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
   3996{
   3997	return cfs_rq->avg.runnable_avg;
   3998}
   3999
   4000static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
   4001{
   4002	return cfs_rq->avg.load_avg;
   4003}
   4004
   4005static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
   4006
   4007static inline unsigned long task_util(struct task_struct *p)
   4008{
   4009	return READ_ONCE(p->se.avg.util_avg);
   4010}
   4011
   4012static inline unsigned long _task_util_est(struct task_struct *p)
   4013{
   4014	struct util_est ue = READ_ONCE(p->se.avg.util_est);
   4015
   4016	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
   4017}
   4018
   4019static inline unsigned long task_util_est(struct task_struct *p)
   4020{
   4021	return max(task_util(p), _task_util_est(p));
   4022}
   4023
   4024#ifdef CONFIG_UCLAMP_TASK
   4025static inline unsigned long uclamp_task_util(struct task_struct *p)
   4026{
   4027	return clamp(task_util_est(p),
   4028		     uclamp_eff_value(p, UCLAMP_MIN),
   4029		     uclamp_eff_value(p, UCLAMP_MAX));
   4030}
   4031#else
   4032static inline unsigned long uclamp_task_util(struct task_struct *p)
   4033{
   4034	return task_util_est(p);
   4035}
   4036#endif
   4037
   4038static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
   4039				    struct task_struct *p)
   4040{
   4041	unsigned int enqueued;
   4042
   4043	if (!sched_feat(UTIL_EST))
   4044		return;
   4045
   4046	/* Update root cfs_rq's estimated utilization */
   4047	enqueued  = cfs_rq->avg.util_est.enqueued;
   4048	enqueued += _task_util_est(p);
   4049	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
   4050
   4051	trace_sched_util_est_cfs_tp(cfs_rq);
   4052}
   4053
   4054static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
   4055				    struct task_struct *p)
   4056{
   4057	unsigned int enqueued;
   4058
   4059	if (!sched_feat(UTIL_EST))
   4060		return;
   4061
   4062	/* Update root cfs_rq's estimated utilization */
   4063	enqueued  = cfs_rq->avg.util_est.enqueued;
   4064	enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
   4065	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
   4066
   4067	trace_sched_util_est_cfs_tp(cfs_rq);
   4068}
   4069
   4070#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
   4071
   4072/*
   4073 * Check if a (signed) value is within a specified (unsigned) margin,
   4074 * based on the observation that:
   4075 *
   4076 *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
   4077 *
   4078 * NOTE: this only works when value + margin < INT_MAX.
   4079 */
   4080static inline bool within_margin(int value, int margin)
   4081{
   4082	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
   4083}
   4084
   4085static inline void util_est_update(struct cfs_rq *cfs_rq,
   4086				   struct task_struct *p,
   4087				   bool task_sleep)
   4088{
   4089	long last_ewma_diff, last_enqueued_diff;
   4090	struct util_est ue;
   4091
   4092	if (!sched_feat(UTIL_EST))
   4093		return;
   4094
   4095	/*
   4096	 * Skip update of task's estimated utilization when the task has not
   4097	 * yet completed an activation, e.g. being migrated.
   4098	 */
   4099	if (!task_sleep)
   4100		return;
   4101
   4102	/*
   4103	 * If the PELT values haven't changed since enqueue time,
   4104	 * skip the util_est update.
   4105	 */
   4106	ue = p->se.avg.util_est;
   4107	if (ue.enqueued & UTIL_AVG_UNCHANGED)
   4108		return;
   4109
   4110	last_enqueued_diff = ue.enqueued;
   4111
   4112	/*
   4113	 * Reset EWMA on utilization increases, the moving average is used only
   4114	 * to smooth utilization decreases.
   4115	 */
   4116	ue.enqueued = task_util(p);
   4117	if (sched_feat(UTIL_EST_FASTUP)) {
   4118		if (ue.ewma < ue.enqueued) {
   4119			ue.ewma = ue.enqueued;
   4120			goto done;
   4121		}
   4122	}
   4123
   4124	/*
   4125	 * Skip update of task's estimated utilization when its members are
   4126	 * already ~1% close to its last activation value.
   4127	 */
   4128	last_ewma_diff = ue.enqueued - ue.ewma;
   4129	last_enqueued_diff -= ue.enqueued;
   4130	if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
   4131		if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
   4132			goto done;
   4133
   4134		return;
   4135	}
   4136
   4137	/*
   4138	 * To avoid overestimation of actual task utilization, skip updates if
   4139	 * we cannot grant there is idle time in this CPU.
   4140	 */
   4141	if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
   4142		return;
   4143
   4144	/*
   4145	 * Update Task's estimated utilization
   4146	 *
   4147	 * When *p completes an activation we can consolidate another sample
   4148	 * of the task size. This is done by storing the current PELT value
   4149	 * as ue.enqueued and by using this value to update the Exponential
   4150	 * Weighted Moving Average (EWMA):
   4151	 *
   4152	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
   4153	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
   4154	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
   4155	 *          = w * (      last_ewma_diff            ) +     ewma(t-1)
   4156	 *          = w * (last_ewma_diff  +  ewma(t-1) / w)
   4157	 *
   4158	 * Where 'w' is the weight of new samples, which is configured to be
   4159	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
   4160	 */
   4161	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
   4162	ue.ewma  += last_ewma_diff;
   4163	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
   4164done:
   4165	ue.enqueued |= UTIL_AVG_UNCHANGED;
   4166	WRITE_ONCE(p->se.avg.util_est, ue);
   4167
   4168	trace_sched_util_est_se_tp(&p->se);
   4169}
   4170
   4171static inline int task_fits_capacity(struct task_struct *p,
   4172				     unsigned long capacity)
   4173{
   4174	return fits_capacity(uclamp_task_util(p), capacity);
   4175}
   4176
   4177static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
   4178{
   4179	if (!static_branch_unlikely(&sched_asym_cpucapacity))
   4180		return;
   4181
   4182	if (!p || p->nr_cpus_allowed == 1) {
   4183		rq->misfit_task_load = 0;
   4184		return;
   4185	}
   4186
   4187	if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
   4188		rq->misfit_task_load = 0;
   4189		return;
   4190	}
   4191
   4192	/*
   4193	 * Make sure that misfit_task_load will not be null even if
   4194	 * task_h_load() returns 0.
   4195	 */
   4196	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
   4197}
   4198
   4199#else /* CONFIG_SMP */
   4200
   4201static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
   4202{
   4203	return true;
   4204}
   4205
   4206#define UPDATE_TG	0x0
   4207#define SKIP_AGE_LOAD	0x0
   4208#define DO_ATTACH	0x0
   4209
   4210static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
   4211{
   4212	cfs_rq_util_change(cfs_rq, 0);
   4213}
   4214
   4215static inline void remove_entity_load_avg(struct sched_entity *se) {}
   4216
   4217static inline void
   4218attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
   4219static inline void
   4220detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
   4221
   4222static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
   4223{
   4224	return 0;
   4225}
   4226
   4227static inline void
   4228util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
   4229
   4230static inline void
   4231util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
   4232
   4233static inline void
   4234util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
   4235		bool task_sleep) {}
   4236static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
   4237
   4238#endif /* CONFIG_SMP */
   4239
   4240static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
   4241{
   4242#ifdef CONFIG_SCHED_DEBUG
   4243	s64 d = se->vruntime - cfs_rq->min_vruntime;
   4244
   4245	if (d < 0)
   4246		d = -d;
   4247
   4248	if (d > 3*sysctl_sched_latency)
   4249		schedstat_inc(cfs_rq->nr_spread_over);
   4250#endif
   4251}
   4252
   4253static void
   4254place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
   4255{
   4256	u64 vruntime = cfs_rq->min_vruntime;
   4257
   4258	/*
   4259	 * The 'current' period is already promised to the current tasks,
   4260	 * however the extra weight of the new task will slow them down a
   4261	 * little, place the new task so that it fits in the slot that
   4262	 * stays open at the end.
   4263	 */
   4264	if (initial && sched_feat(START_DEBIT))
   4265		vruntime += sched_vslice(cfs_rq, se);
   4266
   4267	/* sleeps up to a single latency don't count. */
   4268	if (!initial) {
   4269		unsigned long thresh;
   4270
   4271		if (se_is_idle(se))
   4272			thresh = sysctl_sched_min_granularity;
   4273		else
   4274			thresh = sysctl_sched_latency;
   4275
   4276		/*
   4277		 * Halve their sleep time's effect, to allow
   4278		 * for a gentler effect of sleepers:
   4279		 */
   4280		if (sched_feat(GENTLE_FAIR_SLEEPERS))
   4281			thresh >>= 1;
   4282
   4283		vruntime -= thresh;
   4284	}
   4285
   4286	/* ensure we never gain time by being placed backwards. */
   4287	se->vruntime = max_vruntime(se->vruntime, vruntime);
   4288}
   4289
   4290static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
   4291
   4292static inline bool cfs_bandwidth_used(void);
   4293
   4294/*
   4295 * MIGRATION
   4296 *
   4297 *	dequeue
   4298 *	  update_curr()
   4299 *	    update_min_vruntime()
   4300 *	  vruntime -= min_vruntime
   4301 *
   4302 *	enqueue
   4303 *	  update_curr()
   4304 *	    update_min_vruntime()
   4305 *	  vruntime += min_vruntime
   4306 *
   4307 * this way the vruntime transition between RQs is done when both
   4308 * min_vruntime are up-to-date.
   4309 *
   4310 * WAKEUP (remote)
   4311 *
   4312 *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
   4313 *	  vruntime -= min_vruntime
   4314 *
   4315 *	enqueue
   4316 *	  update_curr()
   4317 *	    update_min_vruntime()
   4318 *	  vruntime += min_vruntime
   4319 *
   4320 * this way we don't have the most up-to-date min_vruntime on the originating
   4321 * CPU and an up-to-date min_vruntime on the destination CPU.
   4322 */
   4323
   4324static void
   4325enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   4326{
   4327	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
   4328	bool curr = cfs_rq->curr == se;
   4329
   4330	/*
   4331	 * If we're the current task, we must renormalise before calling
   4332	 * update_curr().
   4333	 */
   4334	if (renorm && curr)
   4335		se->vruntime += cfs_rq->min_vruntime;
   4336
   4337	update_curr(cfs_rq);
   4338
   4339	/*
   4340	 * Otherwise, renormalise after, such that we're placed at the current
   4341	 * moment in time, instead of some random moment in the past. Being
   4342	 * placed in the past could significantly boost this task to the
   4343	 * fairness detriment of existing tasks.
   4344	 */
   4345	if (renorm && !curr)
   4346		se->vruntime += cfs_rq->min_vruntime;
   4347
   4348	/*
   4349	 * When enqueuing a sched_entity, we must:
   4350	 *   - Update loads to have both entity and cfs_rq synced with now.
   4351	 *   - Add its load to cfs_rq->runnable_avg
   4352	 *   - For group_entity, update its weight to reflect the new share of
   4353	 *     its group cfs_rq
   4354	 *   - Add its new weight to cfs_rq->load.weight
   4355	 */
   4356	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
   4357	se_update_runnable(se);
   4358	update_cfs_group(se);
   4359	account_entity_enqueue(cfs_rq, se);
   4360
   4361	if (flags & ENQUEUE_WAKEUP)
   4362		place_entity(cfs_rq, se, 0);
   4363
   4364	check_schedstat_required();
   4365	update_stats_enqueue_fair(cfs_rq, se, flags);
   4366	check_spread(cfs_rq, se);
   4367	if (!curr)
   4368		__enqueue_entity(cfs_rq, se);
   4369	se->on_rq = 1;
   4370
   4371	/*
   4372	 * When bandwidth control is enabled, cfs might have been removed
   4373	 * because of a parent been throttled but cfs->nr_running > 1. Try to
   4374	 * add it unconditionally.
   4375	 */
   4376	if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
   4377		list_add_leaf_cfs_rq(cfs_rq);
   4378
   4379	if (cfs_rq->nr_running == 1)
   4380		check_enqueue_throttle(cfs_rq);
   4381}
   4382
   4383static void __clear_buddies_last(struct sched_entity *se)
   4384{
   4385	for_each_sched_entity(se) {
   4386		struct cfs_rq *cfs_rq = cfs_rq_of(se);
   4387		if (cfs_rq->last != se)
   4388			break;
   4389
   4390		cfs_rq->last = NULL;
   4391	}
   4392}
   4393
   4394static void __clear_buddies_next(struct sched_entity *se)
   4395{
   4396	for_each_sched_entity(se) {
   4397		struct cfs_rq *cfs_rq = cfs_rq_of(se);
   4398		if (cfs_rq->next != se)
   4399			break;
   4400
   4401		cfs_rq->next = NULL;
   4402	}
   4403}
   4404
   4405static void __clear_buddies_skip(struct sched_entity *se)
   4406{
   4407	for_each_sched_entity(se) {
   4408		struct cfs_rq *cfs_rq = cfs_rq_of(se);
   4409		if (cfs_rq->skip != se)
   4410			break;
   4411
   4412		cfs_rq->skip = NULL;
   4413	}
   4414}
   4415
   4416static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
   4417{
   4418	if (cfs_rq->last == se)
   4419		__clear_buddies_last(se);
   4420
   4421	if (cfs_rq->next == se)
   4422		__clear_buddies_next(se);
   4423
   4424	if (cfs_rq->skip == se)
   4425		__clear_buddies_skip(se);
   4426}
   4427
   4428static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
   4429
   4430static void
   4431dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   4432{
   4433	/*
   4434	 * Update run-time statistics of the 'current'.
   4435	 */
   4436	update_curr(cfs_rq);
   4437
   4438	/*
   4439	 * When dequeuing a sched_entity, we must:
   4440	 *   - Update loads to have both entity and cfs_rq synced with now.
   4441	 *   - Subtract its load from the cfs_rq->runnable_avg.
   4442	 *   - Subtract its previous weight from cfs_rq->load.weight.
   4443	 *   - For group entity, update its weight to reflect the new share
   4444	 *     of its group cfs_rq.
   4445	 */
   4446	update_load_avg(cfs_rq, se, UPDATE_TG);
   4447	se_update_runnable(se);
   4448
   4449	update_stats_dequeue_fair(cfs_rq, se, flags);
   4450
   4451	clear_buddies(cfs_rq, se);
   4452
   4453	if (se != cfs_rq->curr)
   4454		__dequeue_entity(cfs_rq, se);
   4455	se->on_rq = 0;
   4456	account_entity_dequeue(cfs_rq, se);
   4457
   4458	/*
   4459	 * Normalize after update_curr(); which will also have moved
   4460	 * min_vruntime if @se is the one holding it back. But before doing
   4461	 * update_min_vruntime() again, which will discount @se's position and
   4462	 * can move min_vruntime forward still more.
   4463	 */
   4464	if (!(flags & DEQUEUE_SLEEP))
   4465		se->vruntime -= cfs_rq->min_vruntime;
   4466
   4467	/* return excess runtime on last dequeue */
   4468	return_cfs_rq_runtime(cfs_rq);
   4469
   4470	update_cfs_group(se);
   4471
   4472	/*
   4473	 * Now advance min_vruntime if @se was the entity holding it back,
   4474	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
   4475	 * put back on, and if we advance min_vruntime, we'll be placed back
   4476	 * further than we started -- ie. we'll be penalized.
   4477	 */
   4478	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
   4479		update_min_vruntime(cfs_rq);
   4480}
   4481
   4482/*
   4483 * Preempt the current task with a newly woken task if needed:
   4484 */
   4485static void
   4486check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
   4487{
   4488	unsigned long ideal_runtime, delta_exec;
   4489	struct sched_entity *se;
   4490	s64 delta;
   4491
   4492	ideal_runtime = sched_slice(cfs_rq, curr);
   4493	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
   4494	if (delta_exec > ideal_runtime) {
   4495		resched_curr(rq_of(cfs_rq));
   4496		/*
   4497		 * The current task ran long enough, ensure it doesn't get
   4498		 * re-elected due to buddy favours.
   4499		 */
   4500		clear_buddies(cfs_rq, curr);
   4501		return;
   4502	}
   4503
   4504	/*
   4505	 * Ensure that a task that missed wakeup preemption by a
   4506	 * narrow margin doesn't have to wait for a full slice.
   4507	 * This also mitigates buddy induced latencies under load.
   4508	 */
   4509	if (delta_exec < sysctl_sched_min_granularity)
   4510		return;
   4511
   4512	se = __pick_first_entity(cfs_rq);
   4513	delta = curr->vruntime - se->vruntime;
   4514
   4515	if (delta < 0)
   4516		return;
   4517
   4518	if (delta > ideal_runtime)
   4519		resched_curr(rq_of(cfs_rq));
   4520}
   4521
   4522static void
   4523set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
   4524{
   4525	clear_buddies(cfs_rq, se);
   4526
   4527	/* 'current' is not kept within the tree. */
   4528	if (se->on_rq) {
   4529		/*
   4530		 * Any task has to be enqueued before it get to execute on
   4531		 * a CPU. So account for the time it spent waiting on the
   4532		 * runqueue.
   4533		 */
   4534		update_stats_wait_end_fair(cfs_rq, se);
   4535		__dequeue_entity(cfs_rq, se);
   4536		update_load_avg(cfs_rq, se, UPDATE_TG);
   4537	}
   4538
   4539	update_stats_curr_start(cfs_rq, se);
   4540	cfs_rq->curr = se;
   4541
   4542	/*
   4543	 * Track our maximum slice length, if the CPU's load is at
   4544	 * least twice that of our own weight (i.e. dont track it
   4545	 * when there are only lesser-weight tasks around):
   4546	 */
   4547	if (schedstat_enabled() &&
   4548	    rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
   4549		struct sched_statistics *stats;
   4550
   4551		stats = __schedstats_from_se(se);
   4552		__schedstat_set(stats->slice_max,
   4553				max((u64)stats->slice_max,
   4554				    se->sum_exec_runtime - se->prev_sum_exec_runtime));
   4555	}
   4556
   4557	se->prev_sum_exec_runtime = se->sum_exec_runtime;
   4558}
   4559
   4560static int
   4561wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
   4562
   4563/*
   4564 * Pick the next process, keeping these things in mind, in this order:
   4565 * 1) keep things fair between processes/task groups
   4566 * 2) pick the "next" process, since someone really wants that to run
   4567 * 3) pick the "last" process, for cache locality
   4568 * 4) do not run the "skip" process, if something else is available
   4569 */
   4570static struct sched_entity *
   4571pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
   4572{
   4573	struct sched_entity *left = __pick_first_entity(cfs_rq);
   4574	struct sched_entity *se;
   4575
   4576	/*
   4577	 * If curr is set we have to see if its left of the leftmost entity
   4578	 * still in the tree, provided there was anything in the tree at all.
   4579	 */
   4580	if (!left || (curr && entity_before(curr, left)))
   4581		left = curr;
   4582
   4583	se = left; /* ideally we run the leftmost entity */
   4584
   4585	/*
   4586	 * Avoid running the skip buddy, if running something else can
   4587	 * be done without getting too unfair.
   4588	 */
   4589	if (cfs_rq->skip && cfs_rq->skip == se) {
   4590		struct sched_entity *second;
   4591
   4592		if (se == curr) {
   4593			second = __pick_first_entity(cfs_rq);
   4594		} else {
   4595			second = __pick_next_entity(se);
   4596			if (!second || (curr && entity_before(curr, second)))
   4597				second = curr;
   4598		}
   4599
   4600		if (second && wakeup_preempt_entity(second, left) < 1)
   4601			se = second;
   4602	}
   4603
   4604	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
   4605		/*
   4606		 * Someone really wants this to run. If it's not unfair, run it.
   4607		 */
   4608		se = cfs_rq->next;
   4609	} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
   4610		/*
   4611		 * Prefer last buddy, try to return the CPU to a preempted task.
   4612		 */
   4613		se = cfs_rq->last;
   4614	}
   4615
   4616	return se;
   4617}
   4618
   4619static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
   4620
   4621static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
   4622{
   4623	/*
   4624	 * If still on the runqueue then deactivate_task()
   4625	 * was not called and update_curr() has to be done:
   4626	 */
   4627	if (prev->on_rq)
   4628		update_curr(cfs_rq);
   4629
   4630	/* throttle cfs_rqs exceeding runtime */
   4631	check_cfs_rq_runtime(cfs_rq);
   4632
   4633	check_spread(cfs_rq, prev);
   4634
   4635	if (prev->on_rq) {
   4636		update_stats_wait_start_fair(cfs_rq, prev);
   4637		/* Put 'current' back into the tree. */
   4638		__enqueue_entity(cfs_rq, prev);
   4639		/* in !on_rq case, update occurred at dequeue */
   4640		update_load_avg(cfs_rq, prev, 0);
   4641	}
   4642	cfs_rq->curr = NULL;
   4643}
   4644
   4645static void
   4646entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
   4647{
   4648	/*
   4649	 * Update run-time statistics of the 'current'.
   4650	 */
   4651	update_curr(cfs_rq);
   4652
   4653	/*
   4654	 * Ensure that runnable average is periodically updated.
   4655	 */
   4656	update_load_avg(cfs_rq, curr, UPDATE_TG);
   4657	update_cfs_group(curr);
   4658
   4659#ifdef CONFIG_SCHED_HRTICK
   4660	/*
   4661	 * queued ticks are scheduled to match the slice, so don't bother
   4662	 * validating it and just reschedule.
   4663	 */
   4664	if (queued) {
   4665		resched_curr(rq_of(cfs_rq));
   4666		return;
   4667	}
   4668	/*
   4669	 * don't let the period tick interfere with the hrtick preemption
   4670	 */
   4671	if (!sched_feat(DOUBLE_TICK) &&
   4672			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
   4673		return;
   4674#endif
   4675
   4676	if (cfs_rq->nr_running > 1)
   4677		check_preempt_tick(cfs_rq, curr);
   4678}
   4679
   4680
   4681/**************************************************
   4682 * CFS bandwidth control machinery
   4683 */
   4684
   4685#ifdef CONFIG_CFS_BANDWIDTH
   4686
   4687#ifdef CONFIG_JUMP_LABEL
   4688static struct static_key __cfs_bandwidth_used;
   4689
   4690static inline bool cfs_bandwidth_used(void)
   4691{
   4692	return static_key_false(&__cfs_bandwidth_used);
   4693}
   4694
   4695void cfs_bandwidth_usage_inc(void)
   4696{
   4697	static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
   4698}
   4699
   4700void cfs_bandwidth_usage_dec(void)
   4701{
   4702	static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
   4703}
   4704#else /* CONFIG_JUMP_LABEL */
   4705static bool cfs_bandwidth_used(void)
   4706{
   4707	return true;
   4708}
   4709
   4710void cfs_bandwidth_usage_inc(void) {}
   4711void cfs_bandwidth_usage_dec(void) {}
   4712#endif /* CONFIG_JUMP_LABEL */
   4713
   4714/*
   4715 * default period for cfs group bandwidth.
   4716 * default: 0.1s, units: nanoseconds
   4717 */
   4718static inline u64 default_cfs_period(void)
   4719{
   4720	return 100000000ULL;
   4721}
   4722
   4723static inline u64 sched_cfs_bandwidth_slice(void)
   4724{
   4725	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
   4726}
   4727
   4728/*
   4729 * Replenish runtime according to assigned quota. We use sched_clock_cpu
   4730 * directly instead of rq->clock to avoid adding additional synchronization
   4731 * around rq->lock.
   4732 *
   4733 * requires cfs_b->lock
   4734 */
   4735void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
   4736{
   4737	s64 runtime;
   4738
   4739	if (unlikely(cfs_b->quota == RUNTIME_INF))
   4740		return;
   4741
   4742	cfs_b->runtime += cfs_b->quota;
   4743	runtime = cfs_b->runtime_snap - cfs_b->runtime;
   4744	if (runtime > 0) {
   4745		cfs_b->burst_time += runtime;
   4746		cfs_b->nr_burst++;
   4747	}
   4748
   4749	cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
   4750	cfs_b->runtime_snap = cfs_b->runtime;
   4751}
   4752
   4753static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
   4754{
   4755	return &tg->cfs_bandwidth;
   4756}
   4757
   4758/* returns 0 on failure to allocate runtime */
   4759static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
   4760				   struct cfs_rq *cfs_rq, u64 target_runtime)
   4761{
   4762	u64 min_amount, amount = 0;
   4763
   4764	lockdep_assert_held(&cfs_b->lock);
   4765
   4766	/* note: this is a positive sum as runtime_remaining <= 0 */
   4767	min_amount = target_runtime - cfs_rq->runtime_remaining;
   4768
   4769	if (cfs_b->quota == RUNTIME_INF)
   4770		amount = min_amount;
   4771	else {
   4772		start_cfs_bandwidth(cfs_b);
   4773
   4774		if (cfs_b->runtime > 0) {
   4775			amount = min(cfs_b->runtime, min_amount);
   4776			cfs_b->runtime -= amount;
   4777			cfs_b->idle = 0;
   4778		}
   4779	}
   4780
   4781	cfs_rq->runtime_remaining += amount;
   4782
   4783	return cfs_rq->runtime_remaining > 0;
   4784}
   4785
   4786/* returns 0 on failure to allocate runtime */
   4787static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   4788{
   4789	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
   4790	int ret;
   4791
   4792	raw_spin_lock(&cfs_b->lock);
   4793	ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
   4794	raw_spin_unlock(&cfs_b->lock);
   4795
   4796	return ret;
   4797}
   4798
   4799static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
   4800{
   4801	/* dock delta_exec before expiring quota (as it could span periods) */
   4802	cfs_rq->runtime_remaining -= delta_exec;
   4803
   4804	if (likely(cfs_rq->runtime_remaining > 0))
   4805		return;
   4806
   4807	if (cfs_rq->throttled)
   4808		return;
   4809	/*
   4810	 * if we're unable to extend our runtime we resched so that the active
   4811	 * hierarchy can be throttled
   4812	 */
   4813	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
   4814		resched_curr(rq_of(cfs_rq));
   4815}
   4816
   4817static __always_inline
   4818void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
   4819{
   4820	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
   4821		return;
   4822
   4823	__account_cfs_rq_runtime(cfs_rq, delta_exec);
   4824}
   4825
   4826static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
   4827{
   4828	return cfs_bandwidth_used() && cfs_rq->throttled;
   4829}
   4830
   4831/* check whether cfs_rq, or any parent, is throttled */
   4832static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
   4833{
   4834	return cfs_bandwidth_used() && cfs_rq->throttle_count;
   4835}
   4836
   4837/*
   4838 * Ensure that neither of the group entities corresponding to src_cpu or
   4839 * dest_cpu are members of a throttled hierarchy when performing group
   4840 * load-balance operations.
   4841 */
   4842static inline int throttled_lb_pair(struct task_group *tg,
   4843				    int src_cpu, int dest_cpu)
   4844{
   4845	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
   4846
   4847	src_cfs_rq = tg->cfs_rq[src_cpu];
   4848	dest_cfs_rq = tg->cfs_rq[dest_cpu];
   4849
   4850	return throttled_hierarchy(src_cfs_rq) ||
   4851	       throttled_hierarchy(dest_cfs_rq);
   4852}
   4853
   4854static int tg_unthrottle_up(struct task_group *tg, void *data)
   4855{
   4856	struct rq *rq = data;
   4857	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
   4858
   4859	cfs_rq->throttle_count--;
   4860	if (!cfs_rq->throttle_count) {
   4861		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
   4862					     cfs_rq->throttled_clock_pelt;
   4863
   4864		/* Add cfs_rq with load or one or more already running entities to the list */
   4865		if (!cfs_rq_is_decayed(cfs_rq))
   4866			list_add_leaf_cfs_rq(cfs_rq);
   4867	}
   4868
   4869	return 0;
   4870}
   4871
   4872static int tg_throttle_down(struct task_group *tg, void *data)
   4873{
   4874	struct rq *rq = data;
   4875	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
   4876
   4877	/* group is entering throttled state, stop time */
   4878	if (!cfs_rq->throttle_count) {
   4879		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
   4880		list_del_leaf_cfs_rq(cfs_rq);
   4881	}
   4882	cfs_rq->throttle_count++;
   4883
   4884	return 0;
   4885}
   4886
   4887static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
   4888{
   4889	struct rq *rq = rq_of(cfs_rq);
   4890	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
   4891	struct sched_entity *se;
   4892	long task_delta, idle_task_delta, dequeue = 1;
   4893
   4894	raw_spin_lock(&cfs_b->lock);
   4895	/* This will start the period timer if necessary */
   4896	if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
   4897		/*
   4898		 * We have raced with bandwidth becoming available, and if we
   4899		 * actually throttled the timer might not unthrottle us for an
   4900		 * entire period. We additionally needed to make sure that any
   4901		 * subsequent check_cfs_rq_runtime calls agree not to throttle
   4902		 * us, as we may commit to do cfs put_prev+pick_next, so we ask
   4903		 * for 1ns of runtime rather than just check cfs_b.
   4904		 */
   4905		dequeue = 0;
   4906	} else {
   4907		list_add_tail_rcu(&cfs_rq->throttled_list,
   4908				  &cfs_b->throttled_cfs_rq);
   4909	}
   4910	raw_spin_unlock(&cfs_b->lock);
   4911
   4912	if (!dequeue)
   4913		return false;  /* Throttle no longer required. */
   4914
   4915	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
   4916
   4917	/* freeze hierarchy runnable averages while throttled */
   4918	rcu_read_lock();
   4919	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
   4920	rcu_read_unlock();
   4921
   4922	task_delta = cfs_rq->h_nr_running;
   4923	idle_task_delta = cfs_rq->idle_h_nr_running;
   4924	for_each_sched_entity(se) {
   4925		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
   4926		/* throttled entity or throttle-on-deactivate */
   4927		if (!se->on_rq)
   4928			goto done;
   4929
   4930		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
   4931
   4932		if (cfs_rq_is_idle(group_cfs_rq(se)))
   4933			idle_task_delta = cfs_rq->h_nr_running;
   4934
   4935		qcfs_rq->h_nr_running -= task_delta;
   4936		qcfs_rq->idle_h_nr_running -= idle_task_delta;
   4937
   4938		if (qcfs_rq->load.weight) {
   4939			/* Avoid re-evaluating load for this entity: */
   4940			se = parent_entity(se);
   4941			break;
   4942		}
   4943	}
   4944
   4945	for_each_sched_entity(se) {
   4946		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
   4947		/* throttled entity or throttle-on-deactivate */
   4948		if (!se->on_rq)
   4949			goto done;
   4950
   4951		update_load_avg(qcfs_rq, se, 0);
   4952		se_update_runnable(se);
   4953
   4954		if (cfs_rq_is_idle(group_cfs_rq(se)))
   4955			idle_task_delta = cfs_rq->h_nr_running;
   4956
   4957		qcfs_rq->h_nr_running -= task_delta;
   4958		qcfs_rq->idle_h_nr_running -= idle_task_delta;
   4959	}
   4960
   4961	/* At this point se is NULL and we are at root level*/
   4962	sub_nr_running(rq, task_delta);
   4963
   4964done:
   4965	/*
   4966	 * Note: distribution will already see us throttled via the
   4967	 * throttled-list.  rq->lock protects completion.
   4968	 */
   4969	cfs_rq->throttled = 1;
   4970	cfs_rq->throttled_clock = rq_clock(rq);
   4971	return true;
   4972}
   4973
   4974void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
   4975{
   4976	struct rq *rq = rq_of(cfs_rq);
   4977	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
   4978	struct sched_entity *se;
   4979	long task_delta, idle_task_delta;
   4980
   4981	se = cfs_rq->tg->se[cpu_of(rq)];
   4982
   4983	cfs_rq->throttled = 0;
   4984
   4985	update_rq_clock(rq);
   4986
   4987	raw_spin_lock(&cfs_b->lock);
   4988	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
   4989	list_del_rcu(&cfs_rq->throttled_list);
   4990	raw_spin_unlock(&cfs_b->lock);
   4991
   4992	/* update hierarchical throttle state */
   4993	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
   4994
   4995	/* Nothing to run but something to decay (on_list)? Complete the branch */
   4996	if (!cfs_rq->load.weight) {
   4997		if (cfs_rq->on_list)
   4998			goto unthrottle_throttle;
   4999		return;
   5000	}
   5001
   5002	task_delta = cfs_rq->h_nr_running;
   5003	idle_task_delta = cfs_rq->idle_h_nr_running;
   5004	for_each_sched_entity(se) {
   5005		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
   5006
   5007		if (se->on_rq)
   5008			break;
   5009		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
   5010
   5011		if (cfs_rq_is_idle(group_cfs_rq(se)))
   5012			idle_task_delta = cfs_rq->h_nr_running;
   5013
   5014		qcfs_rq->h_nr_running += task_delta;
   5015		qcfs_rq->idle_h_nr_running += idle_task_delta;
   5016
   5017		/* end evaluation on encountering a throttled cfs_rq */
   5018		if (cfs_rq_throttled(qcfs_rq))
   5019			goto unthrottle_throttle;
   5020	}
   5021
   5022	for_each_sched_entity(se) {
   5023		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
   5024
   5025		update_load_avg(qcfs_rq, se, UPDATE_TG);
   5026		se_update_runnable(se);
   5027
   5028		if (cfs_rq_is_idle(group_cfs_rq(se)))
   5029			idle_task_delta = cfs_rq->h_nr_running;
   5030
   5031		qcfs_rq->h_nr_running += task_delta;
   5032		qcfs_rq->idle_h_nr_running += idle_task_delta;
   5033
   5034		/* end evaluation on encountering a throttled cfs_rq */
   5035		if (cfs_rq_throttled(qcfs_rq))
   5036			goto unthrottle_throttle;
   5037
   5038		/*
   5039		 * One parent has been throttled and cfs_rq removed from the
   5040		 * list. Add it back to not break the leaf list.
   5041		 */
   5042		if (throttled_hierarchy(qcfs_rq))
   5043			list_add_leaf_cfs_rq(qcfs_rq);
   5044	}
   5045
   5046	/* At this point se is NULL and we are at root level*/
   5047	add_nr_running(rq, task_delta);
   5048
   5049unthrottle_throttle:
   5050	/*
   5051	 * The cfs_rq_throttled() breaks in the above iteration can result in
   5052	 * incomplete leaf list maintenance, resulting in triggering the
   5053	 * assertion below.
   5054	 */
   5055	for_each_sched_entity(se) {
   5056		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
   5057
   5058		if (list_add_leaf_cfs_rq(qcfs_rq))
   5059			break;
   5060	}
   5061
   5062	assert_list_leaf_cfs_rq(rq);
   5063
   5064	/* Determine whether we need to wake up potentially idle CPU: */
   5065	if (rq->curr == rq->idle && rq->cfs.nr_running)
   5066		resched_curr(rq);
   5067}
   5068
   5069static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
   5070{
   5071	struct cfs_rq *cfs_rq;
   5072	u64 runtime, remaining = 1;
   5073
   5074	rcu_read_lock();
   5075	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
   5076				throttled_list) {
   5077		struct rq *rq = rq_of(cfs_rq);
   5078		struct rq_flags rf;
   5079
   5080		rq_lock_irqsave(rq, &rf);
   5081		if (!cfs_rq_throttled(cfs_rq))
   5082			goto next;
   5083
   5084		/* By the above check, this should never be true */
   5085		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
   5086
   5087		raw_spin_lock(&cfs_b->lock);
   5088		runtime = -cfs_rq->runtime_remaining + 1;
   5089		if (runtime > cfs_b->runtime)
   5090			runtime = cfs_b->runtime;
   5091		cfs_b->runtime -= runtime;
   5092		remaining = cfs_b->runtime;
   5093		raw_spin_unlock(&cfs_b->lock);
   5094
   5095		cfs_rq->runtime_remaining += runtime;
   5096
   5097		/* we check whether we're throttled above */
   5098		if (cfs_rq->runtime_remaining > 0)
   5099			unthrottle_cfs_rq(cfs_rq);
   5100
   5101next:
   5102		rq_unlock_irqrestore(rq, &rf);
   5103
   5104		if (!remaining)
   5105			break;
   5106	}
   5107	rcu_read_unlock();
   5108}
   5109
   5110/*
   5111 * Responsible for refilling a task_group's bandwidth and unthrottling its
   5112 * cfs_rqs as appropriate. If there has been no activity within the last
   5113 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
   5114 * used to track this state.
   5115 */
   5116static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
   5117{
   5118	int throttled;
   5119
   5120	/* no need to continue the timer with no bandwidth constraint */
   5121	if (cfs_b->quota == RUNTIME_INF)
   5122		goto out_deactivate;
   5123
   5124	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
   5125	cfs_b->nr_periods += overrun;
   5126
   5127	/* Refill extra burst quota even if cfs_b->idle */
   5128	__refill_cfs_bandwidth_runtime(cfs_b);
   5129
   5130	/*
   5131	 * idle depends on !throttled (for the case of a large deficit), and if
   5132	 * we're going inactive then everything else can be deferred
   5133	 */
   5134	if (cfs_b->idle && !throttled)
   5135		goto out_deactivate;
   5136
   5137	if (!throttled) {
   5138		/* mark as potentially idle for the upcoming period */
   5139		cfs_b->idle = 1;
   5140		return 0;
   5141	}
   5142
   5143	/* account preceding periods in which throttling occurred */
   5144	cfs_b->nr_throttled += overrun;
   5145
   5146	/*
   5147	 * This check is repeated as we release cfs_b->lock while we unthrottle.
   5148	 */
   5149	while (throttled && cfs_b->runtime > 0) {
   5150		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   5151		/* we can't nest cfs_b->lock while distributing bandwidth */
   5152		distribute_cfs_runtime(cfs_b);
   5153		raw_spin_lock_irqsave(&cfs_b->lock, flags);
   5154
   5155		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
   5156	}
   5157
   5158	/*
   5159	 * While we are ensured activity in the period following an
   5160	 * unthrottle, this also covers the case in which the new bandwidth is
   5161	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
   5162	 * timer to remain active while there are any throttled entities.)
   5163	 */
   5164	cfs_b->idle = 0;
   5165
   5166	return 0;
   5167
   5168out_deactivate:
   5169	return 1;
   5170}
   5171
   5172/* a cfs_rq won't donate quota below this amount */
   5173static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
   5174/* minimum remaining period time to redistribute slack quota */
   5175static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
   5176/* how long we wait to gather additional slack before distributing */
   5177static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
   5178
   5179/*
   5180 * Are we near the end of the current quota period?
   5181 *
   5182 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
   5183 * hrtimer base being cleared by hrtimer_start. In the case of
   5184 * migrate_hrtimers, base is never cleared, so we are fine.
   5185 */
   5186static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
   5187{
   5188	struct hrtimer *refresh_timer = &cfs_b->period_timer;
   5189	s64 remaining;
   5190
   5191	/* if the call-back is running a quota refresh is already occurring */
   5192	if (hrtimer_callback_running(refresh_timer))
   5193		return 1;
   5194
   5195	/* is a quota refresh about to occur? */
   5196	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
   5197	if (remaining < (s64)min_expire)
   5198		return 1;
   5199
   5200	return 0;
   5201}
   5202
   5203static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
   5204{
   5205	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
   5206
   5207	/* if there's a quota refresh soon don't bother with slack */
   5208	if (runtime_refresh_within(cfs_b, min_left))
   5209		return;
   5210
   5211	/* don't push forwards an existing deferred unthrottle */
   5212	if (cfs_b->slack_started)
   5213		return;
   5214	cfs_b->slack_started = true;
   5215
   5216	hrtimer_start(&cfs_b->slack_timer,
   5217			ns_to_ktime(cfs_bandwidth_slack_period),
   5218			HRTIMER_MODE_REL);
   5219}
   5220
   5221/* we know any runtime found here is valid as update_curr() precedes return */
   5222static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   5223{
   5224	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
   5225	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
   5226
   5227	if (slack_runtime <= 0)
   5228		return;
   5229
   5230	raw_spin_lock(&cfs_b->lock);
   5231	if (cfs_b->quota != RUNTIME_INF) {
   5232		cfs_b->runtime += slack_runtime;
   5233
   5234		/* we are under rq->lock, defer unthrottling using a timer */
   5235		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
   5236		    !list_empty(&cfs_b->throttled_cfs_rq))
   5237			start_cfs_slack_bandwidth(cfs_b);
   5238	}
   5239	raw_spin_unlock(&cfs_b->lock);
   5240
   5241	/* even if it's not valid for return we don't want to try again */
   5242	cfs_rq->runtime_remaining -= slack_runtime;
   5243}
   5244
   5245static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   5246{
   5247	if (!cfs_bandwidth_used())
   5248		return;
   5249
   5250	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
   5251		return;
   5252
   5253	__return_cfs_rq_runtime(cfs_rq);
   5254}
   5255
   5256/*
   5257 * This is done with a timer (instead of inline with bandwidth return) since
   5258 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
   5259 */
   5260static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
   5261{
   5262	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
   5263	unsigned long flags;
   5264
   5265	/* confirm we're still not at a refresh boundary */
   5266	raw_spin_lock_irqsave(&cfs_b->lock, flags);
   5267	cfs_b->slack_started = false;
   5268
   5269	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
   5270		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   5271		return;
   5272	}
   5273
   5274	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
   5275		runtime = cfs_b->runtime;
   5276
   5277	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   5278
   5279	if (!runtime)
   5280		return;
   5281
   5282	distribute_cfs_runtime(cfs_b);
   5283}
   5284
   5285/*
   5286 * When a group wakes up we want to make sure that its quota is not already
   5287 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
   5288 * runtime as update_curr() throttling can not trigger until it's on-rq.
   5289 */
   5290static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
   5291{
   5292	if (!cfs_bandwidth_used())
   5293		return;
   5294
   5295	/* an active group must be handled by the update_curr()->put() path */
   5296	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
   5297		return;
   5298
   5299	/* ensure the group is not already throttled */
   5300	if (cfs_rq_throttled(cfs_rq))
   5301		return;
   5302
   5303	/* update runtime allocation */
   5304	account_cfs_rq_runtime(cfs_rq, 0);
   5305	if (cfs_rq->runtime_remaining <= 0)
   5306		throttle_cfs_rq(cfs_rq);
   5307}
   5308
   5309static void sync_throttle(struct task_group *tg, int cpu)
   5310{
   5311	struct cfs_rq *pcfs_rq, *cfs_rq;
   5312
   5313	if (!cfs_bandwidth_used())
   5314		return;
   5315
   5316	if (!tg->parent)
   5317		return;
   5318
   5319	cfs_rq = tg->cfs_rq[cpu];
   5320	pcfs_rq = tg->parent->cfs_rq[cpu];
   5321
   5322	cfs_rq->throttle_count = pcfs_rq->throttle_count;
   5323	cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
   5324}
   5325
   5326/* conditionally throttle active cfs_rq's from put_prev_entity() */
   5327static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   5328{
   5329	if (!cfs_bandwidth_used())
   5330		return false;
   5331
   5332	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
   5333		return false;
   5334
   5335	/*
   5336	 * it's possible for a throttled entity to be forced into a running
   5337	 * state (e.g. set_curr_task), in this case we're finished.
   5338	 */
   5339	if (cfs_rq_throttled(cfs_rq))
   5340		return true;
   5341
   5342	return throttle_cfs_rq(cfs_rq);
   5343}
   5344
   5345static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
   5346{
   5347	struct cfs_bandwidth *cfs_b =
   5348		container_of(timer, struct cfs_bandwidth, slack_timer);
   5349
   5350	do_sched_cfs_slack_timer(cfs_b);
   5351
   5352	return HRTIMER_NORESTART;
   5353}
   5354
   5355extern const u64 max_cfs_quota_period;
   5356
   5357static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
   5358{
   5359	struct cfs_bandwidth *cfs_b =
   5360		container_of(timer, struct cfs_bandwidth, period_timer);
   5361	unsigned long flags;
   5362	int overrun;
   5363	int idle = 0;
   5364	int count = 0;
   5365
   5366	raw_spin_lock_irqsave(&cfs_b->lock, flags);
   5367	for (;;) {
   5368		overrun = hrtimer_forward_now(timer, cfs_b->period);
   5369		if (!overrun)
   5370			break;
   5371
   5372		idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
   5373
   5374		if (++count > 3) {
   5375			u64 new, old = ktime_to_ns(cfs_b->period);
   5376
   5377			/*
   5378			 * Grow period by a factor of 2 to avoid losing precision.
   5379			 * Precision loss in the quota/period ratio can cause __cfs_schedulable
   5380			 * to fail.
   5381			 */
   5382			new = old * 2;
   5383			if (new < max_cfs_quota_period) {
   5384				cfs_b->period = ns_to_ktime(new);
   5385				cfs_b->quota *= 2;
   5386				cfs_b->burst *= 2;
   5387
   5388				pr_warn_ratelimited(
   5389	"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
   5390					smp_processor_id(),
   5391					div_u64(new, NSEC_PER_USEC),
   5392					div_u64(cfs_b->quota, NSEC_PER_USEC));
   5393			} else {
   5394				pr_warn_ratelimited(
   5395	"cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
   5396					smp_processor_id(),
   5397					div_u64(old, NSEC_PER_USEC),
   5398					div_u64(cfs_b->quota, NSEC_PER_USEC));
   5399			}
   5400
   5401			/* reset count so we don't come right back in here */
   5402			count = 0;
   5403		}
   5404	}
   5405	if (idle)
   5406		cfs_b->period_active = 0;
   5407	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   5408
   5409	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
   5410}
   5411
   5412void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
   5413{
   5414	raw_spin_lock_init(&cfs_b->lock);
   5415	cfs_b->runtime = 0;
   5416	cfs_b->quota = RUNTIME_INF;
   5417	cfs_b->period = ns_to_ktime(default_cfs_period());
   5418	cfs_b->burst = 0;
   5419
   5420	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
   5421	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
   5422	cfs_b->period_timer.function = sched_cfs_period_timer;
   5423	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
   5424	cfs_b->slack_timer.function = sched_cfs_slack_timer;
   5425	cfs_b->slack_started = false;
   5426}
   5427
   5428static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   5429{
   5430	cfs_rq->runtime_enabled = 0;
   5431	INIT_LIST_HEAD(&cfs_rq->throttled_list);
   5432}
   5433
   5434void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
   5435{
   5436	lockdep_assert_held(&cfs_b->lock);
   5437
   5438	if (cfs_b->period_active)
   5439		return;
   5440
   5441	cfs_b->period_active = 1;
   5442	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
   5443	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
   5444}
   5445
   5446static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
   5447{
   5448	/* init_cfs_bandwidth() was not called */
   5449	if (!cfs_b->throttled_cfs_rq.next)
   5450		return;
   5451
   5452	hrtimer_cancel(&cfs_b->period_timer);
   5453	hrtimer_cancel(&cfs_b->slack_timer);
   5454}
   5455
   5456/*
   5457 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
   5458 *
   5459 * The race is harmless, since modifying bandwidth settings of unhooked group
   5460 * bits doesn't do much.
   5461 */
   5462
   5463/* cpu online callback */
   5464static void __maybe_unused update_runtime_enabled(struct rq *rq)
   5465{
   5466	struct task_group *tg;
   5467
   5468	lockdep_assert_rq_held(rq);
   5469
   5470	rcu_read_lock();
   5471	list_for_each_entry_rcu(tg, &task_groups, list) {
   5472		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
   5473		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
   5474
   5475		raw_spin_lock(&cfs_b->lock);
   5476		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
   5477		raw_spin_unlock(&cfs_b->lock);
   5478	}
   5479	rcu_read_unlock();
   5480}
   5481
   5482/* cpu offline callback */
   5483static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
   5484{
   5485	struct task_group *tg;
   5486
   5487	lockdep_assert_rq_held(rq);
   5488
   5489	rcu_read_lock();
   5490	list_for_each_entry_rcu(tg, &task_groups, list) {
   5491		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
   5492
   5493		if (!cfs_rq->runtime_enabled)
   5494			continue;
   5495
   5496		/*
   5497		 * clock_task is not advancing so we just need to make sure
   5498		 * there's some valid quota amount
   5499		 */
   5500		cfs_rq->runtime_remaining = 1;
   5501		/*
   5502		 * Offline rq is schedulable till CPU is completely disabled
   5503		 * in take_cpu_down(), so we prevent new cfs throttling here.
   5504		 */
   5505		cfs_rq->runtime_enabled = 0;
   5506
   5507		if (cfs_rq_throttled(cfs_rq))
   5508			unthrottle_cfs_rq(cfs_rq);
   5509	}
   5510	rcu_read_unlock();
   5511}
   5512
   5513#else /* CONFIG_CFS_BANDWIDTH */
   5514
   5515static inline bool cfs_bandwidth_used(void)
   5516{
   5517	return false;
   5518}
   5519
   5520static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
   5521static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
   5522static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
   5523static inline void sync_throttle(struct task_group *tg, int cpu) {}
   5524static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
   5525
   5526static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
   5527{
   5528	return 0;
   5529}
   5530
   5531static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
   5532{
   5533	return 0;
   5534}
   5535
   5536static inline int throttled_lb_pair(struct task_group *tg,
   5537				    int src_cpu, int dest_cpu)
   5538{
   5539	return 0;
   5540}
   5541
   5542void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
   5543
   5544#ifdef CONFIG_FAIR_GROUP_SCHED
   5545static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
   5546#endif
   5547
   5548static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
   5549{
   5550	return NULL;
   5551}
   5552static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
   5553static inline void update_runtime_enabled(struct rq *rq) {}
   5554static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
   5555
   5556#endif /* CONFIG_CFS_BANDWIDTH */
   5557
   5558/**************************************************
   5559 * CFS operations on tasks:
   5560 */
   5561
   5562#ifdef CONFIG_SCHED_HRTICK
   5563static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
   5564{
   5565	struct sched_entity *se = &p->se;
   5566	struct cfs_rq *cfs_rq = cfs_rq_of(se);
   5567
   5568	SCHED_WARN_ON(task_rq(p) != rq);
   5569
   5570	if (rq->cfs.h_nr_running > 1) {
   5571		u64 slice = sched_slice(cfs_rq, se);
   5572		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
   5573		s64 delta = slice - ran;
   5574
   5575		if (delta < 0) {
   5576			if (task_current(rq, p))
   5577				resched_curr(rq);
   5578			return;
   5579		}
   5580		hrtick_start(rq, delta);
   5581	}
   5582}
   5583
   5584/*
   5585 * called from enqueue/dequeue and updates the hrtick when the
   5586 * current task is from our class and nr_running is low enough
   5587 * to matter.
   5588 */
   5589static void hrtick_update(struct rq *rq)
   5590{
   5591	struct task_struct *curr = rq->curr;
   5592
   5593	if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
   5594		return;
   5595
   5596	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
   5597		hrtick_start_fair(rq, curr);
   5598}
   5599#else /* !CONFIG_SCHED_HRTICK */
   5600static inline void
   5601hrtick_start_fair(struct rq *rq, struct task_struct *p)
   5602{
   5603}
   5604
   5605static inline void hrtick_update(struct rq *rq)
   5606{
   5607}
   5608#endif
   5609
   5610#ifdef CONFIG_SMP
   5611static inline bool cpu_overutilized(int cpu)
   5612{
   5613	return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
   5614}
   5615
   5616static inline void update_overutilized_status(struct rq *rq)
   5617{
   5618	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
   5619		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
   5620		trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
   5621	}
   5622}
   5623#else
   5624static inline void update_overutilized_status(struct rq *rq) { }
   5625#endif
   5626
   5627/* Runqueue only has SCHED_IDLE tasks enqueued */
   5628static int sched_idle_rq(struct rq *rq)
   5629{
   5630	return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
   5631			rq->nr_running);
   5632}
   5633
   5634/*
   5635 * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
   5636 * of idle_nr_running, which does not consider idle descendants of normal
   5637 * entities.
   5638 */
   5639static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
   5640{
   5641	return cfs_rq->nr_running &&
   5642		cfs_rq->nr_running == cfs_rq->idle_nr_running;
   5643}
   5644
   5645#ifdef CONFIG_SMP
   5646static int sched_idle_cpu(int cpu)
   5647{
   5648	return sched_idle_rq(cpu_rq(cpu));
   5649}
   5650#endif
   5651
   5652/*
   5653 * The enqueue_task method is called before nr_running is
   5654 * increased. Here we update the fair scheduling stats and
   5655 * then put the task into the rbtree:
   5656 */
   5657static void
   5658enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
   5659{
   5660	struct cfs_rq *cfs_rq;
   5661	struct sched_entity *se = &p->se;
   5662	int idle_h_nr_running = task_has_idle_policy(p);
   5663	int task_new = !(flags & ENQUEUE_WAKEUP);
   5664
   5665	/*
   5666	 * The code below (indirectly) updates schedutil which looks at
   5667	 * the cfs_rq utilization to select a frequency.
   5668	 * Let's add the task's estimated utilization to the cfs_rq's
   5669	 * estimated utilization, before we update schedutil.
   5670	 */
   5671	util_est_enqueue(&rq->cfs, p);
   5672
   5673	/*
   5674	 * If in_iowait is set, the code below may not trigger any cpufreq
   5675	 * utilization updates, so do it here explicitly with the IOWAIT flag
   5676	 * passed.
   5677	 */
   5678	if (p->in_iowait)
   5679		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
   5680
   5681	for_each_sched_entity(se) {
   5682		if (se->on_rq)
   5683			break;
   5684		cfs_rq = cfs_rq_of(se);
   5685		enqueue_entity(cfs_rq, se, flags);
   5686
   5687		cfs_rq->h_nr_running++;
   5688		cfs_rq->idle_h_nr_running += idle_h_nr_running;
   5689
   5690		if (cfs_rq_is_idle(cfs_rq))
   5691			idle_h_nr_running = 1;
   5692
   5693		/* end evaluation on encountering a throttled cfs_rq */
   5694		if (cfs_rq_throttled(cfs_rq))
   5695			goto enqueue_throttle;
   5696
   5697		flags = ENQUEUE_WAKEUP;
   5698	}
   5699
   5700	for_each_sched_entity(se) {
   5701		cfs_rq = cfs_rq_of(se);
   5702
   5703		update_load_avg(cfs_rq, se, UPDATE_TG);
   5704		se_update_runnable(se);
   5705		update_cfs_group(se);
   5706
   5707		cfs_rq->h_nr_running++;
   5708		cfs_rq->idle_h_nr_running += idle_h_nr_running;
   5709
   5710		if (cfs_rq_is_idle(cfs_rq))
   5711			idle_h_nr_running = 1;
   5712
   5713		/* end evaluation on encountering a throttled cfs_rq */
   5714		if (cfs_rq_throttled(cfs_rq))
   5715			goto enqueue_throttle;
   5716
   5717               /*
   5718                * One parent has been throttled and cfs_rq removed from the
   5719                * list. Add it back to not break the leaf list.
   5720                */
   5721               if (throttled_hierarchy(cfs_rq))
   5722                       list_add_leaf_cfs_rq(cfs_rq);
   5723	}
   5724
   5725	/* At this point se is NULL and we are at root level*/
   5726	add_nr_running(rq, 1);
   5727
   5728	/*
   5729	 * Since new tasks are assigned an initial util_avg equal to
   5730	 * half of the spare capacity of their CPU, tiny tasks have the
   5731	 * ability to cross the overutilized threshold, which will
   5732	 * result in the load balancer ruining all the task placement
   5733	 * done by EAS. As a way to mitigate that effect, do not account
   5734	 * for the first enqueue operation of new tasks during the
   5735	 * overutilized flag detection.
   5736	 *
   5737	 * A better way of solving this problem would be to wait for
   5738	 * the PELT signals of tasks to converge before taking them
   5739	 * into account, but that is not straightforward to implement,
   5740	 * and the following generally works well enough in practice.
   5741	 */
   5742	if (!task_new)
   5743		update_overutilized_status(rq);
   5744
   5745enqueue_throttle:
   5746	if (cfs_bandwidth_used()) {
   5747		/*
   5748		 * When bandwidth control is enabled; the cfs_rq_throttled()
   5749		 * breaks in the above iteration can result in incomplete
   5750		 * leaf list maintenance, resulting in triggering the assertion
   5751		 * below.
   5752		 */
   5753		for_each_sched_entity(se) {
   5754			cfs_rq = cfs_rq_of(se);
   5755
   5756			if (list_add_leaf_cfs_rq(cfs_rq))
   5757				break;
   5758		}
   5759	}
   5760
   5761	assert_list_leaf_cfs_rq(rq);
   5762
   5763	hrtick_update(rq);
   5764}
   5765
   5766static void set_next_buddy(struct sched_entity *se);
   5767
   5768/*
   5769 * The dequeue_task method is called before nr_running is
   5770 * decreased. We remove the task from the rbtree and
   5771 * update the fair scheduling stats:
   5772 */
   5773static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
   5774{
   5775	struct cfs_rq *cfs_rq;
   5776	struct sched_entity *se = &p->se;
   5777	int task_sleep = flags & DEQUEUE_SLEEP;
   5778	int idle_h_nr_running = task_has_idle_policy(p);
   5779	bool was_sched_idle = sched_idle_rq(rq);
   5780
   5781	util_est_dequeue(&rq->cfs, p);
   5782
   5783	for_each_sched_entity(se) {
   5784		cfs_rq = cfs_rq_of(se);
   5785		dequeue_entity(cfs_rq, se, flags);
   5786
   5787		cfs_rq->h_nr_running--;
   5788		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
   5789
   5790		if (cfs_rq_is_idle(cfs_rq))
   5791			idle_h_nr_running = 1;
   5792
   5793		/* end evaluation on encountering a throttled cfs_rq */
   5794		if (cfs_rq_throttled(cfs_rq))
   5795			goto dequeue_throttle;
   5796
   5797		/* Don't dequeue parent if it has other entities besides us */
   5798		if (cfs_rq->load.weight) {
   5799			/* Avoid re-evaluating load for this entity: */
   5800			se = parent_entity(se);
   5801			/*
   5802			 * Bias pick_next to pick a task from this cfs_rq, as
   5803			 * p is sleeping when it is within its sched_slice.
   5804			 */
   5805			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
   5806				set_next_buddy(se);
   5807			break;
   5808		}
   5809		flags |= DEQUEUE_SLEEP;
   5810	}
   5811
   5812	for_each_sched_entity(se) {
   5813		cfs_rq = cfs_rq_of(se);
   5814
   5815		update_load_avg(cfs_rq, se, UPDATE_TG);
   5816		se_update_runnable(se);
   5817		update_cfs_group(se);
   5818
   5819		cfs_rq->h_nr_running--;
   5820		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
   5821
   5822		if (cfs_rq_is_idle(cfs_rq))
   5823			idle_h_nr_running = 1;
   5824
   5825		/* end evaluation on encountering a throttled cfs_rq */
   5826		if (cfs_rq_throttled(cfs_rq))
   5827			goto dequeue_throttle;
   5828
   5829	}
   5830
   5831	/* At this point se is NULL and we are at root level*/
   5832	sub_nr_running(rq, 1);
   5833
   5834	/* balance early to pull high priority tasks */
   5835	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
   5836		rq->next_balance = jiffies;
   5837
   5838dequeue_throttle:
   5839	util_est_update(&rq->cfs, p, task_sleep);
   5840	hrtick_update(rq);
   5841}
   5842
   5843#ifdef CONFIG_SMP
   5844
   5845/* Working cpumask for: load_balance, load_balance_newidle. */
   5846DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
   5847DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
   5848
   5849#ifdef CONFIG_NO_HZ_COMMON
   5850
   5851static struct {
   5852	cpumask_var_t idle_cpus_mask;
   5853	atomic_t nr_cpus;
   5854	int has_blocked;		/* Idle CPUS has blocked load */
   5855	int needs_update;		/* Newly idle CPUs need their next_balance collated */
   5856	unsigned long next_balance;     /* in jiffy units */
   5857	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
   5858} nohz ____cacheline_aligned;
   5859
   5860#endif /* CONFIG_NO_HZ_COMMON */
   5861
   5862static unsigned long cpu_load(struct rq *rq)
   5863{
   5864	return cfs_rq_load_avg(&rq->cfs);
   5865}
   5866
   5867/*
   5868 * cpu_load_without - compute CPU load without any contributions from *p
   5869 * @cpu: the CPU which load is requested
   5870 * @p: the task which load should be discounted
   5871 *
   5872 * The load of a CPU is defined by the load of tasks currently enqueued on that
   5873 * CPU as well as tasks which are currently sleeping after an execution on that
   5874 * CPU.
   5875 *
   5876 * This method returns the load of the specified CPU by discounting the load of
   5877 * the specified task, whenever the task is currently contributing to the CPU
   5878 * load.
   5879 */
   5880static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
   5881{
   5882	struct cfs_rq *cfs_rq;
   5883	unsigned int load;
   5884
   5885	/* Task has no contribution or is new */
   5886	if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
   5887		return cpu_load(rq);
   5888
   5889	cfs_rq = &rq->cfs;
   5890	load = READ_ONCE(cfs_rq->avg.load_avg);
   5891
   5892	/* Discount task's util from CPU's util */
   5893	lsub_positive(&load, task_h_load(p));
   5894
   5895	return load;
   5896}
   5897
   5898static unsigned long cpu_runnable(struct rq *rq)
   5899{
   5900	return cfs_rq_runnable_avg(&rq->cfs);
   5901}
   5902
   5903static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
   5904{
   5905	struct cfs_rq *cfs_rq;
   5906	unsigned int runnable;
   5907
   5908	/* Task has no contribution or is new */
   5909	if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
   5910		return cpu_runnable(rq);
   5911
   5912	cfs_rq = &rq->cfs;
   5913	runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
   5914
   5915	/* Discount task's runnable from CPU's runnable */
   5916	lsub_positive(&runnable, p->se.avg.runnable_avg);
   5917
   5918	return runnable;
   5919}
   5920
   5921static unsigned long capacity_of(int cpu)
   5922{
   5923	return cpu_rq(cpu)->cpu_capacity;
   5924}
   5925
   5926static void record_wakee(struct task_struct *p)
   5927{
   5928	/*
   5929	 * Only decay a single time; tasks that have less then 1 wakeup per
   5930	 * jiffy will not have built up many flips.
   5931	 */
   5932	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
   5933		current->wakee_flips >>= 1;
   5934		current->wakee_flip_decay_ts = jiffies;
   5935	}
   5936
   5937	if (current->last_wakee != p) {
   5938		current->last_wakee = p;
   5939		current->wakee_flips++;
   5940	}
   5941}
   5942
   5943/*
   5944 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
   5945 *
   5946 * A waker of many should wake a different task than the one last awakened
   5947 * at a frequency roughly N times higher than one of its wakees.
   5948 *
   5949 * In order to determine whether we should let the load spread vs consolidating
   5950 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
   5951 * partner, and a factor of lls_size higher frequency in the other.
   5952 *
   5953 * With both conditions met, we can be relatively sure that the relationship is
   5954 * non-monogamous, with partner count exceeding socket size.
   5955 *
   5956 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
   5957 * whatever is irrelevant, spread criteria is apparent partner count exceeds
   5958 * socket size.
   5959 */
   5960static int wake_wide(struct task_struct *p)
   5961{
   5962	unsigned int master = current->wakee_flips;
   5963	unsigned int slave = p->wakee_flips;
   5964	int factor = __this_cpu_read(sd_llc_size);
   5965
   5966	if (master < slave)
   5967		swap(master, slave);
   5968	if (slave < factor || master < slave * factor)
   5969		return 0;
   5970	return 1;
   5971}
   5972
   5973/*
   5974 * The purpose of wake_affine() is to quickly determine on which CPU we can run
   5975 * soonest. For the purpose of speed we only consider the waking and previous
   5976 * CPU.
   5977 *
   5978 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
   5979 *			cache-affine and is (or	will be) idle.
   5980 *
   5981 * wake_affine_weight() - considers the weight to reflect the average
   5982 *			  scheduling latency of the CPUs. This seems to work
   5983 *			  for the overloaded case.
   5984 */
   5985static int
   5986wake_affine_idle(int this_cpu, int prev_cpu, int sync)
   5987{
   5988	/*
   5989	 * If this_cpu is idle, it implies the wakeup is from interrupt
   5990	 * context. Only allow the move if cache is shared. Otherwise an
   5991	 * interrupt intensive workload could force all tasks onto one
   5992	 * node depending on the IO topology or IRQ affinity settings.
   5993	 *
   5994	 * If the prev_cpu is idle and cache affine then avoid a migration.
   5995	 * There is no guarantee that the cache hot data from an interrupt
   5996	 * is more important than cache hot data on the prev_cpu and from
   5997	 * a cpufreq perspective, it's better to have higher utilisation
   5998	 * on one CPU.
   5999	 */
   6000	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
   6001		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
   6002
   6003	if (sync && cpu_rq(this_cpu)->nr_running == 1)
   6004		return this_cpu;
   6005
   6006	if (available_idle_cpu(prev_cpu))
   6007		return prev_cpu;
   6008
   6009	return nr_cpumask_bits;
   6010}
   6011
   6012static int
   6013wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
   6014		   int this_cpu, int prev_cpu, int sync)
   6015{
   6016	s64 this_eff_load, prev_eff_load;
   6017	unsigned long task_load;
   6018
   6019	this_eff_load = cpu_load(cpu_rq(this_cpu));
   6020
   6021	if (sync) {
   6022		unsigned long current_load = task_h_load(current);
   6023
   6024		if (current_load > this_eff_load)
   6025			return this_cpu;
   6026
   6027		this_eff_load -= current_load;
   6028	}
   6029
   6030	task_load = task_h_load(p);
   6031
   6032	this_eff_load += task_load;
   6033	if (sched_feat(WA_BIAS))
   6034		this_eff_load *= 100;
   6035	this_eff_load *= capacity_of(prev_cpu);
   6036
   6037	prev_eff_load = cpu_load(cpu_rq(prev_cpu));
   6038	prev_eff_load -= task_load;
   6039	if (sched_feat(WA_BIAS))
   6040		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
   6041	prev_eff_load *= capacity_of(this_cpu);
   6042
   6043	/*
   6044	 * If sync, adjust the weight of prev_eff_load such that if
   6045	 * prev_eff == this_eff that select_idle_sibling() will consider
   6046	 * stacking the wakee on top of the waker if no other CPU is
   6047	 * idle.
   6048	 */
   6049	if (sync)
   6050		prev_eff_load += 1;
   6051
   6052	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
   6053}
   6054
   6055static int wake_affine(struct sched_domain *sd, struct task_struct *p,
   6056		       int this_cpu, int prev_cpu, int sync)
   6057{
   6058	int target = nr_cpumask_bits;
   6059
   6060	if (sched_feat(WA_IDLE))
   6061		target = wake_affine_idle(this_cpu, prev_cpu, sync);
   6062
   6063	if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
   6064		target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
   6065
   6066	schedstat_inc(p->stats.nr_wakeups_affine_attempts);
   6067	if (target == nr_cpumask_bits)
   6068		return prev_cpu;
   6069
   6070	schedstat_inc(sd->ttwu_move_affine);
   6071	schedstat_inc(p->stats.nr_wakeups_affine);
   6072	return target;
   6073}
   6074
   6075static struct sched_group *
   6076find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
   6077
   6078/*
   6079 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
   6080 */
   6081static int
   6082find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
   6083{
   6084	unsigned long load, min_load = ULONG_MAX;
   6085	unsigned int min_exit_latency = UINT_MAX;
   6086	u64 latest_idle_timestamp = 0;
   6087	int least_loaded_cpu = this_cpu;
   6088	int shallowest_idle_cpu = -1;
   6089	int i;
   6090
   6091	/* Check if we have any choice: */
   6092	if (group->group_weight == 1)
   6093		return cpumask_first(sched_group_span(group));
   6094
   6095	/* Traverse only the allowed CPUs */
   6096	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
   6097		struct rq *rq = cpu_rq(i);
   6098
   6099		if (!sched_core_cookie_match(rq, p))
   6100			continue;
   6101
   6102		if (sched_idle_cpu(i))
   6103			return i;
   6104
   6105		if (available_idle_cpu(i)) {
   6106			struct cpuidle_state *idle = idle_get_state(rq);
   6107			if (idle && idle->exit_latency < min_exit_latency) {
   6108				/*
   6109				 * We give priority to a CPU whose idle state
   6110				 * has the smallest exit latency irrespective
   6111				 * of any idle timestamp.
   6112				 */
   6113				min_exit_latency = idle->exit_latency;
   6114				latest_idle_timestamp = rq->idle_stamp;
   6115				shallowest_idle_cpu = i;
   6116			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
   6117				   rq->idle_stamp > latest_idle_timestamp) {
   6118				/*
   6119				 * If equal or no active idle state, then
   6120				 * the most recently idled CPU might have
   6121				 * a warmer cache.
   6122				 */
   6123				latest_idle_timestamp = rq->idle_stamp;
   6124				shallowest_idle_cpu = i;
   6125			}
   6126		} else if (shallowest_idle_cpu == -1) {
   6127			load = cpu_load(cpu_rq(i));
   6128			if (load < min_load) {
   6129				min_load = load;
   6130				least_loaded_cpu = i;
   6131			}
   6132		}
   6133	}
   6134
   6135	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
   6136}
   6137
   6138static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
   6139				  int cpu, int prev_cpu, int sd_flag)
   6140{
   6141	int new_cpu = cpu;
   6142
   6143	if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
   6144		return prev_cpu;
   6145
   6146	/*
   6147	 * We need task's util for cpu_util_without, sync it up to
   6148	 * prev_cpu's last_update_time.
   6149	 */
   6150	if (!(sd_flag & SD_BALANCE_FORK))
   6151		sync_entity_load_avg(&p->se);
   6152
   6153	while (sd) {
   6154		struct sched_group *group;
   6155		struct sched_domain *tmp;
   6156		int weight;
   6157
   6158		if (!(sd->flags & sd_flag)) {
   6159			sd = sd->child;
   6160			continue;
   6161		}
   6162
   6163		group = find_idlest_group(sd, p, cpu);
   6164		if (!group) {
   6165			sd = sd->child;
   6166			continue;
   6167		}
   6168
   6169		new_cpu = find_idlest_group_cpu(group, p, cpu);
   6170		if (new_cpu == cpu) {
   6171			/* Now try balancing at a lower domain level of 'cpu': */
   6172			sd = sd->child;
   6173			continue;
   6174		}
   6175
   6176		/* Now try balancing at a lower domain level of 'new_cpu': */
   6177		cpu = new_cpu;
   6178		weight = sd->span_weight;
   6179		sd = NULL;
   6180		for_each_domain(cpu, tmp) {
   6181			if (weight <= tmp->span_weight)
   6182				break;
   6183			if (tmp->flags & sd_flag)
   6184				sd = tmp;
   6185		}
   6186	}
   6187
   6188	return new_cpu;
   6189}
   6190
   6191static inline int __select_idle_cpu(int cpu, struct task_struct *p)
   6192{
   6193	if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
   6194	    sched_cpu_cookie_match(cpu_rq(cpu), p))
   6195		return cpu;
   6196
   6197	return -1;
   6198}
   6199
   6200#ifdef CONFIG_SCHED_SMT
   6201DEFINE_STATIC_KEY_FALSE(sched_smt_present);
   6202EXPORT_SYMBOL_GPL(sched_smt_present);
   6203
   6204static inline void set_idle_cores(int cpu, int val)
   6205{
   6206	struct sched_domain_shared *sds;
   6207
   6208	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
   6209	if (sds)
   6210		WRITE_ONCE(sds->has_idle_cores, val);
   6211}
   6212
   6213static inline bool test_idle_cores(int cpu, bool def)
   6214{
   6215	struct sched_domain_shared *sds;
   6216
   6217	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
   6218	if (sds)
   6219		return READ_ONCE(sds->has_idle_cores);
   6220
   6221	return def;
   6222}
   6223
   6224/*
   6225 * Scans the local SMT mask to see if the entire core is idle, and records this
   6226 * information in sd_llc_shared->has_idle_cores.
   6227 *
   6228 * Since SMT siblings share all cache levels, inspecting this limited remote
   6229 * state should be fairly cheap.
   6230 */
   6231void __update_idle_core(struct rq *rq)
   6232{
   6233	int core = cpu_of(rq);
   6234	int cpu;
   6235
   6236	rcu_read_lock();
   6237	if (test_idle_cores(core, true))
   6238		goto unlock;
   6239
   6240	for_each_cpu(cpu, cpu_smt_mask(core)) {
   6241		if (cpu == core)
   6242			continue;
   6243
   6244		if (!available_idle_cpu(cpu))
   6245			goto unlock;
   6246	}
   6247
   6248	set_idle_cores(core, 1);
   6249unlock:
   6250	rcu_read_unlock();
   6251}
   6252
   6253/*
   6254 * Scan the entire LLC domain for idle cores; this dynamically switches off if
   6255 * there are no idle cores left in the system; tracked through
   6256 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
   6257 */
   6258static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
   6259{
   6260	bool idle = true;
   6261	int cpu;
   6262
   6263	if (!static_branch_likely(&sched_smt_present))
   6264		return __select_idle_cpu(core, p);
   6265
   6266	for_each_cpu(cpu, cpu_smt_mask(core)) {
   6267		if (!available_idle_cpu(cpu)) {
   6268			idle = false;
   6269			if (*idle_cpu == -1) {
   6270				if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
   6271					*idle_cpu = cpu;
   6272					break;
   6273				}
   6274				continue;
   6275			}
   6276			break;
   6277		}
   6278		if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
   6279			*idle_cpu = cpu;
   6280	}
   6281
   6282	if (idle)
   6283		return core;
   6284
   6285	cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
   6286	return -1;
   6287}
   6288
   6289/*
   6290 * Scan the local SMT mask for idle CPUs.
   6291 */
   6292static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
   6293{
   6294	int cpu;
   6295
   6296	for_each_cpu(cpu, cpu_smt_mask(target)) {
   6297		if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
   6298		    !cpumask_test_cpu(cpu, sched_domain_span(sd)))
   6299			continue;
   6300		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
   6301			return cpu;
   6302	}
   6303
   6304	return -1;
   6305}
   6306
   6307#else /* CONFIG_SCHED_SMT */
   6308
   6309static inline void set_idle_cores(int cpu, int val)
   6310{
   6311}
   6312
   6313static inline bool test_idle_cores(int cpu, bool def)
   6314{
   6315	return def;
   6316}
   6317
   6318static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
   6319{
   6320	return __select_idle_cpu(core, p);
   6321}
   6322
   6323static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
   6324{
   6325	return -1;
   6326}
   6327
   6328#endif /* CONFIG_SCHED_SMT */
   6329
   6330/*
   6331 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
   6332 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
   6333 * average idle time for this rq (as found in rq->avg_idle).
   6334 */
   6335static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
   6336{
   6337	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
   6338	int i, cpu, idle_cpu = -1, nr = INT_MAX;
   6339	struct rq *this_rq = this_rq();
   6340	int this = smp_processor_id();
   6341	struct sched_domain *this_sd;
   6342	u64 time = 0;
   6343
   6344	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
   6345	if (!this_sd)
   6346		return -1;
   6347
   6348	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
   6349
   6350	if (sched_feat(SIS_PROP) && !has_idle_core) {
   6351		u64 avg_cost, avg_idle, span_avg;
   6352		unsigned long now = jiffies;
   6353
   6354		/*
   6355		 * If we're busy, the assumption that the last idle period
   6356		 * predicts the future is flawed; age away the remaining
   6357		 * predicted idle time.
   6358		 */
   6359		if (unlikely(this_rq->wake_stamp < now)) {
   6360			while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
   6361				this_rq->wake_stamp++;
   6362				this_rq->wake_avg_idle >>= 1;
   6363			}
   6364		}
   6365
   6366		avg_idle = this_rq->wake_avg_idle;
   6367		avg_cost = this_sd->avg_scan_cost + 1;
   6368
   6369		span_avg = sd->span_weight * avg_idle;
   6370		if (span_avg > 4*avg_cost)
   6371			nr = div_u64(span_avg, avg_cost);
   6372		else
   6373			nr = 4;
   6374
   6375		time = cpu_clock(this);
   6376	}
   6377
   6378	for_each_cpu_wrap(cpu, cpus, target + 1) {
   6379		if (has_idle_core) {
   6380			i = select_idle_core(p, cpu, cpus, &idle_cpu);
   6381			if ((unsigned int)i < nr_cpumask_bits)
   6382				return i;
   6383
   6384		} else {
   6385			if (!--nr)
   6386				return -1;
   6387			idle_cpu = __select_idle_cpu(cpu, p);
   6388			if ((unsigned int)idle_cpu < nr_cpumask_bits)
   6389				break;
   6390		}
   6391	}
   6392
   6393	if (has_idle_core)
   6394		set_idle_cores(target, false);
   6395
   6396	if (sched_feat(SIS_PROP) && !has_idle_core) {
   6397		time = cpu_clock(this) - time;
   6398
   6399		/*
   6400		 * Account for the scan cost of wakeups against the average
   6401		 * idle time.
   6402		 */
   6403		this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
   6404
   6405		update_avg(&this_sd->avg_scan_cost, time);
   6406	}
   6407
   6408	return idle_cpu;
   6409}
   6410
   6411/*
   6412 * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
   6413 * the task fits. If no CPU is big enough, but there are idle ones, try to
   6414 * maximize capacity.
   6415 */
   6416static int
   6417select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
   6418{
   6419	unsigned long task_util, best_cap = 0;
   6420	int cpu, best_cpu = -1;
   6421	struct cpumask *cpus;
   6422
   6423	cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
   6424	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
   6425
   6426	task_util = uclamp_task_util(p);
   6427
   6428	for_each_cpu_wrap(cpu, cpus, target) {
   6429		unsigned long cpu_cap = capacity_of(cpu);
   6430
   6431		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
   6432			continue;
   6433		if (fits_capacity(task_util, cpu_cap))
   6434			return cpu;
   6435
   6436		if (cpu_cap > best_cap) {
   6437			best_cap = cpu_cap;
   6438			best_cpu = cpu;
   6439		}
   6440	}
   6441
   6442	return best_cpu;
   6443}
   6444
   6445static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
   6446{
   6447	if (static_branch_unlikely(&sched_asym_cpucapacity))
   6448		return fits_capacity(task_util, capacity_of(cpu));
   6449
   6450	return true;
   6451}
   6452
   6453/*
   6454 * Try and locate an idle core/thread in the LLC cache domain.
   6455 */
   6456static int select_idle_sibling(struct task_struct *p, int prev, int target)
   6457{
   6458	bool has_idle_core = false;
   6459	struct sched_domain *sd;
   6460	unsigned long task_util;
   6461	int i, recent_used_cpu;
   6462
   6463	/*
   6464	 * On asymmetric system, update task utilization because we will check
   6465	 * that the task fits with cpu's capacity.
   6466	 */
   6467	if (static_branch_unlikely(&sched_asym_cpucapacity)) {
   6468		sync_entity_load_avg(&p->se);
   6469		task_util = uclamp_task_util(p);
   6470	}
   6471
   6472	/*
   6473	 * per-cpu select_idle_mask usage
   6474	 */
   6475	lockdep_assert_irqs_disabled();
   6476
   6477	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
   6478	    asym_fits_capacity(task_util, target))
   6479		return target;
   6480
   6481	/*
   6482	 * If the previous CPU is cache affine and idle, don't be stupid:
   6483	 */
   6484	if (prev != target && cpus_share_cache(prev, target) &&
   6485	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
   6486	    asym_fits_capacity(task_util, prev))
   6487		return prev;
   6488
   6489	/*
   6490	 * Allow a per-cpu kthread to stack with the wakee if the
   6491	 * kworker thread and the tasks previous CPUs are the same.
   6492	 * The assumption is that the wakee queued work for the
   6493	 * per-cpu kthread that is now complete and the wakeup is
   6494	 * essentially a sync wakeup. An obvious example of this
   6495	 * pattern is IO completions.
   6496	 */
   6497	if (is_per_cpu_kthread(current) &&
   6498	    in_task() &&
   6499	    prev == smp_processor_id() &&
   6500	    this_rq()->nr_running <= 1 &&
   6501	    asym_fits_capacity(task_util, prev)) {
   6502		return prev;
   6503	}
   6504
   6505	/* Check a recently used CPU as a potential idle candidate: */
   6506	recent_used_cpu = p->recent_used_cpu;
   6507	p->recent_used_cpu = prev;
   6508	if (recent_used_cpu != prev &&
   6509	    recent_used_cpu != target &&
   6510	    cpus_share_cache(recent_used_cpu, target) &&
   6511	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
   6512	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
   6513	    asym_fits_capacity(task_util, recent_used_cpu)) {
   6514		return recent_used_cpu;
   6515	}
   6516
   6517	/*
   6518	 * For asymmetric CPU capacity systems, our domain of interest is
   6519	 * sd_asym_cpucapacity rather than sd_llc.
   6520	 */
   6521	if (static_branch_unlikely(&sched_asym_cpucapacity)) {
   6522		sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
   6523		/*
   6524		 * On an asymmetric CPU capacity system where an exclusive
   6525		 * cpuset defines a symmetric island (i.e. one unique
   6526		 * capacity_orig value through the cpuset), the key will be set
   6527		 * but the CPUs within that cpuset will not have a domain with
   6528		 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
   6529		 * capacity path.
   6530		 */
   6531		if (sd) {
   6532			i = select_idle_capacity(p, sd, target);
   6533			return ((unsigned)i < nr_cpumask_bits) ? i : target;
   6534		}
   6535	}
   6536
   6537	sd = rcu_dereference(per_cpu(sd_llc, target));
   6538	if (!sd)
   6539		return target;
   6540
   6541	if (sched_smt_active()) {
   6542		has_idle_core = test_idle_cores(target, false);
   6543
   6544		if (!has_idle_core && cpus_share_cache(prev, target)) {
   6545			i = select_idle_smt(p, sd, prev);
   6546			if ((unsigned int)i < nr_cpumask_bits)
   6547				return i;
   6548		}
   6549	}
   6550
   6551	i = select_idle_cpu(p, sd, has_idle_core, target);
   6552	if ((unsigned)i < nr_cpumask_bits)
   6553		return i;
   6554
   6555	return target;
   6556}
   6557
   6558/*
   6559 * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
   6560 * (@dst_cpu = -1) or migrated to @dst_cpu.
   6561 */
   6562static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
   6563{
   6564	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
   6565	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
   6566
   6567	/*
   6568	 * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
   6569	 * contribution. If @p migrates from another CPU to @cpu add its
   6570	 * contribution. In all the other cases @cpu is not impacted by the
   6571	 * migration so its util_avg is already correct.
   6572	 */
   6573	if (task_cpu(p) == cpu && dst_cpu != cpu)
   6574		lsub_positive(&util, task_util(p));
   6575	else if (task_cpu(p) != cpu && dst_cpu == cpu)
   6576		util += task_util(p);
   6577
   6578	if (sched_feat(UTIL_EST)) {
   6579		unsigned long util_est;
   6580
   6581		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
   6582
   6583		/*
   6584		 * During wake-up @p isn't enqueued yet and doesn't contribute
   6585		 * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
   6586		 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
   6587		 * has been enqueued.
   6588		 *
   6589		 * During exec (@dst_cpu = -1) @p is enqueued and does
   6590		 * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
   6591		 * Remove it to "simulate" cpu_util without @p's contribution.
   6592		 *
   6593		 * Despite the task_on_rq_queued(@p) check there is still a
   6594		 * small window for a possible race when an exec
   6595		 * select_task_rq_fair() races with LB's detach_task().
   6596		 *
   6597		 *   detach_task()
   6598		 *     deactivate_task()
   6599		 *       p->on_rq = TASK_ON_RQ_MIGRATING;
   6600		 *       -------------------------------- A
   6601		 *       dequeue_task()                    \
   6602		 *         dequeue_task_fair()              + Race Time
   6603		 *           util_est_dequeue()            /
   6604		 *       -------------------------------- B
   6605		 *
   6606		 * The additional check "current == p" is required to further
   6607		 * reduce the race window.
   6608		 */
   6609		if (dst_cpu == cpu)
   6610			util_est += _task_util_est(p);
   6611		else if (unlikely(task_on_rq_queued(p) || current == p))
   6612			lsub_positive(&util_est, _task_util_est(p));
   6613
   6614		util = max(util, util_est);
   6615	}
   6616
   6617	return min(util, capacity_orig_of(cpu));
   6618}
   6619
   6620/*
   6621 * cpu_util_without: compute cpu utilization without any contributions from *p
   6622 * @cpu: the CPU which utilization is requested
   6623 * @p: the task which utilization should be discounted
   6624 *
   6625 * The utilization of a CPU is defined by the utilization of tasks currently
   6626 * enqueued on that CPU as well as tasks which are currently sleeping after an
   6627 * execution on that CPU.
   6628 *
   6629 * This method returns the utilization of the specified CPU by discounting the
   6630 * utilization of the specified task, whenever the task is currently
   6631 * contributing to the CPU utilization.
   6632 */
   6633static unsigned long cpu_util_without(int cpu, struct task_struct *p)
   6634{
   6635	/* Task has no contribution or is new */
   6636	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
   6637		return cpu_util_cfs(cpu);
   6638
   6639	return cpu_util_next(cpu, p, -1);
   6640}
   6641
   6642/*
   6643 * compute_energy(): Estimates the energy that @pd would consume if @p was
   6644 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
   6645 * landscape of @pd's CPUs after the task migration, and uses the Energy Model
   6646 * to compute what would be the energy if we decided to actually migrate that
   6647 * task.
   6648 */
   6649static long
   6650compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
   6651{
   6652	struct cpumask *pd_mask = perf_domain_span(pd);
   6653	unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
   6654	unsigned long max_util = 0, sum_util = 0;
   6655	unsigned long _cpu_cap = cpu_cap;
   6656	int cpu;
   6657
   6658	_cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
   6659
   6660	/*
   6661	 * The capacity state of CPUs of the current rd can be driven by CPUs
   6662	 * of another rd if they belong to the same pd. So, account for the
   6663	 * utilization of these CPUs too by masking pd with cpu_online_mask
   6664	 * instead of the rd span.
   6665	 *
   6666	 * If an entire pd is outside of the current rd, it will not appear in
   6667	 * its pd list and will not be accounted by compute_energy().
   6668	 */
   6669	for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
   6670		unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
   6671		unsigned long cpu_util, util_running = util_freq;
   6672		struct task_struct *tsk = NULL;
   6673
   6674		/*
   6675		 * When @p is placed on @cpu:
   6676		 *
   6677		 * util_running = max(cpu_util, cpu_util_est) +
   6678		 *		  max(task_util, _task_util_est)
   6679		 *
   6680		 * while cpu_util_next is: max(cpu_util + task_util,
   6681		 *			       cpu_util_est + _task_util_est)
   6682		 */
   6683		if (cpu == dst_cpu) {
   6684			tsk = p;
   6685			util_running =
   6686				cpu_util_next(cpu, p, -1) + task_util_est(p);
   6687		}
   6688
   6689		/*
   6690		 * Busy time computation: utilization clamping is not
   6691		 * required since the ratio (sum_util / cpu_capacity)
   6692		 * is already enough to scale the EM reported power
   6693		 * consumption at the (eventually clamped) cpu_capacity.
   6694		 */
   6695		cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
   6696					      ENERGY_UTIL, NULL);
   6697
   6698		sum_util += min(cpu_util, _cpu_cap);
   6699
   6700		/*
   6701		 * Performance domain frequency: utilization clamping
   6702		 * must be considered since it affects the selection
   6703		 * of the performance domain frequency.
   6704		 * NOTE: in case RT tasks are running, by default the
   6705		 * FREQUENCY_UTIL's utilization can be max OPP.
   6706		 */
   6707		cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
   6708					      FREQUENCY_UTIL, tsk);
   6709		max_util = max(max_util, min(cpu_util, _cpu_cap));
   6710	}
   6711
   6712	return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
   6713}
   6714
   6715/*
   6716 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
   6717 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
   6718 * spare capacity in each performance domain and uses it as a potential
   6719 * candidate to execute the task. Then, it uses the Energy Model to figure
   6720 * out which of the CPU candidates is the most energy-efficient.
   6721 *
   6722 * The rationale for this heuristic is as follows. In a performance domain,
   6723 * all the most energy efficient CPU candidates (according to the Energy
   6724 * Model) are those for which we'll request a low frequency. When there are
   6725 * several CPUs for which the frequency request will be the same, we don't
   6726 * have enough data to break the tie between them, because the Energy Model
   6727 * only includes active power costs. With this model, if we assume that
   6728 * frequency requests follow utilization (e.g. using schedutil), the CPU with
   6729 * the maximum spare capacity in a performance domain is guaranteed to be among
   6730 * the best candidates of the performance domain.
   6731 *
   6732 * In practice, it could be preferable from an energy standpoint to pack
   6733 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
   6734 * but that could also hurt our chances to go cluster idle, and we have no
   6735 * ways to tell with the current Energy Model if this is actually a good
   6736 * idea or not. So, find_energy_efficient_cpu() basically favors
   6737 * cluster-packing, and spreading inside a cluster. That should at least be
   6738 * a good thing for latency, and this is consistent with the idea that most
   6739 * of the energy savings of EAS come from the asymmetry of the system, and
   6740 * not so much from breaking the tie between identical CPUs. That's also the
   6741 * reason why EAS is enabled in the topology code only for systems where
   6742 * SD_ASYM_CPUCAPACITY is set.
   6743 *
   6744 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
   6745 * they don't have any useful utilization data yet and it's not possible to
   6746 * forecast their impact on energy consumption. Consequently, they will be
   6747 * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
   6748 * to be energy-inefficient in some use-cases. The alternative would be to
   6749 * bias new tasks towards specific types of CPUs first, or to try to infer
   6750 * their util_avg from the parent task, but those heuristics could hurt
   6751 * other use-cases too. So, until someone finds a better way to solve this,
   6752 * let's keep things simple by re-using the existing slow path.
   6753 */
   6754static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
   6755{
   6756	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
   6757	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
   6758	int cpu, best_energy_cpu = prev_cpu, target = -1;
   6759	unsigned long cpu_cap, util, base_energy = 0;
   6760	struct sched_domain *sd;
   6761	struct perf_domain *pd;
   6762
   6763	rcu_read_lock();
   6764	pd = rcu_dereference(rd->pd);
   6765	if (!pd || READ_ONCE(rd->overutilized))
   6766		goto unlock;
   6767
   6768	/*
   6769	 * Energy-aware wake-up happens on the lowest sched_domain starting
   6770	 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
   6771	 */
   6772	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
   6773	while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
   6774		sd = sd->parent;
   6775	if (!sd)
   6776		goto unlock;
   6777
   6778	target = prev_cpu;
   6779
   6780	sync_entity_load_avg(&p->se);
   6781	if (!task_util_est(p))
   6782		goto unlock;
   6783
   6784	for (; pd; pd = pd->next) {
   6785		unsigned long cur_delta, spare_cap, max_spare_cap = 0;
   6786		bool compute_prev_delta = false;
   6787		unsigned long base_energy_pd;
   6788		int max_spare_cap_cpu = -1;
   6789
   6790		for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
   6791			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
   6792				continue;
   6793
   6794			util = cpu_util_next(cpu, p, cpu);
   6795			cpu_cap = capacity_of(cpu);
   6796			spare_cap = cpu_cap;
   6797			lsub_positive(&spare_cap, util);
   6798
   6799			/*
   6800			 * Skip CPUs that cannot satisfy the capacity request.
   6801			 * IOW, placing the task there would make the CPU
   6802			 * overutilized. Take uclamp into account to see how
   6803			 * much capacity we can get out of the CPU; this is
   6804			 * aligned with sched_cpu_util().
   6805			 */
   6806			util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
   6807			if (!fits_capacity(util, cpu_cap))
   6808				continue;
   6809
   6810			if (cpu == prev_cpu) {
   6811				/* Always use prev_cpu as a candidate. */
   6812				compute_prev_delta = true;
   6813			} else if (spare_cap > max_spare_cap) {
   6814				/*
   6815				 * Find the CPU with the maximum spare capacity
   6816				 * in the performance domain.
   6817				 */
   6818				max_spare_cap = spare_cap;
   6819				max_spare_cap_cpu = cpu;
   6820			}
   6821		}
   6822
   6823		if (max_spare_cap_cpu < 0 && !compute_prev_delta)
   6824			continue;
   6825
   6826		/* Compute the 'base' energy of the pd, without @p */
   6827		base_energy_pd = compute_energy(p, -1, pd);
   6828		base_energy += base_energy_pd;
   6829
   6830		/* Evaluate the energy impact of using prev_cpu. */
   6831		if (compute_prev_delta) {
   6832			prev_delta = compute_energy(p, prev_cpu, pd);
   6833			if (prev_delta < base_energy_pd)
   6834				goto unlock;
   6835			prev_delta -= base_energy_pd;
   6836			best_delta = min(best_delta, prev_delta);
   6837		}
   6838
   6839		/* Evaluate the energy impact of using max_spare_cap_cpu. */
   6840		if (max_spare_cap_cpu >= 0) {
   6841			cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
   6842			if (cur_delta < base_energy_pd)
   6843				goto unlock;
   6844			cur_delta -= base_energy_pd;
   6845			if (cur_delta < best_delta) {
   6846				best_delta = cur_delta;
   6847				best_energy_cpu = max_spare_cap_cpu;
   6848			}
   6849		}
   6850	}
   6851	rcu_read_unlock();
   6852
   6853	/*
   6854	 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
   6855	 * least 6% of the energy used by prev_cpu.
   6856	 */
   6857	if ((prev_delta == ULONG_MAX) ||
   6858	    (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
   6859		target = best_energy_cpu;
   6860
   6861	return target;
   6862
   6863unlock:
   6864	rcu_read_unlock();
   6865
   6866	return target;
   6867}
   6868
   6869/*
   6870 * select_task_rq_fair: Select target runqueue for the waking task in domains
   6871 * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
   6872 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
   6873 *
   6874 * Balances load by selecting the idlest CPU in the idlest group, or under
   6875 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
   6876 *
   6877 * Returns the target CPU number.
   6878 */
   6879static int
   6880select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
   6881{
   6882	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
   6883	struct sched_domain *tmp, *sd = NULL;
   6884	int cpu = smp_processor_id();
   6885	int new_cpu = prev_cpu;
   6886	int want_affine = 0;
   6887	/* SD_flags and WF_flags share the first nibble */
   6888	int sd_flag = wake_flags & 0xF;
   6889
   6890	/*
   6891	 * required for stable ->cpus_allowed
   6892	 */
   6893	lockdep_assert_held(&p->pi_lock);
   6894	if (wake_flags & WF_TTWU) {
   6895		record_wakee(p);
   6896
   6897		if (sched_energy_enabled()) {
   6898			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
   6899			if (new_cpu >= 0)
   6900				return new_cpu;
   6901			new_cpu = prev_cpu;
   6902		}
   6903
   6904		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
   6905	}
   6906
   6907	rcu_read_lock();
   6908	for_each_domain(cpu, tmp) {
   6909		/*
   6910		 * If both 'cpu' and 'prev_cpu' are part of this domain,
   6911		 * cpu is a valid SD_WAKE_AFFINE target.
   6912		 */
   6913		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
   6914		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
   6915			if (cpu != prev_cpu)
   6916				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
   6917
   6918			sd = NULL; /* Prefer wake_affine over balance flags */
   6919			break;
   6920		}
   6921
   6922		/*
   6923		 * Usually only true for WF_EXEC and WF_FORK, as sched_domains
   6924		 * usually do not have SD_BALANCE_WAKE set. That means wakeup
   6925		 * will usually go to the fast path.
   6926		 */
   6927		if (tmp->flags & sd_flag)
   6928			sd = tmp;
   6929		else if (!want_affine)
   6930			break;
   6931	}
   6932
   6933	if (unlikely(sd)) {
   6934		/* Slow path */
   6935		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
   6936	} else if (wake_flags & WF_TTWU) { /* XXX always ? */
   6937		/* Fast path */
   6938		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
   6939	}
   6940	rcu_read_unlock();
   6941
   6942	return new_cpu;
   6943}
   6944
   6945static void detach_entity_cfs_rq(struct sched_entity *se);
   6946
   6947/*
   6948 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
   6949 * cfs_rq_of(p) references at time of call are still valid and identify the
   6950 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
   6951 */
   6952static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
   6953{
   6954	/*
   6955	 * As blocked tasks retain absolute vruntime the migration needs to
   6956	 * deal with this by subtracting the old and adding the new
   6957	 * min_vruntime -- the latter is done by enqueue_entity() when placing
   6958	 * the task on the new runqueue.
   6959	 */
   6960	if (READ_ONCE(p->__state) == TASK_WAKING) {
   6961		struct sched_entity *se = &p->se;
   6962		struct cfs_rq *cfs_rq = cfs_rq_of(se);
   6963		u64 min_vruntime;
   6964
   6965#ifndef CONFIG_64BIT
   6966		u64 min_vruntime_copy;
   6967
   6968		do {
   6969			min_vruntime_copy = cfs_rq->min_vruntime_copy;
   6970			smp_rmb();
   6971			min_vruntime = cfs_rq->min_vruntime;
   6972		} while (min_vruntime != min_vruntime_copy);
   6973#else
   6974		min_vruntime = cfs_rq->min_vruntime;
   6975#endif
   6976
   6977		se->vruntime -= min_vruntime;
   6978	}
   6979
   6980	if (p->on_rq == TASK_ON_RQ_MIGRATING) {
   6981		/*
   6982		 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
   6983		 * rq->lock and can modify state directly.
   6984		 */
   6985		lockdep_assert_rq_held(task_rq(p));
   6986		detach_entity_cfs_rq(&p->se);
   6987
   6988	} else {
   6989		/*
   6990		 * We are supposed to update the task to "current" time, then
   6991		 * its up to date and ready to go to new CPU/cfs_rq. But we
   6992		 * have difficulty in getting what current time is, so simply
   6993		 * throw away the out-of-date time. This will result in the
   6994		 * wakee task is less decayed, but giving the wakee more load
   6995		 * sounds not bad.
   6996		 */
   6997		remove_entity_load_avg(&p->se);
   6998	}
   6999
   7000	/* Tell new CPU we are migrated */
   7001	p->se.avg.last_update_time = 0;
   7002
   7003	/* We have migrated, no longer consider this task hot */
   7004	p->se.exec_start = 0;
   7005
   7006	update_scan_period(p, new_cpu);
   7007}
   7008
   7009static void task_dead_fair(struct task_struct *p)
   7010{
   7011	remove_entity_load_avg(&p->se);
   7012}
   7013
   7014static int
   7015balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   7016{
   7017	if (rq->nr_running)
   7018		return 1;
   7019
   7020	return newidle_balance(rq, rf) != 0;
   7021}
   7022#endif /* CONFIG_SMP */
   7023
   7024static unsigned long wakeup_gran(struct sched_entity *se)
   7025{
   7026	unsigned long gran = sysctl_sched_wakeup_granularity;
   7027
   7028	/*
   7029	 * Since its curr running now, convert the gran from real-time
   7030	 * to virtual-time in his units.
   7031	 *
   7032	 * By using 'se' instead of 'curr' we penalize light tasks, so
   7033	 * they get preempted easier. That is, if 'se' < 'curr' then
   7034	 * the resulting gran will be larger, therefore penalizing the
   7035	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
   7036	 * be smaller, again penalizing the lighter task.
   7037	 *
   7038	 * This is especially important for buddies when the leftmost
   7039	 * task is higher priority than the buddy.
   7040	 */
   7041	return calc_delta_fair(gran, se);
   7042}
   7043
   7044/*
   7045 * Should 'se' preempt 'curr'.
   7046 *
   7047 *             |s1
   7048 *        |s2
   7049 *   |s3
   7050 *         g
   7051 *      |<--->|c
   7052 *
   7053 *  w(c, s1) = -1
   7054 *  w(c, s2) =  0
   7055 *  w(c, s3) =  1
   7056 *
   7057 */
   7058static int
   7059wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
   7060{
   7061	s64 gran, vdiff = curr->vruntime - se->vruntime;
   7062
   7063	if (vdiff <= 0)
   7064		return -1;
   7065
   7066	gran = wakeup_gran(se);
   7067	if (vdiff > gran)
   7068		return 1;
   7069
   7070	return 0;
   7071}
   7072
   7073static void set_last_buddy(struct sched_entity *se)
   7074{
   7075	for_each_sched_entity(se) {
   7076		if (SCHED_WARN_ON(!se->on_rq))
   7077			return;
   7078		if (se_is_idle(se))
   7079			return;
   7080		cfs_rq_of(se)->last = se;
   7081	}
   7082}
   7083
   7084static void set_next_buddy(struct sched_entity *se)
   7085{
   7086	for_each_sched_entity(se) {
   7087		if (SCHED_WARN_ON(!se->on_rq))
   7088			return;
   7089		if (se_is_idle(se))
   7090			return;
   7091		cfs_rq_of(se)->next = se;
   7092	}
   7093}
   7094
   7095static void set_skip_buddy(struct sched_entity *se)
   7096{
   7097	for_each_sched_entity(se)
   7098		cfs_rq_of(se)->skip = se;
   7099}
   7100
   7101/*
   7102 * Preempt the current task with a newly woken task if needed:
   7103 */
   7104static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
   7105{
   7106	struct task_struct *curr = rq->curr;
   7107	struct sched_entity *se = &curr->se, *pse = &p->se;
   7108	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
   7109	int scale = cfs_rq->nr_running >= sched_nr_latency;
   7110	int next_buddy_marked = 0;
   7111	int cse_is_idle, pse_is_idle;
   7112
   7113	if (unlikely(se == pse))
   7114		return;
   7115
   7116	/*
   7117	 * This is possible from callers such as attach_tasks(), in which we
   7118	 * unconditionally check_preempt_curr() after an enqueue (which may have
   7119	 * lead to a throttle).  This both saves work and prevents false
   7120	 * next-buddy nomination below.
   7121	 */
   7122	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
   7123		return;
   7124
   7125	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
   7126		set_next_buddy(pse);
   7127		next_buddy_marked = 1;
   7128	}
   7129
   7130	/*
   7131	 * We can come here with TIF_NEED_RESCHED already set from new task
   7132	 * wake up path.
   7133	 *
   7134	 * Note: this also catches the edge-case of curr being in a throttled
   7135	 * group (e.g. via set_curr_task), since update_curr() (in the
   7136	 * enqueue of curr) will have resulted in resched being set.  This
   7137	 * prevents us from potentially nominating it as a false LAST_BUDDY
   7138	 * below.
   7139	 */
   7140	if (test_tsk_need_resched(curr))
   7141		return;
   7142
   7143	/* Idle tasks are by definition preempted by non-idle tasks. */
   7144	if (unlikely(task_has_idle_policy(curr)) &&
   7145	    likely(!task_has_idle_policy(p)))
   7146		goto preempt;
   7147
   7148	/*
   7149	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
   7150	 * is driven by the tick):
   7151	 */
   7152	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
   7153		return;
   7154
   7155	find_matching_se(&se, &pse);
   7156	BUG_ON(!pse);
   7157
   7158	cse_is_idle = se_is_idle(se);
   7159	pse_is_idle = se_is_idle(pse);
   7160
   7161	/*
   7162	 * Preempt an idle group in favor of a non-idle group (and don't preempt
   7163	 * in the inverse case).
   7164	 */
   7165	if (cse_is_idle && !pse_is_idle)
   7166		goto preempt;
   7167	if (cse_is_idle != pse_is_idle)
   7168		return;
   7169
   7170	update_curr(cfs_rq_of(se));
   7171	if (wakeup_preempt_entity(se, pse) == 1) {
   7172		/*
   7173		 * Bias pick_next to pick the sched entity that is
   7174		 * triggering this preemption.
   7175		 */
   7176		if (!next_buddy_marked)
   7177			set_next_buddy(pse);
   7178		goto preempt;
   7179	}
   7180
   7181	return;
   7182
   7183preempt:
   7184	resched_curr(rq);
   7185	/*
   7186	 * Only set the backward buddy when the current task is still
   7187	 * on the rq. This can happen when a wakeup gets interleaved
   7188	 * with schedule on the ->pre_schedule() or idle_balance()
   7189	 * point, either of which can * drop the rq lock.
   7190	 *
   7191	 * Also, during early boot the idle thread is in the fair class,
   7192	 * for obvious reasons its a bad idea to schedule back to it.
   7193	 */
   7194	if (unlikely(!se->on_rq || curr == rq->idle))
   7195		return;
   7196
   7197	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
   7198		set_last_buddy(se);
   7199}
   7200
   7201#ifdef CONFIG_SMP
   7202static struct task_struct *pick_task_fair(struct rq *rq)
   7203{
   7204	struct sched_entity *se;
   7205	struct cfs_rq *cfs_rq;
   7206
   7207again:
   7208	cfs_rq = &rq->cfs;
   7209	if (!cfs_rq->nr_running)
   7210		return NULL;
   7211
   7212	do {
   7213		struct sched_entity *curr = cfs_rq->curr;
   7214
   7215		/* When we pick for a remote RQ, we'll not have done put_prev_entity() */
   7216		if (curr) {
   7217			if (curr->on_rq)
   7218				update_curr(cfs_rq);
   7219			else
   7220				curr = NULL;
   7221
   7222			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
   7223				goto again;
   7224		}
   7225
   7226		se = pick_next_entity(cfs_rq, curr);
   7227		cfs_rq = group_cfs_rq(se);
   7228	} while (cfs_rq);
   7229
   7230	return task_of(se);
   7231}
   7232#endif
   7233
   7234struct task_struct *
   7235pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   7236{
   7237	struct cfs_rq *cfs_rq = &rq->cfs;
   7238	struct sched_entity *se;
   7239	struct task_struct *p;
   7240	int new_tasks;
   7241
   7242again:
   7243	if (!sched_fair_runnable(rq))
   7244		goto idle;
   7245
   7246#ifdef CONFIG_FAIR_GROUP_SCHED
   7247	if (!prev || prev->sched_class != &fair_sched_class)
   7248		goto simple;
   7249
   7250	/*
   7251	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
   7252	 * likely that a next task is from the same cgroup as the current.
   7253	 *
   7254	 * Therefore attempt to avoid putting and setting the entire cgroup
   7255	 * hierarchy, only change the part that actually changes.
   7256	 */
   7257
   7258	do {
   7259		struct sched_entity *curr = cfs_rq->curr;
   7260
   7261		/*
   7262		 * Since we got here without doing put_prev_entity() we also
   7263		 * have to consider cfs_rq->curr. If it is still a runnable
   7264		 * entity, update_curr() will update its vruntime, otherwise
   7265		 * forget we've ever seen it.
   7266		 */
   7267		if (curr) {
   7268			if (curr->on_rq)
   7269				update_curr(cfs_rq);
   7270			else
   7271				curr = NULL;
   7272
   7273			/*
   7274			 * This call to check_cfs_rq_runtime() will do the
   7275			 * throttle and dequeue its entity in the parent(s).
   7276			 * Therefore the nr_running test will indeed
   7277			 * be correct.
   7278			 */
   7279			if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
   7280				cfs_rq = &rq->cfs;
   7281
   7282				if (!cfs_rq->nr_running)
   7283					goto idle;
   7284
   7285				goto simple;
   7286			}
   7287		}
   7288
   7289		se = pick_next_entity(cfs_rq, curr);
   7290		cfs_rq = group_cfs_rq(se);
   7291	} while (cfs_rq);
   7292
   7293	p = task_of(se);
   7294
   7295	/*
   7296	 * Since we haven't yet done put_prev_entity and if the selected task
   7297	 * is a different task than we started out with, try and touch the
   7298	 * least amount of cfs_rqs.
   7299	 */
   7300	if (prev != p) {
   7301		struct sched_entity *pse = &prev->se;
   7302
   7303		while (!(cfs_rq = is_same_group(se, pse))) {
   7304			int se_depth = se->depth;
   7305			int pse_depth = pse->depth;
   7306
   7307			if (se_depth <= pse_depth) {
   7308				put_prev_entity(cfs_rq_of(pse), pse);
   7309				pse = parent_entity(pse);
   7310			}
   7311			if (se_depth >= pse_depth) {
   7312				set_next_entity(cfs_rq_of(se), se);
   7313				se = parent_entity(se);
   7314			}
   7315		}
   7316
   7317		put_prev_entity(cfs_rq, pse);
   7318		set_next_entity(cfs_rq, se);
   7319	}
   7320
   7321	goto done;
   7322simple:
   7323#endif
   7324	if (prev)
   7325		put_prev_task(rq, prev);
   7326
   7327	do {
   7328		se = pick_next_entity(cfs_rq, NULL);
   7329		set_next_entity(cfs_rq, se);
   7330		cfs_rq = group_cfs_rq(se);
   7331	} while (cfs_rq);
   7332
   7333	p = task_of(se);
   7334
   7335done: __maybe_unused;
   7336#ifdef CONFIG_SMP
   7337	/*
   7338	 * Move the next running task to the front of
   7339	 * the list, so our cfs_tasks list becomes MRU
   7340	 * one.
   7341	 */
   7342	list_move(&p->se.group_node, &rq->cfs_tasks);
   7343#endif
   7344
   7345	if (hrtick_enabled_fair(rq))
   7346		hrtick_start_fair(rq, p);
   7347
   7348	update_misfit_status(p, rq);
   7349
   7350	return p;
   7351
   7352idle:
   7353	if (!rf)
   7354		return NULL;
   7355
   7356	new_tasks = newidle_balance(rq, rf);
   7357
   7358	/*
   7359	 * Because newidle_balance() releases (and re-acquires) rq->lock, it is
   7360	 * possible for any higher priority task to appear. In that case we
   7361	 * must re-start the pick_next_entity() loop.
   7362	 */
   7363	if (new_tasks < 0)
   7364		return RETRY_TASK;
   7365
   7366	if (new_tasks > 0)
   7367		goto again;
   7368
   7369	/*
   7370	 * rq is about to be idle, check if we need to update the
   7371	 * lost_idle_time of clock_pelt
   7372	 */
   7373	update_idle_rq_clock_pelt(rq);
   7374
   7375	return NULL;
   7376}
   7377
   7378static struct task_struct *__pick_next_task_fair(struct rq *rq)
   7379{
   7380	return pick_next_task_fair(rq, NULL, NULL);
   7381}
   7382
   7383/*
   7384 * Account for a descheduled task:
   7385 */
   7386static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
   7387{
   7388	struct sched_entity *se = &prev->se;
   7389	struct cfs_rq *cfs_rq;
   7390
   7391	for_each_sched_entity(se) {
   7392		cfs_rq = cfs_rq_of(se);
   7393		put_prev_entity(cfs_rq, se);
   7394	}
   7395}
   7396
   7397/*
   7398 * sched_yield() is very simple
   7399 *
   7400 * The magic of dealing with the ->skip buddy is in pick_next_entity.
   7401 */
   7402static void yield_task_fair(struct rq *rq)
   7403{
   7404	struct task_struct *curr = rq->curr;
   7405	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
   7406	struct sched_entity *se = &curr->se;
   7407
   7408	/*
   7409	 * Are we the only task in the tree?
   7410	 */
   7411	if (unlikely(rq->nr_running == 1))
   7412		return;
   7413
   7414	clear_buddies(cfs_rq, se);
   7415
   7416	if (curr->policy != SCHED_BATCH) {
   7417		update_rq_clock(rq);
   7418		/*
   7419		 * Update run-time statistics of the 'current'.
   7420		 */
   7421		update_curr(cfs_rq);
   7422		/*
   7423		 * Tell update_rq_clock() that we've just updated,
   7424		 * so we don't do microscopic update in schedule()
   7425		 * and double the fastpath cost.
   7426		 */
   7427		rq_clock_skip_update(rq);
   7428	}
   7429
   7430	set_skip_buddy(se);
   7431}
   7432
   7433static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
   7434{
   7435	struct sched_entity *se = &p->se;
   7436
   7437	/* throttled hierarchies are not runnable */
   7438	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
   7439		return false;
   7440
   7441	/* Tell the scheduler that we'd really like pse to run next. */
   7442	set_next_buddy(se);
   7443
   7444	yield_task_fair(rq);
   7445
   7446	return true;
   7447}
   7448
   7449#ifdef CONFIG_SMP
   7450/**************************************************
   7451 * Fair scheduling class load-balancing methods.
   7452 *
   7453 * BASICS
   7454 *
   7455 * The purpose of load-balancing is to achieve the same basic fairness the
   7456 * per-CPU scheduler provides, namely provide a proportional amount of compute
   7457 * time to each task. This is expressed in the following equation:
   7458 *
   7459 *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
   7460 *
   7461 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
   7462 * W_i,0 is defined as:
   7463 *
   7464 *   W_i,0 = \Sum_j w_i,j                                             (2)
   7465 *
   7466 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
   7467 * is derived from the nice value as per sched_prio_to_weight[].
   7468 *
   7469 * The weight average is an exponential decay average of the instantaneous
   7470 * weight:
   7471 *
   7472 *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
   7473 *
   7474 * C_i is the compute capacity of CPU i, typically it is the
   7475 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
   7476 * can also include other factors [XXX].
   7477 *
   7478 * To achieve this balance we define a measure of imbalance which follows
   7479 * directly from (1):
   7480 *
   7481 *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
   7482 *
   7483 * We them move tasks around to minimize the imbalance. In the continuous
   7484 * function space it is obvious this converges, in the discrete case we get
   7485 * a few fun cases generally called infeasible weight scenarios.
   7486 *
   7487 * [XXX expand on:
   7488 *     - infeasible weights;
   7489 *     - local vs global optima in the discrete case. ]
   7490 *
   7491 *
   7492 * SCHED DOMAINS
   7493 *
   7494 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
   7495 * for all i,j solution, we create a tree of CPUs that follows the hardware
   7496 * topology where each level pairs two lower groups (or better). This results
   7497 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
   7498 * tree to only the first of the previous level and we decrease the frequency
   7499 * of load-balance at each level inv. proportional to the number of CPUs in
   7500 * the groups.
   7501 *
   7502 * This yields:
   7503 *
   7504 *     log_2 n     1     n
   7505 *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
   7506 *     i = 0      2^i   2^i
   7507 *                               `- size of each group
   7508 *         |         |     `- number of CPUs doing load-balance
   7509 *         |         `- freq
   7510 *         `- sum over all levels
   7511 *
   7512 * Coupled with a limit on how many tasks we can migrate every balance pass,
   7513 * this makes (5) the runtime complexity of the balancer.
   7514 *
   7515 * An important property here is that each CPU is still (indirectly) connected
   7516 * to every other CPU in at most O(log n) steps:
   7517 *
   7518 * The adjacency matrix of the resulting graph is given by:
   7519 *
   7520 *             log_2 n
   7521 *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
   7522 *             k = 0
   7523 *
   7524 * And you'll find that:
   7525 *
   7526 *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
   7527 *
   7528 * Showing there's indeed a path between every CPU in at most O(log n) steps.
   7529 * The task movement gives a factor of O(m), giving a convergence complexity
   7530 * of:
   7531 *
   7532 *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
   7533 *
   7534 *
   7535 * WORK CONSERVING
   7536 *
   7537 * In order to avoid CPUs going idle while there's still work to do, new idle
   7538 * balancing is more aggressive and has the newly idle CPU iterate up the domain
   7539 * tree itself instead of relying on other CPUs to bring it work.
   7540 *
   7541 * This adds some complexity to both (5) and (8) but it reduces the total idle
   7542 * time.
   7543 *
   7544 * [XXX more?]
   7545 *
   7546 *
   7547 * CGROUPS
   7548 *
   7549 * Cgroups make a horror show out of (2), instead of a simple sum we get:
   7550 *
   7551 *                                s_k,i
   7552 *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
   7553 *                                 S_k
   7554 *
   7555 * Where
   7556 *
   7557 *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
   7558 *
   7559 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
   7560 *
   7561 * The big problem is S_k, its a global sum needed to compute a local (W_i)
   7562 * property.
   7563 *
   7564 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
   7565 *      rewrite all of this once again.]
   7566 */
   7567
   7568static unsigned long __read_mostly max_load_balance_interval = HZ/10;
   7569
   7570enum fbq_type { regular, remote, all };
   7571
   7572/*
   7573 * 'group_type' describes the group of CPUs at the moment of load balancing.
   7574 *
   7575 * The enum is ordered by pulling priority, with the group with lowest priority
   7576 * first so the group_type can simply be compared when selecting the busiest
   7577 * group. See update_sd_pick_busiest().
   7578 */
   7579enum group_type {
   7580	/* The group has spare capacity that can be used to run more tasks.  */
   7581	group_has_spare = 0,
   7582	/*
   7583	 * The group is fully used and the tasks don't compete for more CPU
   7584	 * cycles. Nevertheless, some tasks might wait before running.
   7585	 */
   7586	group_fully_busy,
   7587	/*
   7588	 * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
   7589	 * and must be migrated to a more powerful CPU.
   7590	 */
   7591	group_misfit_task,
   7592	/*
   7593	 * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
   7594	 * and the task should be migrated to it instead of running on the
   7595	 * current CPU.
   7596	 */
   7597	group_asym_packing,
   7598	/*
   7599	 * The tasks' affinity constraints previously prevented the scheduler
   7600	 * from balancing the load across the system.
   7601	 */
   7602	group_imbalanced,
   7603	/*
   7604	 * The CPU is overloaded and can't provide expected CPU cycles to all
   7605	 * tasks.
   7606	 */
   7607	group_overloaded
   7608};
   7609
   7610enum migration_type {
   7611	migrate_load = 0,
   7612	migrate_util,
   7613	migrate_task,
   7614	migrate_misfit
   7615};
   7616
   7617#define LBF_ALL_PINNED	0x01
   7618#define LBF_NEED_BREAK	0x02
   7619#define LBF_DST_PINNED  0x04
   7620#define LBF_SOME_PINNED	0x08
   7621#define LBF_ACTIVE_LB	0x10
   7622
   7623struct lb_env {
   7624	struct sched_domain	*sd;
   7625
   7626	struct rq		*src_rq;
   7627	int			src_cpu;
   7628
   7629	int			dst_cpu;
   7630	struct rq		*dst_rq;
   7631
   7632	struct cpumask		*dst_grpmask;
   7633	int			new_dst_cpu;
   7634	enum cpu_idle_type	idle;
   7635	long			imbalance;
   7636	/* The set of CPUs under consideration for load-balancing */
   7637	struct cpumask		*cpus;
   7638
   7639	unsigned int		flags;
   7640
   7641	unsigned int		loop;
   7642	unsigned int		loop_break;
   7643	unsigned int		loop_max;
   7644
   7645	enum fbq_type		fbq_type;
   7646	enum migration_type	migration_type;
   7647	struct list_head	tasks;
   7648};
   7649
   7650/*
   7651 * Is this task likely cache-hot:
   7652 */
   7653static int task_hot(struct task_struct *p, struct lb_env *env)
   7654{
   7655	s64 delta;
   7656
   7657	lockdep_assert_rq_held(env->src_rq);
   7658
   7659	if (p->sched_class != &fair_sched_class)
   7660		return 0;
   7661
   7662	if (unlikely(task_has_idle_policy(p)))
   7663		return 0;
   7664
   7665	/* SMT siblings share cache */
   7666	if (env->sd->flags & SD_SHARE_CPUCAPACITY)
   7667		return 0;
   7668
   7669	/*
   7670	 * Buddy candidates are cache hot:
   7671	 */
   7672	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
   7673			(&p->se == cfs_rq_of(&p->se)->next ||
   7674			 &p->se == cfs_rq_of(&p->se)->last))
   7675		return 1;
   7676
   7677	if (sysctl_sched_migration_cost == -1)
   7678		return 1;
   7679
   7680	/*
   7681	 * Don't migrate task if the task's cookie does not match
   7682	 * with the destination CPU's core cookie.
   7683	 */
   7684	if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
   7685		return 1;
   7686
   7687	if (sysctl_sched_migration_cost == 0)
   7688		return 0;
   7689
   7690	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
   7691
   7692	return delta < (s64)sysctl_sched_migration_cost;
   7693}
   7694
   7695#ifdef CONFIG_NUMA_BALANCING
   7696/*
   7697 * Returns 1, if task migration degrades locality
   7698 * Returns 0, if task migration improves locality i.e migration preferred.
   7699 * Returns -1, if task migration is not affected by locality.
   7700 */
   7701static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
   7702{
   7703	struct numa_group *numa_group = rcu_dereference(p->numa_group);
   7704	unsigned long src_weight, dst_weight;
   7705	int src_nid, dst_nid, dist;
   7706
   7707	if (!static_branch_likely(&sched_numa_balancing))
   7708		return -1;
   7709
   7710	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
   7711		return -1;
   7712
   7713	src_nid = cpu_to_node(env->src_cpu);
   7714	dst_nid = cpu_to_node(env->dst_cpu);
   7715
   7716	if (src_nid == dst_nid)
   7717		return -1;
   7718
   7719	/* Migrating away from the preferred node is always bad. */
   7720	if (src_nid == p->numa_preferred_nid) {
   7721		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
   7722			return 1;
   7723		else
   7724			return -1;
   7725	}
   7726
   7727	/* Encourage migration to the preferred node. */
   7728	if (dst_nid == p->numa_preferred_nid)
   7729		return 0;
   7730
   7731	/* Leaving a core idle is often worse than degrading locality. */
   7732	if (env->idle == CPU_IDLE)
   7733		return -1;
   7734
   7735	dist = node_distance(src_nid, dst_nid);
   7736	if (numa_group) {
   7737		src_weight = group_weight(p, src_nid, dist);
   7738		dst_weight = group_weight(p, dst_nid, dist);
   7739	} else {
   7740		src_weight = task_weight(p, src_nid, dist);
   7741		dst_weight = task_weight(p, dst_nid, dist);
   7742	}
   7743
   7744	return dst_weight < src_weight;
   7745}
   7746
   7747#else
   7748static inline int migrate_degrades_locality(struct task_struct *p,
   7749					     struct lb_env *env)
   7750{
   7751	return -1;
   7752}
   7753#endif
   7754
   7755/*
   7756 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
   7757 */
   7758static
   7759int can_migrate_task(struct task_struct *p, struct lb_env *env)
   7760{
   7761	int tsk_cache_hot;
   7762
   7763	lockdep_assert_rq_held(env->src_rq);
   7764
   7765	/*
   7766	 * We do not migrate tasks that are:
   7767	 * 1) throttled_lb_pair, or
   7768	 * 2) cannot be migrated to this CPU due to cpus_ptr, or
   7769	 * 3) running (obviously), or
   7770	 * 4) are cache-hot on their current CPU.
   7771	 */
   7772	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
   7773		return 0;
   7774
   7775	/* Disregard pcpu kthreads; they are where they need to be. */
   7776	if (kthread_is_per_cpu(p))
   7777		return 0;
   7778
   7779	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
   7780		int cpu;
   7781
   7782		schedstat_inc(p->stats.nr_failed_migrations_affine);
   7783
   7784		env->flags |= LBF_SOME_PINNED;
   7785
   7786		/*
   7787		 * Remember if this task can be migrated to any other CPU in
   7788		 * our sched_group. We may want to revisit it if we couldn't
   7789		 * meet load balance goals by pulling other tasks on src_cpu.
   7790		 *
   7791		 * Avoid computing new_dst_cpu
   7792		 * - for NEWLY_IDLE
   7793		 * - if we have already computed one in current iteration
   7794		 * - if it's an active balance
   7795		 */
   7796		if (env->idle == CPU_NEWLY_IDLE ||
   7797		    env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
   7798			return 0;
   7799
   7800		/* Prevent to re-select dst_cpu via env's CPUs: */
   7801		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
   7802			if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
   7803				env->flags |= LBF_DST_PINNED;
   7804				env->new_dst_cpu = cpu;
   7805				break;
   7806			}
   7807		}
   7808
   7809		return 0;
   7810	}
   7811
   7812	/* Record that we found at least one task that could run on dst_cpu */
   7813	env->flags &= ~LBF_ALL_PINNED;
   7814
   7815	if (task_running(env->src_rq, p)) {
   7816		schedstat_inc(p->stats.nr_failed_migrations_running);
   7817		return 0;
   7818	}
   7819
   7820	/*
   7821	 * Aggressive migration if:
   7822	 * 1) active balance
   7823	 * 2) destination numa is preferred
   7824	 * 3) task is cache cold, or
   7825	 * 4) too many balance attempts have failed.
   7826	 */
   7827	if (env->flags & LBF_ACTIVE_LB)
   7828		return 1;
   7829
   7830	tsk_cache_hot = migrate_degrades_locality(p, env);
   7831	if (tsk_cache_hot == -1)
   7832		tsk_cache_hot = task_hot(p, env);
   7833
   7834	if (tsk_cache_hot <= 0 ||
   7835	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
   7836		if (tsk_cache_hot == 1) {
   7837			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
   7838			schedstat_inc(p->stats.nr_forced_migrations);
   7839		}
   7840		return 1;
   7841	}
   7842
   7843	schedstat_inc(p->stats.nr_failed_migrations_hot);
   7844	return 0;
   7845}
   7846
   7847/*
   7848 * detach_task() -- detach the task for the migration specified in env
   7849 */
   7850static void detach_task(struct task_struct *p, struct lb_env *env)
   7851{
   7852	lockdep_assert_rq_held(env->src_rq);
   7853
   7854	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
   7855	set_task_cpu(p, env->dst_cpu);
   7856}
   7857
   7858/*
   7859 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
   7860 * part of active balancing operations within "domain".
   7861 *
   7862 * Returns a task if successful and NULL otherwise.
   7863 */
   7864static struct task_struct *detach_one_task(struct lb_env *env)
   7865{
   7866	struct task_struct *p;
   7867
   7868	lockdep_assert_rq_held(env->src_rq);
   7869
   7870	list_for_each_entry_reverse(p,
   7871			&env->src_rq->cfs_tasks, se.group_node) {
   7872		if (!can_migrate_task(p, env))
   7873			continue;
   7874
   7875		detach_task(p, env);
   7876
   7877		/*
   7878		 * Right now, this is only the second place where
   7879		 * lb_gained[env->idle] is updated (other is detach_tasks)
   7880		 * so we can safely collect stats here rather than
   7881		 * inside detach_tasks().
   7882		 */
   7883		schedstat_inc(env->sd->lb_gained[env->idle]);
   7884		return p;
   7885	}
   7886	return NULL;
   7887}
   7888
   7889static const unsigned int sched_nr_migrate_break = 32;
   7890
   7891/*
   7892 * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
   7893 * busiest_rq, as part of a balancing operation within domain "sd".
   7894 *
   7895 * Returns number of detached tasks if successful and 0 otherwise.
   7896 */
   7897static int detach_tasks(struct lb_env *env)
   7898{
   7899	struct list_head *tasks = &env->src_rq->cfs_tasks;
   7900	unsigned long util, load;
   7901	struct task_struct *p;
   7902	int detached = 0;
   7903
   7904	lockdep_assert_rq_held(env->src_rq);
   7905
   7906	/*
   7907	 * Source run queue has been emptied by another CPU, clear
   7908	 * LBF_ALL_PINNED flag as we will not test any task.
   7909	 */
   7910	if (env->src_rq->nr_running <= 1) {
   7911		env->flags &= ~LBF_ALL_PINNED;
   7912		return 0;
   7913	}
   7914
   7915	if (env->imbalance <= 0)
   7916		return 0;
   7917
   7918	while (!list_empty(tasks)) {
   7919		/*
   7920		 * We don't want to steal all, otherwise we may be treated likewise,
   7921		 * which could at worst lead to a livelock crash.
   7922		 */
   7923		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
   7924			break;
   7925
   7926		p = list_last_entry(tasks, struct task_struct, se.group_node);
   7927
   7928		env->loop++;
   7929		/* We've more or less seen every task there is, call it quits */
   7930		if (env->loop > env->loop_max)
   7931			break;
   7932
   7933		/* take a breather every nr_migrate tasks */
   7934		if (env->loop > env->loop_break) {
   7935			env->loop_break += sched_nr_migrate_break;
   7936			env->flags |= LBF_NEED_BREAK;
   7937			break;
   7938		}
   7939
   7940		if (!can_migrate_task(p, env))
   7941			goto next;
   7942
   7943		switch (env->migration_type) {
   7944		case migrate_load:
   7945			/*
   7946			 * Depending of the number of CPUs and tasks and the
   7947			 * cgroup hierarchy, task_h_load() can return a null
   7948			 * value. Make sure that env->imbalance decreases
   7949			 * otherwise detach_tasks() will stop only after
   7950			 * detaching up to loop_max tasks.
   7951			 */
   7952			load = max_t(unsigned long, task_h_load(p), 1);
   7953
   7954			if (sched_feat(LB_MIN) &&
   7955			    load < 16 && !env->sd->nr_balance_failed)
   7956				goto next;
   7957
   7958			/*
   7959			 * Make sure that we don't migrate too much load.
   7960			 * Nevertheless, let relax the constraint if
   7961			 * scheduler fails to find a good waiting task to
   7962			 * migrate.
   7963			 */
   7964			if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
   7965				goto next;
   7966
   7967			env->imbalance -= load;
   7968			break;
   7969
   7970		case migrate_util:
   7971			util = task_util_est(p);
   7972
   7973			if (util > env->imbalance)
   7974				goto next;
   7975
   7976			env->imbalance -= util;
   7977			break;
   7978
   7979		case migrate_task:
   7980			env->imbalance--;
   7981			break;
   7982
   7983		case migrate_misfit:
   7984			/* This is not a misfit task */
   7985			if (task_fits_capacity(p, capacity_of(env->src_cpu)))
   7986				goto next;
   7987
   7988			env->imbalance = 0;
   7989			break;
   7990		}
   7991
   7992		detach_task(p, env);
   7993		list_add(&p->se.group_node, &env->tasks);
   7994
   7995		detached++;
   7996
   7997#ifdef CONFIG_PREEMPTION
   7998		/*
   7999		 * NEWIDLE balancing is a source of latency, so preemptible
   8000		 * kernels will stop after the first task is detached to minimize
   8001		 * the critical section.
   8002		 */
   8003		if (env->idle == CPU_NEWLY_IDLE)
   8004			break;
   8005#endif
   8006
   8007		/*
   8008		 * We only want to steal up to the prescribed amount of
   8009		 * load/util/tasks.
   8010		 */
   8011		if (env->imbalance <= 0)
   8012			break;
   8013
   8014		continue;
   8015next:
   8016		list_move(&p->se.group_node, tasks);
   8017	}
   8018
   8019	/*
   8020	 * Right now, this is one of only two places we collect this stat
   8021	 * so we can safely collect detach_one_task() stats here rather
   8022	 * than inside detach_one_task().
   8023	 */
   8024	schedstat_add(env->sd->lb_gained[env->idle], detached);
   8025
   8026	return detached;
   8027}
   8028
   8029/*
   8030 * attach_task() -- attach the task detached by detach_task() to its new rq.
   8031 */
   8032static void attach_task(struct rq *rq, struct task_struct *p)
   8033{
   8034	lockdep_assert_rq_held(rq);
   8035
   8036	BUG_ON(task_rq(p) != rq);
   8037	activate_task(rq, p, ENQUEUE_NOCLOCK);
   8038	check_preempt_curr(rq, p, 0);
   8039}
   8040
   8041/*
   8042 * attach_one_task() -- attaches the task returned from detach_one_task() to
   8043 * its new rq.
   8044 */
   8045static void attach_one_task(struct rq *rq, struct task_struct *p)
   8046{
   8047	struct rq_flags rf;
   8048
   8049	rq_lock(rq, &rf);
   8050	update_rq_clock(rq);
   8051	attach_task(rq, p);
   8052	rq_unlock(rq, &rf);
   8053}
   8054
   8055/*
   8056 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
   8057 * new rq.
   8058 */
   8059static void attach_tasks(struct lb_env *env)
   8060{
   8061	struct list_head *tasks = &env->tasks;
   8062	struct task_struct *p;
   8063	struct rq_flags rf;
   8064
   8065	rq_lock(env->dst_rq, &rf);
   8066	update_rq_clock(env->dst_rq);
   8067
   8068	while (!list_empty(tasks)) {
   8069		p = list_first_entry(tasks, struct task_struct, se.group_node);
   8070		list_del_init(&p->se.group_node);
   8071
   8072		attach_task(env->dst_rq, p);
   8073	}
   8074
   8075	rq_unlock(env->dst_rq, &rf);
   8076}
   8077
   8078#ifdef CONFIG_NO_HZ_COMMON
   8079static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
   8080{
   8081	if (cfs_rq->avg.load_avg)
   8082		return true;
   8083
   8084	if (cfs_rq->avg.util_avg)
   8085		return true;
   8086
   8087	return false;
   8088}
   8089
   8090static inline bool others_have_blocked(struct rq *rq)
   8091{
   8092	if (READ_ONCE(rq->avg_rt.util_avg))
   8093		return true;
   8094
   8095	if (READ_ONCE(rq->avg_dl.util_avg))
   8096		return true;
   8097
   8098	if (thermal_load_avg(rq))
   8099		return true;
   8100
   8101#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
   8102	if (READ_ONCE(rq->avg_irq.util_avg))
   8103		return true;
   8104#endif
   8105
   8106	return false;
   8107}
   8108
   8109static inline void update_blocked_load_tick(struct rq *rq)
   8110{
   8111	WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
   8112}
   8113
   8114static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
   8115{
   8116	if (!has_blocked)
   8117		rq->has_blocked_load = 0;
   8118}
   8119#else
   8120static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
   8121static inline bool others_have_blocked(struct rq *rq) { return false; }
   8122static inline void update_blocked_load_tick(struct rq *rq) {}
   8123static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
   8124#endif
   8125
   8126static bool __update_blocked_others(struct rq *rq, bool *done)
   8127{
   8128	const struct sched_class *curr_class;
   8129	u64 now = rq_clock_pelt(rq);
   8130	unsigned long thermal_pressure;
   8131	bool decayed;
   8132
   8133	/*
   8134	 * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
   8135	 * DL and IRQ signals have been updated before updating CFS.
   8136	 */
   8137	curr_class = rq->curr->sched_class;
   8138
   8139	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
   8140
   8141	decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
   8142		  update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
   8143		  update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
   8144		  update_irq_load_avg(rq, 0);
   8145
   8146	if (others_have_blocked(rq))
   8147		*done = false;
   8148
   8149	return decayed;
   8150}
   8151
   8152#ifdef CONFIG_FAIR_GROUP_SCHED
   8153
   8154static bool __update_blocked_fair(struct rq *rq, bool *done)
   8155{
   8156	struct cfs_rq *cfs_rq, *pos;
   8157	bool decayed = false;
   8158	int cpu = cpu_of(rq);
   8159
   8160	/*
   8161	 * Iterates the task_group tree in a bottom up fashion, see
   8162	 * list_add_leaf_cfs_rq() for details.
   8163	 */
   8164	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
   8165		struct sched_entity *se;
   8166
   8167		if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
   8168			update_tg_load_avg(cfs_rq);
   8169
   8170			if (cfs_rq == &rq->cfs)
   8171				decayed = true;
   8172		}
   8173
   8174		/* Propagate pending load changes to the parent, if any: */
   8175		se = cfs_rq->tg->se[cpu];
   8176		if (se && !skip_blocked_update(se))
   8177			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
   8178
   8179		/*
   8180		 * There can be a lot of idle CPU cgroups.  Don't let fully
   8181		 * decayed cfs_rqs linger on the list.
   8182		 */
   8183		if (cfs_rq_is_decayed(cfs_rq))
   8184			list_del_leaf_cfs_rq(cfs_rq);
   8185
   8186		/* Don't need periodic decay once load/util_avg are null */
   8187		if (cfs_rq_has_blocked(cfs_rq))
   8188			*done = false;
   8189	}
   8190
   8191	return decayed;
   8192}
   8193
   8194/*
   8195 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
   8196 * This needs to be done in a top-down fashion because the load of a child
   8197 * group is a fraction of its parents load.
   8198 */
   8199static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
   8200{
   8201	struct rq *rq = rq_of(cfs_rq);
   8202	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
   8203	unsigned long now = jiffies;
   8204	unsigned long load;
   8205
   8206	if (cfs_rq->last_h_load_update == now)
   8207		return;
   8208
   8209	WRITE_ONCE(cfs_rq->h_load_next, NULL);
   8210	for_each_sched_entity(se) {
   8211		cfs_rq = cfs_rq_of(se);
   8212		WRITE_ONCE(cfs_rq->h_load_next, se);
   8213		if (cfs_rq->last_h_load_update == now)
   8214			break;
   8215	}
   8216
   8217	if (!se) {
   8218		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
   8219		cfs_rq->last_h_load_update = now;
   8220	}
   8221
   8222	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
   8223		load = cfs_rq->h_load;
   8224		load = div64_ul(load * se->avg.load_avg,
   8225			cfs_rq_load_avg(cfs_rq) + 1);
   8226		cfs_rq = group_cfs_rq(se);
   8227		cfs_rq->h_load = load;
   8228		cfs_rq->last_h_load_update = now;
   8229	}
   8230}
   8231
   8232static unsigned long task_h_load(struct task_struct *p)
   8233{
   8234	struct cfs_rq *cfs_rq = task_cfs_rq(p);
   8235
   8236	update_cfs_rq_h_load(cfs_rq);
   8237	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
   8238			cfs_rq_load_avg(cfs_rq) + 1);
   8239}
   8240#else
   8241static bool __update_blocked_fair(struct rq *rq, bool *done)
   8242{
   8243	struct cfs_rq *cfs_rq = &rq->cfs;
   8244	bool decayed;
   8245
   8246	decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
   8247	if (cfs_rq_has_blocked(cfs_rq))
   8248		*done = false;
   8249
   8250	return decayed;
   8251}
   8252
   8253static unsigned long task_h_load(struct task_struct *p)
   8254{
   8255	return p->se.avg.load_avg;
   8256}
   8257#endif
   8258
   8259static void update_blocked_averages(int cpu)
   8260{
   8261	bool decayed = false, done = true;
   8262	struct rq *rq = cpu_rq(cpu);
   8263	struct rq_flags rf;
   8264
   8265	rq_lock_irqsave(rq, &rf);
   8266	update_blocked_load_tick(rq);
   8267	update_rq_clock(rq);
   8268
   8269	decayed |= __update_blocked_others(rq, &done);
   8270	decayed |= __update_blocked_fair(rq, &done);
   8271
   8272	update_blocked_load_status(rq, !done);
   8273	if (decayed)
   8274		cpufreq_update_util(rq, 0);
   8275	rq_unlock_irqrestore(rq, &rf);
   8276}
   8277
   8278/********** Helpers for find_busiest_group ************************/
   8279
   8280/*
   8281 * sg_lb_stats - stats of a sched_group required for load_balancing
   8282 */
   8283struct sg_lb_stats {
   8284	unsigned long avg_load; /*Avg load across the CPUs of the group */
   8285	unsigned long group_load; /* Total load over the CPUs of the group */
   8286	unsigned long group_capacity;
   8287	unsigned long group_util; /* Total utilization over the CPUs of the group */
   8288	unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
   8289	unsigned int sum_nr_running; /* Nr of tasks running in the group */
   8290	unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
   8291	unsigned int idle_cpus;
   8292	unsigned int group_weight;
   8293	enum group_type group_type;
   8294	unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
   8295	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
   8296#ifdef CONFIG_NUMA_BALANCING
   8297	unsigned int nr_numa_running;
   8298	unsigned int nr_preferred_running;
   8299#endif
   8300};
   8301
   8302/*
   8303 * sd_lb_stats - Structure to store the statistics of a sched_domain
   8304 *		 during load balancing.
   8305 */
   8306struct sd_lb_stats {
   8307	struct sched_group *busiest;	/* Busiest group in this sd */
   8308	struct sched_group *local;	/* Local group in this sd */
   8309	unsigned long total_load;	/* Total load of all groups in sd */
   8310	unsigned long total_capacity;	/* Total capacity of all groups in sd */
   8311	unsigned long avg_load;	/* Average load across all groups in sd */
   8312	unsigned int prefer_sibling; /* tasks should go to sibling first */
   8313
   8314	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
   8315	struct sg_lb_stats local_stat;	/* Statistics of the local group */
   8316};
   8317
   8318static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
   8319{
   8320	/*
   8321	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
   8322	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
   8323	 * We must however set busiest_stat::group_type and
   8324	 * busiest_stat::idle_cpus to the worst busiest group because
   8325	 * update_sd_pick_busiest() reads these before assignment.
   8326	 */
   8327	*sds = (struct sd_lb_stats){
   8328		.busiest = NULL,
   8329		.local = NULL,
   8330		.total_load = 0UL,
   8331		.total_capacity = 0UL,
   8332		.busiest_stat = {
   8333			.idle_cpus = UINT_MAX,
   8334			.group_type = group_has_spare,
   8335		},
   8336	};
   8337}
   8338
   8339static unsigned long scale_rt_capacity(int cpu)
   8340{
   8341	struct rq *rq = cpu_rq(cpu);
   8342	unsigned long max = arch_scale_cpu_capacity(cpu);
   8343	unsigned long used, free;
   8344	unsigned long irq;
   8345
   8346	irq = cpu_util_irq(rq);
   8347
   8348	if (unlikely(irq >= max))
   8349		return 1;
   8350
   8351	/*
   8352	 * avg_rt.util_avg and avg_dl.util_avg track binary signals
   8353	 * (running and not running) with weights 0 and 1024 respectively.
   8354	 * avg_thermal.load_avg tracks thermal pressure and the weighted
   8355	 * average uses the actual delta max capacity(load).
   8356	 */
   8357	used = READ_ONCE(rq->avg_rt.util_avg);
   8358	used += READ_ONCE(rq->avg_dl.util_avg);
   8359	used += thermal_load_avg(rq);
   8360
   8361	if (unlikely(used >= max))
   8362		return 1;
   8363
   8364	free = max - used;
   8365
   8366	return scale_irq_capacity(free, irq, max);
   8367}
   8368
   8369static void update_cpu_capacity(struct sched_domain *sd, int cpu)
   8370{
   8371	unsigned long capacity = scale_rt_capacity(cpu);
   8372	struct sched_group *sdg = sd->groups;
   8373
   8374	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
   8375
   8376	if (!capacity)
   8377		capacity = 1;
   8378
   8379	cpu_rq(cpu)->cpu_capacity = capacity;
   8380	trace_sched_cpu_capacity_tp(cpu_rq(cpu));
   8381
   8382	sdg->sgc->capacity = capacity;
   8383	sdg->sgc->min_capacity = capacity;
   8384	sdg->sgc->max_capacity = capacity;
   8385}
   8386
   8387void update_group_capacity(struct sched_domain *sd, int cpu)
   8388{
   8389	struct sched_domain *child = sd->child;
   8390	struct sched_group *group, *sdg = sd->groups;
   8391	unsigned long capacity, min_capacity, max_capacity;
   8392	unsigned long interval;
   8393
   8394	interval = msecs_to_jiffies(sd->balance_interval);
   8395	interval = clamp(interval, 1UL, max_load_balance_interval);
   8396	sdg->sgc->next_update = jiffies + interval;
   8397
   8398	if (!child) {
   8399		update_cpu_capacity(sd, cpu);
   8400		return;
   8401	}
   8402
   8403	capacity = 0;
   8404	min_capacity = ULONG_MAX;
   8405	max_capacity = 0;
   8406
   8407	if (child->flags & SD_OVERLAP) {
   8408		/*
   8409		 * SD_OVERLAP domains cannot assume that child groups
   8410		 * span the current group.
   8411		 */
   8412
   8413		for_each_cpu(cpu, sched_group_span(sdg)) {
   8414			unsigned long cpu_cap = capacity_of(cpu);
   8415
   8416			capacity += cpu_cap;
   8417			min_capacity = min(cpu_cap, min_capacity);
   8418			max_capacity = max(cpu_cap, max_capacity);
   8419		}
   8420	} else  {
   8421		/*
   8422		 * !SD_OVERLAP domains can assume that child groups
   8423		 * span the current group.
   8424		 */
   8425
   8426		group = child->groups;
   8427		do {
   8428			struct sched_group_capacity *sgc = group->sgc;
   8429
   8430			capacity += sgc->capacity;
   8431			min_capacity = min(sgc->min_capacity, min_capacity);
   8432			max_capacity = max(sgc->max_capacity, max_capacity);
   8433			group = group->next;
   8434		} while (group != child->groups);
   8435	}
   8436
   8437	sdg->sgc->capacity = capacity;
   8438	sdg->sgc->min_capacity = min_capacity;
   8439	sdg->sgc->max_capacity = max_capacity;
   8440}
   8441
   8442/*
   8443 * Check whether the capacity of the rq has been noticeably reduced by side
   8444 * activity. The imbalance_pct is used for the threshold.
   8445 * Return true is the capacity is reduced
   8446 */
   8447static inline int
   8448check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
   8449{
   8450	return ((rq->cpu_capacity * sd->imbalance_pct) <
   8451				(rq->cpu_capacity_orig * 100));
   8452}
   8453
   8454/*
   8455 * Check whether a rq has a misfit task and if it looks like we can actually
   8456 * help that task: we can migrate the task to a CPU of higher capacity, or
   8457 * the task's current CPU is heavily pressured.
   8458 */
   8459static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
   8460{
   8461	return rq->misfit_task_load &&
   8462		(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
   8463		 check_cpu_capacity(rq, sd));
   8464}
   8465
   8466/*
   8467 * Group imbalance indicates (and tries to solve) the problem where balancing
   8468 * groups is inadequate due to ->cpus_ptr constraints.
   8469 *
   8470 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
   8471 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
   8472 * Something like:
   8473 *
   8474 *	{ 0 1 2 3 } { 4 5 6 7 }
   8475 *	        *     * * *
   8476 *
   8477 * If we were to balance group-wise we'd place two tasks in the first group and
   8478 * two tasks in the second group. Clearly this is undesired as it will overload
   8479 * cpu 3 and leave one of the CPUs in the second group unused.
   8480 *
   8481 * The current solution to this issue is detecting the skew in the first group
   8482 * by noticing the lower domain failed to reach balance and had difficulty
   8483 * moving tasks due to affinity constraints.
   8484 *
   8485 * When this is so detected; this group becomes a candidate for busiest; see
   8486 * update_sd_pick_busiest(). And calculate_imbalance() and
   8487 * find_busiest_group() avoid some of the usual balance conditions to allow it
   8488 * to create an effective group imbalance.
   8489 *
   8490 * This is a somewhat tricky proposition since the next run might not find the
   8491 * group imbalance and decide the groups need to be balanced again. A most
   8492 * subtle and fragile situation.
   8493 */
   8494
   8495static inline int sg_imbalanced(struct sched_group *group)
   8496{
   8497	return group->sgc->imbalance;
   8498}
   8499
   8500/*
   8501 * group_has_capacity returns true if the group has spare capacity that could
   8502 * be used by some tasks.
   8503 * We consider that a group has spare capacity if the  * number of task is
   8504 * smaller than the number of CPUs or if the utilization is lower than the
   8505 * available capacity for CFS tasks.
   8506 * For the latter, we use a threshold to stabilize the state, to take into
   8507 * account the variance of the tasks' load and to return true if the available
   8508 * capacity in meaningful for the load balancer.
   8509 * As an example, an available capacity of 1% can appear but it doesn't make
   8510 * any benefit for the load balance.
   8511 */
   8512static inline bool
   8513group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
   8514{
   8515	if (sgs->sum_nr_running < sgs->group_weight)
   8516		return true;
   8517
   8518	if ((sgs->group_capacity * imbalance_pct) <
   8519			(sgs->group_runnable * 100))
   8520		return false;
   8521
   8522	if ((sgs->group_capacity * 100) >
   8523			(sgs->group_util * imbalance_pct))
   8524		return true;
   8525
   8526	return false;
   8527}
   8528
   8529/*
   8530 *  group_is_overloaded returns true if the group has more tasks than it can
   8531 *  handle.
   8532 *  group_is_overloaded is not equals to !group_has_capacity because a group
   8533 *  with the exact right number of tasks, has no more spare capacity but is not
   8534 *  overloaded so both group_has_capacity and group_is_overloaded return
   8535 *  false.
   8536 */
   8537static inline bool
   8538group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
   8539{
   8540	if (sgs->sum_nr_running <= sgs->group_weight)
   8541		return false;
   8542
   8543	if ((sgs->group_capacity * 100) <
   8544			(sgs->group_util * imbalance_pct))
   8545		return true;
   8546
   8547	if ((sgs->group_capacity * imbalance_pct) <
   8548			(sgs->group_runnable * 100))
   8549		return true;
   8550
   8551	return false;
   8552}
   8553
   8554static inline enum
   8555group_type group_classify(unsigned int imbalance_pct,
   8556			  struct sched_group *group,
   8557			  struct sg_lb_stats *sgs)
   8558{
   8559	if (group_is_overloaded(imbalance_pct, sgs))
   8560		return group_overloaded;
   8561
   8562	if (sg_imbalanced(group))
   8563		return group_imbalanced;
   8564
   8565	if (sgs->group_asym_packing)
   8566		return group_asym_packing;
   8567
   8568	if (sgs->group_misfit_task_load)
   8569		return group_misfit_task;
   8570
   8571	if (!group_has_capacity(imbalance_pct, sgs))
   8572		return group_fully_busy;
   8573
   8574	return group_has_spare;
   8575}
   8576
   8577/**
   8578 * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
   8579 * @dst_cpu:	Destination CPU of the load balancing
   8580 * @sds:	Load-balancing data with statistics of the local group
   8581 * @sgs:	Load-balancing statistics of the candidate busiest group
   8582 * @sg:		The candidate busiest group
   8583 *
   8584 * Check the state of the SMT siblings of both @sds::local and @sg and decide
   8585 * if @dst_cpu can pull tasks.
   8586 *
   8587 * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
   8588 * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
   8589 * only if @dst_cpu has higher priority.
   8590 *
   8591 * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
   8592 * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
   8593 * Bigger imbalances in the number of busy CPUs will be dealt with in
   8594 * update_sd_pick_busiest().
   8595 *
   8596 * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
   8597 * of @dst_cpu are idle and @sg has lower priority.
   8598 *
   8599 * Return: true if @dst_cpu can pull tasks, false otherwise.
   8600 */
   8601static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
   8602				    struct sg_lb_stats *sgs,
   8603				    struct sched_group *sg)
   8604{
   8605#ifdef CONFIG_SCHED_SMT
   8606	bool local_is_smt, sg_is_smt;
   8607	int sg_busy_cpus;
   8608
   8609	local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
   8610	sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
   8611
   8612	sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
   8613
   8614	if (!local_is_smt) {
   8615		/*
   8616		 * If we are here, @dst_cpu is idle and does not have SMT
   8617		 * siblings. Pull tasks if candidate group has two or more
   8618		 * busy CPUs.
   8619		 */
   8620		if (sg_busy_cpus >= 2) /* implies sg_is_smt */
   8621			return true;
   8622
   8623		/*
   8624		 * @dst_cpu does not have SMT siblings. @sg may have SMT
   8625		 * siblings and only one is busy. In such case, @dst_cpu
   8626		 * can help if it has higher priority and is idle (i.e.,
   8627		 * it has no running tasks).
   8628		 */
   8629		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
   8630	}
   8631
   8632	/* @dst_cpu has SMT siblings. */
   8633
   8634	if (sg_is_smt) {
   8635		int local_busy_cpus = sds->local->group_weight -
   8636				      sds->local_stat.idle_cpus;
   8637		int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
   8638
   8639		if (busy_cpus_delta == 1)
   8640			return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
   8641
   8642		return false;
   8643	}
   8644
   8645	/*
   8646	 * @sg does not have SMT siblings. Ensure that @sds::local does not end
   8647	 * up with more than one busy SMT sibling and only pull tasks if there
   8648	 * are not busy CPUs (i.e., no CPU has running tasks).
   8649	 */
   8650	if (!sds->local_stat.sum_nr_running)
   8651		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
   8652
   8653	return false;
   8654#else
   8655	/* Always return false so that callers deal with non-SMT cases. */
   8656	return false;
   8657#endif
   8658}
   8659
   8660static inline bool
   8661sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs,
   8662	   struct sched_group *group)
   8663{
   8664	/* Only do SMT checks if either local or candidate have SMT siblings */
   8665	if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
   8666	    (group->flags & SD_SHARE_CPUCAPACITY))
   8667		return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
   8668
   8669	return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
   8670}
   8671
   8672/**
   8673 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
   8674 * @env: The load balancing environment.
   8675 * @sds: Load-balancing data with statistics of the local group.
   8676 * @group: sched_group whose statistics are to be updated.
   8677 * @sgs: variable to hold the statistics for this group.
   8678 * @sg_status: Holds flag indicating the status of the sched_group
   8679 */
   8680static inline void update_sg_lb_stats(struct lb_env *env,
   8681				      struct sd_lb_stats *sds,
   8682				      struct sched_group *group,
   8683				      struct sg_lb_stats *sgs,
   8684				      int *sg_status)
   8685{
   8686	int i, nr_running, local_group;
   8687
   8688	memset(sgs, 0, sizeof(*sgs));
   8689
   8690	local_group = group == sds->local;
   8691
   8692	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
   8693		struct rq *rq = cpu_rq(i);
   8694
   8695		sgs->group_load += cpu_load(rq);
   8696		sgs->group_util += cpu_util_cfs(i);
   8697		sgs->group_runnable += cpu_runnable(rq);
   8698		sgs->sum_h_nr_running += rq->cfs.h_nr_running;
   8699
   8700		nr_running = rq->nr_running;
   8701		sgs->sum_nr_running += nr_running;
   8702
   8703		if (nr_running > 1)
   8704			*sg_status |= SG_OVERLOAD;
   8705
   8706		if (cpu_overutilized(i))
   8707			*sg_status |= SG_OVERUTILIZED;
   8708
   8709#ifdef CONFIG_NUMA_BALANCING
   8710		sgs->nr_numa_running += rq->nr_numa_running;
   8711		sgs->nr_preferred_running += rq->nr_preferred_running;
   8712#endif
   8713		/*
   8714		 * No need to call idle_cpu() if nr_running is not 0
   8715		 */
   8716		if (!nr_running && idle_cpu(i)) {
   8717			sgs->idle_cpus++;
   8718			/* Idle cpu can't have misfit task */
   8719			continue;
   8720		}
   8721
   8722		if (local_group)
   8723			continue;
   8724
   8725		/* Check for a misfit task on the cpu */
   8726		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
   8727		    sgs->group_misfit_task_load < rq->misfit_task_load) {
   8728			sgs->group_misfit_task_load = rq->misfit_task_load;
   8729			*sg_status |= SG_OVERLOAD;
   8730		}
   8731	}
   8732
   8733	sgs->group_capacity = group->sgc->capacity;
   8734
   8735	sgs->group_weight = group->group_weight;
   8736
   8737	/* Check if dst CPU is idle and preferred to this group */
   8738	if (!local_group && env->sd->flags & SD_ASYM_PACKING &&
   8739	    env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
   8740	    sched_asym(env, sds, sgs, group)) {
   8741		sgs->group_asym_packing = 1;
   8742	}
   8743
   8744	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
   8745
   8746	/* Computing avg_load makes sense only when group is overloaded */
   8747	if (sgs->group_type == group_overloaded)
   8748		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
   8749				sgs->group_capacity;
   8750}
   8751
   8752/**
   8753 * update_sd_pick_busiest - return 1 on busiest group
   8754 * @env: The load balancing environment.
   8755 * @sds: sched_domain statistics
   8756 * @sg: sched_group candidate to be checked for being the busiest
   8757 * @sgs: sched_group statistics
   8758 *
   8759 * Determine if @sg is a busier group than the previously selected
   8760 * busiest group.
   8761 *
   8762 * Return: %true if @sg is a busier group than the previously selected
   8763 * busiest group. %false otherwise.
   8764 */
   8765static bool update_sd_pick_busiest(struct lb_env *env,
   8766				   struct sd_lb_stats *sds,
   8767				   struct sched_group *sg,
   8768				   struct sg_lb_stats *sgs)
   8769{
   8770	struct sg_lb_stats *busiest = &sds->busiest_stat;
   8771
   8772	/* Make sure that there is at least one task to pull */
   8773	if (!sgs->sum_h_nr_running)
   8774		return false;
   8775
   8776	/*
   8777	 * Don't try to pull misfit tasks we can't help.
   8778	 * We can use max_capacity here as reduction in capacity on some
   8779	 * CPUs in the group should either be possible to resolve
   8780	 * internally or be covered by avg_load imbalance (eventually).
   8781	 */
   8782	if (sgs->group_type == group_misfit_task &&
   8783	    (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
   8784	     sds->local_stat.group_type != group_has_spare))
   8785		return false;
   8786
   8787	if (sgs->group_type > busiest->group_type)
   8788		return true;
   8789
   8790	if (sgs->group_type < busiest->group_type)
   8791		return false;
   8792
   8793	/*
   8794	 * The candidate and the current busiest group are the same type of
   8795	 * group. Let check which one is the busiest according to the type.
   8796	 */
   8797
   8798	switch (sgs->group_type) {
   8799	case group_overloaded:
   8800		/* Select the overloaded group with highest avg_load. */
   8801		if (sgs->avg_load <= busiest->avg_load)
   8802			return false;
   8803		break;
   8804
   8805	case group_imbalanced:
   8806		/*
   8807		 * Select the 1st imbalanced group as we don't have any way to
   8808		 * choose one more than another.
   8809		 */
   8810		return false;
   8811
   8812	case group_asym_packing:
   8813		/* Prefer to move from lowest priority CPU's work */
   8814		if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
   8815			return false;
   8816		break;
   8817
   8818	case group_misfit_task:
   8819		/*
   8820		 * If we have more than one misfit sg go with the biggest
   8821		 * misfit.
   8822		 */
   8823		if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
   8824			return false;
   8825		break;
   8826
   8827	case group_fully_busy:
   8828		/*
   8829		 * Select the fully busy group with highest avg_load. In
   8830		 * theory, there is no need to pull task from such kind of
   8831		 * group because tasks have all compute capacity that they need
   8832		 * but we can still improve the overall throughput by reducing
   8833		 * contention when accessing shared HW resources.
   8834		 *
   8835		 * XXX for now avg_load is not computed and always 0 so we
   8836		 * select the 1st one.
   8837		 */
   8838		if (sgs->avg_load <= busiest->avg_load)
   8839			return false;
   8840		break;
   8841
   8842	case group_has_spare:
   8843		/*
   8844		 * Select not overloaded group with lowest number of idle cpus
   8845		 * and highest number of running tasks. We could also compare
   8846		 * the spare capacity which is more stable but it can end up
   8847		 * that the group has less spare capacity but finally more idle
   8848		 * CPUs which means less opportunity to pull tasks.
   8849		 */
   8850		if (sgs->idle_cpus > busiest->idle_cpus)
   8851			return false;
   8852		else if ((sgs->idle_cpus == busiest->idle_cpus) &&
   8853			 (sgs->sum_nr_running <= busiest->sum_nr_running))
   8854			return false;
   8855
   8856		break;
   8857	}
   8858
   8859	/*
   8860	 * Candidate sg has no more than one task per CPU and has higher
   8861	 * per-CPU capacity. Migrating tasks to less capable CPUs may harm
   8862	 * throughput. Maximize throughput, power/energy consequences are not
   8863	 * considered.
   8864	 */
   8865	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
   8866	    (sgs->group_type <= group_fully_busy) &&
   8867	    (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
   8868		return false;
   8869
   8870	return true;
   8871}
   8872
   8873#ifdef CONFIG_NUMA_BALANCING
   8874static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
   8875{
   8876	if (sgs->sum_h_nr_running > sgs->nr_numa_running)
   8877		return regular;
   8878	if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
   8879		return remote;
   8880	return all;
   8881}
   8882
   8883static inline enum fbq_type fbq_classify_rq(struct rq *rq)
   8884{
   8885	if (rq->nr_running > rq->nr_numa_running)
   8886		return regular;
   8887	if (rq->nr_running > rq->nr_preferred_running)
   8888		return remote;
   8889	return all;
   8890}
   8891#else
   8892static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
   8893{
   8894	return all;
   8895}
   8896
   8897static inline enum fbq_type fbq_classify_rq(struct rq *rq)
   8898{
   8899	return regular;
   8900}
   8901#endif /* CONFIG_NUMA_BALANCING */
   8902
   8903
   8904struct sg_lb_stats;
   8905
   8906/*
   8907 * task_running_on_cpu - return 1 if @p is running on @cpu.
   8908 */
   8909
   8910static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
   8911{
   8912	/* Task has no contribution or is new */
   8913	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
   8914		return 0;
   8915
   8916	if (task_on_rq_queued(p))
   8917		return 1;
   8918
   8919	return 0;
   8920}
   8921
   8922/**
   8923 * idle_cpu_without - would a given CPU be idle without p ?
   8924 * @cpu: the processor on which idleness is tested.
   8925 * @p: task which should be ignored.
   8926 *
   8927 * Return: 1 if the CPU would be idle. 0 otherwise.
   8928 */
   8929static int idle_cpu_without(int cpu, struct task_struct *p)
   8930{
   8931	struct rq *rq = cpu_rq(cpu);
   8932
   8933	if (rq->curr != rq->idle && rq->curr != p)
   8934		return 0;
   8935
   8936	/*
   8937	 * rq->nr_running can't be used but an updated version without the
   8938	 * impact of p on cpu must be used instead. The updated nr_running
   8939	 * be computed and tested before calling idle_cpu_without().
   8940	 */
   8941
   8942#ifdef CONFIG_SMP
   8943	if (rq->ttwu_pending)
   8944		return 0;
   8945#endif
   8946
   8947	return 1;
   8948}
   8949
   8950/*
   8951 * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
   8952 * @sd: The sched_domain level to look for idlest group.
   8953 * @group: sched_group whose statistics are to be updated.
   8954 * @sgs: variable to hold the statistics for this group.
   8955 * @p: The task for which we look for the idlest group/CPU.
   8956 */
   8957static inline void update_sg_wakeup_stats(struct sched_domain *sd,
   8958					  struct sched_group *group,
   8959					  struct sg_lb_stats *sgs,
   8960					  struct task_struct *p)
   8961{
   8962	int i, nr_running;
   8963
   8964	memset(sgs, 0, sizeof(*sgs));
   8965
   8966	for_each_cpu(i, sched_group_span(group)) {
   8967		struct rq *rq = cpu_rq(i);
   8968		unsigned int local;
   8969
   8970		sgs->group_load += cpu_load_without(rq, p);
   8971		sgs->group_util += cpu_util_without(i, p);
   8972		sgs->group_runnable += cpu_runnable_without(rq, p);
   8973		local = task_running_on_cpu(i, p);
   8974		sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
   8975
   8976		nr_running = rq->nr_running - local;
   8977		sgs->sum_nr_running += nr_running;
   8978
   8979		/*
   8980		 * No need to call idle_cpu_without() if nr_running is not 0
   8981		 */
   8982		if (!nr_running && idle_cpu_without(i, p))
   8983			sgs->idle_cpus++;
   8984
   8985	}
   8986
   8987	/* Check if task fits in the group */
   8988	if (sd->flags & SD_ASYM_CPUCAPACITY &&
   8989	    !task_fits_capacity(p, group->sgc->max_capacity)) {
   8990		sgs->group_misfit_task_load = 1;
   8991	}
   8992
   8993	sgs->group_capacity = group->sgc->capacity;
   8994
   8995	sgs->group_weight = group->group_weight;
   8996
   8997	sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
   8998
   8999	/*
   9000	 * Computing avg_load makes sense only when group is fully busy or
   9001	 * overloaded
   9002	 */
   9003	if (sgs->group_type == group_fully_busy ||
   9004		sgs->group_type == group_overloaded)
   9005		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
   9006				sgs->group_capacity;
   9007}
   9008
   9009static bool update_pick_idlest(struct sched_group *idlest,
   9010			       struct sg_lb_stats *idlest_sgs,
   9011			       struct sched_group *group,
   9012			       struct sg_lb_stats *sgs)
   9013{
   9014	if (sgs->group_type < idlest_sgs->group_type)
   9015		return true;
   9016
   9017	if (sgs->group_type > idlest_sgs->group_type)
   9018		return false;
   9019
   9020	/*
   9021	 * The candidate and the current idlest group are the same type of
   9022	 * group. Let check which one is the idlest according to the type.
   9023	 */
   9024
   9025	switch (sgs->group_type) {
   9026	case group_overloaded:
   9027	case group_fully_busy:
   9028		/* Select the group with lowest avg_load. */
   9029		if (idlest_sgs->avg_load <= sgs->avg_load)
   9030			return false;
   9031		break;
   9032
   9033	case group_imbalanced:
   9034	case group_asym_packing:
   9035		/* Those types are not used in the slow wakeup path */
   9036		return false;
   9037
   9038	case group_misfit_task:
   9039		/* Select group with the highest max capacity */
   9040		if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
   9041			return false;
   9042		break;
   9043
   9044	case group_has_spare:
   9045		/* Select group with most idle CPUs */
   9046		if (idlest_sgs->idle_cpus > sgs->idle_cpus)
   9047			return false;
   9048
   9049		/* Select group with lowest group_util */
   9050		if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
   9051			idlest_sgs->group_util <= sgs->group_util)
   9052			return false;
   9053
   9054		break;
   9055	}
   9056
   9057	return true;
   9058}
   9059
   9060/*
   9061 * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
   9062 * This is an approximation as the number of running tasks may not be
   9063 * related to the number of busy CPUs due to sched_setaffinity.
   9064 */
   9065static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
   9066{
   9067	return running <= imb_numa_nr;
   9068}
   9069
   9070/*
   9071 * find_idlest_group() finds and returns the least busy CPU group within the
   9072 * domain.
   9073 *
   9074 * Assumes p is allowed on at least one CPU in sd.
   9075 */
   9076static struct sched_group *
   9077find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
   9078{
   9079	struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
   9080	struct sg_lb_stats local_sgs, tmp_sgs;
   9081	struct sg_lb_stats *sgs;
   9082	unsigned long imbalance;
   9083	struct sg_lb_stats idlest_sgs = {
   9084			.avg_load = UINT_MAX,
   9085			.group_type = group_overloaded,
   9086	};
   9087
   9088	do {
   9089		int local_group;
   9090
   9091		/* Skip over this group if it has no CPUs allowed */
   9092		if (!cpumask_intersects(sched_group_span(group),
   9093					p->cpus_ptr))
   9094			continue;
   9095
   9096		/* Skip over this group if no cookie matched */
   9097		if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
   9098			continue;
   9099
   9100		local_group = cpumask_test_cpu(this_cpu,
   9101					       sched_group_span(group));
   9102
   9103		if (local_group) {
   9104			sgs = &local_sgs;
   9105			local = group;
   9106		} else {
   9107			sgs = &tmp_sgs;
   9108		}
   9109
   9110		update_sg_wakeup_stats(sd, group, sgs, p);
   9111
   9112		if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
   9113			idlest = group;
   9114			idlest_sgs = *sgs;
   9115		}
   9116
   9117	} while (group = group->next, group != sd->groups);
   9118
   9119
   9120	/* There is no idlest group to push tasks to */
   9121	if (!idlest)
   9122		return NULL;
   9123
   9124	/* The local group has been skipped because of CPU affinity */
   9125	if (!local)
   9126		return idlest;
   9127
   9128	/*
   9129	 * If the local group is idler than the selected idlest group
   9130	 * don't try and push the task.
   9131	 */
   9132	if (local_sgs.group_type < idlest_sgs.group_type)
   9133		return NULL;
   9134
   9135	/*
   9136	 * If the local group is busier than the selected idlest group
   9137	 * try and push the task.
   9138	 */
   9139	if (local_sgs.group_type > idlest_sgs.group_type)
   9140		return idlest;
   9141
   9142	switch (local_sgs.group_type) {
   9143	case group_overloaded:
   9144	case group_fully_busy:
   9145
   9146		/* Calculate allowed imbalance based on load */
   9147		imbalance = scale_load_down(NICE_0_LOAD) *
   9148				(sd->imbalance_pct-100) / 100;
   9149
   9150		/*
   9151		 * When comparing groups across NUMA domains, it's possible for
   9152		 * the local domain to be very lightly loaded relative to the
   9153		 * remote domains but "imbalance" skews the comparison making
   9154		 * remote CPUs look much more favourable. When considering
   9155		 * cross-domain, add imbalance to the load on the remote node
   9156		 * and consider staying local.
   9157		 */
   9158
   9159		if ((sd->flags & SD_NUMA) &&
   9160		    ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
   9161			return NULL;
   9162
   9163		/*
   9164		 * If the local group is less loaded than the selected
   9165		 * idlest group don't try and push any tasks.
   9166		 */
   9167		if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
   9168			return NULL;
   9169
   9170		if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
   9171			return NULL;
   9172		break;
   9173
   9174	case group_imbalanced:
   9175	case group_asym_packing:
   9176		/* Those type are not used in the slow wakeup path */
   9177		return NULL;
   9178
   9179	case group_misfit_task:
   9180		/* Select group with the highest max capacity */
   9181		if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
   9182			return NULL;
   9183		break;
   9184
   9185	case group_has_spare:
   9186		if (sd->flags & SD_NUMA) {
   9187#ifdef CONFIG_NUMA_BALANCING
   9188			int idlest_cpu;
   9189			/*
   9190			 * If there is spare capacity at NUMA, try to select
   9191			 * the preferred node
   9192			 */
   9193			if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
   9194				return NULL;
   9195
   9196			idlest_cpu = cpumask_first(sched_group_span(idlest));
   9197			if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
   9198				return idlest;
   9199#endif
   9200			/*
   9201			 * Otherwise, keep the task close to the wakeup source
   9202			 * and improve locality if the number of running tasks
   9203			 * would remain below threshold where an imbalance is
   9204			 * allowed. If there is a real need of migration,
   9205			 * periodic load balance will take care of it.
   9206			 */
   9207			if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
   9208				return NULL;
   9209		}
   9210
   9211		/*
   9212		 * Select group with highest number of idle CPUs. We could also
   9213		 * compare the utilization which is more stable but it can end
   9214		 * up that the group has less spare capacity but finally more
   9215		 * idle CPUs which means more opportunity to run task.
   9216		 */
   9217		if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
   9218			return NULL;
   9219		break;
   9220	}
   9221
   9222	return idlest;
   9223}
   9224
   9225/**
   9226 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
   9227 * @env: The load balancing environment.
   9228 * @sds: variable to hold the statistics for this sched_domain.
   9229 */
   9230
   9231static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
   9232{
   9233	struct sched_domain *child = env->sd->child;
   9234	struct sched_group *sg = env->sd->groups;
   9235	struct sg_lb_stats *local = &sds->local_stat;
   9236	struct sg_lb_stats tmp_sgs;
   9237	int sg_status = 0;
   9238
   9239	do {
   9240		struct sg_lb_stats *sgs = &tmp_sgs;
   9241		int local_group;
   9242
   9243		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
   9244		if (local_group) {
   9245			sds->local = sg;
   9246			sgs = local;
   9247
   9248			if (env->idle != CPU_NEWLY_IDLE ||
   9249			    time_after_eq(jiffies, sg->sgc->next_update))
   9250				update_group_capacity(env->sd, env->dst_cpu);
   9251		}
   9252
   9253		update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
   9254
   9255		if (local_group)
   9256			goto next_group;
   9257
   9258
   9259		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
   9260			sds->busiest = sg;
   9261			sds->busiest_stat = *sgs;
   9262		}
   9263
   9264next_group:
   9265		/* Now, start updating sd_lb_stats */
   9266		sds->total_load += sgs->group_load;
   9267		sds->total_capacity += sgs->group_capacity;
   9268
   9269		sg = sg->next;
   9270	} while (sg != env->sd->groups);
   9271
   9272	/* Tag domain that child domain prefers tasks go to siblings first */
   9273	sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
   9274
   9275
   9276	if (env->sd->flags & SD_NUMA)
   9277		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
   9278
   9279	if (!env->sd->parent) {
   9280		struct root_domain *rd = env->dst_rq->rd;
   9281
   9282		/* update overload indicator if we are at root domain */
   9283		WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
   9284
   9285		/* Update over-utilization (tipping point, U >= 0) indicator */
   9286		WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
   9287		trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
   9288	} else if (sg_status & SG_OVERUTILIZED) {
   9289		struct root_domain *rd = env->dst_rq->rd;
   9290
   9291		WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
   9292		trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
   9293	}
   9294}
   9295
   9296#define NUMA_IMBALANCE_MIN 2
   9297
   9298static inline long adjust_numa_imbalance(int imbalance,
   9299				int dst_running, int imb_numa_nr)
   9300{
   9301	if (!allow_numa_imbalance(dst_running, imb_numa_nr))
   9302		return imbalance;
   9303
   9304	/*
   9305	 * Allow a small imbalance based on a simple pair of communicating
   9306	 * tasks that remain local when the destination is lightly loaded.
   9307	 */
   9308	if (imbalance <= NUMA_IMBALANCE_MIN)
   9309		return 0;
   9310
   9311	return imbalance;
   9312}
   9313
   9314/**
   9315 * calculate_imbalance - Calculate the amount of imbalance present within the
   9316 *			 groups of a given sched_domain during load balance.
   9317 * @env: load balance environment
   9318 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
   9319 */
   9320static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
   9321{
   9322	struct sg_lb_stats *local, *busiest;
   9323
   9324	local = &sds->local_stat;
   9325	busiest = &sds->busiest_stat;
   9326
   9327	if (busiest->group_type == group_misfit_task) {
   9328		/* Set imbalance to allow misfit tasks to be balanced. */
   9329		env->migration_type = migrate_misfit;
   9330		env->imbalance = 1;
   9331		return;
   9332	}
   9333
   9334	if (busiest->group_type == group_asym_packing) {
   9335		/*
   9336		 * In case of asym capacity, we will try to migrate all load to
   9337		 * the preferred CPU.
   9338		 */
   9339		env->migration_type = migrate_task;
   9340		env->imbalance = busiest->sum_h_nr_running;
   9341		return;
   9342	}
   9343
   9344	if (busiest->group_type == group_imbalanced) {
   9345		/*
   9346		 * In the group_imb case we cannot rely on group-wide averages
   9347		 * to ensure CPU-load equilibrium, try to move any task to fix
   9348		 * the imbalance. The next load balance will take care of
   9349		 * balancing back the system.
   9350		 */
   9351		env->migration_type = migrate_task;
   9352		env->imbalance = 1;
   9353		return;
   9354	}
   9355
   9356	/*
   9357	 * Try to use spare capacity of local group without overloading it or
   9358	 * emptying busiest.
   9359	 */
   9360	if (local->group_type == group_has_spare) {
   9361		if ((busiest->group_type > group_fully_busy) &&
   9362		    !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
   9363			/*
   9364			 * If busiest is overloaded, try to fill spare
   9365			 * capacity. This might end up creating spare capacity
   9366			 * in busiest or busiest still being overloaded but
   9367			 * there is no simple way to directly compute the
   9368			 * amount of load to migrate in order to balance the
   9369			 * system.
   9370			 */
   9371			env->migration_type = migrate_util;
   9372			env->imbalance = max(local->group_capacity, local->group_util) -
   9373					 local->group_util;
   9374
   9375			/*
   9376			 * In some cases, the group's utilization is max or even
   9377			 * higher than capacity because of migrations but the
   9378			 * local CPU is (newly) idle. There is at least one
   9379			 * waiting task in this overloaded busiest group. Let's
   9380			 * try to pull it.
   9381			 */
   9382			if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
   9383				env->migration_type = migrate_task;
   9384				env->imbalance = 1;
   9385			}
   9386
   9387			return;
   9388		}
   9389
   9390		if (busiest->group_weight == 1 || sds->prefer_sibling) {
   9391			unsigned int nr_diff = busiest->sum_nr_running;
   9392			/*
   9393			 * When prefer sibling, evenly spread running tasks on
   9394			 * groups.
   9395			 */
   9396			env->migration_type = migrate_task;
   9397			lsub_positive(&nr_diff, local->sum_nr_running);
   9398			env->imbalance = nr_diff >> 1;
   9399		} else {
   9400
   9401			/*
   9402			 * If there is no overload, we just want to even the number of
   9403			 * idle cpus.
   9404			 */
   9405			env->migration_type = migrate_task;
   9406			env->imbalance = max_t(long, 0, (local->idle_cpus -
   9407						 busiest->idle_cpus) >> 1);
   9408		}
   9409
   9410		/* Consider allowing a small imbalance between NUMA groups */
   9411		if (env->sd->flags & SD_NUMA) {
   9412			env->imbalance = adjust_numa_imbalance(env->imbalance,
   9413				local->sum_nr_running + 1, env->sd->imb_numa_nr);
   9414		}
   9415
   9416		return;
   9417	}
   9418
   9419	/*
   9420	 * Local is fully busy but has to take more load to relieve the
   9421	 * busiest group
   9422	 */
   9423	if (local->group_type < group_overloaded) {
   9424		/*
   9425		 * Local will become overloaded so the avg_load metrics are
   9426		 * finally needed.
   9427		 */
   9428
   9429		local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
   9430				  local->group_capacity;
   9431
   9432		/*
   9433		 * If the local group is more loaded than the selected
   9434		 * busiest group don't try to pull any tasks.
   9435		 */
   9436		if (local->avg_load >= busiest->avg_load) {
   9437			env->imbalance = 0;
   9438			return;
   9439		}
   9440
   9441		sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
   9442				sds->total_capacity;
   9443	}
   9444
   9445	/*
   9446	 * Both group are or will become overloaded and we're trying to get all
   9447	 * the CPUs to the average_load, so we don't want to push ourselves
   9448	 * above the average load, nor do we wish to reduce the max loaded CPU
   9449	 * below the average load. At the same time, we also don't want to
   9450	 * reduce the group load below the group capacity. Thus we look for
   9451	 * the minimum possible imbalance.
   9452	 */
   9453	env->migration_type = migrate_load;
   9454	env->imbalance = min(
   9455		(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
   9456		(sds->avg_load - local->avg_load) * local->group_capacity
   9457	) / SCHED_CAPACITY_SCALE;
   9458}
   9459
   9460/******* find_busiest_group() helpers end here *********************/
   9461
   9462/*
   9463 * Decision matrix according to the local and busiest group type:
   9464 *
   9465 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
   9466 * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
   9467 * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
   9468 * misfit_task      force     N/A        N/A    N/A  N/A        N/A
   9469 * asym_packing     force     force      N/A    N/A  force      force
   9470 * imbalanced       force     force      N/A    N/A  force      force
   9471 * overloaded       force     force      N/A    N/A  force      avg_load
   9472 *
   9473 * N/A :      Not Applicable because already filtered while updating
   9474 *            statistics.
   9475 * balanced : The system is balanced for these 2 groups.
   9476 * force :    Calculate the imbalance as load migration is probably needed.
   9477 * avg_load : Only if imbalance is significant enough.
   9478 * nr_idle :  dst_cpu is not busy and the number of idle CPUs is quite
   9479 *            different in groups.
   9480 */
   9481
   9482/**
   9483 * find_busiest_group - Returns the busiest group within the sched_domain
   9484 * if there is an imbalance.
   9485 * @env: The load balancing environment.
   9486 *
   9487 * Also calculates the amount of runnable load which should be moved
   9488 * to restore balance.
   9489 *
   9490 * Return:	- The busiest group if imbalance exists.
   9491 */
   9492static struct sched_group *find_busiest_group(struct lb_env *env)
   9493{
   9494	struct sg_lb_stats *local, *busiest;
   9495	struct sd_lb_stats sds;
   9496
   9497	init_sd_lb_stats(&sds);
   9498
   9499	/*
   9500	 * Compute the various statistics relevant for load balancing at
   9501	 * this level.
   9502	 */
   9503	update_sd_lb_stats(env, &sds);
   9504
   9505	if (sched_energy_enabled()) {
   9506		struct root_domain *rd = env->dst_rq->rd;
   9507
   9508		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
   9509			goto out_balanced;
   9510	}
   9511
   9512	local = &sds.local_stat;
   9513	busiest = &sds.busiest_stat;
   9514
   9515	/* There is no busy sibling group to pull tasks from */
   9516	if (!sds.busiest)
   9517		goto out_balanced;
   9518
   9519	/* Misfit tasks should be dealt with regardless of the avg load */
   9520	if (busiest->group_type == group_misfit_task)
   9521		goto force_balance;
   9522
   9523	/* ASYM feature bypasses nice load balance check */
   9524	if (busiest->group_type == group_asym_packing)
   9525		goto force_balance;
   9526
   9527	/*
   9528	 * If the busiest group is imbalanced the below checks don't
   9529	 * work because they assume all things are equal, which typically
   9530	 * isn't true due to cpus_ptr constraints and the like.
   9531	 */
   9532	if (busiest->group_type == group_imbalanced)
   9533		goto force_balance;
   9534
   9535	/*
   9536	 * If the local group is busier than the selected busiest group
   9537	 * don't try and pull any tasks.
   9538	 */
   9539	if (local->group_type > busiest->group_type)
   9540		goto out_balanced;
   9541
   9542	/*
   9543	 * When groups are overloaded, use the avg_load to ensure fairness
   9544	 * between tasks.
   9545	 */
   9546	if (local->group_type == group_overloaded) {
   9547		/*
   9548		 * If the local group is more loaded than the selected
   9549		 * busiest group don't try to pull any tasks.
   9550		 */
   9551		if (local->avg_load >= busiest->avg_load)
   9552			goto out_balanced;
   9553
   9554		/* XXX broken for overlapping NUMA groups */
   9555		sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
   9556				sds.total_capacity;
   9557
   9558		/*
   9559		 * Don't pull any tasks if this group is already above the
   9560		 * domain average load.
   9561		 */
   9562		if (local->avg_load >= sds.avg_load)
   9563			goto out_balanced;
   9564
   9565		/*
   9566		 * If the busiest group is more loaded, use imbalance_pct to be
   9567		 * conservative.
   9568		 */
   9569		if (100 * busiest->avg_load <=
   9570				env->sd->imbalance_pct * local->avg_load)
   9571			goto out_balanced;
   9572	}
   9573
   9574	/* Try to move all excess tasks to child's sibling domain */
   9575	if (sds.prefer_sibling && local->group_type == group_has_spare &&
   9576	    busiest->sum_nr_running > local->sum_nr_running + 1)
   9577		goto force_balance;
   9578
   9579	if (busiest->group_type != group_overloaded) {
   9580		if (env->idle == CPU_NOT_IDLE)
   9581			/*
   9582			 * If the busiest group is not overloaded (and as a
   9583			 * result the local one too) but this CPU is already
   9584			 * busy, let another idle CPU try to pull task.
   9585			 */
   9586			goto out_balanced;
   9587
   9588		if (busiest->group_weight > 1 &&
   9589		    local->idle_cpus <= (busiest->idle_cpus + 1))
   9590			/*
   9591			 * If the busiest group is not overloaded
   9592			 * and there is no imbalance between this and busiest
   9593			 * group wrt idle CPUs, it is balanced. The imbalance
   9594			 * becomes significant if the diff is greater than 1
   9595			 * otherwise we might end up to just move the imbalance
   9596			 * on another group. Of course this applies only if
   9597			 * there is more than 1 CPU per group.
   9598			 */
   9599			goto out_balanced;
   9600
   9601		if (busiest->sum_h_nr_running == 1)
   9602			/*
   9603			 * busiest doesn't have any tasks waiting to run
   9604			 */
   9605			goto out_balanced;
   9606	}
   9607
   9608force_balance:
   9609	/* Looks like there is an imbalance. Compute it */
   9610	calculate_imbalance(env, &sds);
   9611	return env->imbalance ? sds.busiest : NULL;
   9612
   9613out_balanced:
   9614	env->imbalance = 0;
   9615	return NULL;
   9616}
   9617
   9618/*
   9619 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
   9620 */
   9621static struct rq *find_busiest_queue(struct lb_env *env,
   9622				     struct sched_group *group)
   9623{
   9624	struct rq *busiest = NULL, *rq;
   9625	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
   9626	unsigned int busiest_nr = 0;
   9627	int i;
   9628
   9629	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
   9630		unsigned long capacity, load, util;
   9631		unsigned int nr_running;
   9632		enum fbq_type rt;
   9633
   9634		rq = cpu_rq(i);
   9635		rt = fbq_classify_rq(rq);
   9636
   9637		/*
   9638		 * We classify groups/runqueues into three groups:
   9639		 *  - regular: there are !numa tasks
   9640		 *  - remote:  there are numa tasks that run on the 'wrong' node
   9641		 *  - all:     there is no distinction
   9642		 *
   9643		 * In order to avoid migrating ideally placed numa tasks,
   9644		 * ignore those when there's better options.
   9645		 *
   9646		 * If we ignore the actual busiest queue to migrate another
   9647		 * task, the next balance pass can still reduce the busiest
   9648		 * queue by moving tasks around inside the node.
   9649		 *
   9650		 * If we cannot move enough load due to this classification
   9651		 * the next pass will adjust the group classification and
   9652		 * allow migration of more tasks.
   9653		 *
   9654		 * Both cases only affect the total convergence complexity.
   9655		 */
   9656		if (rt > env->fbq_type)
   9657			continue;
   9658
   9659		nr_running = rq->cfs.h_nr_running;
   9660		if (!nr_running)
   9661			continue;
   9662
   9663		capacity = capacity_of(i);
   9664
   9665		/*
   9666		 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
   9667		 * eventually lead to active_balancing high->low capacity.
   9668		 * Higher per-CPU capacity is considered better than balancing
   9669		 * average load.
   9670		 */
   9671		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
   9672		    !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
   9673		    nr_running == 1)
   9674			continue;
   9675
   9676		/* Make sure we only pull tasks from a CPU of lower priority */
   9677		if ((env->sd->flags & SD_ASYM_PACKING) &&
   9678		    sched_asym_prefer(i, env->dst_cpu) &&
   9679		    nr_running == 1)
   9680			continue;
   9681
   9682		switch (env->migration_type) {
   9683		case migrate_load:
   9684			/*
   9685			 * When comparing with load imbalance, use cpu_load()
   9686			 * which is not scaled with the CPU capacity.
   9687			 */
   9688			load = cpu_load(rq);
   9689
   9690			if (nr_running == 1 && load > env->imbalance &&
   9691			    !check_cpu_capacity(rq, env->sd))
   9692				break;
   9693
   9694			/*
   9695			 * For the load comparisons with the other CPUs,
   9696			 * consider the cpu_load() scaled with the CPU
   9697			 * capacity, so that the load can be moved away
   9698			 * from the CPU that is potentially running at a
   9699			 * lower capacity.
   9700			 *
   9701			 * Thus we're looking for max(load_i / capacity_i),
   9702			 * crosswise multiplication to rid ourselves of the
   9703			 * division works out to:
   9704			 * load_i * capacity_j > load_j * capacity_i;
   9705			 * where j is our previous maximum.
   9706			 */
   9707			if (load * busiest_capacity > busiest_load * capacity) {
   9708				busiest_load = load;
   9709				busiest_capacity = capacity;
   9710				busiest = rq;
   9711			}
   9712			break;
   9713
   9714		case migrate_util:
   9715			util = cpu_util_cfs(i);
   9716
   9717			/*
   9718			 * Don't try to pull utilization from a CPU with one
   9719			 * running task. Whatever its utilization, we will fail
   9720			 * detach the task.
   9721			 */
   9722			if (nr_running <= 1)
   9723				continue;
   9724
   9725			if (busiest_util < util) {
   9726				busiest_util = util;
   9727				busiest = rq;
   9728			}
   9729			break;
   9730
   9731		case migrate_task:
   9732			if (busiest_nr < nr_running) {
   9733				busiest_nr = nr_running;
   9734				busiest = rq;
   9735			}
   9736			break;
   9737
   9738		case migrate_misfit:
   9739			/*
   9740			 * For ASYM_CPUCAPACITY domains with misfit tasks we
   9741			 * simply seek the "biggest" misfit task.
   9742			 */
   9743			if (rq->misfit_task_load > busiest_load) {
   9744				busiest_load = rq->misfit_task_load;
   9745				busiest = rq;
   9746			}
   9747
   9748			break;
   9749
   9750		}
   9751	}
   9752
   9753	return busiest;
   9754}
   9755
   9756/*
   9757 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
   9758 * so long as it is large enough.
   9759 */
   9760#define MAX_PINNED_INTERVAL	512
   9761
   9762static inline bool
   9763asym_active_balance(struct lb_env *env)
   9764{
   9765	/*
   9766	 * ASYM_PACKING needs to force migrate tasks from busy but
   9767	 * lower priority CPUs in order to pack all tasks in the
   9768	 * highest priority CPUs.
   9769	 */
   9770	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
   9771	       sched_asym_prefer(env->dst_cpu, env->src_cpu);
   9772}
   9773
   9774static inline bool
   9775imbalanced_active_balance(struct lb_env *env)
   9776{
   9777	struct sched_domain *sd = env->sd;
   9778
   9779	/*
   9780	 * The imbalanced case includes the case of pinned tasks preventing a fair
   9781	 * distribution of the load on the system but also the even distribution of the
   9782	 * threads on a system with spare capacity
   9783	 */
   9784	if ((env->migration_type == migrate_task) &&
   9785	    (sd->nr_balance_failed > sd->cache_nice_tries+2))
   9786		return 1;
   9787
   9788	return 0;
   9789}
   9790
   9791static int need_active_balance(struct lb_env *env)
   9792{
   9793	struct sched_domain *sd = env->sd;
   9794
   9795	if (asym_active_balance(env))
   9796		return 1;
   9797
   9798	if (imbalanced_active_balance(env))
   9799		return 1;
   9800
   9801	/*
   9802	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
   9803	 * It's worth migrating the task if the src_cpu's capacity is reduced
   9804	 * because of other sched_class or IRQs if more capacity stays
   9805	 * available on dst_cpu.
   9806	 */
   9807	if ((env->idle != CPU_NOT_IDLE) &&
   9808	    (env->src_rq->cfs.h_nr_running == 1)) {
   9809		if ((check_cpu_capacity(env->src_rq, sd)) &&
   9810		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
   9811			return 1;
   9812	}
   9813
   9814	if (env->migration_type == migrate_misfit)
   9815		return 1;
   9816
   9817	return 0;
   9818}
   9819
   9820static int active_load_balance_cpu_stop(void *data);
   9821
   9822static int should_we_balance(struct lb_env *env)
   9823{
   9824	struct sched_group *sg = env->sd->groups;
   9825	int cpu;
   9826
   9827	/*
   9828	 * Ensure the balancing environment is consistent; can happen
   9829	 * when the softirq triggers 'during' hotplug.
   9830	 */
   9831	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
   9832		return 0;
   9833
   9834	/*
   9835	 * In the newly idle case, we will allow all the CPUs
   9836	 * to do the newly idle load balance.
   9837	 */
   9838	if (env->idle == CPU_NEWLY_IDLE)
   9839		return 1;
   9840
   9841	/* Try to find first idle CPU */
   9842	for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
   9843		if (!idle_cpu(cpu))
   9844			continue;
   9845
   9846		/* Are we the first idle CPU? */
   9847		return cpu == env->dst_cpu;
   9848	}
   9849
   9850	/* Are we the first CPU of this group ? */
   9851	return group_balance_cpu(sg) == env->dst_cpu;
   9852}
   9853
   9854/*
   9855 * Check this_cpu to ensure it is balanced within domain. Attempt to move
   9856 * tasks if there is an imbalance.
   9857 */
   9858static int load_balance(int this_cpu, struct rq *this_rq,
   9859			struct sched_domain *sd, enum cpu_idle_type idle,
   9860			int *continue_balancing)
   9861{
   9862	int ld_moved, cur_ld_moved, active_balance = 0;
   9863	struct sched_domain *sd_parent = sd->parent;
   9864	struct sched_group *group;
   9865	struct rq *busiest;
   9866	struct rq_flags rf;
   9867	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
   9868
   9869	struct lb_env env = {
   9870		.sd		= sd,
   9871		.dst_cpu	= this_cpu,
   9872		.dst_rq		= this_rq,
   9873		.dst_grpmask    = sched_group_span(sd->groups),
   9874		.idle		= idle,
   9875		.loop_break	= sched_nr_migrate_break,
   9876		.cpus		= cpus,
   9877		.fbq_type	= all,
   9878		.tasks		= LIST_HEAD_INIT(env.tasks),
   9879	};
   9880
   9881	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
   9882
   9883	schedstat_inc(sd->lb_count[idle]);
   9884
   9885redo:
   9886	if (!should_we_balance(&env)) {
   9887		*continue_balancing = 0;
   9888		goto out_balanced;
   9889	}
   9890
   9891	group = find_busiest_group(&env);
   9892	if (!group) {
   9893		schedstat_inc(sd->lb_nobusyg[idle]);
   9894		goto out_balanced;
   9895	}
   9896
   9897	busiest = find_busiest_queue(&env, group);
   9898	if (!busiest) {
   9899		schedstat_inc(sd->lb_nobusyq[idle]);
   9900		goto out_balanced;
   9901	}
   9902
   9903	BUG_ON(busiest == env.dst_rq);
   9904
   9905	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
   9906
   9907	env.src_cpu = busiest->cpu;
   9908	env.src_rq = busiest;
   9909
   9910	ld_moved = 0;
   9911	/* Clear this flag as soon as we find a pullable task */
   9912	env.flags |= LBF_ALL_PINNED;
   9913	if (busiest->nr_running > 1) {
   9914		/*
   9915		 * Attempt to move tasks. If find_busiest_group has found
   9916		 * an imbalance but busiest->nr_running <= 1, the group is
   9917		 * still unbalanced. ld_moved simply stays zero, so it is
   9918		 * correctly treated as an imbalance.
   9919		 */
   9920		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
   9921
   9922more_balance:
   9923		rq_lock_irqsave(busiest, &rf);
   9924		update_rq_clock(busiest);
   9925
   9926		/*
   9927		 * cur_ld_moved - load moved in current iteration
   9928		 * ld_moved     - cumulative load moved across iterations
   9929		 */
   9930		cur_ld_moved = detach_tasks(&env);
   9931
   9932		/*
   9933		 * We've detached some tasks from busiest_rq. Every
   9934		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
   9935		 * unlock busiest->lock, and we are able to be sure
   9936		 * that nobody can manipulate the tasks in parallel.
   9937		 * See task_rq_lock() family for the details.
   9938		 */
   9939
   9940		rq_unlock(busiest, &rf);
   9941
   9942		if (cur_ld_moved) {
   9943			attach_tasks(&env);
   9944			ld_moved += cur_ld_moved;
   9945		}
   9946
   9947		local_irq_restore(rf.flags);
   9948
   9949		if (env.flags & LBF_NEED_BREAK) {
   9950			env.flags &= ~LBF_NEED_BREAK;
   9951			goto more_balance;
   9952		}
   9953
   9954		/*
   9955		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
   9956		 * us and move them to an alternate dst_cpu in our sched_group
   9957		 * where they can run. The upper limit on how many times we
   9958		 * iterate on same src_cpu is dependent on number of CPUs in our
   9959		 * sched_group.
   9960		 *
   9961		 * This changes load balance semantics a bit on who can move
   9962		 * load to a given_cpu. In addition to the given_cpu itself
   9963		 * (or a ilb_cpu acting on its behalf where given_cpu is
   9964		 * nohz-idle), we now have balance_cpu in a position to move
   9965		 * load to given_cpu. In rare situations, this may cause
   9966		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
   9967		 * _independently_ and at _same_ time to move some load to
   9968		 * given_cpu) causing excess load to be moved to given_cpu.
   9969		 * This however should not happen so much in practice and
   9970		 * moreover subsequent load balance cycles should correct the
   9971		 * excess load moved.
   9972		 */
   9973		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
   9974
   9975			/* Prevent to re-select dst_cpu via env's CPUs */
   9976			__cpumask_clear_cpu(env.dst_cpu, env.cpus);
   9977
   9978			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
   9979			env.dst_cpu	 = env.new_dst_cpu;
   9980			env.flags	&= ~LBF_DST_PINNED;
   9981			env.loop	 = 0;
   9982			env.loop_break	 = sched_nr_migrate_break;
   9983
   9984			/*
   9985			 * Go back to "more_balance" rather than "redo" since we
   9986			 * need to continue with same src_cpu.
   9987			 */
   9988			goto more_balance;
   9989		}
   9990
   9991		/*
   9992		 * We failed to reach balance because of affinity.
   9993		 */
   9994		if (sd_parent) {
   9995			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
   9996
   9997			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
   9998				*group_imbalance = 1;
   9999		}
  10000
  10001		/* All tasks on this runqueue were pinned by CPU affinity */
  10002		if (unlikely(env.flags & LBF_ALL_PINNED)) {
  10003			__cpumask_clear_cpu(cpu_of(busiest), cpus);
  10004			/*
  10005			 * Attempting to continue load balancing at the current
  10006			 * sched_domain level only makes sense if there are
  10007			 * active CPUs remaining as possible busiest CPUs to
  10008			 * pull load from which are not contained within the
  10009			 * destination group that is receiving any migrated
  10010			 * load.
  10011			 */
  10012			if (!cpumask_subset(cpus, env.dst_grpmask)) {
  10013				env.loop = 0;
  10014				env.loop_break = sched_nr_migrate_break;
  10015				goto redo;
  10016			}
  10017			goto out_all_pinned;
  10018		}
  10019	}
  10020
  10021	if (!ld_moved) {
  10022		schedstat_inc(sd->lb_failed[idle]);
  10023		/*
  10024		 * Increment the failure counter only on periodic balance.
  10025		 * We do not want newidle balance, which can be very
  10026		 * frequent, pollute the failure counter causing
  10027		 * excessive cache_hot migrations and active balances.
  10028		 */
  10029		if (idle != CPU_NEWLY_IDLE)
  10030			sd->nr_balance_failed++;
  10031
  10032		if (need_active_balance(&env)) {
  10033			unsigned long flags;
  10034
  10035			raw_spin_rq_lock_irqsave(busiest, flags);
  10036
  10037			/*
  10038			 * Don't kick the active_load_balance_cpu_stop,
  10039			 * if the curr task on busiest CPU can't be
  10040			 * moved to this_cpu:
  10041			 */
  10042			if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
  10043				raw_spin_rq_unlock_irqrestore(busiest, flags);
  10044				goto out_one_pinned;
  10045			}
  10046
  10047			/* Record that we found at least one task that could run on this_cpu */
  10048			env.flags &= ~LBF_ALL_PINNED;
  10049
  10050			/*
  10051			 * ->active_balance synchronizes accesses to
  10052			 * ->active_balance_work.  Once set, it's cleared
  10053			 * only after active load balance is finished.
  10054			 */
  10055			if (!busiest->active_balance) {
  10056				busiest->active_balance = 1;
  10057				busiest->push_cpu = this_cpu;
  10058				active_balance = 1;
  10059			}
  10060			raw_spin_rq_unlock_irqrestore(busiest, flags);
  10061
  10062			if (active_balance) {
  10063				stop_one_cpu_nowait(cpu_of(busiest),
  10064					active_load_balance_cpu_stop, busiest,
  10065					&busiest->active_balance_work);
  10066			}
  10067		}
  10068	} else {
  10069		sd->nr_balance_failed = 0;
  10070	}
  10071
  10072	if (likely(!active_balance) || need_active_balance(&env)) {
  10073		/* We were unbalanced, so reset the balancing interval */
  10074		sd->balance_interval = sd->min_interval;
  10075	}
  10076
  10077	goto out;
  10078
  10079out_balanced:
  10080	/*
  10081	 * We reach balance although we may have faced some affinity
  10082	 * constraints. Clear the imbalance flag only if other tasks got
  10083	 * a chance to move and fix the imbalance.
  10084	 */
  10085	if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
  10086		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
  10087
  10088		if (*group_imbalance)
  10089			*group_imbalance = 0;
  10090	}
  10091
  10092out_all_pinned:
  10093	/*
  10094	 * We reach balance because all tasks are pinned at this level so
  10095	 * we can't migrate them. Let the imbalance flag set so parent level
  10096	 * can try to migrate them.
  10097	 */
  10098	schedstat_inc(sd->lb_balanced[idle]);
  10099
  10100	sd->nr_balance_failed = 0;
  10101
  10102out_one_pinned:
  10103	ld_moved = 0;
  10104
  10105	/*
  10106	 * newidle_balance() disregards balance intervals, so we could
  10107	 * repeatedly reach this code, which would lead to balance_interval
  10108	 * skyrocketing in a short amount of time. Skip the balance_interval
  10109	 * increase logic to avoid that.
  10110	 */
  10111	if (env.idle == CPU_NEWLY_IDLE)
  10112		goto out;
  10113
  10114	/* tune up the balancing interval */
  10115	if ((env.flags & LBF_ALL_PINNED &&
  10116	     sd->balance_interval < MAX_PINNED_INTERVAL) ||
  10117	    sd->balance_interval < sd->max_interval)
  10118		sd->balance_interval *= 2;
  10119out:
  10120	return ld_moved;
  10121}
  10122
  10123static inline unsigned long
  10124get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
  10125{
  10126	unsigned long interval = sd->balance_interval;
  10127
  10128	if (cpu_busy)
  10129		interval *= sd->busy_factor;
  10130
  10131	/* scale ms to jiffies */
  10132	interval = msecs_to_jiffies(interval);
  10133
  10134	/*
  10135	 * Reduce likelihood of busy balancing at higher domains racing with
  10136	 * balancing at lower domains by preventing their balancing periods
  10137	 * from being multiples of each other.
  10138	 */
  10139	if (cpu_busy)
  10140		interval -= 1;
  10141
  10142	interval = clamp(interval, 1UL, max_load_balance_interval);
  10143
  10144	return interval;
  10145}
  10146
  10147static inline void
  10148update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
  10149{
  10150	unsigned long interval, next;
  10151
  10152	/* used by idle balance, so cpu_busy = 0 */
  10153	interval = get_sd_balance_interval(sd, 0);
  10154	next = sd->last_balance + interval;
  10155
  10156	if (time_after(*next_balance, next))
  10157		*next_balance = next;
  10158}
  10159
  10160/*
  10161 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
  10162 * running tasks off the busiest CPU onto idle CPUs. It requires at
  10163 * least 1 task to be running on each physical CPU where possible, and
  10164 * avoids physical / logical imbalances.
  10165 */
  10166static int active_load_balance_cpu_stop(void *data)
  10167{
  10168	struct rq *busiest_rq = data;
  10169	int busiest_cpu = cpu_of(busiest_rq);
  10170	int target_cpu = busiest_rq->push_cpu;
  10171	struct rq *target_rq = cpu_rq(target_cpu);
  10172	struct sched_domain *sd;
  10173	struct task_struct *p = NULL;
  10174	struct rq_flags rf;
  10175
  10176	rq_lock_irq(busiest_rq, &rf);
  10177	/*
  10178	 * Between queueing the stop-work and running it is a hole in which
  10179	 * CPUs can become inactive. We should not move tasks from or to
  10180	 * inactive CPUs.
  10181	 */
  10182	if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
  10183		goto out_unlock;
  10184
  10185	/* Make sure the requested CPU hasn't gone down in the meantime: */
  10186	if (unlikely(busiest_cpu != smp_processor_id() ||
  10187		     !busiest_rq->active_balance))
  10188		goto out_unlock;
  10189
  10190	/* Is there any task to move? */
  10191	if (busiest_rq->nr_running <= 1)
  10192		goto out_unlock;
  10193
  10194	/*
  10195	 * This condition is "impossible", if it occurs
  10196	 * we need to fix it. Originally reported by
  10197	 * Bjorn Helgaas on a 128-CPU setup.
  10198	 */
  10199	BUG_ON(busiest_rq == target_rq);
  10200
  10201	/* Search for an sd spanning us and the target CPU. */
  10202	rcu_read_lock();
  10203	for_each_domain(target_cpu, sd) {
  10204		if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
  10205			break;
  10206	}
  10207
  10208	if (likely(sd)) {
  10209		struct lb_env env = {
  10210			.sd		= sd,
  10211			.dst_cpu	= target_cpu,
  10212			.dst_rq		= target_rq,
  10213			.src_cpu	= busiest_rq->cpu,
  10214			.src_rq		= busiest_rq,
  10215			.idle		= CPU_IDLE,
  10216			.flags		= LBF_ACTIVE_LB,
  10217		};
  10218
  10219		schedstat_inc(sd->alb_count);
  10220		update_rq_clock(busiest_rq);
  10221
  10222		p = detach_one_task(&env);
  10223		if (p) {
  10224			schedstat_inc(sd->alb_pushed);
  10225			/* Active balancing done, reset the failure counter. */
  10226			sd->nr_balance_failed = 0;
  10227		} else {
  10228			schedstat_inc(sd->alb_failed);
  10229		}
  10230	}
  10231	rcu_read_unlock();
  10232out_unlock:
  10233	busiest_rq->active_balance = 0;
  10234	rq_unlock(busiest_rq, &rf);
  10235
  10236	if (p)
  10237		attach_one_task(target_rq, p);
  10238
  10239	local_irq_enable();
  10240
  10241	return 0;
  10242}
  10243
  10244static DEFINE_SPINLOCK(balancing);
  10245
  10246/*
  10247 * Scale the max load_balance interval with the number of CPUs in the system.
  10248 * This trades load-balance latency on larger machines for less cross talk.
  10249 */
  10250void update_max_interval(void)
  10251{
  10252	max_load_balance_interval = HZ*num_online_cpus()/10;
  10253}
  10254
  10255static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
  10256{
  10257	if (cost > sd->max_newidle_lb_cost) {
  10258		/*
  10259		 * Track max cost of a domain to make sure to not delay the
  10260		 * next wakeup on the CPU.
  10261		 */
  10262		sd->max_newidle_lb_cost = cost;
  10263		sd->last_decay_max_lb_cost = jiffies;
  10264	} else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
  10265		/*
  10266		 * Decay the newidle max times by ~1% per second to ensure that
  10267		 * it is not outdated and the current max cost is actually
  10268		 * shorter.
  10269		 */
  10270		sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
  10271		sd->last_decay_max_lb_cost = jiffies;
  10272
  10273		return true;
  10274	}
  10275
  10276	return false;
  10277}
  10278
  10279/*
  10280 * It checks each scheduling domain to see if it is due to be balanced,
  10281 * and initiates a balancing operation if so.
  10282 *
  10283 * Balancing parameters are set up in init_sched_domains.
  10284 */
  10285static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
  10286{
  10287	int continue_balancing = 1;
  10288	int cpu = rq->cpu;
  10289	int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
  10290	unsigned long interval;
  10291	struct sched_domain *sd;
  10292	/* Earliest time when we have to do rebalance again */
  10293	unsigned long next_balance = jiffies + 60*HZ;
  10294	int update_next_balance = 0;
  10295	int need_serialize, need_decay = 0;
  10296	u64 max_cost = 0;
  10297
  10298	rcu_read_lock();
  10299	for_each_domain(cpu, sd) {
  10300		/*
  10301		 * Decay the newidle max times here because this is a regular
  10302		 * visit to all the domains.
  10303		 */
  10304		need_decay = update_newidle_cost(sd, 0);
  10305		max_cost += sd->max_newidle_lb_cost;
  10306
  10307		/*
  10308		 * Stop the load balance at this level. There is another
  10309		 * CPU in our sched group which is doing load balancing more
  10310		 * actively.
  10311		 */
  10312		if (!continue_balancing) {
  10313			if (need_decay)
  10314				continue;
  10315			break;
  10316		}
  10317
  10318		interval = get_sd_balance_interval(sd, busy);
  10319
  10320		need_serialize = sd->flags & SD_SERIALIZE;
  10321		if (need_serialize) {
  10322			if (!spin_trylock(&balancing))
  10323				goto out;
  10324		}
  10325
  10326		if (time_after_eq(jiffies, sd->last_balance + interval)) {
  10327			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
  10328				/*
  10329				 * The LBF_DST_PINNED logic could have changed
  10330				 * env->dst_cpu, so we can't know our idle
  10331				 * state even if we migrated tasks. Update it.
  10332				 */
  10333				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
  10334				busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
  10335			}
  10336			sd->last_balance = jiffies;
  10337			interval = get_sd_balance_interval(sd, busy);
  10338		}
  10339		if (need_serialize)
  10340			spin_unlock(&balancing);
  10341out:
  10342		if (time_after(next_balance, sd->last_balance + interval)) {
  10343			next_balance = sd->last_balance + interval;
  10344			update_next_balance = 1;
  10345		}
  10346	}
  10347	if (need_decay) {
  10348		/*
  10349		 * Ensure the rq-wide value also decays but keep it at a
  10350		 * reasonable floor to avoid funnies with rq->avg_idle.
  10351		 */
  10352		rq->max_idle_balance_cost =
  10353			max((u64)sysctl_sched_migration_cost, max_cost);
  10354	}
  10355	rcu_read_unlock();
  10356
  10357	/*
  10358	 * next_balance will be updated only when there is a need.
  10359	 * When the cpu is attached to null domain for ex, it will not be
  10360	 * updated.
  10361	 */
  10362	if (likely(update_next_balance))
  10363		rq->next_balance = next_balance;
  10364
  10365}
  10366
  10367static inline int on_null_domain(struct rq *rq)
  10368{
  10369	return unlikely(!rcu_dereference_sched(rq->sd));
  10370}
  10371
  10372#ifdef CONFIG_NO_HZ_COMMON
  10373/*
  10374 * idle load balancing details
  10375 * - When one of the busy CPUs notice that there may be an idle rebalancing
  10376 *   needed, they will kick the idle load balancer, which then does idle
  10377 *   load balancing for all the idle CPUs.
  10378 * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
  10379 *   anywhere yet.
  10380 */
  10381
  10382static inline int find_new_ilb(void)
  10383{
  10384	int ilb;
  10385	const struct cpumask *hk_mask;
  10386
  10387	hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
  10388
  10389	for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
  10390
  10391		if (ilb == smp_processor_id())
  10392			continue;
  10393
  10394		if (idle_cpu(ilb))
  10395			return ilb;
  10396	}
  10397
  10398	return nr_cpu_ids;
  10399}
  10400
  10401/*
  10402 * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
  10403 * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
  10404 */
  10405static void kick_ilb(unsigned int flags)
  10406{
  10407	int ilb_cpu;
  10408
  10409	/*
  10410	 * Increase nohz.next_balance only when if full ilb is triggered but
  10411	 * not if we only update stats.
  10412	 */
  10413	if (flags & NOHZ_BALANCE_KICK)
  10414		nohz.next_balance = jiffies+1;
  10415
  10416	ilb_cpu = find_new_ilb();
  10417
  10418	if (ilb_cpu >= nr_cpu_ids)
  10419		return;
  10420
  10421	/*
  10422	 * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
  10423	 * the first flag owns it; cleared by nohz_csd_func().
  10424	 */
  10425	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
  10426	if (flags & NOHZ_KICK_MASK)
  10427		return;
  10428
  10429	/*
  10430	 * This way we generate an IPI on the target CPU which
  10431	 * is idle. And the softirq performing nohz idle load balance
  10432	 * will be run before returning from the IPI.
  10433	 */
  10434	smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
  10435}
  10436
  10437/*
  10438 * Current decision point for kicking the idle load balancer in the presence
  10439 * of idle CPUs in the system.
  10440 */
  10441static void nohz_balancer_kick(struct rq *rq)
  10442{
  10443	unsigned long now = jiffies;
  10444	struct sched_domain_shared *sds;
  10445	struct sched_domain *sd;
  10446	int nr_busy, i, cpu = rq->cpu;
  10447	unsigned int flags = 0;
  10448
  10449	if (unlikely(rq->idle_balance))
  10450		return;
  10451
  10452	/*
  10453	 * We may be recently in ticked or tickless idle mode. At the first
  10454	 * busy tick after returning from idle, we will update the busy stats.
  10455	 */
  10456	nohz_balance_exit_idle(rq);
  10457
  10458	/*
  10459	 * None are in tickless mode and hence no need for NOHZ idle load
  10460	 * balancing.
  10461	 */
  10462	if (likely(!atomic_read(&nohz.nr_cpus)))
  10463		return;
  10464
  10465	if (READ_ONCE(nohz.has_blocked) &&
  10466	    time_after(now, READ_ONCE(nohz.next_blocked)))
  10467		flags = NOHZ_STATS_KICK;
  10468
  10469	if (time_before(now, nohz.next_balance))
  10470		goto out;
  10471
  10472	if (rq->nr_running >= 2) {
  10473		flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
  10474		goto out;
  10475	}
  10476
  10477	rcu_read_lock();
  10478
  10479	sd = rcu_dereference(rq->sd);
  10480	if (sd) {
  10481		/*
  10482		 * If there's a CFS task and the current CPU has reduced
  10483		 * capacity; kick the ILB to see if there's a better CPU to run
  10484		 * on.
  10485		 */
  10486		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
  10487			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
  10488			goto unlock;
  10489		}
  10490	}
  10491
  10492	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
  10493	if (sd) {
  10494		/*
  10495		 * When ASYM_PACKING; see if there's a more preferred CPU
  10496		 * currently idle; in which case, kick the ILB to move tasks
  10497		 * around.
  10498		 */
  10499		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
  10500			if (sched_asym_prefer(i, cpu)) {
  10501				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
  10502				goto unlock;
  10503			}
  10504		}
  10505	}
  10506
  10507	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
  10508	if (sd) {
  10509		/*
  10510		 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
  10511		 * to run the misfit task on.
  10512		 */
  10513		if (check_misfit_status(rq, sd)) {
  10514			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
  10515			goto unlock;
  10516		}
  10517
  10518		/*
  10519		 * For asymmetric systems, we do not want to nicely balance
  10520		 * cache use, instead we want to embrace asymmetry and only
  10521		 * ensure tasks have enough CPU capacity.
  10522		 *
  10523		 * Skip the LLC logic because it's not relevant in that case.
  10524		 */
  10525		goto unlock;
  10526	}
  10527
  10528	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
  10529	if (sds) {
  10530		/*
  10531		 * If there is an imbalance between LLC domains (IOW we could
  10532		 * increase the overall cache use), we need some less-loaded LLC
  10533		 * domain to pull some load. Likewise, we may need to spread
  10534		 * load within the current LLC domain (e.g. packed SMT cores but
  10535		 * other CPUs are idle). We can't really know from here how busy
  10536		 * the others are - so just get a nohz balance going if it looks
  10537		 * like this LLC domain has tasks we could move.
  10538		 */
  10539		nr_busy = atomic_read(&sds->nr_busy_cpus);
  10540		if (nr_busy > 1) {
  10541			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
  10542			goto unlock;
  10543		}
  10544	}
  10545unlock:
  10546	rcu_read_unlock();
  10547out:
  10548	if (READ_ONCE(nohz.needs_update))
  10549		flags |= NOHZ_NEXT_KICK;
  10550
  10551	if (flags)
  10552		kick_ilb(flags);
  10553}
  10554
  10555static void set_cpu_sd_state_busy(int cpu)
  10556{
  10557	struct sched_domain *sd;
  10558
  10559	rcu_read_lock();
  10560	sd = rcu_dereference(per_cpu(sd_llc, cpu));
  10561
  10562	if (!sd || !sd->nohz_idle)
  10563		goto unlock;
  10564	sd->nohz_idle = 0;
  10565
  10566	atomic_inc(&sd->shared->nr_busy_cpus);
  10567unlock:
  10568	rcu_read_unlock();
  10569}
  10570
  10571void nohz_balance_exit_idle(struct rq *rq)
  10572{
  10573	SCHED_WARN_ON(rq != this_rq());
  10574
  10575	if (likely(!rq->nohz_tick_stopped))
  10576		return;
  10577
  10578	rq->nohz_tick_stopped = 0;
  10579	cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
  10580	atomic_dec(&nohz.nr_cpus);
  10581
  10582	set_cpu_sd_state_busy(rq->cpu);
  10583}
  10584
  10585static void set_cpu_sd_state_idle(int cpu)
  10586{
  10587	struct sched_domain *sd;
  10588
  10589	rcu_read_lock();
  10590	sd = rcu_dereference(per_cpu(sd_llc, cpu));
  10591
  10592	if (!sd || sd->nohz_idle)
  10593		goto unlock;
  10594	sd->nohz_idle = 1;
  10595
  10596	atomic_dec(&sd->shared->nr_busy_cpus);
  10597unlock:
  10598	rcu_read_unlock();
  10599}
  10600
  10601/*
  10602 * This routine will record that the CPU is going idle with tick stopped.
  10603 * This info will be used in performing idle load balancing in the future.
  10604 */
  10605void nohz_balance_enter_idle(int cpu)
  10606{
  10607	struct rq *rq = cpu_rq(cpu);
  10608
  10609	SCHED_WARN_ON(cpu != smp_processor_id());
  10610
  10611	/* If this CPU is going down, then nothing needs to be done: */
  10612	if (!cpu_active(cpu))
  10613		return;
  10614
  10615	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
  10616	if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
  10617		return;
  10618
  10619	/*
  10620	 * Can be set safely without rq->lock held
  10621	 * If a clear happens, it will have evaluated last additions because
  10622	 * rq->lock is held during the check and the clear
  10623	 */
  10624	rq->has_blocked_load = 1;
  10625
  10626	/*
  10627	 * The tick is still stopped but load could have been added in the
  10628	 * meantime. We set the nohz.has_blocked flag to trig a check of the
  10629	 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
  10630	 * of nohz.has_blocked can only happen after checking the new load
  10631	 */
  10632	if (rq->nohz_tick_stopped)
  10633		goto out;
  10634
  10635	/* If we're a completely isolated CPU, we don't play: */
  10636	if (on_null_domain(rq))
  10637		return;
  10638
  10639	rq->nohz_tick_stopped = 1;
  10640
  10641	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
  10642	atomic_inc(&nohz.nr_cpus);
  10643
  10644	/*
  10645	 * Ensures that if nohz_idle_balance() fails to observe our
  10646	 * @idle_cpus_mask store, it must observe the @has_blocked
  10647	 * and @needs_update stores.
  10648	 */
  10649	smp_mb__after_atomic();
  10650
  10651	set_cpu_sd_state_idle(cpu);
  10652
  10653	WRITE_ONCE(nohz.needs_update, 1);
  10654out:
  10655	/*
  10656	 * Each time a cpu enter idle, we assume that it has blocked load and
  10657	 * enable the periodic update of the load of idle cpus
  10658	 */
  10659	WRITE_ONCE(nohz.has_blocked, 1);
  10660}
  10661
  10662static bool update_nohz_stats(struct rq *rq)
  10663{
  10664	unsigned int cpu = rq->cpu;
  10665
  10666	if (!rq->has_blocked_load)
  10667		return false;
  10668
  10669	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
  10670		return false;
  10671
  10672	if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
  10673		return true;
  10674
  10675	update_blocked_averages(cpu);
  10676
  10677	return rq->has_blocked_load;
  10678}
  10679
  10680/*
  10681 * Internal function that runs load balance for all idle cpus. The load balance
  10682 * can be a simple update of blocked load or a complete load balance with
  10683 * tasks movement depending of flags.
  10684 */
  10685static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
  10686			       enum cpu_idle_type idle)
  10687{
  10688	/* Earliest time when we have to do rebalance again */
  10689	unsigned long now = jiffies;
  10690	unsigned long next_balance = now + 60*HZ;
  10691	bool has_blocked_load = false;
  10692	int update_next_balance = 0;
  10693	int this_cpu = this_rq->cpu;
  10694	int balance_cpu;
  10695	struct rq *rq;
  10696
  10697	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
  10698
  10699	/*
  10700	 * We assume there will be no idle load after this update and clear
  10701	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
  10702	 * set the has_blocked flag and trigger another update of idle load.
  10703	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
  10704	 * setting the flag, we are sure to not clear the state and not
  10705	 * check the load of an idle cpu.
  10706	 *
  10707	 * Same applies to idle_cpus_mask vs needs_update.
  10708	 */
  10709	if (flags & NOHZ_STATS_KICK)
  10710		WRITE_ONCE(nohz.has_blocked, 0);
  10711	if (flags & NOHZ_NEXT_KICK)
  10712		WRITE_ONCE(nohz.needs_update, 0);
  10713
  10714	/*
  10715	 * Ensures that if we miss the CPU, we must see the has_blocked
  10716	 * store from nohz_balance_enter_idle().
  10717	 */
  10718	smp_mb();
  10719
  10720	/*
  10721	 * Start with the next CPU after this_cpu so we will end with this_cpu and let a
  10722	 * chance for other idle cpu to pull load.
  10723	 */
  10724	for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
  10725		if (!idle_cpu(balance_cpu))
  10726			continue;
  10727
  10728		/*
  10729		 * If this CPU gets work to do, stop the load balancing
  10730		 * work being done for other CPUs. Next load
  10731		 * balancing owner will pick it up.
  10732		 */
  10733		if (need_resched()) {
  10734			if (flags & NOHZ_STATS_KICK)
  10735				has_blocked_load = true;
  10736			if (flags & NOHZ_NEXT_KICK)
  10737				WRITE_ONCE(nohz.needs_update, 1);
  10738			goto abort;
  10739		}
  10740
  10741		rq = cpu_rq(balance_cpu);
  10742
  10743		if (flags & NOHZ_STATS_KICK)
  10744			has_blocked_load |= update_nohz_stats(rq);
  10745
  10746		/*
  10747		 * If time for next balance is due,
  10748		 * do the balance.
  10749		 */
  10750		if (time_after_eq(jiffies, rq->next_balance)) {
  10751			struct rq_flags rf;
  10752
  10753			rq_lock_irqsave(rq, &rf);
  10754			update_rq_clock(rq);
  10755			rq_unlock_irqrestore(rq, &rf);
  10756
  10757			if (flags & NOHZ_BALANCE_KICK)
  10758				rebalance_domains(rq, CPU_IDLE);
  10759		}
  10760
  10761		if (time_after(next_balance, rq->next_balance)) {
  10762			next_balance = rq->next_balance;
  10763			update_next_balance = 1;
  10764		}
  10765	}
  10766
  10767	/*
  10768	 * next_balance will be updated only when there is a need.
  10769	 * When the CPU is attached to null domain for ex, it will not be
  10770	 * updated.
  10771	 */
  10772	if (likely(update_next_balance))
  10773		nohz.next_balance = next_balance;
  10774
  10775	if (flags & NOHZ_STATS_KICK)
  10776		WRITE_ONCE(nohz.next_blocked,
  10777			   now + msecs_to_jiffies(LOAD_AVG_PERIOD));
  10778
  10779abort:
  10780	/* There is still blocked load, enable periodic update */
  10781	if (has_blocked_load)
  10782		WRITE_ONCE(nohz.has_blocked, 1);
  10783}
  10784
  10785/*
  10786 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
  10787 * rebalancing for all the cpus for whom scheduler ticks are stopped.
  10788 */
  10789static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
  10790{
  10791	unsigned int flags = this_rq->nohz_idle_balance;
  10792
  10793	if (!flags)
  10794		return false;
  10795
  10796	this_rq->nohz_idle_balance = 0;
  10797
  10798	if (idle != CPU_IDLE)
  10799		return false;
  10800
  10801	_nohz_idle_balance(this_rq, flags, idle);
  10802
  10803	return true;
  10804}
  10805
  10806/*
  10807 * Check if we need to run the ILB for updating blocked load before entering
  10808 * idle state.
  10809 */
  10810void nohz_run_idle_balance(int cpu)
  10811{
  10812	unsigned int flags;
  10813
  10814	flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
  10815
  10816	/*
  10817	 * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
  10818	 * (ie NOHZ_STATS_KICK set) and will do the same.
  10819	 */
  10820	if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
  10821		_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
  10822}
  10823
  10824static void nohz_newidle_balance(struct rq *this_rq)
  10825{
  10826	int this_cpu = this_rq->cpu;
  10827
  10828	/*
  10829	 * This CPU doesn't want to be disturbed by scheduler
  10830	 * housekeeping
  10831	 */
  10832	if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
  10833		return;
  10834
  10835	/* Will wake up very soon. No time for doing anything else*/
  10836	if (this_rq->avg_idle < sysctl_sched_migration_cost)
  10837		return;
  10838
  10839	/* Don't need to update blocked load of idle CPUs*/
  10840	if (!READ_ONCE(nohz.has_blocked) ||
  10841	    time_before(jiffies, READ_ONCE(nohz.next_blocked)))
  10842		return;
  10843
  10844	/*
  10845	 * Set the need to trigger ILB in order to update blocked load
  10846	 * before entering idle state.
  10847	 */
  10848	atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
  10849}
  10850
  10851#else /* !CONFIG_NO_HZ_COMMON */
  10852static inline void nohz_balancer_kick(struct rq *rq) { }
  10853
  10854static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
  10855{
  10856	return false;
  10857}
  10858
  10859static inline void nohz_newidle_balance(struct rq *this_rq) { }
  10860#endif /* CONFIG_NO_HZ_COMMON */
  10861
  10862/*
  10863 * newidle_balance is called by schedule() if this_cpu is about to become
  10864 * idle. Attempts to pull tasks from other CPUs.
  10865 *
  10866 * Returns:
  10867 *   < 0 - we released the lock and there are !fair tasks present
  10868 *     0 - failed, no new tasks
  10869 *   > 0 - success, new (fair) tasks present
  10870 */
  10871static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
  10872{
  10873	unsigned long next_balance = jiffies + HZ;
  10874	int this_cpu = this_rq->cpu;
  10875	u64 t0, t1, curr_cost = 0;
  10876	struct sched_domain *sd;
  10877	int pulled_task = 0;
  10878
  10879	update_misfit_status(NULL, this_rq);
  10880
  10881	/*
  10882	 * There is a task waiting to run. No need to search for one.
  10883	 * Return 0; the task will be enqueued when switching to idle.
  10884	 */
  10885	if (this_rq->ttwu_pending)
  10886		return 0;
  10887
  10888	/*
  10889	 * We must set idle_stamp _before_ calling idle_balance(), such that we
  10890	 * measure the duration of idle_balance() as idle time.
  10891	 */
  10892	this_rq->idle_stamp = rq_clock(this_rq);
  10893
  10894	/*
  10895	 * Do not pull tasks towards !active CPUs...
  10896	 */
  10897	if (!cpu_active(this_cpu))
  10898		return 0;
  10899
  10900	/*
  10901	 * This is OK, because current is on_cpu, which avoids it being picked
  10902	 * for load-balance and preemption/IRQs are still disabled avoiding
  10903	 * further scheduler activity on it and we're being very careful to
  10904	 * re-start the picking loop.
  10905	 */
  10906	rq_unpin_lock(this_rq, rf);
  10907
  10908	rcu_read_lock();
  10909	sd = rcu_dereference_check_sched_domain(this_rq->sd);
  10910
  10911	if (!READ_ONCE(this_rq->rd->overload) ||
  10912	    (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
  10913
  10914		if (sd)
  10915			update_next_balance(sd, &next_balance);
  10916		rcu_read_unlock();
  10917
  10918		goto out;
  10919	}
  10920	rcu_read_unlock();
  10921
  10922	raw_spin_rq_unlock(this_rq);
  10923
  10924	t0 = sched_clock_cpu(this_cpu);
  10925	update_blocked_averages(this_cpu);
  10926
  10927	rcu_read_lock();
  10928	for_each_domain(this_cpu, sd) {
  10929		int continue_balancing = 1;
  10930		u64 domain_cost;
  10931
  10932		update_next_balance(sd, &next_balance);
  10933
  10934		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
  10935			break;
  10936
  10937		if (sd->flags & SD_BALANCE_NEWIDLE) {
  10938
  10939			pulled_task = load_balance(this_cpu, this_rq,
  10940						   sd, CPU_NEWLY_IDLE,
  10941						   &continue_balancing);
  10942
  10943			t1 = sched_clock_cpu(this_cpu);
  10944			domain_cost = t1 - t0;
  10945			update_newidle_cost(sd, domain_cost);
  10946
  10947			curr_cost += domain_cost;
  10948			t0 = t1;
  10949		}
  10950
  10951		/*
  10952		 * Stop searching for tasks to pull if there are
  10953		 * now runnable tasks on this rq.
  10954		 */
  10955		if (pulled_task || this_rq->nr_running > 0 ||
  10956		    this_rq->ttwu_pending)
  10957			break;
  10958	}
  10959	rcu_read_unlock();
  10960
  10961	raw_spin_rq_lock(this_rq);
  10962
  10963	if (curr_cost > this_rq->max_idle_balance_cost)
  10964		this_rq->max_idle_balance_cost = curr_cost;
  10965
  10966	/*
  10967	 * While browsing the domains, we released the rq lock, a task could
  10968	 * have been enqueued in the meantime. Since we're not going idle,
  10969	 * pretend we pulled a task.
  10970	 */
  10971	if (this_rq->cfs.h_nr_running && !pulled_task)
  10972		pulled_task = 1;
  10973
  10974	/* Is there a task of a high priority class? */
  10975	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
  10976		pulled_task = -1;
  10977
  10978out:
  10979	/* Move the next balance forward */
  10980	if (time_after(this_rq->next_balance, next_balance))
  10981		this_rq->next_balance = next_balance;
  10982
  10983	if (pulled_task)
  10984		this_rq->idle_stamp = 0;
  10985	else
  10986		nohz_newidle_balance(this_rq);
  10987
  10988	rq_repin_lock(this_rq, rf);
  10989
  10990	return pulled_task;
  10991}
  10992
  10993/*
  10994 * run_rebalance_domains is triggered when needed from the scheduler tick.
  10995 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
  10996 */
  10997static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
  10998{
  10999	struct rq *this_rq = this_rq();
  11000	enum cpu_idle_type idle = this_rq->idle_balance ?
  11001						CPU_IDLE : CPU_NOT_IDLE;
  11002
  11003	/*
  11004	 * If this CPU has a pending nohz_balance_kick, then do the
  11005	 * balancing on behalf of the other idle CPUs whose ticks are
  11006	 * stopped. Do nohz_idle_balance *before* rebalance_domains to
  11007	 * give the idle CPUs a chance to load balance. Else we may
  11008	 * load balance only within the local sched_domain hierarchy
  11009	 * and abort nohz_idle_balance altogether if we pull some load.
  11010	 */
  11011	if (nohz_idle_balance(this_rq, idle))
  11012		return;
  11013
  11014	/* normal load balance */
  11015	update_blocked_averages(this_rq->cpu);
  11016	rebalance_domains(this_rq, idle);
  11017}
  11018
  11019/*
  11020 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
  11021 */
  11022void trigger_load_balance(struct rq *rq)
  11023{
  11024	/*
  11025	 * Don't need to rebalance while attached to NULL domain or
  11026	 * runqueue CPU is not active
  11027	 */
  11028	if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
  11029		return;
  11030
  11031	if (time_after_eq(jiffies, rq->next_balance))
  11032		raise_softirq(SCHED_SOFTIRQ);
  11033
  11034	nohz_balancer_kick(rq);
  11035}
  11036
  11037static void rq_online_fair(struct rq *rq)
  11038{
  11039	update_sysctl();
  11040
  11041	update_runtime_enabled(rq);
  11042}
  11043
  11044static void rq_offline_fair(struct rq *rq)
  11045{
  11046	update_sysctl();
  11047
  11048	/* Ensure any throttled groups are reachable by pick_next_task */
  11049	unthrottle_offline_cfs_rqs(rq);
  11050}
  11051
  11052#endif /* CONFIG_SMP */
  11053
  11054#ifdef CONFIG_SCHED_CORE
  11055static inline bool
  11056__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
  11057{
  11058	u64 slice = sched_slice(cfs_rq_of(se), se);
  11059	u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
  11060
  11061	return (rtime * min_nr_tasks > slice);
  11062}
  11063
  11064#define MIN_NR_TASKS_DURING_FORCEIDLE	2
  11065static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
  11066{
  11067	if (!sched_core_enabled(rq))
  11068		return;
  11069
  11070	/*
  11071	 * If runqueue has only one task which used up its slice and
  11072	 * if the sibling is forced idle, then trigger schedule to
  11073	 * give forced idle task a chance.
  11074	 *
  11075	 * sched_slice() considers only this active rq and it gets the
  11076	 * whole slice. But during force idle, we have siblings acting
  11077	 * like a single runqueue and hence we need to consider runnable
  11078	 * tasks on this CPU and the forced idle CPU. Ideally, we should
  11079	 * go through the forced idle rq, but that would be a perf hit.
  11080	 * We can assume that the forced idle CPU has at least
  11081	 * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
  11082	 * if we need to give up the CPU.
  11083	 */
  11084	if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
  11085	    __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
  11086		resched_curr(rq);
  11087}
  11088
  11089/*
  11090 * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
  11091 */
  11092static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
  11093{
  11094	for_each_sched_entity(se) {
  11095		struct cfs_rq *cfs_rq = cfs_rq_of(se);
  11096
  11097		if (forceidle) {
  11098			if (cfs_rq->forceidle_seq == fi_seq)
  11099				break;
  11100			cfs_rq->forceidle_seq = fi_seq;
  11101		}
  11102
  11103		cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
  11104	}
  11105}
  11106
  11107void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
  11108{
  11109	struct sched_entity *se = &p->se;
  11110
  11111	if (p->sched_class != &fair_sched_class)
  11112		return;
  11113
  11114	se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
  11115}
  11116
  11117bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
  11118{
  11119	struct rq *rq = task_rq(a);
  11120	struct sched_entity *sea = &a->se;
  11121	struct sched_entity *seb = &b->se;
  11122	struct cfs_rq *cfs_rqa;
  11123	struct cfs_rq *cfs_rqb;
  11124	s64 delta;
  11125
  11126	SCHED_WARN_ON(task_rq(b)->core != rq->core);
  11127
  11128#ifdef CONFIG_FAIR_GROUP_SCHED
  11129	/*
  11130	 * Find an se in the hierarchy for tasks a and b, such that the se's
  11131	 * are immediate siblings.
  11132	 */
  11133	while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
  11134		int sea_depth = sea->depth;
  11135		int seb_depth = seb->depth;
  11136
  11137		if (sea_depth >= seb_depth)
  11138			sea = parent_entity(sea);
  11139		if (sea_depth <= seb_depth)
  11140			seb = parent_entity(seb);
  11141	}
  11142
  11143	se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
  11144	se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
  11145
  11146	cfs_rqa = sea->cfs_rq;
  11147	cfs_rqb = seb->cfs_rq;
  11148#else
  11149	cfs_rqa = &task_rq(a)->cfs;
  11150	cfs_rqb = &task_rq(b)->cfs;
  11151#endif
  11152
  11153	/*
  11154	 * Find delta after normalizing se's vruntime with its cfs_rq's
  11155	 * min_vruntime_fi, which would have been updated in prior calls
  11156	 * to se_fi_update().
  11157	 */
  11158	delta = (s64)(sea->vruntime - seb->vruntime) +
  11159		(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
  11160
  11161	return delta > 0;
  11162}
  11163#else
  11164static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
  11165#endif
  11166
  11167/*
  11168 * scheduler tick hitting a task of our scheduling class.
  11169 *
  11170 * NOTE: This function can be called remotely by the tick offload that
  11171 * goes along full dynticks. Therefore no local assumption can be made
  11172 * and everything must be accessed through the @rq and @curr passed in
  11173 * parameters.
  11174 */
  11175static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
  11176{
  11177	struct cfs_rq *cfs_rq;
  11178	struct sched_entity *se = &curr->se;
  11179
  11180	for_each_sched_entity(se) {
  11181		cfs_rq = cfs_rq_of(se);
  11182		entity_tick(cfs_rq, se, queued);
  11183	}
  11184
  11185	if (static_branch_unlikely(&sched_numa_balancing))
  11186		task_tick_numa(rq, curr);
  11187
  11188	update_misfit_status(curr, rq);
  11189	update_overutilized_status(task_rq(curr));
  11190
  11191	task_tick_core(rq, curr);
  11192}
  11193
  11194/*
  11195 * called on fork with the child task as argument from the parent's context
  11196 *  - child not yet on the tasklist
  11197 *  - preemption disabled
  11198 */
  11199static void task_fork_fair(struct task_struct *p)
  11200{
  11201	struct cfs_rq *cfs_rq;
  11202	struct sched_entity *se = &p->se, *curr;
  11203	struct rq *rq = this_rq();
  11204	struct rq_flags rf;
  11205
  11206	rq_lock(rq, &rf);
  11207	update_rq_clock(rq);
  11208
  11209	cfs_rq = task_cfs_rq(current);
  11210	curr = cfs_rq->curr;
  11211	if (curr) {
  11212		update_curr(cfs_rq);
  11213		se->vruntime = curr->vruntime;
  11214	}
  11215	place_entity(cfs_rq, se, 1);
  11216
  11217	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
  11218		/*
  11219		 * Upon rescheduling, sched_class::put_prev_task() will place
  11220		 * 'current' within the tree based on its new key value.
  11221		 */
  11222		swap(curr->vruntime, se->vruntime);
  11223		resched_curr(rq);
  11224	}
  11225
  11226	se->vruntime -= cfs_rq->min_vruntime;
  11227	rq_unlock(rq, &rf);
  11228}
  11229
  11230/*
  11231 * Priority of the task has changed. Check to see if we preempt
  11232 * the current task.
  11233 */
  11234static void
  11235prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
  11236{
  11237	if (!task_on_rq_queued(p))
  11238		return;
  11239
  11240	if (rq->cfs.nr_running == 1)
  11241		return;
  11242
  11243	/*
  11244	 * Reschedule if we are currently running on this runqueue and
  11245	 * our priority decreased, or if we are not currently running on
  11246	 * this runqueue and our priority is higher than the current's
  11247	 */
  11248	if (task_current(rq, p)) {
  11249		if (p->prio > oldprio)
  11250			resched_curr(rq);
  11251	} else
  11252		check_preempt_curr(rq, p, 0);
  11253}
  11254
  11255static inline bool vruntime_normalized(struct task_struct *p)
  11256{
  11257	struct sched_entity *se = &p->se;
  11258
  11259	/*
  11260	 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
  11261	 * the dequeue_entity(.flags=0) will already have normalized the
  11262	 * vruntime.
  11263	 */
  11264	if (p->on_rq)
  11265		return true;
  11266
  11267	/*
  11268	 * When !on_rq, vruntime of the task has usually NOT been normalized.
  11269	 * But there are some cases where it has already been normalized:
  11270	 *
  11271	 * - A forked child which is waiting for being woken up by
  11272	 *   wake_up_new_task().
  11273	 * - A task which has been woken up by try_to_wake_up() and
  11274	 *   waiting for actually being woken up by sched_ttwu_pending().
  11275	 */
  11276	if (!se->sum_exec_runtime ||
  11277	    (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
  11278		return true;
  11279
  11280	return false;
  11281}
  11282
  11283#ifdef CONFIG_FAIR_GROUP_SCHED
  11284/*
  11285 * Propagate the changes of the sched_entity across the tg tree to make it
  11286 * visible to the root
  11287 */
  11288static void propagate_entity_cfs_rq(struct sched_entity *se)
  11289{
  11290	struct cfs_rq *cfs_rq;
  11291
  11292	list_add_leaf_cfs_rq(cfs_rq_of(se));
  11293
  11294	/* Start to propagate at parent */
  11295	se = se->parent;
  11296
  11297	for_each_sched_entity(se) {
  11298		cfs_rq = cfs_rq_of(se);
  11299
  11300		if (!cfs_rq_throttled(cfs_rq)){
  11301			update_load_avg(cfs_rq, se, UPDATE_TG);
  11302			list_add_leaf_cfs_rq(cfs_rq);
  11303			continue;
  11304		}
  11305
  11306		if (list_add_leaf_cfs_rq(cfs_rq))
  11307			break;
  11308	}
  11309}
  11310#else
  11311static void propagate_entity_cfs_rq(struct sched_entity *se) { }
  11312#endif
  11313
  11314static void detach_entity_cfs_rq(struct sched_entity *se)
  11315{
  11316	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  11317
  11318	/* Catch up with the cfs_rq and remove our load when we leave */
  11319	update_load_avg(cfs_rq, se, 0);
  11320	detach_entity_load_avg(cfs_rq, se);
  11321	update_tg_load_avg(cfs_rq);
  11322	propagate_entity_cfs_rq(se);
  11323}
  11324
  11325static void attach_entity_cfs_rq(struct sched_entity *se)
  11326{
  11327	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  11328
  11329#ifdef CONFIG_FAIR_GROUP_SCHED
  11330	/*
  11331	 * Since the real-depth could have been changed (only FAIR
  11332	 * class maintain depth value), reset depth properly.
  11333	 */
  11334	se->depth = se->parent ? se->parent->depth + 1 : 0;
  11335#endif
  11336
  11337	/* Synchronize entity with its cfs_rq */
  11338	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
  11339	attach_entity_load_avg(cfs_rq, se);
  11340	update_tg_load_avg(cfs_rq);
  11341	propagate_entity_cfs_rq(se);
  11342}
  11343
  11344static void detach_task_cfs_rq(struct task_struct *p)
  11345{
  11346	struct sched_entity *se = &p->se;
  11347	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  11348
  11349	if (!vruntime_normalized(p)) {
  11350		/*
  11351		 * Fix up our vruntime so that the current sleep doesn't
  11352		 * cause 'unlimited' sleep bonus.
  11353		 */
  11354		place_entity(cfs_rq, se, 0);
  11355		se->vruntime -= cfs_rq->min_vruntime;
  11356	}
  11357
  11358	detach_entity_cfs_rq(se);
  11359}
  11360
  11361static void attach_task_cfs_rq(struct task_struct *p)
  11362{
  11363	struct sched_entity *se = &p->se;
  11364	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  11365
  11366	attach_entity_cfs_rq(se);
  11367
  11368	if (!vruntime_normalized(p))
  11369		se->vruntime += cfs_rq->min_vruntime;
  11370}
  11371
  11372static void switched_from_fair(struct rq *rq, struct task_struct *p)
  11373{
  11374	detach_task_cfs_rq(p);
  11375}
  11376
  11377static void switched_to_fair(struct rq *rq, struct task_struct *p)
  11378{
  11379	attach_task_cfs_rq(p);
  11380
  11381	if (task_on_rq_queued(p)) {
  11382		/*
  11383		 * We were most likely switched from sched_rt, so
  11384		 * kick off the schedule if running, otherwise just see
  11385		 * if we can still preempt the current task.
  11386		 */
  11387		if (task_current(rq, p))
  11388			resched_curr(rq);
  11389		else
  11390			check_preempt_curr(rq, p, 0);
  11391	}
  11392}
  11393
  11394/* Account for a task changing its policy or group.
  11395 *
  11396 * This routine is mostly called to set cfs_rq->curr field when a task
  11397 * migrates between groups/classes.
  11398 */
  11399static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
  11400{
  11401	struct sched_entity *se = &p->se;
  11402
  11403#ifdef CONFIG_SMP
  11404	if (task_on_rq_queued(p)) {
  11405		/*
  11406		 * Move the next running task to the front of the list, so our
  11407		 * cfs_tasks list becomes MRU one.
  11408		 */
  11409		list_move(&se->group_node, &rq->cfs_tasks);
  11410	}
  11411#endif
  11412
  11413	for_each_sched_entity(se) {
  11414		struct cfs_rq *cfs_rq = cfs_rq_of(se);
  11415
  11416		set_next_entity(cfs_rq, se);
  11417		/* ensure bandwidth has been allocated on our new cfs_rq */
  11418		account_cfs_rq_runtime(cfs_rq, 0);
  11419	}
  11420}
  11421
  11422void init_cfs_rq(struct cfs_rq *cfs_rq)
  11423{
  11424	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
  11425	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
  11426#ifndef CONFIG_64BIT
  11427	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  11428#endif
  11429#ifdef CONFIG_SMP
  11430	raw_spin_lock_init(&cfs_rq->removed.lock);
  11431#endif
  11432}
  11433
  11434#ifdef CONFIG_FAIR_GROUP_SCHED
  11435static void task_set_group_fair(struct task_struct *p)
  11436{
  11437	struct sched_entity *se = &p->se;
  11438
  11439	set_task_rq(p, task_cpu(p));
  11440	se->depth = se->parent ? se->parent->depth + 1 : 0;
  11441}
  11442
  11443static void task_move_group_fair(struct task_struct *p)
  11444{
  11445	detach_task_cfs_rq(p);
  11446	set_task_rq(p, task_cpu(p));
  11447
  11448#ifdef CONFIG_SMP
  11449	/* Tell se's cfs_rq has been changed -- migrated */
  11450	p->se.avg.last_update_time = 0;
  11451#endif
  11452	attach_task_cfs_rq(p);
  11453}
  11454
  11455static void task_change_group_fair(struct task_struct *p, int type)
  11456{
  11457	switch (type) {
  11458	case TASK_SET_GROUP:
  11459		task_set_group_fair(p);
  11460		break;
  11461
  11462	case TASK_MOVE_GROUP:
  11463		task_move_group_fair(p);
  11464		break;
  11465	}
  11466}
  11467
  11468void free_fair_sched_group(struct task_group *tg)
  11469{
  11470	int i;
  11471
  11472	for_each_possible_cpu(i) {
  11473		if (tg->cfs_rq)
  11474			kfree(tg->cfs_rq[i]);
  11475		if (tg->se)
  11476			kfree(tg->se[i]);
  11477	}
  11478
  11479	kfree(tg->cfs_rq);
  11480	kfree(tg->se);
  11481}
  11482
  11483int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  11484{
  11485	struct sched_entity *se;
  11486	struct cfs_rq *cfs_rq;
  11487	int i;
  11488
  11489	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
  11490	if (!tg->cfs_rq)
  11491		goto err;
  11492	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
  11493	if (!tg->se)
  11494		goto err;
  11495
  11496	tg->shares = NICE_0_LOAD;
  11497
  11498	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
  11499
  11500	for_each_possible_cpu(i) {
  11501		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
  11502				      GFP_KERNEL, cpu_to_node(i));
  11503		if (!cfs_rq)
  11504			goto err;
  11505
  11506		se = kzalloc_node(sizeof(struct sched_entity_stats),
  11507				  GFP_KERNEL, cpu_to_node(i));
  11508		if (!se)
  11509			goto err_free_rq;
  11510
  11511		init_cfs_rq(cfs_rq);
  11512		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
  11513		init_entity_runnable_average(se);
  11514	}
  11515
  11516	return 1;
  11517
  11518err_free_rq:
  11519	kfree(cfs_rq);
  11520err:
  11521	return 0;
  11522}
  11523
  11524void online_fair_sched_group(struct task_group *tg)
  11525{
  11526	struct sched_entity *se;
  11527	struct rq_flags rf;
  11528	struct rq *rq;
  11529	int i;
  11530
  11531	for_each_possible_cpu(i) {
  11532		rq = cpu_rq(i);
  11533		se = tg->se[i];
  11534		rq_lock_irq(rq, &rf);
  11535		update_rq_clock(rq);
  11536		attach_entity_cfs_rq(se);
  11537		sync_throttle(tg, i);
  11538		rq_unlock_irq(rq, &rf);
  11539	}
  11540}
  11541
  11542void unregister_fair_sched_group(struct task_group *tg)
  11543{
  11544	unsigned long flags;
  11545	struct rq *rq;
  11546	int cpu;
  11547
  11548	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
  11549
  11550	for_each_possible_cpu(cpu) {
  11551		if (tg->se[cpu])
  11552			remove_entity_load_avg(tg->se[cpu]);
  11553
  11554		/*
  11555		 * Only empty task groups can be destroyed; so we can speculatively
  11556		 * check on_list without danger of it being re-added.
  11557		 */
  11558		if (!tg->cfs_rq[cpu]->on_list)
  11559			continue;
  11560
  11561		rq = cpu_rq(cpu);
  11562
  11563		raw_spin_rq_lock_irqsave(rq, flags);
  11564		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
  11565		raw_spin_rq_unlock_irqrestore(rq, flags);
  11566	}
  11567}
  11568
  11569void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
  11570			struct sched_entity *se, int cpu,
  11571			struct sched_entity *parent)
  11572{
  11573	struct rq *rq = cpu_rq(cpu);
  11574
  11575	cfs_rq->tg = tg;
  11576	cfs_rq->rq = rq;
  11577	init_cfs_rq_runtime(cfs_rq);
  11578
  11579	tg->cfs_rq[cpu] = cfs_rq;
  11580	tg->se[cpu] = se;
  11581
  11582	/* se could be NULL for root_task_group */
  11583	if (!se)
  11584		return;
  11585
  11586	if (!parent) {
  11587		se->cfs_rq = &rq->cfs;
  11588		se->depth = 0;
  11589	} else {
  11590		se->cfs_rq = parent->my_q;
  11591		se->depth = parent->depth + 1;
  11592	}
  11593
  11594	se->my_q = cfs_rq;
  11595	/* guarantee group entities always have weight */
  11596	update_load_set(&se->load, NICE_0_LOAD);
  11597	se->parent = parent;
  11598}
  11599
  11600static DEFINE_MUTEX(shares_mutex);
  11601
  11602static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
  11603{
  11604	int i;
  11605
  11606	lockdep_assert_held(&shares_mutex);
  11607
  11608	/*
  11609	 * We can't change the weight of the root cgroup.
  11610	 */
  11611	if (!tg->se[0])
  11612		return -EINVAL;
  11613
  11614	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
  11615
  11616	if (tg->shares == shares)
  11617		return 0;
  11618
  11619	tg->shares = shares;
  11620	for_each_possible_cpu(i) {
  11621		struct rq *rq = cpu_rq(i);
  11622		struct sched_entity *se = tg->se[i];
  11623		struct rq_flags rf;
  11624
  11625		/* Propagate contribution to hierarchy */
  11626		rq_lock_irqsave(rq, &rf);
  11627		update_rq_clock(rq);
  11628		for_each_sched_entity(se) {
  11629			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
  11630			update_cfs_group(se);
  11631		}
  11632		rq_unlock_irqrestore(rq, &rf);
  11633	}
  11634
  11635	return 0;
  11636}
  11637
  11638int sched_group_set_shares(struct task_group *tg, unsigned long shares)
  11639{
  11640	int ret;
  11641
  11642	mutex_lock(&shares_mutex);
  11643	if (tg_is_idle(tg))
  11644		ret = -EINVAL;
  11645	else
  11646		ret = __sched_group_set_shares(tg, shares);
  11647	mutex_unlock(&shares_mutex);
  11648
  11649	return ret;
  11650}
  11651
  11652int sched_group_set_idle(struct task_group *tg, long idle)
  11653{
  11654	int i;
  11655
  11656	if (tg == &root_task_group)
  11657		return -EINVAL;
  11658
  11659	if (idle < 0 || idle > 1)
  11660		return -EINVAL;
  11661
  11662	mutex_lock(&shares_mutex);
  11663
  11664	if (tg->idle == idle) {
  11665		mutex_unlock(&shares_mutex);
  11666		return 0;
  11667	}
  11668
  11669	tg->idle = idle;
  11670
  11671	for_each_possible_cpu(i) {
  11672		struct rq *rq = cpu_rq(i);
  11673		struct sched_entity *se = tg->se[i];
  11674		struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
  11675		bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
  11676		long idle_task_delta;
  11677		struct rq_flags rf;
  11678
  11679		rq_lock_irqsave(rq, &rf);
  11680
  11681		grp_cfs_rq->idle = idle;
  11682		if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
  11683			goto next_cpu;
  11684
  11685		if (se->on_rq) {
  11686			parent_cfs_rq = cfs_rq_of(se);
  11687			if (cfs_rq_is_idle(grp_cfs_rq))
  11688				parent_cfs_rq->idle_nr_running++;
  11689			else
  11690				parent_cfs_rq->idle_nr_running--;
  11691		}
  11692
  11693		idle_task_delta = grp_cfs_rq->h_nr_running -
  11694				  grp_cfs_rq->idle_h_nr_running;
  11695		if (!cfs_rq_is_idle(grp_cfs_rq))
  11696			idle_task_delta *= -1;
  11697
  11698		for_each_sched_entity(se) {
  11699			struct cfs_rq *cfs_rq = cfs_rq_of(se);
  11700
  11701			if (!se->on_rq)
  11702				break;
  11703
  11704			cfs_rq->idle_h_nr_running += idle_task_delta;
  11705
  11706			/* Already accounted at parent level and above. */
  11707			if (cfs_rq_is_idle(cfs_rq))
  11708				break;
  11709		}
  11710
  11711next_cpu:
  11712		rq_unlock_irqrestore(rq, &rf);
  11713	}
  11714
  11715	/* Idle groups have minimum weight. */
  11716	if (tg_is_idle(tg))
  11717		__sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
  11718	else
  11719		__sched_group_set_shares(tg, NICE_0_LOAD);
  11720
  11721	mutex_unlock(&shares_mutex);
  11722	return 0;
  11723}
  11724
  11725#else /* CONFIG_FAIR_GROUP_SCHED */
  11726
  11727void free_fair_sched_group(struct task_group *tg) { }
  11728
  11729int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  11730{
  11731	return 1;
  11732}
  11733
  11734void online_fair_sched_group(struct task_group *tg) { }
  11735
  11736void unregister_fair_sched_group(struct task_group *tg) { }
  11737
  11738#endif /* CONFIG_FAIR_GROUP_SCHED */
  11739
  11740
  11741static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
  11742{
  11743	struct sched_entity *se = &task->se;
  11744	unsigned int rr_interval = 0;
  11745
  11746	/*
  11747	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
  11748	 * idle runqueue:
  11749	 */
  11750	if (rq->cfs.load.weight)
  11751		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
  11752
  11753	return rr_interval;
  11754}
  11755
  11756/*
  11757 * All the scheduling class methods:
  11758 */
  11759DEFINE_SCHED_CLASS(fair) = {
  11760
  11761	.enqueue_task		= enqueue_task_fair,
  11762	.dequeue_task		= dequeue_task_fair,
  11763	.yield_task		= yield_task_fair,
  11764	.yield_to_task		= yield_to_task_fair,
  11765
  11766	.check_preempt_curr	= check_preempt_wakeup,
  11767
  11768	.pick_next_task		= __pick_next_task_fair,
  11769	.put_prev_task		= put_prev_task_fair,
  11770	.set_next_task          = set_next_task_fair,
  11771
  11772#ifdef CONFIG_SMP
  11773	.balance		= balance_fair,
  11774	.pick_task		= pick_task_fair,
  11775	.select_task_rq		= select_task_rq_fair,
  11776	.migrate_task_rq	= migrate_task_rq_fair,
  11777
  11778	.rq_online		= rq_online_fair,
  11779	.rq_offline		= rq_offline_fair,
  11780
  11781	.task_dead		= task_dead_fair,
  11782	.set_cpus_allowed	= set_cpus_allowed_common,
  11783#endif
  11784
  11785	.task_tick		= task_tick_fair,
  11786	.task_fork		= task_fork_fair,
  11787
  11788	.prio_changed		= prio_changed_fair,
  11789	.switched_from		= switched_from_fair,
  11790	.switched_to		= switched_to_fair,
  11791
  11792	.get_rr_interval	= get_rr_interval_fair,
  11793
  11794	.update_curr		= update_curr_fair,
  11795
  11796#ifdef CONFIG_FAIR_GROUP_SCHED
  11797	.task_change_group	= task_change_group_fair,
  11798#endif
  11799
  11800#ifdef CONFIG_UCLAMP_TASK
  11801	.uclamp_enabled		= 1,
  11802#endif
  11803};
  11804
  11805#ifdef CONFIG_SCHED_DEBUG
  11806void print_cfs_stats(struct seq_file *m, int cpu)
  11807{
  11808	struct cfs_rq *cfs_rq, *pos;
  11809
  11810	rcu_read_lock();
  11811	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
  11812		print_cfs_rq(m, cpu, cfs_rq);
  11813	rcu_read_unlock();
  11814}
  11815
  11816#ifdef CONFIG_NUMA_BALANCING
  11817void show_numa_stats(struct task_struct *p, struct seq_file *m)
  11818{
  11819	int node;
  11820	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
  11821	struct numa_group *ng;
  11822
  11823	rcu_read_lock();
  11824	ng = rcu_dereference(p->numa_group);
  11825	for_each_online_node(node) {
  11826		if (p->numa_faults) {
  11827			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
  11828			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
  11829		}
  11830		if (ng) {
  11831			gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
  11832			gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
  11833		}
  11834		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
  11835	}
  11836	rcu_read_unlock();
  11837}
  11838#endif /* CONFIG_NUMA_BALANCING */
  11839#endif /* CONFIG_SCHED_DEBUG */
  11840
  11841__init void init_sched_fair_class(void)
  11842{
  11843#ifdef CONFIG_SMP
  11844	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
  11845
  11846#ifdef CONFIG_NO_HZ_COMMON
  11847	nohz.next_balance = jiffies;
  11848	nohz.next_blocked = jiffies;
  11849	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
  11850#endif
  11851#endif /* SMP */
  11852
  11853}