cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

topology.c (68262B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Scheduler topology setup/handling methods
      4 */
      5
      6DEFINE_MUTEX(sched_domains_mutex);
      7
      8/* Protected by sched_domains_mutex: */
      9static cpumask_var_t sched_domains_tmpmask;
     10static cpumask_var_t sched_domains_tmpmask2;
     11
     12#ifdef CONFIG_SCHED_DEBUG
     13
     14static int __init sched_debug_setup(char *str)
     15{
     16	sched_debug_verbose = true;
     17
     18	return 0;
     19}
     20early_param("sched_verbose", sched_debug_setup);
     21
     22static inline bool sched_debug(void)
     23{
     24	return sched_debug_verbose;
     25}
     26
     27#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
     28const struct sd_flag_debug sd_flag_debug[] = {
     29#include <linux/sched/sd_flags.h>
     30};
     31#undef SD_FLAG
     32
     33static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
     34				  struct cpumask *groupmask)
     35{
     36	struct sched_group *group = sd->groups;
     37	unsigned long flags = sd->flags;
     38	unsigned int idx;
     39
     40	cpumask_clear(groupmask);
     41
     42	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
     43	printk(KERN_CONT "span=%*pbl level=%s\n",
     44	       cpumask_pr_args(sched_domain_span(sd)), sd->name);
     45
     46	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
     47		printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
     48	}
     49	if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
     50		printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
     51	}
     52
     53	for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
     54		unsigned int flag = BIT(idx);
     55		unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
     56
     57		if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
     58		    !(sd->child->flags & flag))
     59			printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
     60			       sd_flag_debug[idx].name);
     61
     62		if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
     63		    !(sd->parent->flags & flag))
     64			printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
     65			       sd_flag_debug[idx].name);
     66	}
     67
     68	printk(KERN_DEBUG "%*s groups:", level + 1, "");
     69	do {
     70		if (!group) {
     71			printk("\n");
     72			printk(KERN_ERR "ERROR: group is NULL\n");
     73			break;
     74		}
     75
     76		if (cpumask_empty(sched_group_span(group))) {
     77			printk(KERN_CONT "\n");
     78			printk(KERN_ERR "ERROR: empty group\n");
     79			break;
     80		}
     81
     82		if (!(sd->flags & SD_OVERLAP) &&
     83		    cpumask_intersects(groupmask, sched_group_span(group))) {
     84			printk(KERN_CONT "\n");
     85			printk(KERN_ERR "ERROR: repeated CPUs\n");
     86			break;
     87		}
     88
     89		cpumask_or(groupmask, groupmask, sched_group_span(group));
     90
     91		printk(KERN_CONT " %d:{ span=%*pbl",
     92				group->sgc->id,
     93				cpumask_pr_args(sched_group_span(group)));
     94
     95		if ((sd->flags & SD_OVERLAP) &&
     96		    !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
     97			printk(KERN_CONT " mask=%*pbl",
     98				cpumask_pr_args(group_balance_mask(group)));
     99		}
    100
    101		if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
    102			printk(KERN_CONT " cap=%lu", group->sgc->capacity);
    103
    104		if (group == sd->groups && sd->child &&
    105		    !cpumask_equal(sched_domain_span(sd->child),
    106				   sched_group_span(group))) {
    107			printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
    108		}
    109
    110		printk(KERN_CONT " }");
    111
    112		group = group->next;
    113
    114		if (group != sd->groups)
    115			printk(KERN_CONT ",");
    116
    117	} while (group != sd->groups);
    118	printk(KERN_CONT "\n");
    119
    120	if (!cpumask_equal(sched_domain_span(sd), groupmask))
    121		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
    122
    123	if (sd->parent &&
    124	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
    125		printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
    126	return 0;
    127}
    128
    129static void sched_domain_debug(struct sched_domain *sd, int cpu)
    130{
    131	int level = 0;
    132
    133	if (!sched_debug_verbose)
    134		return;
    135
    136	if (!sd) {
    137		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
    138		return;
    139	}
    140
    141	printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
    142
    143	for (;;) {
    144		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
    145			break;
    146		level++;
    147		sd = sd->parent;
    148		if (!sd)
    149			break;
    150	}
    151}
    152#else /* !CONFIG_SCHED_DEBUG */
    153
    154# define sched_debug_verbose 0
    155# define sched_domain_debug(sd, cpu) do { } while (0)
    156static inline bool sched_debug(void)
    157{
    158	return false;
    159}
    160#endif /* CONFIG_SCHED_DEBUG */
    161
    162/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
    163#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
    164static const unsigned int SD_DEGENERATE_GROUPS_MASK =
    165#include <linux/sched/sd_flags.h>
    1660;
    167#undef SD_FLAG
    168
    169static int sd_degenerate(struct sched_domain *sd)
    170{
    171	if (cpumask_weight(sched_domain_span(sd)) == 1)
    172		return 1;
    173
    174	/* Following flags need at least 2 groups */
    175	if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
    176	    (sd->groups != sd->groups->next))
    177		return 0;
    178
    179	/* Following flags don't use groups */
    180	if (sd->flags & (SD_WAKE_AFFINE))
    181		return 0;
    182
    183	return 1;
    184}
    185
    186static int
    187sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
    188{
    189	unsigned long cflags = sd->flags, pflags = parent->flags;
    190
    191	if (sd_degenerate(parent))
    192		return 1;
    193
    194	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
    195		return 0;
    196
    197	/* Flags needing groups don't count if only 1 group in parent */
    198	if (parent->groups == parent->groups->next)
    199		pflags &= ~SD_DEGENERATE_GROUPS_MASK;
    200
    201	if (~cflags & pflags)
    202		return 0;
    203
    204	return 1;
    205}
    206
    207#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
    208DEFINE_STATIC_KEY_FALSE(sched_energy_present);
    209static unsigned int sysctl_sched_energy_aware = 1;
    210DEFINE_MUTEX(sched_energy_mutex);
    211bool sched_energy_update;
    212
    213void rebuild_sched_domains_energy(void)
    214{
    215	mutex_lock(&sched_energy_mutex);
    216	sched_energy_update = true;
    217	rebuild_sched_domains();
    218	sched_energy_update = false;
    219	mutex_unlock(&sched_energy_mutex);
    220}
    221
    222#ifdef CONFIG_PROC_SYSCTL
    223static int sched_energy_aware_handler(struct ctl_table *table, int write,
    224		void *buffer, size_t *lenp, loff_t *ppos)
    225{
    226	int ret, state;
    227
    228	if (write && !capable(CAP_SYS_ADMIN))
    229		return -EPERM;
    230
    231	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
    232	if (!ret && write) {
    233		state = static_branch_unlikely(&sched_energy_present);
    234		if (state != sysctl_sched_energy_aware)
    235			rebuild_sched_domains_energy();
    236	}
    237
    238	return ret;
    239}
    240
    241static struct ctl_table sched_energy_aware_sysctls[] = {
    242	{
    243		.procname       = "sched_energy_aware",
    244		.data           = &sysctl_sched_energy_aware,
    245		.maxlen         = sizeof(unsigned int),
    246		.mode           = 0644,
    247		.proc_handler   = sched_energy_aware_handler,
    248		.extra1         = SYSCTL_ZERO,
    249		.extra2         = SYSCTL_ONE,
    250	},
    251	{}
    252};
    253
    254static int __init sched_energy_aware_sysctl_init(void)
    255{
    256	register_sysctl_init("kernel", sched_energy_aware_sysctls);
    257	return 0;
    258}
    259
    260late_initcall(sched_energy_aware_sysctl_init);
    261#endif
    262
    263static void free_pd(struct perf_domain *pd)
    264{
    265	struct perf_domain *tmp;
    266
    267	while (pd) {
    268		tmp = pd->next;
    269		kfree(pd);
    270		pd = tmp;
    271	}
    272}
    273
    274static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
    275{
    276	while (pd) {
    277		if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
    278			return pd;
    279		pd = pd->next;
    280	}
    281
    282	return NULL;
    283}
    284
    285static struct perf_domain *pd_init(int cpu)
    286{
    287	struct em_perf_domain *obj = em_cpu_get(cpu);
    288	struct perf_domain *pd;
    289
    290	if (!obj) {
    291		if (sched_debug())
    292			pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
    293		return NULL;
    294	}
    295
    296	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
    297	if (!pd)
    298		return NULL;
    299	pd->em_pd = obj;
    300
    301	return pd;
    302}
    303
    304static void perf_domain_debug(const struct cpumask *cpu_map,
    305						struct perf_domain *pd)
    306{
    307	if (!sched_debug() || !pd)
    308		return;
    309
    310	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
    311
    312	while (pd) {
    313		printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
    314				cpumask_first(perf_domain_span(pd)),
    315				cpumask_pr_args(perf_domain_span(pd)),
    316				em_pd_nr_perf_states(pd->em_pd));
    317		pd = pd->next;
    318	}
    319
    320	printk(KERN_CONT "\n");
    321}
    322
    323static void destroy_perf_domain_rcu(struct rcu_head *rp)
    324{
    325	struct perf_domain *pd;
    326
    327	pd = container_of(rp, struct perf_domain, rcu);
    328	free_pd(pd);
    329}
    330
    331static void sched_energy_set(bool has_eas)
    332{
    333	if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
    334		if (sched_debug())
    335			pr_info("%s: stopping EAS\n", __func__);
    336		static_branch_disable_cpuslocked(&sched_energy_present);
    337	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
    338		if (sched_debug())
    339			pr_info("%s: starting EAS\n", __func__);
    340		static_branch_enable_cpuslocked(&sched_energy_present);
    341	}
    342}
    343
    344/*
    345 * EAS can be used on a root domain if it meets all the following conditions:
    346 *    1. an Energy Model (EM) is available;
    347 *    2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
    348 *    3. no SMT is detected.
    349 *    4. the EM complexity is low enough to keep scheduling overheads low;
    350 *    5. schedutil is driving the frequency of all CPUs of the rd;
    351 *    6. frequency invariance support is present;
    352 *
    353 * The complexity of the Energy Model is defined as:
    354 *
    355 *              C = nr_pd * (nr_cpus + nr_ps)
    356 *
    357 * with parameters defined as:
    358 *  - nr_pd:    the number of performance domains
    359 *  - nr_cpus:  the number of CPUs
    360 *  - nr_ps:    the sum of the number of performance states of all performance
    361 *              domains (for example, on a system with 2 performance domains,
    362 *              with 10 performance states each, nr_ps = 2 * 10 = 20).
    363 *
    364 * It is generally not a good idea to use such a model in the wake-up path on
    365 * very complex platforms because of the associated scheduling overheads. The
    366 * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
    367 * with per-CPU DVFS and less than 8 performance states each, for example.
    368 */
    369#define EM_MAX_COMPLEXITY 2048
    370
    371extern struct cpufreq_governor schedutil_gov;
    372static bool build_perf_domains(const struct cpumask *cpu_map)
    373{
    374	int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
    375	struct perf_domain *pd = NULL, *tmp;
    376	int cpu = cpumask_first(cpu_map);
    377	struct root_domain *rd = cpu_rq(cpu)->rd;
    378	struct cpufreq_policy *policy;
    379	struct cpufreq_governor *gov;
    380
    381	if (!sysctl_sched_energy_aware)
    382		goto free;
    383
    384	/* EAS is enabled for asymmetric CPU capacity topologies. */
    385	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
    386		if (sched_debug()) {
    387			pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
    388					cpumask_pr_args(cpu_map));
    389		}
    390		goto free;
    391	}
    392
    393	/* EAS definitely does *not* handle SMT */
    394	if (sched_smt_active()) {
    395		pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
    396			cpumask_pr_args(cpu_map));
    397		goto free;
    398	}
    399
    400	if (!arch_scale_freq_invariant()) {
    401		if (sched_debug()) {
    402			pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
    403				cpumask_pr_args(cpu_map));
    404		}
    405		goto free;
    406	}
    407
    408	for_each_cpu(i, cpu_map) {
    409		/* Skip already covered CPUs. */
    410		if (find_pd(pd, i))
    411			continue;
    412
    413		/* Do not attempt EAS if schedutil is not being used. */
    414		policy = cpufreq_cpu_get(i);
    415		if (!policy)
    416			goto free;
    417		gov = policy->governor;
    418		cpufreq_cpu_put(policy);
    419		if (gov != &schedutil_gov) {
    420			if (rd->pd)
    421				pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
    422						cpumask_pr_args(cpu_map));
    423			goto free;
    424		}
    425
    426		/* Create the new pd and add it to the local list. */
    427		tmp = pd_init(i);
    428		if (!tmp)
    429			goto free;
    430		tmp->next = pd;
    431		pd = tmp;
    432
    433		/*
    434		 * Count performance domains and performance states for the
    435		 * complexity check.
    436		 */
    437		nr_pd++;
    438		nr_ps += em_pd_nr_perf_states(pd->em_pd);
    439	}
    440
    441	/* Bail out if the Energy Model complexity is too high. */
    442	if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
    443		WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
    444						cpumask_pr_args(cpu_map));
    445		goto free;
    446	}
    447
    448	perf_domain_debug(cpu_map, pd);
    449
    450	/* Attach the new list of performance domains to the root domain. */
    451	tmp = rd->pd;
    452	rcu_assign_pointer(rd->pd, pd);
    453	if (tmp)
    454		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
    455
    456	return !!pd;
    457
    458free:
    459	free_pd(pd);
    460	tmp = rd->pd;
    461	rcu_assign_pointer(rd->pd, NULL);
    462	if (tmp)
    463		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
    464
    465	return false;
    466}
    467#else
    468static void free_pd(struct perf_domain *pd) { }
    469#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
    470
    471static void free_rootdomain(struct rcu_head *rcu)
    472{
    473	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
    474
    475	cpupri_cleanup(&rd->cpupri);
    476	cpudl_cleanup(&rd->cpudl);
    477	free_cpumask_var(rd->dlo_mask);
    478	free_cpumask_var(rd->rto_mask);
    479	free_cpumask_var(rd->online);
    480	free_cpumask_var(rd->span);
    481	free_pd(rd->pd);
    482	kfree(rd);
    483}
    484
    485void rq_attach_root(struct rq *rq, struct root_domain *rd)
    486{
    487	struct root_domain *old_rd = NULL;
    488	unsigned long flags;
    489
    490	raw_spin_rq_lock_irqsave(rq, flags);
    491
    492	if (rq->rd) {
    493		old_rd = rq->rd;
    494
    495		if (cpumask_test_cpu(rq->cpu, old_rd->online))
    496			set_rq_offline(rq);
    497
    498		cpumask_clear_cpu(rq->cpu, old_rd->span);
    499
    500		/*
    501		 * If we dont want to free the old_rd yet then
    502		 * set old_rd to NULL to skip the freeing later
    503		 * in this function:
    504		 */
    505		if (!atomic_dec_and_test(&old_rd->refcount))
    506			old_rd = NULL;
    507	}
    508
    509	atomic_inc(&rd->refcount);
    510	rq->rd = rd;
    511
    512	cpumask_set_cpu(rq->cpu, rd->span);
    513	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
    514		set_rq_online(rq);
    515
    516	raw_spin_rq_unlock_irqrestore(rq, flags);
    517
    518	if (old_rd)
    519		call_rcu(&old_rd->rcu, free_rootdomain);
    520}
    521
    522void sched_get_rd(struct root_domain *rd)
    523{
    524	atomic_inc(&rd->refcount);
    525}
    526
    527void sched_put_rd(struct root_domain *rd)
    528{
    529	if (!atomic_dec_and_test(&rd->refcount))
    530		return;
    531
    532	call_rcu(&rd->rcu, free_rootdomain);
    533}
    534
    535static int init_rootdomain(struct root_domain *rd)
    536{
    537	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
    538		goto out;
    539	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
    540		goto free_span;
    541	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
    542		goto free_online;
    543	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
    544		goto free_dlo_mask;
    545
    546#ifdef HAVE_RT_PUSH_IPI
    547	rd->rto_cpu = -1;
    548	raw_spin_lock_init(&rd->rto_lock);
    549	rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
    550#endif
    551
    552	rd->visit_gen = 0;
    553	init_dl_bw(&rd->dl_bw);
    554	if (cpudl_init(&rd->cpudl) != 0)
    555		goto free_rto_mask;
    556
    557	if (cpupri_init(&rd->cpupri) != 0)
    558		goto free_cpudl;
    559	return 0;
    560
    561free_cpudl:
    562	cpudl_cleanup(&rd->cpudl);
    563free_rto_mask:
    564	free_cpumask_var(rd->rto_mask);
    565free_dlo_mask:
    566	free_cpumask_var(rd->dlo_mask);
    567free_online:
    568	free_cpumask_var(rd->online);
    569free_span:
    570	free_cpumask_var(rd->span);
    571out:
    572	return -ENOMEM;
    573}
    574
    575/*
    576 * By default the system creates a single root-domain with all CPUs as
    577 * members (mimicking the global state we have today).
    578 */
    579struct root_domain def_root_domain;
    580
    581void init_defrootdomain(void)
    582{
    583	init_rootdomain(&def_root_domain);
    584
    585	atomic_set(&def_root_domain.refcount, 1);
    586}
    587
    588static struct root_domain *alloc_rootdomain(void)
    589{
    590	struct root_domain *rd;
    591
    592	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
    593	if (!rd)
    594		return NULL;
    595
    596	if (init_rootdomain(rd) != 0) {
    597		kfree(rd);
    598		return NULL;
    599	}
    600
    601	return rd;
    602}
    603
    604static void free_sched_groups(struct sched_group *sg, int free_sgc)
    605{
    606	struct sched_group *tmp, *first;
    607
    608	if (!sg)
    609		return;
    610
    611	first = sg;
    612	do {
    613		tmp = sg->next;
    614
    615		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
    616			kfree(sg->sgc);
    617
    618		if (atomic_dec_and_test(&sg->ref))
    619			kfree(sg);
    620		sg = tmp;
    621	} while (sg != first);
    622}
    623
    624static void destroy_sched_domain(struct sched_domain *sd)
    625{
    626	/*
    627	 * A normal sched domain may have multiple group references, an
    628	 * overlapping domain, having private groups, only one.  Iterate,
    629	 * dropping group/capacity references, freeing where none remain.
    630	 */
    631	free_sched_groups(sd->groups, 1);
    632
    633	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
    634		kfree(sd->shared);
    635	kfree(sd);
    636}
    637
    638static void destroy_sched_domains_rcu(struct rcu_head *rcu)
    639{
    640	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
    641
    642	while (sd) {
    643		struct sched_domain *parent = sd->parent;
    644		destroy_sched_domain(sd);
    645		sd = parent;
    646	}
    647}
    648
    649static void destroy_sched_domains(struct sched_domain *sd)
    650{
    651	if (sd)
    652		call_rcu(&sd->rcu, destroy_sched_domains_rcu);
    653}
    654
    655/*
    656 * Keep a special pointer to the highest sched_domain that has
    657 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
    658 * allows us to avoid some pointer chasing select_idle_sibling().
    659 *
    660 * Also keep a unique ID per domain (we use the first CPU number in
    661 * the cpumask of the domain), this allows us to quickly tell if
    662 * two CPUs are in the same cache domain, see cpus_share_cache().
    663 */
    664DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
    665DEFINE_PER_CPU(int, sd_llc_size);
    666DEFINE_PER_CPU(int, sd_llc_id);
    667DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
    668DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
    669DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
    670DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
    671DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
    672
    673static void update_top_cache_domain(int cpu)
    674{
    675	struct sched_domain_shared *sds = NULL;
    676	struct sched_domain *sd;
    677	int id = cpu;
    678	int size = 1;
    679
    680	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
    681	if (sd) {
    682		id = cpumask_first(sched_domain_span(sd));
    683		size = cpumask_weight(sched_domain_span(sd));
    684		sds = sd->shared;
    685	}
    686
    687	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
    688	per_cpu(sd_llc_size, cpu) = size;
    689	per_cpu(sd_llc_id, cpu) = id;
    690	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
    691
    692	sd = lowest_flag_domain(cpu, SD_NUMA);
    693	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
    694
    695	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
    696	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
    697
    698	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
    699	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
    700}
    701
    702/*
    703 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
    704 * hold the hotplug lock.
    705 */
    706static void
    707cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
    708{
    709	struct rq *rq = cpu_rq(cpu);
    710	struct sched_domain *tmp;
    711
    712	/* Remove the sched domains which do not contribute to scheduling. */
    713	for (tmp = sd; tmp; ) {
    714		struct sched_domain *parent = tmp->parent;
    715		if (!parent)
    716			break;
    717
    718		if (sd_parent_degenerate(tmp, parent)) {
    719			tmp->parent = parent->parent;
    720			if (parent->parent)
    721				parent->parent->child = tmp;
    722			/*
    723			 * Transfer SD_PREFER_SIBLING down in case of a
    724			 * degenerate parent; the spans match for this
    725			 * so the property transfers.
    726			 */
    727			if (parent->flags & SD_PREFER_SIBLING)
    728				tmp->flags |= SD_PREFER_SIBLING;
    729			destroy_sched_domain(parent);
    730		} else
    731			tmp = tmp->parent;
    732	}
    733
    734	if (sd && sd_degenerate(sd)) {
    735		tmp = sd;
    736		sd = sd->parent;
    737		destroy_sched_domain(tmp);
    738		if (sd) {
    739			struct sched_group *sg = sd->groups;
    740
    741			/*
    742			 * sched groups hold the flags of the child sched
    743			 * domain for convenience. Clear such flags since
    744			 * the child is being destroyed.
    745			 */
    746			do {
    747				sg->flags = 0;
    748			} while (sg != sd->groups);
    749
    750			sd->child = NULL;
    751		}
    752	}
    753
    754	sched_domain_debug(sd, cpu);
    755
    756	rq_attach_root(rq, rd);
    757	tmp = rq->sd;
    758	rcu_assign_pointer(rq->sd, sd);
    759	dirty_sched_domain_sysctl(cpu);
    760	destroy_sched_domains(tmp);
    761
    762	update_top_cache_domain(cpu);
    763}
    764
    765struct s_data {
    766	struct sched_domain * __percpu *sd;
    767	struct root_domain	*rd;
    768};
    769
    770enum s_alloc {
    771	sa_rootdomain,
    772	sa_sd,
    773	sa_sd_storage,
    774	sa_none,
    775};
    776
    777/*
    778 * Return the canonical balance CPU for this group, this is the first CPU
    779 * of this group that's also in the balance mask.
    780 *
    781 * The balance mask are all those CPUs that could actually end up at this
    782 * group. See build_balance_mask().
    783 *
    784 * Also see should_we_balance().
    785 */
    786int group_balance_cpu(struct sched_group *sg)
    787{
    788	return cpumask_first(group_balance_mask(sg));
    789}
    790
    791
    792/*
    793 * NUMA topology (first read the regular topology blurb below)
    794 *
    795 * Given a node-distance table, for example:
    796 *
    797 *   node   0   1   2   3
    798 *     0:  10  20  30  20
    799 *     1:  20  10  20  30
    800 *     2:  30  20  10  20
    801 *     3:  20  30  20  10
    802 *
    803 * which represents a 4 node ring topology like:
    804 *
    805 *   0 ----- 1
    806 *   |       |
    807 *   |       |
    808 *   |       |
    809 *   3 ----- 2
    810 *
    811 * We want to construct domains and groups to represent this. The way we go
    812 * about doing this is to build the domains on 'hops'. For each NUMA level we
    813 * construct the mask of all nodes reachable in @level hops.
    814 *
    815 * For the above NUMA topology that gives 3 levels:
    816 *
    817 * NUMA-2	0-3		0-3		0-3		0-3
    818 *  groups:	{0-1,3},{1-3}	{0-2},{0,2-3}	{1-3},{0-1,3}	{0,2-3},{0-2}
    819 *
    820 * NUMA-1	0-1,3		0-2		1-3		0,2-3
    821 *  groups:	{0},{1},{3}	{0},{1},{2}	{1},{2},{3}	{0},{2},{3}
    822 *
    823 * NUMA-0	0		1		2		3
    824 *
    825 *
    826 * As can be seen; things don't nicely line up as with the regular topology.
    827 * When we iterate a domain in child domain chunks some nodes can be
    828 * represented multiple times -- hence the "overlap" naming for this part of
    829 * the topology.
    830 *
    831 * In order to minimize this overlap, we only build enough groups to cover the
    832 * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
    833 *
    834 * Because:
    835 *
    836 *  - the first group of each domain is its child domain; this
    837 *    gets us the first 0-1,3
    838 *  - the only uncovered node is 2, who's child domain is 1-3.
    839 *
    840 * However, because of the overlap, computing a unique CPU for each group is
    841 * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
    842 * groups include the CPUs of Node-0, while those CPUs would not in fact ever
    843 * end up at those groups (they would end up in group: 0-1,3).
    844 *
    845 * To correct this we have to introduce the group balance mask. This mask
    846 * will contain those CPUs in the group that can reach this group given the
    847 * (child) domain tree.
    848 *
    849 * With this we can once again compute balance_cpu and sched_group_capacity
    850 * relations.
    851 *
    852 * XXX include words on how balance_cpu is unique and therefore can be
    853 * used for sched_group_capacity links.
    854 *
    855 *
    856 * Another 'interesting' topology is:
    857 *
    858 *   node   0   1   2   3
    859 *     0:  10  20  20  30
    860 *     1:  20  10  20  20
    861 *     2:  20  20  10  20
    862 *     3:  30  20  20  10
    863 *
    864 * Which looks a little like:
    865 *
    866 *   0 ----- 1
    867 *   |     / |
    868 *   |   /   |
    869 *   | /     |
    870 *   2 ----- 3
    871 *
    872 * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
    873 * are not.
    874 *
    875 * This leads to a few particularly weird cases where the sched_domain's are
    876 * not of the same number for each CPU. Consider:
    877 *
    878 * NUMA-2	0-3						0-3
    879 *  groups:	{0-2},{1-3}					{1-3},{0-2}
    880 *
    881 * NUMA-1	0-2		0-3		0-3		1-3
    882 *
    883 * NUMA-0	0		1		2		3
    884 *
    885 */
    886
    887
    888/*
    889 * Build the balance mask; it contains only those CPUs that can arrive at this
    890 * group and should be considered to continue balancing.
    891 *
    892 * We do this during the group creation pass, therefore the group information
    893 * isn't complete yet, however since each group represents a (child) domain we
    894 * can fully construct this using the sched_domain bits (which are already
    895 * complete).
    896 */
    897static void
    898build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
    899{
    900	const struct cpumask *sg_span = sched_group_span(sg);
    901	struct sd_data *sdd = sd->private;
    902	struct sched_domain *sibling;
    903	int i;
    904
    905	cpumask_clear(mask);
    906
    907	for_each_cpu(i, sg_span) {
    908		sibling = *per_cpu_ptr(sdd->sd, i);
    909
    910		/*
    911		 * Can happen in the asymmetric case, where these siblings are
    912		 * unused. The mask will not be empty because those CPUs that
    913		 * do have the top domain _should_ span the domain.
    914		 */
    915		if (!sibling->child)
    916			continue;
    917
    918		/* If we would not end up here, we can't continue from here */
    919		if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
    920			continue;
    921
    922		cpumask_set_cpu(i, mask);
    923	}
    924
    925	/* We must not have empty masks here */
    926	WARN_ON_ONCE(cpumask_empty(mask));
    927}
    928
    929/*
    930 * XXX: This creates per-node group entries; since the load-balancer will
    931 * immediately access remote memory to construct this group's load-balance
    932 * statistics having the groups node local is of dubious benefit.
    933 */
    934static struct sched_group *
    935build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
    936{
    937	struct sched_group *sg;
    938	struct cpumask *sg_span;
    939
    940	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
    941			GFP_KERNEL, cpu_to_node(cpu));
    942
    943	if (!sg)
    944		return NULL;
    945
    946	sg_span = sched_group_span(sg);
    947	if (sd->child) {
    948		cpumask_copy(sg_span, sched_domain_span(sd->child));
    949		sg->flags = sd->child->flags;
    950	} else {
    951		cpumask_copy(sg_span, sched_domain_span(sd));
    952	}
    953
    954	atomic_inc(&sg->ref);
    955	return sg;
    956}
    957
    958static void init_overlap_sched_group(struct sched_domain *sd,
    959				     struct sched_group *sg)
    960{
    961	struct cpumask *mask = sched_domains_tmpmask2;
    962	struct sd_data *sdd = sd->private;
    963	struct cpumask *sg_span;
    964	int cpu;
    965
    966	build_balance_mask(sd, sg, mask);
    967	cpu = cpumask_first(mask);
    968
    969	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
    970	if (atomic_inc_return(&sg->sgc->ref) == 1)
    971		cpumask_copy(group_balance_mask(sg), mask);
    972	else
    973		WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
    974
    975	/*
    976	 * Initialize sgc->capacity such that even if we mess up the
    977	 * domains and no possible iteration will get us here, we won't
    978	 * die on a /0 trap.
    979	 */
    980	sg_span = sched_group_span(sg);
    981	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
    982	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
    983	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
    984}
    985
    986static struct sched_domain *
    987find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
    988{
    989	/*
    990	 * The proper descendant would be the one whose child won't span out
    991	 * of sd
    992	 */
    993	while (sibling->child &&
    994	       !cpumask_subset(sched_domain_span(sibling->child),
    995			       sched_domain_span(sd)))
    996		sibling = sibling->child;
    997
    998	/*
    999	 * As we are referencing sgc across different topology level, we need
   1000	 * to go down to skip those sched_domains which don't contribute to
   1001	 * scheduling because they will be degenerated in cpu_attach_domain
   1002	 */
   1003	while (sibling->child &&
   1004	       cpumask_equal(sched_domain_span(sibling->child),
   1005			     sched_domain_span(sibling)))
   1006		sibling = sibling->child;
   1007
   1008	return sibling;
   1009}
   1010
   1011static int
   1012build_overlap_sched_groups(struct sched_domain *sd, int cpu)
   1013{
   1014	struct sched_group *first = NULL, *last = NULL, *sg;
   1015	const struct cpumask *span = sched_domain_span(sd);
   1016	struct cpumask *covered = sched_domains_tmpmask;
   1017	struct sd_data *sdd = sd->private;
   1018	struct sched_domain *sibling;
   1019	int i;
   1020
   1021	cpumask_clear(covered);
   1022
   1023	for_each_cpu_wrap(i, span, cpu) {
   1024		struct cpumask *sg_span;
   1025
   1026		if (cpumask_test_cpu(i, covered))
   1027			continue;
   1028
   1029		sibling = *per_cpu_ptr(sdd->sd, i);
   1030
   1031		/*
   1032		 * Asymmetric node setups can result in situations where the
   1033		 * domain tree is of unequal depth, make sure to skip domains
   1034		 * that already cover the entire range.
   1035		 *
   1036		 * In that case build_sched_domains() will have terminated the
   1037		 * iteration early and our sibling sd spans will be empty.
   1038		 * Domains should always include the CPU they're built on, so
   1039		 * check that.
   1040		 */
   1041		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
   1042			continue;
   1043
   1044		/*
   1045		 * Usually we build sched_group by sibling's child sched_domain
   1046		 * But for machines whose NUMA diameter are 3 or above, we move
   1047		 * to build sched_group by sibling's proper descendant's child
   1048		 * domain because sibling's child sched_domain will span out of
   1049		 * the sched_domain being built as below.
   1050		 *
   1051		 * Smallest diameter=3 topology is:
   1052		 *
   1053		 *   node   0   1   2   3
   1054		 *     0:  10  20  30  40
   1055		 *     1:  20  10  20  30
   1056		 *     2:  30  20  10  20
   1057		 *     3:  40  30  20  10
   1058		 *
   1059		 *   0 --- 1 --- 2 --- 3
   1060		 *
   1061		 * NUMA-3       0-3             N/A             N/A             0-3
   1062		 *  groups:     {0-2},{1-3}                                     {1-3},{0-2}
   1063		 *
   1064		 * NUMA-2       0-2             0-3             0-3             1-3
   1065		 *  groups:     {0-1},{1-3}     {0-2},{2-3}     {1-3},{0-1}     {2-3},{0-2}
   1066		 *
   1067		 * NUMA-1       0-1             0-2             1-3             2-3
   1068		 *  groups:     {0},{1}         {1},{2},{0}     {2},{3},{1}     {3},{2}
   1069		 *
   1070		 * NUMA-0       0               1               2               3
   1071		 *
   1072		 * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
   1073		 * group span isn't a subset of the domain span.
   1074		 */
   1075		if (sibling->child &&
   1076		    !cpumask_subset(sched_domain_span(sibling->child), span))
   1077			sibling = find_descended_sibling(sd, sibling);
   1078
   1079		sg = build_group_from_child_sched_domain(sibling, cpu);
   1080		if (!sg)
   1081			goto fail;
   1082
   1083		sg_span = sched_group_span(sg);
   1084		cpumask_or(covered, covered, sg_span);
   1085
   1086		init_overlap_sched_group(sibling, sg);
   1087
   1088		if (!first)
   1089			first = sg;
   1090		if (last)
   1091			last->next = sg;
   1092		last = sg;
   1093		last->next = first;
   1094	}
   1095	sd->groups = first;
   1096
   1097	return 0;
   1098
   1099fail:
   1100	free_sched_groups(first, 0);
   1101
   1102	return -ENOMEM;
   1103}
   1104
   1105
   1106/*
   1107 * Package topology (also see the load-balance blurb in fair.c)
   1108 *
   1109 * The scheduler builds a tree structure to represent a number of important
   1110 * topology features. By default (default_topology[]) these include:
   1111 *
   1112 *  - Simultaneous multithreading (SMT)
   1113 *  - Multi-Core Cache (MC)
   1114 *  - Package (DIE)
   1115 *
   1116 * Where the last one more or less denotes everything up to a NUMA node.
   1117 *
   1118 * The tree consists of 3 primary data structures:
   1119 *
   1120 *	sched_domain -> sched_group -> sched_group_capacity
   1121 *	    ^ ^             ^ ^
   1122 *          `-'             `-'
   1123 *
   1124 * The sched_domains are per-CPU and have a two way link (parent & child) and
   1125 * denote the ever growing mask of CPUs belonging to that level of topology.
   1126 *
   1127 * Each sched_domain has a circular (double) linked list of sched_group's, each
   1128 * denoting the domains of the level below (or individual CPUs in case of the
   1129 * first domain level). The sched_group linked by a sched_domain includes the
   1130 * CPU of that sched_domain [*].
   1131 *
   1132 * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
   1133 *
   1134 * CPU   0   1   2   3   4   5   6   7
   1135 *
   1136 * DIE  [                             ]
   1137 * MC   [             ] [             ]
   1138 * SMT  [     ] [     ] [     ] [     ]
   1139 *
   1140 *  - or -
   1141 *
   1142 * DIE  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
   1143 * MC	0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
   1144 * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
   1145 *
   1146 * CPU   0   1   2   3   4   5   6   7
   1147 *
   1148 * One way to think about it is: sched_domain moves you up and down among these
   1149 * topology levels, while sched_group moves you sideways through it, at child
   1150 * domain granularity.
   1151 *
   1152 * sched_group_capacity ensures each unique sched_group has shared storage.
   1153 *
   1154 * There are two related construction problems, both require a CPU that
   1155 * uniquely identify each group (for a given domain):
   1156 *
   1157 *  - The first is the balance_cpu (see should_we_balance() and the
   1158 *    load-balance blub in fair.c); for each group we only want 1 CPU to
   1159 *    continue balancing at a higher domain.
   1160 *
   1161 *  - The second is the sched_group_capacity; we want all identical groups
   1162 *    to share a single sched_group_capacity.
   1163 *
   1164 * Since these topologies are exclusive by construction. That is, its
   1165 * impossible for an SMT thread to belong to multiple cores, and cores to
   1166 * be part of multiple caches. There is a very clear and unique location
   1167 * for each CPU in the hierarchy.
   1168 *
   1169 * Therefore computing a unique CPU for each group is trivial (the iteration
   1170 * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
   1171 * group), we can simply pick the first CPU in each group.
   1172 *
   1173 *
   1174 * [*] in other words, the first group of each domain is its child domain.
   1175 */
   1176
   1177static struct sched_group *get_group(int cpu, struct sd_data *sdd)
   1178{
   1179	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
   1180	struct sched_domain *child = sd->child;
   1181	struct sched_group *sg;
   1182	bool already_visited;
   1183
   1184	if (child)
   1185		cpu = cpumask_first(sched_domain_span(child));
   1186
   1187	sg = *per_cpu_ptr(sdd->sg, cpu);
   1188	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
   1189
   1190	/* Increase refcounts for claim_allocations: */
   1191	already_visited = atomic_inc_return(&sg->ref) > 1;
   1192	/* sgc visits should follow a similar trend as sg */
   1193	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
   1194
   1195	/* If we have already visited that group, it's already initialized. */
   1196	if (already_visited)
   1197		return sg;
   1198
   1199	if (child) {
   1200		cpumask_copy(sched_group_span(sg), sched_domain_span(child));
   1201		cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
   1202		sg->flags = child->flags;
   1203	} else {
   1204		cpumask_set_cpu(cpu, sched_group_span(sg));
   1205		cpumask_set_cpu(cpu, group_balance_mask(sg));
   1206	}
   1207
   1208	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
   1209	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
   1210	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
   1211
   1212	return sg;
   1213}
   1214
   1215/*
   1216 * build_sched_groups will build a circular linked list of the groups
   1217 * covered by the given span, will set each group's ->cpumask correctly,
   1218 * and will initialize their ->sgc.
   1219 *
   1220 * Assumes the sched_domain tree is fully constructed
   1221 */
   1222static int
   1223build_sched_groups(struct sched_domain *sd, int cpu)
   1224{
   1225	struct sched_group *first = NULL, *last = NULL;
   1226	struct sd_data *sdd = sd->private;
   1227	const struct cpumask *span = sched_domain_span(sd);
   1228	struct cpumask *covered;
   1229	int i;
   1230
   1231	lockdep_assert_held(&sched_domains_mutex);
   1232	covered = sched_domains_tmpmask;
   1233
   1234	cpumask_clear(covered);
   1235
   1236	for_each_cpu_wrap(i, span, cpu) {
   1237		struct sched_group *sg;
   1238
   1239		if (cpumask_test_cpu(i, covered))
   1240			continue;
   1241
   1242		sg = get_group(i, sdd);
   1243
   1244		cpumask_or(covered, covered, sched_group_span(sg));
   1245
   1246		if (!first)
   1247			first = sg;
   1248		if (last)
   1249			last->next = sg;
   1250		last = sg;
   1251	}
   1252	last->next = first;
   1253	sd->groups = first;
   1254
   1255	return 0;
   1256}
   1257
   1258/*
   1259 * Initialize sched groups cpu_capacity.
   1260 *
   1261 * cpu_capacity indicates the capacity of sched group, which is used while
   1262 * distributing the load between different sched groups in a sched domain.
   1263 * Typically cpu_capacity for all the groups in a sched domain will be same
   1264 * unless there are asymmetries in the topology. If there are asymmetries,
   1265 * group having more cpu_capacity will pickup more load compared to the
   1266 * group having less cpu_capacity.
   1267 */
   1268static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
   1269{
   1270	struct sched_group *sg = sd->groups;
   1271
   1272	WARN_ON(!sg);
   1273
   1274	do {
   1275		int cpu, max_cpu = -1;
   1276
   1277		sg->group_weight = cpumask_weight(sched_group_span(sg));
   1278
   1279		if (!(sd->flags & SD_ASYM_PACKING))
   1280			goto next;
   1281
   1282		for_each_cpu(cpu, sched_group_span(sg)) {
   1283			if (max_cpu < 0)
   1284				max_cpu = cpu;
   1285			else if (sched_asym_prefer(cpu, max_cpu))
   1286				max_cpu = cpu;
   1287		}
   1288		sg->asym_prefer_cpu = max_cpu;
   1289
   1290next:
   1291		sg = sg->next;
   1292	} while (sg != sd->groups);
   1293
   1294	if (cpu != group_balance_cpu(sg))
   1295		return;
   1296
   1297	update_group_capacity(sd, cpu);
   1298}
   1299
   1300/*
   1301 * Asymmetric CPU capacity bits
   1302 */
   1303struct asym_cap_data {
   1304	struct list_head link;
   1305	unsigned long capacity;
   1306	unsigned long cpus[];
   1307};
   1308
   1309/*
   1310 * Set of available CPUs grouped by their corresponding capacities
   1311 * Each list entry contains a CPU mask reflecting CPUs that share the same
   1312 * capacity.
   1313 * The lifespan of data is unlimited.
   1314 */
   1315static LIST_HEAD(asym_cap_list);
   1316
   1317#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
   1318
   1319/*
   1320 * Verify whether there is any CPU capacity asymmetry in a given sched domain.
   1321 * Provides sd_flags reflecting the asymmetry scope.
   1322 */
   1323static inline int
   1324asym_cpu_capacity_classify(const struct cpumask *sd_span,
   1325			   const struct cpumask *cpu_map)
   1326{
   1327	struct asym_cap_data *entry;
   1328	int count = 0, miss = 0;
   1329
   1330	/*
   1331	 * Count how many unique CPU capacities this domain spans across
   1332	 * (compare sched_domain CPUs mask with ones representing  available
   1333	 * CPUs capacities). Take into account CPUs that might be offline:
   1334	 * skip those.
   1335	 */
   1336	list_for_each_entry(entry, &asym_cap_list, link) {
   1337		if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
   1338			++count;
   1339		else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
   1340			++miss;
   1341	}
   1342
   1343	WARN_ON_ONCE(!count && !list_empty(&asym_cap_list));
   1344
   1345	/* No asymmetry detected */
   1346	if (count < 2)
   1347		return 0;
   1348	/* Some of the available CPU capacity values have not been detected */
   1349	if (miss)
   1350		return SD_ASYM_CPUCAPACITY;
   1351
   1352	/* Full asymmetry */
   1353	return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL;
   1354
   1355}
   1356
   1357static inline void asym_cpu_capacity_update_data(int cpu)
   1358{
   1359	unsigned long capacity = arch_scale_cpu_capacity(cpu);
   1360	struct asym_cap_data *entry = NULL;
   1361
   1362	list_for_each_entry(entry, &asym_cap_list, link) {
   1363		if (capacity == entry->capacity)
   1364			goto done;
   1365	}
   1366
   1367	entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
   1368	if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
   1369		return;
   1370	entry->capacity = capacity;
   1371	list_add(&entry->link, &asym_cap_list);
   1372done:
   1373	__cpumask_set_cpu(cpu, cpu_capacity_span(entry));
   1374}
   1375
   1376/*
   1377 * Build-up/update list of CPUs grouped by their capacities
   1378 * An update requires explicit request to rebuild sched domains
   1379 * with state indicating CPU topology changes.
   1380 */
   1381static void asym_cpu_capacity_scan(void)
   1382{
   1383	struct asym_cap_data *entry, *next;
   1384	int cpu;
   1385
   1386	list_for_each_entry(entry, &asym_cap_list, link)
   1387		cpumask_clear(cpu_capacity_span(entry));
   1388
   1389	for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN))
   1390		asym_cpu_capacity_update_data(cpu);
   1391
   1392	list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
   1393		if (cpumask_empty(cpu_capacity_span(entry))) {
   1394			list_del(&entry->link);
   1395			kfree(entry);
   1396		}
   1397	}
   1398
   1399	/*
   1400	 * Only one capacity value has been detected i.e. this system is symmetric.
   1401	 * No need to keep this data around.
   1402	 */
   1403	if (list_is_singular(&asym_cap_list)) {
   1404		entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
   1405		list_del(&entry->link);
   1406		kfree(entry);
   1407	}
   1408}
   1409
   1410/*
   1411 * Initializers for schedule domains
   1412 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
   1413 */
   1414
   1415static int default_relax_domain_level = -1;
   1416int sched_domain_level_max;
   1417
   1418static int __init setup_relax_domain_level(char *str)
   1419{
   1420	if (kstrtoint(str, 0, &default_relax_domain_level))
   1421		pr_warn("Unable to set relax_domain_level\n");
   1422
   1423	return 1;
   1424}
   1425__setup("relax_domain_level=", setup_relax_domain_level);
   1426
   1427static void set_domain_attribute(struct sched_domain *sd,
   1428				 struct sched_domain_attr *attr)
   1429{
   1430	int request;
   1431
   1432	if (!attr || attr->relax_domain_level < 0) {
   1433		if (default_relax_domain_level < 0)
   1434			return;
   1435		request = default_relax_domain_level;
   1436	} else
   1437		request = attr->relax_domain_level;
   1438
   1439	if (sd->level > request) {
   1440		/* Turn off idle balance on this domain: */
   1441		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
   1442	}
   1443}
   1444
   1445static void __sdt_free(const struct cpumask *cpu_map);
   1446static int __sdt_alloc(const struct cpumask *cpu_map);
   1447
   1448static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
   1449				 const struct cpumask *cpu_map)
   1450{
   1451	switch (what) {
   1452	case sa_rootdomain:
   1453		if (!atomic_read(&d->rd->refcount))
   1454			free_rootdomain(&d->rd->rcu);
   1455		fallthrough;
   1456	case sa_sd:
   1457		free_percpu(d->sd);
   1458		fallthrough;
   1459	case sa_sd_storage:
   1460		__sdt_free(cpu_map);
   1461		fallthrough;
   1462	case sa_none:
   1463		break;
   1464	}
   1465}
   1466
   1467static enum s_alloc
   1468__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
   1469{
   1470	memset(d, 0, sizeof(*d));
   1471
   1472	if (__sdt_alloc(cpu_map))
   1473		return sa_sd_storage;
   1474	d->sd = alloc_percpu(struct sched_domain *);
   1475	if (!d->sd)
   1476		return sa_sd_storage;
   1477	d->rd = alloc_rootdomain();
   1478	if (!d->rd)
   1479		return sa_sd;
   1480
   1481	return sa_rootdomain;
   1482}
   1483
   1484/*
   1485 * NULL the sd_data elements we've used to build the sched_domain and
   1486 * sched_group structure so that the subsequent __free_domain_allocs()
   1487 * will not free the data we're using.
   1488 */
   1489static void claim_allocations(int cpu, struct sched_domain *sd)
   1490{
   1491	struct sd_data *sdd = sd->private;
   1492
   1493	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
   1494	*per_cpu_ptr(sdd->sd, cpu) = NULL;
   1495
   1496	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
   1497		*per_cpu_ptr(sdd->sds, cpu) = NULL;
   1498
   1499	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
   1500		*per_cpu_ptr(sdd->sg, cpu) = NULL;
   1501
   1502	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
   1503		*per_cpu_ptr(sdd->sgc, cpu) = NULL;
   1504}
   1505
   1506#ifdef CONFIG_NUMA
   1507enum numa_topology_type sched_numa_topology_type;
   1508
   1509static int			sched_domains_numa_levels;
   1510static int			sched_domains_curr_level;
   1511
   1512int				sched_max_numa_distance;
   1513static int			*sched_domains_numa_distance;
   1514static struct cpumask		***sched_domains_numa_masks;
   1515#endif
   1516
   1517/*
   1518 * SD_flags allowed in topology descriptions.
   1519 *
   1520 * These flags are purely descriptive of the topology and do not prescribe
   1521 * behaviour. Behaviour is artificial and mapped in the below sd_init()
   1522 * function:
   1523 *
   1524 *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
   1525 *   SD_SHARE_PKG_RESOURCES - describes shared caches
   1526 *   SD_NUMA                - describes NUMA topologies
   1527 *
   1528 * Odd one out, which beside describing the topology has a quirk also
   1529 * prescribes the desired behaviour that goes along with it:
   1530 *
   1531 *   SD_ASYM_PACKING        - describes SMT quirks
   1532 */
   1533#define TOPOLOGY_SD_FLAGS		\
   1534	(SD_SHARE_CPUCAPACITY	|	\
   1535	 SD_SHARE_PKG_RESOURCES |	\
   1536	 SD_NUMA		|	\
   1537	 SD_ASYM_PACKING)
   1538
   1539static struct sched_domain *
   1540sd_init(struct sched_domain_topology_level *tl,
   1541	const struct cpumask *cpu_map,
   1542	struct sched_domain *child, int cpu)
   1543{
   1544	struct sd_data *sdd = &tl->data;
   1545	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
   1546	int sd_id, sd_weight, sd_flags = 0;
   1547	struct cpumask *sd_span;
   1548
   1549#ifdef CONFIG_NUMA
   1550	/*
   1551	 * Ugly hack to pass state to sd_numa_mask()...
   1552	 */
   1553	sched_domains_curr_level = tl->numa_level;
   1554#endif
   1555
   1556	sd_weight = cpumask_weight(tl->mask(cpu));
   1557
   1558	if (tl->sd_flags)
   1559		sd_flags = (*tl->sd_flags)();
   1560	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
   1561			"wrong sd_flags in topology description\n"))
   1562		sd_flags &= TOPOLOGY_SD_FLAGS;
   1563
   1564	*sd = (struct sched_domain){
   1565		.min_interval		= sd_weight,
   1566		.max_interval		= 2*sd_weight,
   1567		.busy_factor		= 16,
   1568		.imbalance_pct		= 117,
   1569
   1570		.cache_nice_tries	= 0,
   1571
   1572		.flags			= 1*SD_BALANCE_NEWIDLE
   1573					| 1*SD_BALANCE_EXEC
   1574					| 1*SD_BALANCE_FORK
   1575					| 0*SD_BALANCE_WAKE
   1576					| 1*SD_WAKE_AFFINE
   1577					| 0*SD_SHARE_CPUCAPACITY
   1578					| 0*SD_SHARE_PKG_RESOURCES
   1579					| 0*SD_SERIALIZE
   1580					| 1*SD_PREFER_SIBLING
   1581					| 0*SD_NUMA
   1582					| sd_flags
   1583					,
   1584
   1585		.last_balance		= jiffies,
   1586		.balance_interval	= sd_weight,
   1587		.max_newidle_lb_cost	= 0,
   1588		.last_decay_max_lb_cost	= jiffies,
   1589		.child			= child,
   1590#ifdef CONFIG_SCHED_DEBUG
   1591		.name			= tl->name,
   1592#endif
   1593	};
   1594
   1595	sd_span = sched_domain_span(sd);
   1596	cpumask_and(sd_span, cpu_map, tl->mask(cpu));
   1597	sd_id = cpumask_first(sd_span);
   1598
   1599	sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
   1600
   1601	WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
   1602		  (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
   1603		  "CPU capacity asymmetry not supported on SMT\n");
   1604
   1605	/*
   1606	 * Convert topological properties into behaviour.
   1607	 */
   1608	/* Don't attempt to spread across CPUs of different capacities. */
   1609	if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
   1610		sd->child->flags &= ~SD_PREFER_SIBLING;
   1611
   1612	if (sd->flags & SD_SHARE_CPUCAPACITY) {
   1613		sd->imbalance_pct = 110;
   1614
   1615	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
   1616		sd->imbalance_pct = 117;
   1617		sd->cache_nice_tries = 1;
   1618
   1619#ifdef CONFIG_NUMA
   1620	} else if (sd->flags & SD_NUMA) {
   1621		sd->cache_nice_tries = 2;
   1622
   1623		sd->flags &= ~SD_PREFER_SIBLING;
   1624		sd->flags |= SD_SERIALIZE;
   1625		if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
   1626			sd->flags &= ~(SD_BALANCE_EXEC |
   1627				       SD_BALANCE_FORK |
   1628				       SD_WAKE_AFFINE);
   1629		}
   1630
   1631#endif
   1632	} else {
   1633		sd->cache_nice_tries = 1;
   1634	}
   1635
   1636	/*
   1637	 * For all levels sharing cache; connect a sched_domain_shared
   1638	 * instance.
   1639	 */
   1640	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
   1641		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
   1642		atomic_inc(&sd->shared->ref);
   1643		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
   1644	}
   1645
   1646	sd->private = sdd;
   1647
   1648	return sd;
   1649}
   1650
   1651/*
   1652 * Topology list, bottom-up.
   1653 */
   1654static struct sched_domain_topology_level default_topology[] = {
   1655#ifdef CONFIG_SCHED_SMT
   1656	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
   1657#endif
   1658
   1659#ifdef CONFIG_SCHED_CLUSTER
   1660	{ cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
   1661#endif
   1662
   1663#ifdef CONFIG_SCHED_MC
   1664	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
   1665#endif
   1666	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
   1667	{ NULL, },
   1668};
   1669
   1670static struct sched_domain_topology_level *sched_domain_topology =
   1671	default_topology;
   1672static struct sched_domain_topology_level *sched_domain_topology_saved;
   1673
   1674#define for_each_sd_topology(tl)			\
   1675	for (tl = sched_domain_topology; tl->mask; tl++)
   1676
   1677void set_sched_topology(struct sched_domain_topology_level *tl)
   1678{
   1679	if (WARN_ON_ONCE(sched_smp_initialized))
   1680		return;
   1681
   1682	sched_domain_topology = tl;
   1683	sched_domain_topology_saved = NULL;
   1684}
   1685
   1686#ifdef CONFIG_NUMA
   1687
   1688static const struct cpumask *sd_numa_mask(int cpu)
   1689{
   1690	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
   1691}
   1692
   1693static void sched_numa_warn(const char *str)
   1694{
   1695	static int done = false;
   1696	int i,j;
   1697
   1698	if (done)
   1699		return;
   1700
   1701	done = true;
   1702
   1703	printk(KERN_WARNING "ERROR: %s\n\n", str);
   1704
   1705	for (i = 0; i < nr_node_ids; i++) {
   1706		printk(KERN_WARNING "  ");
   1707		for (j = 0; j < nr_node_ids; j++) {
   1708			if (!node_state(i, N_CPU) || !node_state(j, N_CPU))
   1709				printk(KERN_CONT "(%02d) ", node_distance(i,j));
   1710			else
   1711				printk(KERN_CONT " %02d  ", node_distance(i,j));
   1712		}
   1713		printk(KERN_CONT "\n");
   1714	}
   1715	printk(KERN_WARNING "\n");
   1716}
   1717
   1718bool find_numa_distance(int distance)
   1719{
   1720	bool found = false;
   1721	int i, *distances;
   1722
   1723	if (distance == node_distance(0, 0))
   1724		return true;
   1725
   1726	rcu_read_lock();
   1727	distances = rcu_dereference(sched_domains_numa_distance);
   1728	if (!distances)
   1729		goto unlock;
   1730	for (i = 0; i < sched_domains_numa_levels; i++) {
   1731		if (distances[i] == distance) {
   1732			found = true;
   1733			break;
   1734		}
   1735	}
   1736unlock:
   1737	rcu_read_unlock();
   1738
   1739	return found;
   1740}
   1741
   1742#define for_each_cpu_node_but(n, nbut)		\
   1743	for_each_node_state(n, N_CPU)		\
   1744		if (n == nbut)			\
   1745			continue;		\
   1746		else
   1747
   1748/*
   1749 * A system can have three types of NUMA topology:
   1750 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
   1751 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
   1752 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
   1753 *
   1754 * The difference between a glueless mesh topology and a backplane
   1755 * topology lies in whether communication between not directly
   1756 * connected nodes goes through intermediary nodes (where programs
   1757 * could run), or through backplane controllers. This affects
   1758 * placement of programs.
   1759 *
   1760 * The type of topology can be discerned with the following tests:
   1761 * - If the maximum distance between any nodes is 1 hop, the system
   1762 *   is directly connected.
   1763 * - If for two nodes A and B, located N > 1 hops away from each other,
   1764 *   there is an intermediary node C, which is < N hops away from both
   1765 *   nodes A and B, the system is a glueless mesh.
   1766 */
   1767static void init_numa_topology_type(int offline_node)
   1768{
   1769	int a, b, c, n;
   1770
   1771	n = sched_max_numa_distance;
   1772
   1773	if (sched_domains_numa_levels <= 2) {
   1774		sched_numa_topology_type = NUMA_DIRECT;
   1775		return;
   1776	}
   1777
   1778	for_each_cpu_node_but(a, offline_node) {
   1779		for_each_cpu_node_but(b, offline_node) {
   1780			/* Find two nodes furthest removed from each other. */
   1781			if (node_distance(a, b) < n)
   1782				continue;
   1783
   1784			/* Is there an intermediary node between a and b? */
   1785			for_each_cpu_node_but(c, offline_node) {
   1786				if (node_distance(a, c) < n &&
   1787				    node_distance(b, c) < n) {
   1788					sched_numa_topology_type =
   1789							NUMA_GLUELESS_MESH;
   1790					return;
   1791				}
   1792			}
   1793
   1794			sched_numa_topology_type = NUMA_BACKPLANE;
   1795			return;
   1796		}
   1797	}
   1798
   1799	pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n");
   1800	sched_numa_topology_type = NUMA_DIRECT;
   1801}
   1802
   1803
   1804#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
   1805
   1806void sched_init_numa(int offline_node)
   1807{
   1808	struct sched_domain_topology_level *tl;
   1809	unsigned long *distance_map;
   1810	int nr_levels = 0;
   1811	int i, j;
   1812	int *distances;
   1813	struct cpumask ***masks;
   1814
   1815	/*
   1816	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
   1817	 * unique distances in the node_distance() table.
   1818	 */
   1819	distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
   1820	if (!distance_map)
   1821		return;
   1822
   1823	bitmap_zero(distance_map, NR_DISTANCE_VALUES);
   1824	for_each_cpu_node_but(i, offline_node) {
   1825		for_each_cpu_node_but(j, offline_node) {
   1826			int distance = node_distance(i, j);
   1827
   1828			if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
   1829				sched_numa_warn("Invalid distance value range");
   1830				bitmap_free(distance_map);
   1831				return;
   1832			}
   1833
   1834			bitmap_set(distance_map, distance, 1);
   1835		}
   1836	}
   1837	/*
   1838	 * We can now figure out how many unique distance values there are and
   1839	 * allocate memory accordingly.
   1840	 */
   1841	nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
   1842
   1843	distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
   1844	if (!distances) {
   1845		bitmap_free(distance_map);
   1846		return;
   1847	}
   1848
   1849	for (i = 0, j = 0; i < nr_levels; i++, j++) {
   1850		j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
   1851		distances[i] = j;
   1852	}
   1853	rcu_assign_pointer(sched_domains_numa_distance, distances);
   1854
   1855	bitmap_free(distance_map);
   1856
   1857	/*
   1858	 * 'nr_levels' contains the number of unique distances
   1859	 *
   1860	 * The sched_domains_numa_distance[] array includes the actual distance
   1861	 * numbers.
   1862	 */
   1863
   1864	/*
   1865	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
   1866	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
   1867	 * the array will contain less then 'nr_levels' members. This could be
   1868	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
   1869	 * in other functions.
   1870	 *
   1871	 * We reset it to 'nr_levels' at the end of this function.
   1872	 */
   1873	sched_domains_numa_levels = 0;
   1874
   1875	masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
   1876	if (!masks)
   1877		return;
   1878
   1879	/*
   1880	 * Now for each level, construct a mask per node which contains all
   1881	 * CPUs of nodes that are that many hops away from us.
   1882	 */
   1883	for (i = 0; i < nr_levels; i++) {
   1884		masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
   1885		if (!masks[i])
   1886			return;
   1887
   1888		for_each_cpu_node_but(j, offline_node) {
   1889			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
   1890			int k;
   1891
   1892			if (!mask)
   1893				return;
   1894
   1895			masks[i][j] = mask;
   1896
   1897			for_each_cpu_node_but(k, offline_node) {
   1898				if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
   1899					sched_numa_warn("Node-distance not symmetric");
   1900
   1901				if (node_distance(j, k) > sched_domains_numa_distance[i])
   1902					continue;
   1903
   1904				cpumask_or(mask, mask, cpumask_of_node(k));
   1905			}
   1906		}
   1907	}
   1908	rcu_assign_pointer(sched_domains_numa_masks, masks);
   1909
   1910	/* Compute default topology size */
   1911	for (i = 0; sched_domain_topology[i].mask; i++);
   1912
   1913	tl = kzalloc((i + nr_levels + 1) *
   1914			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
   1915	if (!tl)
   1916		return;
   1917
   1918	/*
   1919	 * Copy the default topology bits..
   1920	 */
   1921	for (i = 0; sched_domain_topology[i].mask; i++)
   1922		tl[i] = sched_domain_topology[i];
   1923
   1924	/*
   1925	 * Add the NUMA identity distance, aka single NODE.
   1926	 */
   1927	tl[i++] = (struct sched_domain_topology_level){
   1928		.mask = sd_numa_mask,
   1929		.numa_level = 0,
   1930		SD_INIT_NAME(NODE)
   1931	};
   1932
   1933	/*
   1934	 * .. and append 'j' levels of NUMA goodness.
   1935	 */
   1936	for (j = 1; j < nr_levels; i++, j++) {
   1937		tl[i] = (struct sched_domain_topology_level){
   1938			.mask = sd_numa_mask,
   1939			.sd_flags = cpu_numa_flags,
   1940			.flags = SDTL_OVERLAP,
   1941			.numa_level = j,
   1942			SD_INIT_NAME(NUMA)
   1943		};
   1944	}
   1945
   1946	sched_domain_topology_saved = sched_domain_topology;
   1947	sched_domain_topology = tl;
   1948
   1949	sched_domains_numa_levels = nr_levels;
   1950	WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
   1951
   1952	init_numa_topology_type(offline_node);
   1953}
   1954
   1955
   1956static void sched_reset_numa(void)
   1957{
   1958	int nr_levels, *distances;
   1959	struct cpumask ***masks;
   1960
   1961	nr_levels = sched_domains_numa_levels;
   1962	sched_domains_numa_levels = 0;
   1963	sched_max_numa_distance = 0;
   1964	sched_numa_topology_type = NUMA_DIRECT;
   1965	distances = sched_domains_numa_distance;
   1966	rcu_assign_pointer(sched_domains_numa_distance, NULL);
   1967	masks = sched_domains_numa_masks;
   1968	rcu_assign_pointer(sched_domains_numa_masks, NULL);
   1969	if (distances || masks) {
   1970		int i, j;
   1971
   1972		synchronize_rcu();
   1973		kfree(distances);
   1974		for (i = 0; i < nr_levels && masks; i++) {
   1975			if (!masks[i])
   1976				continue;
   1977			for_each_node(j)
   1978				kfree(masks[i][j]);
   1979			kfree(masks[i]);
   1980		}
   1981		kfree(masks);
   1982	}
   1983	if (sched_domain_topology_saved) {
   1984		kfree(sched_domain_topology);
   1985		sched_domain_topology = sched_domain_topology_saved;
   1986		sched_domain_topology_saved = NULL;
   1987	}
   1988}
   1989
   1990/*
   1991 * Call with hotplug lock held
   1992 */
   1993void sched_update_numa(int cpu, bool online)
   1994{
   1995	int node;
   1996
   1997	node = cpu_to_node(cpu);
   1998	/*
   1999	 * Scheduler NUMA topology is updated when the first CPU of a
   2000	 * node is onlined or the last CPU of a node is offlined.
   2001	 */
   2002	if (cpumask_weight(cpumask_of_node(node)) != 1)
   2003		return;
   2004
   2005	sched_reset_numa();
   2006	sched_init_numa(online ? NUMA_NO_NODE : node);
   2007}
   2008
   2009void sched_domains_numa_masks_set(unsigned int cpu)
   2010{
   2011	int node = cpu_to_node(cpu);
   2012	int i, j;
   2013
   2014	for (i = 0; i < sched_domains_numa_levels; i++) {
   2015		for (j = 0; j < nr_node_ids; j++) {
   2016			if (!node_state(j, N_CPU))
   2017				continue;
   2018
   2019			/* Set ourselves in the remote node's masks */
   2020			if (node_distance(j, node) <= sched_domains_numa_distance[i])
   2021				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
   2022		}
   2023	}
   2024}
   2025
   2026void sched_domains_numa_masks_clear(unsigned int cpu)
   2027{
   2028	int i, j;
   2029
   2030	for (i = 0; i < sched_domains_numa_levels; i++) {
   2031		for (j = 0; j < nr_node_ids; j++) {
   2032			if (sched_domains_numa_masks[i][j])
   2033				cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
   2034		}
   2035	}
   2036}
   2037
   2038/*
   2039 * sched_numa_find_closest() - given the NUMA topology, find the cpu
   2040 *                             closest to @cpu from @cpumask.
   2041 * cpumask: cpumask to find a cpu from
   2042 * cpu: cpu to be close to
   2043 *
   2044 * returns: cpu, or nr_cpu_ids when nothing found.
   2045 */
   2046int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
   2047{
   2048	int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
   2049	struct cpumask ***masks;
   2050
   2051	rcu_read_lock();
   2052	masks = rcu_dereference(sched_domains_numa_masks);
   2053	if (!masks)
   2054		goto unlock;
   2055	for (i = 0; i < sched_domains_numa_levels; i++) {
   2056		if (!masks[i][j])
   2057			break;
   2058		cpu = cpumask_any_and(cpus, masks[i][j]);
   2059		if (cpu < nr_cpu_ids) {
   2060			found = cpu;
   2061			break;
   2062		}
   2063	}
   2064unlock:
   2065	rcu_read_unlock();
   2066
   2067	return found;
   2068}
   2069
   2070#endif /* CONFIG_NUMA */
   2071
   2072static int __sdt_alloc(const struct cpumask *cpu_map)
   2073{
   2074	struct sched_domain_topology_level *tl;
   2075	int j;
   2076
   2077	for_each_sd_topology(tl) {
   2078		struct sd_data *sdd = &tl->data;
   2079
   2080		sdd->sd = alloc_percpu(struct sched_domain *);
   2081		if (!sdd->sd)
   2082			return -ENOMEM;
   2083
   2084		sdd->sds = alloc_percpu(struct sched_domain_shared *);
   2085		if (!sdd->sds)
   2086			return -ENOMEM;
   2087
   2088		sdd->sg = alloc_percpu(struct sched_group *);
   2089		if (!sdd->sg)
   2090			return -ENOMEM;
   2091
   2092		sdd->sgc = alloc_percpu(struct sched_group_capacity *);
   2093		if (!sdd->sgc)
   2094			return -ENOMEM;
   2095
   2096		for_each_cpu(j, cpu_map) {
   2097			struct sched_domain *sd;
   2098			struct sched_domain_shared *sds;
   2099			struct sched_group *sg;
   2100			struct sched_group_capacity *sgc;
   2101
   2102			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
   2103					GFP_KERNEL, cpu_to_node(j));
   2104			if (!sd)
   2105				return -ENOMEM;
   2106
   2107			*per_cpu_ptr(sdd->sd, j) = sd;
   2108
   2109			sds = kzalloc_node(sizeof(struct sched_domain_shared),
   2110					GFP_KERNEL, cpu_to_node(j));
   2111			if (!sds)
   2112				return -ENOMEM;
   2113
   2114			*per_cpu_ptr(sdd->sds, j) = sds;
   2115
   2116			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
   2117					GFP_KERNEL, cpu_to_node(j));
   2118			if (!sg)
   2119				return -ENOMEM;
   2120
   2121			sg->next = sg;
   2122
   2123			*per_cpu_ptr(sdd->sg, j) = sg;
   2124
   2125			sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
   2126					GFP_KERNEL, cpu_to_node(j));
   2127			if (!sgc)
   2128				return -ENOMEM;
   2129
   2130#ifdef CONFIG_SCHED_DEBUG
   2131			sgc->id = j;
   2132#endif
   2133
   2134			*per_cpu_ptr(sdd->sgc, j) = sgc;
   2135		}
   2136	}
   2137
   2138	return 0;
   2139}
   2140
   2141static void __sdt_free(const struct cpumask *cpu_map)
   2142{
   2143	struct sched_domain_topology_level *tl;
   2144	int j;
   2145
   2146	for_each_sd_topology(tl) {
   2147		struct sd_data *sdd = &tl->data;
   2148
   2149		for_each_cpu(j, cpu_map) {
   2150			struct sched_domain *sd;
   2151
   2152			if (sdd->sd) {
   2153				sd = *per_cpu_ptr(sdd->sd, j);
   2154				if (sd && (sd->flags & SD_OVERLAP))
   2155					free_sched_groups(sd->groups, 0);
   2156				kfree(*per_cpu_ptr(sdd->sd, j));
   2157			}
   2158
   2159			if (sdd->sds)
   2160				kfree(*per_cpu_ptr(sdd->sds, j));
   2161			if (sdd->sg)
   2162				kfree(*per_cpu_ptr(sdd->sg, j));
   2163			if (sdd->sgc)
   2164				kfree(*per_cpu_ptr(sdd->sgc, j));
   2165		}
   2166		free_percpu(sdd->sd);
   2167		sdd->sd = NULL;
   2168		free_percpu(sdd->sds);
   2169		sdd->sds = NULL;
   2170		free_percpu(sdd->sg);
   2171		sdd->sg = NULL;
   2172		free_percpu(sdd->sgc);
   2173		sdd->sgc = NULL;
   2174	}
   2175}
   2176
   2177static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
   2178		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
   2179		struct sched_domain *child, int cpu)
   2180{
   2181	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
   2182
   2183	if (child) {
   2184		sd->level = child->level + 1;
   2185		sched_domain_level_max = max(sched_domain_level_max, sd->level);
   2186		child->parent = sd;
   2187
   2188		if (!cpumask_subset(sched_domain_span(child),
   2189				    sched_domain_span(sd))) {
   2190			pr_err("BUG: arch topology borken\n");
   2191#ifdef CONFIG_SCHED_DEBUG
   2192			pr_err("     the %s domain not a subset of the %s domain\n",
   2193					child->name, sd->name);
   2194#endif
   2195			/* Fixup, ensure @sd has at least @child CPUs. */
   2196			cpumask_or(sched_domain_span(sd),
   2197				   sched_domain_span(sd),
   2198				   sched_domain_span(child));
   2199		}
   2200
   2201	}
   2202	set_domain_attribute(sd, attr);
   2203
   2204	return sd;
   2205}
   2206
   2207/*
   2208 * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
   2209 * any two given CPUs at this (non-NUMA) topology level.
   2210 */
   2211static bool topology_span_sane(struct sched_domain_topology_level *tl,
   2212			      const struct cpumask *cpu_map, int cpu)
   2213{
   2214	int i;
   2215
   2216	/* NUMA levels are allowed to overlap */
   2217	if (tl->flags & SDTL_OVERLAP)
   2218		return true;
   2219
   2220	/*
   2221	 * Non-NUMA levels cannot partially overlap - they must be either
   2222	 * completely equal or completely disjoint. Otherwise we can end up
   2223	 * breaking the sched_group lists - i.e. a later get_group() pass
   2224	 * breaks the linking done for an earlier span.
   2225	 */
   2226	for_each_cpu(i, cpu_map) {
   2227		if (i == cpu)
   2228			continue;
   2229		/*
   2230		 * We should 'and' all those masks with 'cpu_map' to exactly
   2231		 * match the topology we're about to build, but that can only
   2232		 * remove CPUs, which only lessens our ability to detect
   2233		 * overlaps
   2234		 */
   2235		if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
   2236		    cpumask_intersects(tl->mask(cpu), tl->mask(i)))
   2237			return false;
   2238	}
   2239
   2240	return true;
   2241}
   2242
   2243/*
   2244 * Build sched domains for a given set of CPUs and attach the sched domains
   2245 * to the individual CPUs
   2246 */
   2247static int
   2248build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
   2249{
   2250	enum s_alloc alloc_state = sa_none;
   2251	struct sched_domain *sd;
   2252	struct s_data d;
   2253	struct rq *rq = NULL;
   2254	int i, ret = -ENOMEM;
   2255	bool has_asym = false;
   2256
   2257	if (WARN_ON(cpumask_empty(cpu_map)))
   2258		goto error;
   2259
   2260	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
   2261	if (alloc_state != sa_rootdomain)
   2262		goto error;
   2263
   2264	/* Set up domains for CPUs specified by the cpu_map: */
   2265	for_each_cpu(i, cpu_map) {
   2266		struct sched_domain_topology_level *tl;
   2267
   2268		sd = NULL;
   2269		for_each_sd_topology(tl) {
   2270
   2271			if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
   2272				goto error;
   2273
   2274			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
   2275
   2276			has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
   2277
   2278			if (tl == sched_domain_topology)
   2279				*per_cpu_ptr(d.sd, i) = sd;
   2280			if (tl->flags & SDTL_OVERLAP)
   2281				sd->flags |= SD_OVERLAP;
   2282			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
   2283				break;
   2284		}
   2285	}
   2286
   2287	/* Build the groups for the domains */
   2288	for_each_cpu(i, cpu_map) {
   2289		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
   2290			sd->span_weight = cpumask_weight(sched_domain_span(sd));
   2291			if (sd->flags & SD_OVERLAP) {
   2292				if (build_overlap_sched_groups(sd, i))
   2293					goto error;
   2294			} else {
   2295				if (build_sched_groups(sd, i))
   2296					goto error;
   2297			}
   2298		}
   2299	}
   2300
   2301	/*
   2302	 * Calculate an allowed NUMA imbalance such that LLCs do not get
   2303	 * imbalanced.
   2304	 */
   2305	for_each_cpu(i, cpu_map) {
   2306		unsigned int imb = 0;
   2307		unsigned int imb_span = 1;
   2308
   2309		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
   2310			struct sched_domain *child = sd->child;
   2311
   2312			if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
   2313			    (child->flags & SD_SHARE_PKG_RESOURCES)) {
   2314				struct sched_domain __rcu *top_p;
   2315				unsigned int nr_llcs;
   2316
   2317				/*
   2318				 * For a single LLC per node, allow an
   2319				 * imbalance up to 25% of the node. This is an
   2320				 * arbitrary cutoff based on SMT-2 to balance
   2321				 * between memory bandwidth and avoiding
   2322				 * premature sharing of HT resources and SMT-4
   2323				 * or SMT-8 *may* benefit from a different
   2324				 * cutoff.
   2325				 *
   2326				 * For multiple LLCs, allow an imbalance
   2327				 * until multiple tasks would share an LLC
   2328				 * on one node while LLCs on another node
   2329				 * remain idle.
   2330				 */
   2331				nr_llcs = sd->span_weight / child->span_weight;
   2332				if (nr_llcs == 1)
   2333					imb = sd->span_weight >> 2;
   2334				else
   2335					imb = nr_llcs;
   2336				sd->imb_numa_nr = imb;
   2337
   2338				/* Set span based on the first NUMA domain. */
   2339				top_p = sd->parent;
   2340				while (top_p && !(top_p->flags & SD_NUMA)) {
   2341					top_p = top_p->parent;
   2342				}
   2343				imb_span = top_p ? top_p->span_weight : sd->span_weight;
   2344			} else {
   2345				int factor = max(1U, (sd->span_weight / imb_span));
   2346
   2347				sd->imb_numa_nr = imb * factor;
   2348			}
   2349		}
   2350	}
   2351
   2352	/* Calculate CPU capacity for physical packages and nodes */
   2353	for (i = nr_cpumask_bits-1; i >= 0; i--) {
   2354		if (!cpumask_test_cpu(i, cpu_map))
   2355			continue;
   2356
   2357		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
   2358			claim_allocations(i, sd);
   2359			init_sched_groups_capacity(i, sd);
   2360		}
   2361	}
   2362
   2363	/* Attach the domains */
   2364	rcu_read_lock();
   2365	for_each_cpu(i, cpu_map) {
   2366		rq = cpu_rq(i);
   2367		sd = *per_cpu_ptr(d.sd, i);
   2368
   2369		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
   2370		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
   2371			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
   2372
   2373		cpu_attach_domain(sd, d.rd, i);
   2374	}
   2375	rcu_read_unlock();
   2376
   2377	if (has_asym)
   2378		static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
   2379
   2380	if (rq && sched_debug_verbose) {
   2381		pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
   2382			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
   2383	}
   2384
   2385	ret = 0;
   2386error:
   2387	__free_domain_allocs(&d, alloc_state, cpu_map);
   2388
   2389	return ret;
   2390}
   2391
   2392/* Current sched domains: */
   2393static cpumask_var_t			*doms_cur;
   2394
   2395/* Number of sched domains in 'doms_cur': */
   2396static int				ndoms_cur;
   2397
   2398/* Attributes of custom domains in 'doms_cur' */
   2399static struct sched_domain_attr		*dattr_cur;
   2400
   2401/*
   2402 * Special case: If a kmalloc() of a doms_cur partition (array of
   2403 * cpumask) fails, then fallback to a single sched domain,
   2404 * as determined by the single cpumask fallback_doms.
   2405 */
   2406static cpumask_var_t			fallback_doms;
   2407
   2408/*
   2409 * arch_update_cpu_topology lets virtualized architectures update the
   2410 * CPU core maps. It is supposed to return 1 if the topology changed
   2411 * or 0 if it stayed the same.
   2412 */
   2413int __weak arch_update_cpu_topology(void)
   2414{
   2415	return 0;
   2416}
   2417
   2418cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
   2419{
   2420	int i;
   2421	cpumask_var_t *doms;
   2422
   2423	doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
   2424	if (!doms)
   2425		return NULL;
   2426	for (i = 0; i < ndoms; i++) {
   2427		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
   2428			free_sched_domains(doms, i);
   2429			return NULL;
   2430		}
   2431	}
   2432	return doms;
   2433}
   2434
   2435void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
   2436{
   2437	unsigned int i;
   2438	for (i = 0; i < ndoms; i++)
   2439		free_cpumask_var(doms[i]);
   2440	kfree(doms);
   2441}
   2442
   2443/*
   2444 * Set up scheduler domains and groups.  For now this just excludes isolated
   2445 * CPUs, but could be used to exclude other special cases in the future.
   2446 */
   2447int sched_init_domains(const struct cpumask *cpu_map)
   2448{
   2449	int err;
   2450
   2451	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
   2452	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
   2453	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
   2454
   2455	arch_update_cpu_topology();
   2456	asym_cpu_capacity_scan();
   2457	ndoms_cur = 1;
   2458	doms_cur = alloc_sched_domains(ndoms_cur);
   2459	if (!doms_cur)
   2460		doms_cur = &fallback_doms;
   2461	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN));
   2462	err = build_sched_domains(doms_cur[0], NULL);
   2463
   2464	return err;
   2465}
   2466
   2467/*
   2468 * Detach sched domains from a group of CPUs specified in cpu_map
   2469 * These CPUs will now be attached to the NULL domain
   2470 */
   2471static void detach_destroy_domains(const struct cpumask *cpu_map)
   2472{
   2473	unsigned int cpu = cpumask_any(cpu_map);
   2474	int i;
   2475
   2476	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
   2477		static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
   2478
   2479	rcu_read_lock();
   2480	for_each_cpu(i, cpu_map)
   2481		cpu_attach_domain(NULL, &def_root_domain, i);
   2482	rcu_read_unlock();
   2483}
   2484
   2485/* handle null as "default" */
   2486static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
   2487			struct sched_domain_attr *new, int idx_new)
   2488{
   2489	struct sched_domain_attr tmp;
   2490
   2491	/* Fast path: */
   2492	if (!new && !cur)
   2493		return 1;
   2494
   2495	tmp = SD_ATTR_INIT;
   2496
   2497	return !memcmp(cur ? (cur + idx_cur) : &tmp,
   2498			new ? (new + idx_new) : &tmp,
   2499			sizeof(struct sched_domain_attr));
   2500}
   2501
   2502/*
   2503 * Partition sched domains as specified by the 'ndoms_new'
   2504 * cpumasks in the array doms_new[] of cpumasks. This compares
   2505 * doms_new[] to the current sched domain partitioning, doms_cur[].
   2506 * It destroys each deleted domain and builds each new domain.
   2507 *
   2508 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
   2509 * The masks don't intersect (don't overlap.) We should setup one
   2510 * sched domain for each mask. CPUs not in any of the cpumasks will
   2511 * not be load balanced. If the same cpumask appears both in the
   2512 * current 'doms_cur' domains and in the new 'doms_new', we can leave
   2513 * it as it is.
   2514 *
   2515 * The passed in 'doms_new' should be allocated using
   2516 * alloc_sched_domains.  This routine takes ownership of it and will
   2517 * free_sched_domains it when done with it. If the caller failed the
   2518 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
   2519 * and partition_sched_domains() will fallback to the single partition
   2520 * 'fallback_doms', it also forces the domains to be rebuilt.
   2521 *
   2522 * If doms_new == NULL it will be replaced with cpu_online_mask.
   2523 * ndoms_new == 0 is a special case for destroying existing domains,
   2524 * and it will not create the default domain.
   2525 *
   2526 * Call with hotplug lock and sched_domains_mutex held
   2527 */
   2528void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
   2529				    struct sched_domain_attr *dattr_new)
   2530{
   2531	bool __maybe_unused has_eas = false;
   2532	int i, j, n;
   2533	int new_topology;
   2534
   2535	lockdep_assert_held(&sched_domains_mutex);
   2536
   2537	/* Let the architecture update CPU core mappings: */
   2538	new_topology = arch_update_cpu_topology();
   2539	/* Trigger rebuilding CPU capacity asymmetry data */
   2540	if (new_topology)
   2541		asym_cpu_capacity_scan();
   2542
   2543	if (!doms_new) {
   2544		WARN_ON_ONCE(dattr_new);
   2545		n = 0;
   2546		doms_new = alloc_sched_domains(1);
   2547		if (doms_new) {
   2548			n = 1;
   2549			cpumask_and(doms_new[0], cpu_active_mask,
   2550				    housekeeping_cpumask(HK_TYPE_DOMAIN));
   2551		}
   2552	} else {
   2553		n = ndoms_new;
   2554	}
   2555
   2556	/* Destroy deleted domains: */
   2557	for (i = 0; i < ndoms_cur; i++) {
   2558		for (j = 0; j < n && !new_topology; j++) {
   2559			if (cpumask_equal(doms_cur[i], doms_new[j]) &&
   2560			    dattrs_equal(dattr_cur, i, dattr_new, j)) {
   2561				struct root_domain *rd;
   2562
   2563				/*
   2564				 * This domain won't be destroyed and as such
   2565				 * its dl_bw->total_bw needs to be cleared.  It
   2566				 * will be recomputed in function
   2567				 * update_tasks_root_domain().
   2568				 */
   2569				rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
   2570				dl_clear_root_domain(rd);
   2571				goto match1;
   2572			}
   2573		}
   2574		/* No match - a current sched domain not in new doms_new[] */
   2575		detach_destroy_domains(doms_cur[i]);
   2576match1:
   2577		;
   2578	}
   2579
   2580	n = ndoms_cur;
   2581	if (!doms_new) {
   2582		n = 0;
   2583		doms_new = &fallback_doms;
   2584		cpumask_and(doms_new[0], cpu_active_mask,
   2585			    housekeeping_cpumask(HK_TYPE_DOMAIN));
   2586	}
   2587
   2588	/* Build new domains: */
   2589	for (i = 0; i < ndoms_new; i++) {
   2590		for (j = 0; j < n && !new_topology; j++) {
   2591			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
   2592			    dattrs_equal(dattr_new, i, dattr_cur, j))
   2593				goto match2;
   2594		}
   2595		/* No match - add a new doms_new */
   2596		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
   2597match2:
   2598		;
   2599	}
   2600
   2601#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
   2602	/* Build perf. domains: */
   2603	for (i = 0; i < ndoms_new; i++) {
   2604		for (j = 0; j < n && !sched_energy_update; j++) {
   2605			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
   2606			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
   2607				has_eas = true;
   2608				goto match3;
   2609			}
   2610		}
   2611		/* No match - add perf. domains for a new rd */
   2612		has_eas |= build_perf_domains(doms_new[i]);
   2613match3:
   2614		;
   2615	}
   2616	sched_energy_set(has_eas);
   2617#endif
   2618
   2619	/* Remember the new sched domains: */
   2620	if (doms_cur != &fallback_doms)
   2621		free_sched_domains(doms_cur, ndoms_cur);
   2622
   2623	kfree(dattr_cur);
   2624	doms_cur = doms_new;
   2625	dattr_cur = dattr_new;
   2626	ndoms_cur = ndoms_new;
   2627
   2628	update_sched_domain_debugfs();
   2629}
   2630
   2631/*
   2632 * Call with hotplug lock held
   2633 */
   2634void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
   2635			     struct sched_domain_attr *dattr_new)
   2636{
   2637	mutex_lock(&sched_domains_mutex);
   2638	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
   2639	mutex_unlock(&sched_domains_mutex);
   2640}