cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

smpboot.c (46810B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2 /*
      3 *	x86 SMP booting functions
      4 *
      5 *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
      6 *	(c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
      7 *	Copyright 2001 Andi Kleen, SuSE Labs.
      8 *
      9 *	Much of the core SMP work is based on previous work by Thomas Radke, to
     10 *	whom a great many thanks are extended.
     11 *
     12 *	Thanks to Intel for making available several different Pentium,
     13 *	Pentium Pro and Pentium-II/Xeon MP machines.
     14 *	Original development of Linux SMP code supported by Caldera.
     15 *
     16 *	Fixes
     17 *		Felix Koop	:	NR_CPUS used properly
     18 *		Jose Renau	:	Handle single CPU case.
     19 *		Alan Cox	:	By repeated request 8) - Total BogoMIPS report.
     20 *		Greg Wright	:	Fix for kernel stacks panic.
     21 *		Erich Boleyn	:	MP v1.4 and additional changes.
     22 *	Matthias Sattler	:	Changes for 2.1 kernel map.
     23 *	Michel Lespinasse	:	Changes for 2.1 kernel map.
     24 *	Michael Chastain	:	Change trampoline.S to gnu as.
     25 *		Alan Cox	:	Dumb bug: 'B' step PPro's are fine
     26 *		Ingo Molnar	:	Added APIC timers, based on code
     27 *					from Jose Renau
     28 *		Ingo Molnar	:	various cleanups and rewrites
     29 *		Tigran Aivazian	:	fixed "0.00 in /proc/uptime on SMP" bug.
     30 *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs
     31 *	Andi Kleen		:	Changed for SMP boot into long mode.
     32 *		Martin J. Bligh	: 	Added support for multi-quad systems
     33 *		Dave Jones	:	Report invalid combinations of Athlon CPUs.
     34 *		Rusty Russell	:	Hacked into shape for new "hotplug" boot process.
     35 *      Andi Kleen              :       Converted to new state machine.
     36 *	Ashok Raj		: 	CPU hotplug support
     37 *	Glauber Costa		:	i386 and x86_64 integration
     38 */
     39
     40#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     41
     42#include <linux/init.h>
     43#include <linux/smp.h>
     44#include <linux/export.h>
     45#include <linux/sched.h>
     46#include <linux/sched/topology.h>
     47#include <linux/sched/hotplug.h>
     48#include <linux/sched/task_stack.h>
     49#include <linux/percpu.h>
     50#include <linux/memblock.h>
     51#include <linux/err.h>
     52#include <linux/nmi.h>
     53#include <linux/tboot.h>
     54#include <linux/gfp.h>
     55#include <linux/cpuidle.h>
     56#include <linux/numa.h>
     57#include <linux/pgtable.h>
     58#include <linux/overflow.h>
     59
     60#include <asm/acpi.h>
     61#include <asm/desc.h>
     62#include <asm/nmi.h>
     63#include <asm/irq.h>
     64#include <asm/realmode.h>
     65#include <asm/cpu.h>
     66#include <asm/numa.h>
     67#include <asm/tlbflush.h>
     68#include <asm/mtrr.h>
     69#include <asm/mwait.h>
     70#include <asm/apic.h>
     71#include <asm/io_apic.h>
     72#include <asm/fpu/api.h>
     73#include <asm/setup.h>
     74#include <asm/uv/uv.h>
     75#include <linux/mc146818rtc.h>
     76#include <asm/i8259.h>
     77#include <asm/misc.h>
     78#include <asm/qspinlock.h>
     79#include <asm/intel-family.h>
     80#include <asm/cpu_device_id.h>
     81#include <asm/spec-ctrl.h>
     82#include <asm/hw_irq.h>
     83#include <asm/stackprotector.h>
     84#include <asm/sev.h>
     85
     86/* representing HT siblings of each logical CPU */
     87DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
     88EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
     89
     90/* representing HT and core siblings of each logical CPU */
     91DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
     92EXPORT_PER_CPU_SYMBOL(cpu_core_map);
     93
     94/* representing HT, core, and die siblings of each logical CPU */
     95DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
     96EXPORT_PER_CPU_SYMBOL(cpu_die_map);
     97
     98DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
     99
    100DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
    101
    102/* Per CPU bogomips and other parameters */
    103DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
    104EXPORT_PER_CPU_SYMBOL(cpu_info);
    105
    106/* Logical package management. We might want to allocate that dynamically */
    107unsigned int __max_logical_packages __read_mostly;
    108EXPORT_SYMBOL(__max_logical_packages);
    109static unsigned int logical_packages __read_mostly;
    110static unsigned int logical_die __read_mostly;
    111
    112/* Maximum number of SMT threads on any online core */
    113int __read_mostly __max_smt_threads = 1;
    114
    115/* Flag to indicate if a complete sched domain rebuild is required */
    116bool x86_topology_update;
    117
    118int arch_update_cpu_topology(void)
    119{
    120	int retval = x86_topology_update;
    121
    122	x86_topology_update = false;
    123	return retval;
    124}
    125
    126static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
    127{
    128	unsigned long flags;
    129
    130	spin_lock_irqsave(&rtc_lock, flags);
    131	CMOS_WRITE(0xa, 0xf);
    132	spin_unlock_irqrestore(&rtc_lock, flags);
    133	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
    134							start_eip >> 4;
    135	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
    136							start_eip & 0xf;
    137}
    138
    139static inline void smpboot_restore_warm_reset_vector(void)
    140{
    141	unsigned long flags;
    142
    143	/*
    144	 * Paranoid:  Set warm reset code and vector here back
    145	 * to default values.
    146	 */
    147	spin_lock_irqsave(&rtc_lock, flags);
    148	CMOS_WRITE(0, 0xf);
    149	spin_unlock_irqrestore(&rtc_lock, flags);
    150
    151	*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
    152}
    153
    154/*
    155 * Report back to the Boot Processor during boot time or to the caller processor
    156 * during CPU online.
    157 */
    158static void smp_callin(void)
    159{
    160	int cpuid;
    161
    162	/*
    163	 * If waken up by an INIT in an 82489DX configuration
    164	 * cpu_callout_mask guarantees we don't get here before
    165	 * an INIT_deassert IPI reaches our local APIC, so it is
    166	 * now safe to touch our local APIC.
    167	 */
    168	cpuid = smp_processor_id();
    169
    170	/*
    171	 * the boot CPU has finished the init stage and is spinning
    172	 * on callin_map until we finish. We are free to set up this
    173	 * CPU, first the APIC. (this is probably redundant on most
    174	 * boards)
    175	 */
    176	apic_ap_setup();
    177
    178	/*
    179	 * Save our processor parameters. Note: this information
    180	 * is needed for clock calibration.
    181	 */
    182	smp_store_cpu_info(cpuid);
    183
    184	/*
    185	 * The topology information must be up to date before
    186	 * calibrate_delay() and notify_cpu_starting().
    187	 */
    188	set_cpu_sibling_map(raw_smp_processor_id());
    189
    190	ap_init_aperfmperf();
    191
    192	/*
    193	 * Get our bogomips.
    194	 * Update loops_per_jiffy in cpu_data. Previous call to
    195	 * smp_store_cpu_info() stored a value that is close but not as
    196	 * accurate as the value just calculated.
    197	 */
    198	calibrate_delay();
    199	cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
    200	pr_debug("Stack at about %p\n", &cpuid);
    201
    202	wmb();
    203
    204	notify_cpu_starting(cpuid);
    205
    206	/*
    207	 * Allow the master to continue.
    208	 */
    209	cpumask_set_cpu(cpuid, cpu_callin_mask);
    210}
    211
    212static int cpu0_logical_apicid;
    213static int enable_start_cpu0;
    214/*
    215 * Activate a secondary processor.
    216 */
    217static void notrace start_secondary(void *unused)
    218{
    219	/*
    220	 * Don't put *anything* except direct CPU state initialization
    221	 * before cpu_init(), SMP booting is too fragile that we want to
    222	 * limit the things done here to the most necessary things.
    223	 */
    224	cr4_init();
    225
    226#ifdef CONFIG_X86_32
    227	/* switch away from the initial page table */
    228	load_cr3(swapper_pg_dir);
    229	__flush_tlb_all();
    230#endif
    231	cpu_init_secondary();
    232	rcu_cpu_starting(raw_smp_processor_id());
    233	x86_cpuinit.early_percpu_clock_init();
    234	smp_callin();
    235
    236	enable_start_cpu0 = 0;
    237
    238	/* otherwise gcc will move up smp_processor_id before the cpu_init */
    239	barrier();
    240	/*
    241	 * Check TSC synchronization with the boot CPU:
    242	 */
    243	check_tsc_sync_target();
    244
    245	speculative_store_bypass_ht_init();
    246
    247	/*
    248	 * Lock vector_lock, set CPU online and bring the vector
    249	 * allocator online. Online must be set with vector_lock held
    250	 * to prevent a concurrent irq setup/teardown from seeing a
    251	 * half valid vector space.
    252	 */
    253	lock_vector_lock();
    254	set_cpu_online(smp_processor_id(), true);
    255	lapic_online();
    256	unlock_vector_lock();
    257	cpu_set_state_online(smp_processor_id());
    258	x86_platform.nmi_init();
    259
    260	/* enable local interrupts */
    261	local_irq_enable();
    262
    263	x86_cpuinit.setup_percpu_clockev();
    264
    265	wmb();
    266	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
    267}
    268
    269/**
    270 * topology_is_primary_thread - Check whether CPU is the primary SMT thread
    271 * @cpu:	CPU to check
    272 */
    273bool topology_is_primary_thread(unsigned int cpu)
    274{
    275	return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu));
    276}
    277
    278/**
    279 * topology_smt_supported - Check whether SMT is supported by the CPUs
    280 */
    281bool topology_smt_supported(void)
    282{
    283	return smp_num_siblings > 1;
    284}
    285
    286/**
    287 * topology_phys_to_logical_pkg - Map a physical package id to a logical
    288 *
    289 * Returns logical package id or -1 if not found
    290 */
    291int topology_phys_to_logical_pkg(unsigned int phys_pkg)
    292{
    293	int cpu;
    294
    295	for_each_possible_cpu(cpu) {
    296		struct cpuinfo_x86 *c = &cpu_data(cpu);
    297
    298		if (c->initialized && c->phys_proc_id == phys_pkg)
    299			return c->logical_proc_id;
    300	}
    301	return -1;
    302}
    303EXPORT_SYMBOL(topology_phys_to_logical_pkg);
    304/**
    305 * topology_phys_to_logical_die - Map a physical die id to logical
    306 *
    307 * Returns logical die id or -1 if not found
    308 */
    309int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
    310{
    311	int cpu;
    312	int proc_id = cpu_data(cur_cpu).phys_proc_id;
    313
    314	for_each_possible_cpu(cpu) {
    315		struct cpuinfo_x86 *c = &cpu_data(cpu);
    316
    317		if (c->initialized && c->cpu_die_id == die_id &&
    318		    c->phys_proc_id == proc_id)
    319			return c->logical_die_id;
    320	}
    321	return -1;
    322}
    323EXPORT_SYMBOL(topology_phys_to_logical_die);
    324
    325/**
    326 * topology_update_package_map - Update the physical to logical package map
    327 * @pkg:	The physical package id as retrieved via CPUID
    328 * @cpu:	The cpu for which this is updated
    329 */
    330int topology_update_package_map(unsigned int pkg, unsigned int cpu)
    331{
    332	int new;
    333
    334	/* Already available somewhere? */
    335	new = topology_phys_to_logical_pkg(pkg);
    336	if (new >= 0)
    337		goto found;
    338
    339	new = logical_packages++;
    340	if (new != pkg) {
    341		pr_info("CPU %u Converting physical %u to logical package %u\n",
    342			cpu, pkg, new);
    343	}
    344found:
    345	cpu_data(cpu).logical_proc_id = new;
    346	return 0;
    347}
    348/**
    349 * topology_update_die_map - Update the physical to logical die map
    350 * @die:	The die id as retrieved via CPUID
    351 * @cpu:	The cpu for which this is updated
    352 */
    353int topology_update_die_map(unsigned int die, unsigned int cpu)
    354{
    355	int new;
    356
    357	/* Already available somewhere? */
    358	new = topology_phys_to_logical_die(die, cpu);
    359	if (new >= 0)
    360		goto found;
    361
    362	new = logical_die++;
    363	if (new != die) {
    364		pr_info("CPU %u Converting physical %u to logical die %u\n",
    365			cpu, die, new);
    366	}
    367found:
    368	cpu_data(cpu).logical_die_id = new;
    369	return 0;
    370}
    371
    372void __init smp_store_boot_cpu_info(void)
    373{
    374	int id = 0; /* CPU 0 */
    375	struct cpuinfo_x86 *c = &cpu_data(id);
    376
    377	*c = boot_cpu_data;
    378	c->cpu_index = id;
    379	topology_update_package_map(c->phys_proc_id, id);
    380	topology_update_die_map(c->cpu_die_id, id);
    381	c->initialized = true;
    382}
    383
    384/*
    385 * The bootstrap kernel entry code has set these up. Save them for
    386 * a given CPU
    387 */
    388void smp_store_cpu_info(int id)
    389{
    390	struct cpuinfo_x86 *c = &cpu_data(id);
    391
    392	/* Copy boot_cpu_data only on the first bringup */
    393	if (!c->initialized)
    394		*c = boot_cpu_data;
    395	c->cpu_index = id;
    396	/*
    397	 * During boot time, CPU0 has this setup already. Save the info when
    398	 * bringing up AP or offlined CPU0.
    399	 */
    400	identify_secondary_cpu(c);
    401	c->initialized = true;
    402}
    403
    404static bool
    405topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
    406{
    407	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
    408
    409	return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
    410}
    411
    412static bool
    413topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
    414{
    415	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
    416
    417	return !WARN_ONCE(!topology_same_node(c, o),
    418		"sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
    419		"[node: %d != %d]. Ignoring dependency.\n",
    420		cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
    421}
    422
    423#define link_mask(mfunc, c1, c2)					\
    424do {									\
    425	cpumask_set_cpu((c1), mfunc(c2));				\
    426	cpumask_set_cpu((c2), mfunc(c1));				\
    427} while (0)
    428
    429static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
    430{
    431	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
    432		int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
    433
    434		if (c->phys_proc_id == o->phys_proc_id &&
    435		    c->cpu_die_id == o->cpu_die_id &&
    436		    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
    437			if (c->cpu_core_id == o->cpu_core_id)
    438				return topology_sane(c, o, "smt");
    439
    440			if ((c->cu_id != 0xff) &&
    441			    (o->cu_id != 0xff) &&
    442			    (c->cu_id == o->cu_id))
    443				return topology_sane(c, o, "smt");
    444		}
    445
    446	} else if (c->phys_proc_id == o->phys_proc_id &&
    447		   c->cpu_die_id == o->cpu_die_id &&
    448		   c->cpu_core_id == o->cpu_core_id) {
    449		return topology_sane(c, o, "smt");
    450	}
    451
    452	return false;
    453}
    454
    455static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
    456{
    457	if (c->phys_proc_id == o->phys_proc_id &&
    458	    c->cpu_die_id == o->cpu_die_id)
    459		return true;
    460	return false;
    461}
    462
    463static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
    464{
    465	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
    466
    467	/* If the arch didn't set up l2c_id, fall back to SMT */
    468	if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
    469		return match_smt(c, o);
    470
    471	/* Do not match if L2 cache id does not match: */
    472	if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
    473		return false;
    474
    475	return topology_sane(c, o, "l2c");
    476}
    477
    478/*
    479 * Unlike the other levels, we do not enforce keeping a
    480 * multicore group inside a NUMA node.  If this happens, we will
    481 * discard the MC level of the topology later.
    482 */
    483static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
    484{
    485	if (c->phys_proc_id == o->phys_proc_id)
    486		return true;
    487	return false;
    488}
    489
    490/*
    491 * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
    492 *
    493 * Any Intel CPU that has multiple nodes per package and does not
    494 * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
    495 *
    496 * When in SNC mode, these CPUs enumerate an LLC that is shared
    497 * by multiple NUMA nodes. The LLC is shared for off-package data
    498 * access but private to the NUMA node (half of the package) for
    499 * on-package access. CPUID (the source of the information about
    500 * the LLC) can only enumerate the cache as shared or unshared,
    501 * but not this particular configuration.
    502 */
    503
    504static const struct x86_cpu_id intel_cod_cpu[] = {
    505	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0),	/* COD */
    506	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0),	/* COD */
    507	X86_MATCH_INTEL_FAM6_MODEL(ANY, 1),		/* SNC */
    508	{}
    509};
    510
    511static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
    512{
    513	const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
    514	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
    515	bool intel_snc = id && id->driver_data;
    516
    517	/* Do not match if we do not have a valid APICID for cpu: */
    518	if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
    519		return false;
    520
    521	/* Do not match if LLC id does not match: */
    522	if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
    523		return false;
    524
    525	/*
    526	 * Allow the SNC topology without warning. Return of false
    527	 * means 'c' does not share the LLC of 'o'. This will be
    528	 * reflected to userspace.
    529	 */
    530	if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
    531		return false;
    532
    533	return topology_sane(c, o, "llc");
    534}
    535
    536
    537#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
    538static inline int x86_sched_itmt_flags(void)
    539{
    540	return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
    541}
    542
    543#ifdef CONFIG_SCHED_MC
    544static int x86_core_flags(void)
    545{
    546	return cpu_core_flags() | x86_sched_itmt_flags();
    547}
    548#endif
    549#ifdef CONFIG_SCHED_SMT
    550static int x86_smt_flags(void)
    551{
    552	return cpu_smt_flags() | x86_sched_itmt_flags();
    553}
    554#endif
    555#ifdef CONFIG_SCHED_CLUSTER
    556static int x86_cluster_flags(void)
    557{
    558	return cpu_cluster_flags() | x86_sched_itmt_flags();
    559}
    560#endif
    561#endif
    562
    563static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
    564#ifdef CONFIG_SCHED_SMT
    565	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
    566#endif
    567#ifdef CONFIG_SCHED_CLUSTER
    568	{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
    569#endif
    570#ifdef CONFIG_SCHED_MC
    571	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
    572#endif
    573	{ NULL, },
    574};
    575
    576static struct sched_domain_topology_level x86_hybrid_topology[] = {
    577#ifdef CONFIG_SCHED_SMT
    578	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
    579#endif
    580#ifdef CONFIG_SCHED_MC
    581	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
    582#endif
    583	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
    584	{ NULL, },
    585};
    586
    587static struct sched_domain_topology_level x86_topology[] = {
    588#ifdef CONFIG_SCHED_SMT
    589	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
    590#endif
    591#ifdef CONFIG_SCHED_CLUSTER
    592	{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
    593#endif
    594#ifdef CONFIG_SCHED_MC
    595	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
    596#endif
    597	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
    598	{ NULL, },
    599};
    600
    601/*
    602 * Set if a package/die has multiple NUMA nodes inside.
    603 * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
    604 * Sub-NUMA Clustering have this.
    605 */
    606static bool x86_has_numa_in_package;
    607
    608void set_cpu_sibling_map(int cpu)
    609{
    610	bool has_smt = smp_num_siblings > 1;
    611	bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
    612	struct cpuinfo_x86 *c = &cpu_data(cpu);
    613	struct cpuinfo_x86 *o;
    614	int i, threads;
    615
    616	cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
    617
    618	if (!has_mp) {
    619		cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
    620		cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
    621		cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
    622		cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
    623		cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
    624		c->booted_cores = 1;
    625		return;
    626	}
    627
    628	for_each_cpu(i, cpu_sibling_setup_mask) {
    629		o = &cpu_data(i);
    630
    631		if (match_pkg(c, o) && !topology_same_node(c, o))
    632			x86_has_numa_in_package = true;
    633
    634		if ((i == cpu) || (has_smt && match_smt(c, o)))
    635			link_mask(topology_sibling_cpumask, cpu, i);
    636
    637		if ((i == cpu) || (has_mp && match_llc(c, o)))
    638			link_mask(cpu_llc_shared_mask, cpu, i);
    639
    640		if ((i == cpu) || (has_mp && match_l2c(c, o)))
    641			link_mask(cpu_l2c_shared_mask, cpu, i);
    642
    643		if ((i == cpu) || (has_mp && match_die(c, o)))
    644			link_mask(topology_die_cpumask, cpu, i);
    645	}
    646
    647	threads = cpumask_weight(topology_sibling_cpumask(cpu));
    648	if (threads > __max_smt_threads)
    649		__max_smt_threads = threads;
    650
    651	for_each_cpu(i, topology_sibling_cpumask(cpu))
    652		cpu_data(i).smt_active = threads > 1;
    653
    654	/*
    655	 * This needs a separate iteration over the cpus because we rely on all
    656	 * topology_sibling_cpumask links to be set-up.
    657	 */
    658	for_each_cpu(i, cpu_sibling_setup_mask) {
    659		o = &cpu_data(i);
    660
    661		if ((i == cpu) || (has_mp && match_pkg(c, o))) {
    662			link_mask(topology_core_cpumask, cpu, i);
    663
    664			/*
    665			 *  Does this new cpu bringup a new core?
    666			 */
    667			if (threads == 1) {
    668				/*
    669				 * for each core in package, increment
    670				 * the booted_cores for this new cpu
    671				 */
    672				if (cpumask_first(
    673				    topology_sibling_cpumask(i)) == i)
    674					c->booted_cores++;
    675				/*
    676				 * increment the core count for all
    677				 * the other cpus in this package
    678				 */
    679				if (i != cpu)
    680					cpu_data(i).booted_cores++;
    681			} else if (i != cpu && !c->booted_cores)
    682				c->booted_cores = cpu_data(i).booted_cores;
    683		}
    684	}
    685}
    686
    687/* maps the cpu to the sched domain representing multi-core */
    688const struct cpumask *cpu_coregroup_mask(int cpu)
    689{
    690	return cpu_llc_shared_mask(cpu);
    691}
    692
    693const struct cpumask *cpu_clustergroup_mask(int cpu)
    694{
    695	return cpu_l2c_shared_mask(cpu);
    696}
    697
    698static void impress_friends(void)
    699{
    700	int cpu;
    701	unsigned long bogosum = 0;
    702	/*
    703	 * Allow the user to impress friends.
    704	 */
    705	pr_debug("Before bogomips\n");
    706	for_each_possible_cpu(cpu)
    707		if (cpumask_test_cpu(cpu, cpu_callout_mask))
    708			bogosum += cpu_data(cpu).loops_per_jiffy;
    709	pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
    710		num_online_cpus(),
    711		bogosum/(500000/HZ),
    712		(bogosum/(5000/HZ))%100);
    713
    714	pr_debug("Before bogocount - setting activated=1\n");
    715}
    716
    717void __inquire_remote_apic(int apicid)
    718{
    719	unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
    720	const char * const names[] = { "ID", "VERSION", "SPIV" };
    721	int timeout;
    722	u32 status;
    723
    724	pr_info("Inquiring remote APIC 0x%x...\n", apicid);
    725
    726	for (i = 0; i < ARRAY_SIZE(regs); i++) {
    727		pr_info("... APIC 0x%x %s: ", apicid, names[i]);
    728
    729		/*
    730		 * Wait for idle.
    731		 */
    732		status = safe_apic_wait_icr_idle();
    733		if (status)
    734			pr_cont("a previous APIC delivery may have failed\n");
    735
    736		apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
    737
    738		timeout = 0;
    739		do {
    740			udelay(100);
    741			status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
    742		} while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
    743
    744		switch (status) {
    745		case APIC_ICR_RR_VALID:
    746			status = apic_read(APIC_RRR);
    747			pr_cont("%08x\n", status);
    748			break;
    749		default:
    750			pr_cont("failed\n");
    751		}
    752	}
    753}
    754
    755/*
    756 * The Multiprocessor Specification 1.4 (1997) example code suggests
    757 * that there should be a 10ms delay between the BSP asserting INIT
    758 * and de-asserting INIT, when starting a remote processor.
    759 * But that slows boot and resume on modern processors, which include
    760 * many cores and don't require that delay.
    761 *
    762 * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
    763 * Modern processor families are quirked to remove the delay entirely.
    764 */
    765#define UDELAY_10MS_DEFAULT 10000
    766
    767static unsigned int init_udelay = UINT_MAX;
    768
    769static int __init cpu_init_udelay(char *str)
    770{
    771	get_option(&str, &init_udelay);
    772
    773	return 0;
    774}
    775early_param("cpu_init_udelay", cpu_init_udelay);
    776
    777static void __init smp_quirk_init_udelay(void)
    778{
    779	/* if cmdline changed it from default, leave it alone */
    780	if (init_udelay != UINT_MAX)
    781		return;
    782
    783	/* if modern processor, use no delay */
    784	if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
    785	    ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) ||
    786	    ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
    787		init_udelay = 0;
    788		return;
    789	}
    790	/* else, use legacy delay */
    791	init_udelay = UDELAY_10MS_DEFAULT;
    792}
    793
    794/*
    795 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
    796 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
    797 * won't ... remember to clear down the APIC, etc later.
    798 */
    799int
    800wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
    801{
    802	u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
    803	unsigned long send_status, accept_status = 0;
    804	int maxlvt;
    805
    806	/* Target chip */
    807	/* Boot on the stack */
    808	/* Kick the second */
    809	apic_icr_write(APIC_DM_NMI | dm, apicid);
    810
    811	pr_debug("Waiting for send to finish...\n");
    812	send_status = safe_apic_wait_icr_idle();
    813
    814	/*
    815	 * Give the other CPU some time to accept the IPI.
    816	 */
    817	udelay(200);
    818	if (APIC_INTEGRATED(boot_cpu_apic_version)) {
    819		maxlvt = lapic_get_maxlvt();
    820		if (maxlvt > 3)			/* Due to the Pentium erratum 3AP.  */
    821			apic_write(APIC_ESR, 0);
    822		accept_status = (apic_read(APIC_ESR) & 0xEF);
    823	}
    824	pr_debug("NMI sent\n");
    825
    826	if (send_status)
    827		pr_err("APIC never delivered???\n");
    828	if (accept_status)
    829		pr_err("APIC delivery error (%lx)\n", accept_status);
    830
    831	return (send_status | accept_status);
    832}
    833
    834static int
    835wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
    836{
    837	unsigned long send_status = 0, accept_status = 0;
    838	int maxlvt, num_starts, j;
    839
    840	maxlvt = lapic_get_maxlvt();
    841
    842	/*
    843	 * Be paranoid about clearing APIC errors.
    844	 */
    845	if (APIC_INTEGRATED(boot_cpu_apic_version)) {
    846		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
    847			apic_write(APIC_ESR, 0);
    848		apic_read(APIC_ESR);
    849	}
    850
    851	pr_debug("Asserting INIT\n");
    852
    853	/*
    854	 * Turn INIT on target chip
    855	 */
    856	/*
    857	 * Send IPI
    858	 */
    859	apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
    860		       phys_apicid);
    861
    862	pr_debug("Waiting for send to finish...\n");
    863	send_status = safe_apic_wait_icr_idle();
    864
    865	udelay(init_udelay);
    866
    867	pr_debug("Deasserting INIT\n");
    868
    869	/* Target chip */
    870	/* Send IPI */
    871	apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
    872
    873	pr_debug("Waiting for send to finish...\n");
    874	send_status = safe_apic_wait_icr_idle();
    875
    876	mb();
    877
    878	/*
    879	 * Should we send STARTUP IPIs ?
    880	 *
    881	 * Determine this based on the APIC version.
    882	 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
    883	 */
    884	if (APIC_INTEGRATED(boot_cpu_apic_version))
    885		num_starts = 2;
    886	else
    887		num_starts = 0;
    888
    889	/*
    890	 * Run STARTUP IPI loop.
    891	 */
    892	pr_debug("#startup loops: %d\n", num_starts);
    893
    894	for (j = 1; j <= num_starts; j++) {
    895		pr_debug("Sending STARTUP #%d\n", j);
    896		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
    897			apic_write(APIC_ESR, 0);
    898		apic_read(APIC_ESR);
    899		pr_debug("After apic_write\n");
    900
    901		/*
    902		 * STARTUP IPI
    903		 */
    904
    905		/* Target chip */
    906		/* Boot on the stack */
    907		/* Kick the second */
    908		apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
    909			       phys_apicid);
    910
    911		/*
    912		 * Give the other CPU some time to accept the IPI.
    913		 */
    914		if (init_udelay == 0)
    915			udelay(10);
    916		else
    917			udelay(300);
    918
    919		pr_debug("Startup point 1\n");
    920
    921		pr_debug("Waiting for send to finish...\n");
    922		send_status = safe_apic_wait_icr_idle();
    923
    924		/*
    925		 * Give the other CPU some time to accept the IPI.
    926		 */
    927		if (init_udelay == 0)
    928			udelay(10);
    929		else
    930			udelay(200);
    931
    932		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
    933			apic_write(APIC_ESR, 0);
    934		accept_status = (apic_read(APIC_ESR) & 0xEF);
    935		if (send_status || accept_status)
    936			break;
    937	}
    938	pr_debug("After Startup\n");
    939
    940	if (send_status)
    941		pr_err("APIC never delivered???\n");
    942	if (accept_status)
    943		pr_err("APIC delivery error (%lx)\n", accept_status);
    944
    945	return (send_status | accept_status);
    946}
    947
    948/* reduce the number of lines printed when booting a large cpu count system */
    949static void announce_cpu(int cpu, int apicid)
    950{
    951	static int current_node = NUMA_NO_NODE;
    952	int node = early_cpu_to_node(cpu);
    953	static int width, node_width;
    954
    955	if (!width)
    956		width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
    957
    958	if (!node_width)
    959		node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
    960
    961	if (cpu == 1)
    962		printk(KERN_INFO "x86: Booting SMP configuration:\n");
    963
    964	if (system_state < SYSTEM_RUNNING) {
    965		if (node != current_node) {
    966			if (current_node > (-1))
    967				pr_cont("\n");
    968			current_node = node;
    969
    970			printk(KERN_INFO ".... node %*s#%d, CPUs:  ",
    971			       node_width - num_digits(node), " ", node);
    972		}
    973
    974		/* Add padding for the BSP */
    975		if (cpu == 1)
    976			pr_cont("%*s", width + 1, " ");
    977
    978		pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
    979
    980	} else
    981		pr_info("Booting Node %d Processor %d APIC 0x%x\n",
    982			node, cpu, apicid);
    983}
    984
    985static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs)
    986{
    987	int cpu;
    988
    989	cpu = smp_processor_id();
    990	if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0)
    991		return NMI_HANDLED;
    992
    993	return NMI_DONE;
    994}
    995
    996/*
    997 * Wake up AP by INIT, INIT, STARTUP sequence.
    998 *
    999 * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS
   1000 * boot-strap code which is not a desired behavior for waking up BSP. To
   1001 * void the boot-strap code, wake up CPU0 by NMI instead.
   1002 *
   1003 * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined
   1004 * (i.e. physically hot removed and then hot added), NMI won't wake it up.
   1005 * We'll change this code in the future to wake up hard offlined CPU0 if
   1006 * real platform and request are available.
   1007 */
   1008static int
   1009wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
   1010	       int *cpu0_nmi_registered)
   1011{
   1012	int id;
   1013	int boot_error;
   1014
   1015	preempt_disable();
   1016
   1017	/*
   1018	 * Wake up AP by INIT, INIT, STARTUP sequence.
   1019	 */
   1020	if (cpu) {
   1021		boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
   1022		goto out;
   1023	}
   1024
   1025	/*
   1026	 * Wake up BSP by nmi.
   1027	 *
   1028	 * Register a NMI handler to help wake up CPU0.
   1029	 */
   1030	boot_error = register_nmi_handler(NMI_LOCAL,
   1031					  wakeup_cpu0_nmi, 0, "wake_cpu0");
   1032
   1033	if (!boot_error) {
   1034		enable_start_cpu0 = 1;
   1035		*cpu0_nmi_registered = 1;
   1036		id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid;
   1037		boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
   1038	}
   1039
   1040out:
   1041	preempt_enable();
   1042
   1043	return boot_error;
   1044}
   1045
   1046int common_cpu_up(unsigned int cpu, struct task_struct *idle)
   1047{
   1048	int ret;
   1049
   1050	/* Just in case we booted with a single CPU. */
   1051	alternatives_enable_smp();
   1052
   1053	per_cpu(current_task, cpu) = idle;
   1054	cpu_init_stack_canary(cpu, idle);
   1055
   1056	/* Initialize the interrupt stack(s) */
   1057	ret = irq_init_percpu_irqstack(cpu);
   1058	if (ret)
   1059		return ret;
   1060
   1061#ifdef CONFIG_X86_32
   1062	/* Stack for startup_32 can be just as for start_secondary onwards */
   1063	per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
   1064#else
   1065	initial_gs = per_cpu_offset(cpu);
   1066#endif
   1067	return 0;
   1068}
   1069
   1070/*
   1071 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
   1072 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
   1073 * Returns zero if CPU booted OK, else error code from
   1074 * ->wakeup_secondary_cpu.
   1075 */
   1076static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
   1077		       int *cpu0_nmi_registered)
   1078{
   1079	/* start_ip had better be page-aligned! */
   1080	unsigned long start_ip = real_mode_header->trampoline_start;
   1081
   1082	unsigned long boot_error = 0;
   1083	unsigned long timeout;
   1084
   1085#ifdef CONFIG_X86_64
   1086	/* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
   1087	if (apic->wakeup_secondary_cpu_64)
   1088		start_ip = real_mode_header->trampoline_start64;
   1089#endif
   1090	idle->thread.sp = (unsigned long)task_pt_regs(idle);
   1091	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
   1092	initial_code = (unsigned long)start_secondary;
   1093	initial_stack  = idle->thread.sp;
   1094
   1095	/* Enable the espfix hack for this CPU */
   1096	init_espfix_ap(cpu);
   1097
   1098	/* So we see what's up */
   1099	announce_cpu(cpu, apicid);
   1100
   1101	/*
   1102	 * This grunge runs the startup process for
   1103	 * the targeted processor.
   1104	 */
   1105
   1106	if (x86_platform.legacy.warm_reset) {
   1107
   1108		pr_debug("Setting warm reset code and vector.\n");
   1109
   1110		smpboot_setup_warm_reset_vector(start_ip);
   1111		/*
   1112		 * Be paranoid about clearing APIC errors.
   1113		*/
   1114		if (APIC_INTEGRATED(boot_cpu_apic_version)) {
   1115			apic_write(APIC_ESR, 0);
   1116			apic_read(APIC_ESR);
   1117		}
   1118	}
   1119
   1120	/*
   1121	 * AP might wait on cpu_callout_mask in cpu_init() with
   1122	 * cpu_initialized_mask set if previous attempt to online
   1123	 * it timed-out. Clear cpu_initialized_mask so that after
   1124	 * INIT/SIPI it could start with a clean state.
   1125	 */
   1126	cpumask_clear_cpu(cpu, cpu_initialized_mask);
   1127	smp_mb();
   1128
   1129	/*
   1130	 * Wake up a CPU in difference cases:
   1131	 * - Use a method from the APIC driver if one defined, with wakeup
   1132	 *   straight to 64-bit mode preferred over wakeup to RM.
   1133	 * Otherwise,
   1134	 * - Use an INIT boot APIC message for APs or NMI for BSP.
   1135	 */
   1136	if (apic->wakeup_secondary_cpu_64)
   1137		boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip);
   1138	else if (apic->wakeup_secondary_cpu)
   1139		boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
   1140	else
   1141		boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
   1142						     cpu0_nmi_registered);
   1143
   1144	if (!boot_error) {
   1145		/*
   1146		 * Wait 10s total for first sign of life from AP
   1147		 */
   1148		boot_error = -1;
   1149		timeout = jiffies + 10*HZ;
   1150		while (time_before(jiffies, timeout)) {
   1151			if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
   1152				/*
   1153				 * Tell AP to proceed with initialization
   1154				 */
   1155				cpumask_set_cpu(cpu, cpu_callout_mask);
   1156				boot_error = 0;
   1157				break;
   1158			}
   1159			schedule();
   1160		}
   1161	}
   1162
   1163	if (!boot_error) {
   1164		/*
   1165		 * Wait till AP completes initial initialization
   1166		 */
   1167		while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
   1168			/*
   1169			 * Allow other tasks to run while we wait for the
   1170			 * AP to come online. This also gives a chance
   1171			 * for the MTRR work(triggered by the AP coming online)
   1172			 * to be completed in the stop machine context.
   1173			 */
   1174			schedule();
   1175		}
   1176	}
   1177
   1178	if (x86_platform.legacy.warm_reset) {
   1179		/*
   1180		 * Cleanup possible dangling ends...
   1181		 */
   1182		smpboot_restore_warm_reset_vector();
   1183	}
   1184
   1185	return boot_error;
   1186}
   1187
   1188int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
   1189{
   1190	int apicid = apic->cpu_present_to_apicid(cpu);
   1191	int cpu0_nmi_registered = 0;
   1192	unsigned long flags;
   1193	int err, ret = 0;
   1194
   1195	lockdep_assert_irqs_enabled();
   1196
   1197	pr_debug("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
   1198
   1199	if (apicid == BAD_APICID ||
   1200	    !physid_isset(apicid, phys_cpu_present_map) ||
   1201	    !apic->apic_id_valid(apicid)) {
   1202		pr_err("%s: bad cpu %d\n", __func__, cpu);
   1203		return -EINVAL;
   1204	}
   1205
   1206	/*
   1207	 * Already booted CPU?
   1208	 */
   1209	if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
   1210		pr_debug("do_boot_cpu %d Already started\n", cpu);
   1211		return -ENOSYS;
   1212	}
   1213
   1214	/*
   1215	 * Save current MTRR state in case it was changed since early boot
   1216	 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
   1217	 */
   1218	mtrr_save_state();
   1219
   1220	/* x86 CPUs take themselves offline, so delayed offline is OK. */
   1221	err = cpu_check_up_prepare(cpu);
   1222	if (err && err != -EBUSY)
   1223		return err;
   1224
   1225	/* the FPU context is blank, nobody can own it */
   1226	per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
   1227
   1228	err = common_cpu_up(cpu, tidle);
   1229	if (err)
   1230		return err;
   1231
   1232	err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
   1233	if (err) {
   1234		pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
   1235		ret = -EIO;
   1236		goto unreg_nmi;
   1237	}
   1238
   1239	/*
   1240	 * Check TSC synchronization with the AP (keep irqs disabled
   1241	 * while doing so):
   1242	 */
   1243	local_irq_save(flags);
   1244	check_tsc_sync_source(cpu);
   1245	local_irq_restore(flags);
   1246
   1247	while (!cpu_online(cpu)) {
   1248		cpu_relax();
   1249		touch_nmi_watchdog();
   1250	}
   1251
   1252unreg_nmi:
   1253	/*
   1254	 * Clean up the nmi handler. Do this after the callin and callout sync
   1255	 * to avoid impact of possible long unregister time.
   1256	 */
   1257	if (cpu0_nmi_registered)
   1258		unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
   1259
   1260	return ret;
   1261}
   1262
   1263/**
   1264 * arch_disable_smp_support() - disables SMP support for x86 at runtime
   1265 */
   1266void arch_disable_smp_support(void)
   1267{
   1268	disable_ioapic_support();
   1269}
   1270
   1271/*
   1272 * Fall back to non SMP mode after errors.
   1273 *
   1274 * RED-PEN audit/test this more. I bet there is more state messed up here.
   1275 */
   1276static __init void disable_smp(void)
   1277{
   1278	pr_info("SMP disabled\n");
   1279
   1280	disable_ioapic_support();
   1281
   1282	init_cpu_present(cpumask_of(0));
   1283	init_cpu_possible(cpumask_of(0));
   1284
   1285	if (smp_found_config)
   1286		physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
   1287	else
   1288		physid_set_mask_of_physid(0, &phys_cpu_present_map);
   1289	cpumask_set_cpu(0, topology_sibling_cpumask(0));
   1290	cpumask_set_cpu(0, topology_core_cpumask(0));
   1291	cpumask_set_cpu(0, topology_die_cpumask(0));
   1292}
   1293
   1294/*
   1295 * Various sanity checks.
   1296 */
   1297static void __init smp_sanity_check(void)
   1298{
   1299	preempt_disable();
   1300
   1301#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
   1302	if (def_to_bigsmp && nr_cpu_ids > 8) {
   1303		unsigned int cpu;
   1304		unsigned nr;
   1305
   1306		pr_warn("More than 8 CPUs detected - skipping them\n"
   1307			"Use CONFIG_X86_BIGSMP\n");
   1308
   1309		nr = 0;
   1310		for_each_present_cpu(cpu) {
   1311			if (nr >= 8)
   1312				set_cpu_present(cpu, false);
   1313			nr++;
   1314		}
   1315
   1316		nr = 0;
   1317		for_each_possible_cpu(cpu) {
   1318			if (nr >= 8)
   1319				set_cpu_possible(cpu, false);
   1320			nr++;
   1321		}
   1322
   1323		nr_cpu_ids = 8;
   1324	}
   1325#endif
   1326
   1327	if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
   1328		pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
   1329			hard_smp_processor_id());
   1330
   1331		physid_set(hard_smp_processor_id(), phys_cpu_present_map);
   1332	}
   1333
   1334	/*
   1335	 * Should not be necessary because the MP table should list the boot
   1336	 * CPU too, but we do it for the sake of robustness anyway.
   1337	 */
   1338	if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
   1339		pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
   1340			  boot_cpu_physical_apicid);
   1341		physid_set(hard_smp_processor_id(), phys_cpu_present_map);
   1342	}
   1343	preempt_enable();
   1344}
   1345
   1346static void __init smp_cpu_index_default(void)
   1347{
   1348	int i;
   1349	struct cpuinfo_x86 *c;
   1350
   1351	for_each_possible_cpu(i) {
   1352		c = &cpu_data(i);
   1353		/* mark all to hotplug */
   1354		c->cpu_index = nr_cpu_ids;
   1355	}
   1356}
   1357
   1358static void __init smp_get_logical_apicid(void)
   1359{
   1360	if (x2apic_mode)
   1361		cpu0_logical_apicid = apic_read(APIC_LDR);
   1362	else
   1363		cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
   1364}
   1365
   1366void __init smp_prepare_cpus_common(void)
   1367{
   1368	unsigned int i;
   1369
   1370	smp_cpu_index_default();
   1371
   1372	/*
   1373	 * Setup boot CPU information
   1374	 */
   1375	smp_store_boot_cpu_info(); /* Final full version of the data */
   1376	cpumask_copy(cpu_callin_mask, cpumask_of(0));
   1377	mb();
   1378
   1379	for_each_possible_cpu(i) {
   1380		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
   1381		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
   1382		zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
   1383		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
   1384		zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
   1385	}
   1386
   1387	/*
   1388	 * Set 'default' x86 topology, this matches default_topology() in that
   1389	 * it has NUMA nodes as a topology level. See also
   1390	 * native_smp_cpus_done().
   1391	 *
   1392	 * Must be done before set_cpus_sibling_map() is ran.
   1393	 */
   1394	set_sched_topology(x86_topology);
   1395
   1396	set_cpu_sibling_map(0);
   1397}
   1398
   1399/*
   1400 * Prepare for SMP bootup.
   1401 * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
   1402 *            for common interface support.
   1403 */
   1404void __init native_smp_prepare_cpus(unsigned int max_cpus)
   1405{
   1406	smp_prepare_cpus_common();
   1407
   1408	smp_sanity_check();
   1409
   1410	switch (apic_intr_mode) {
   1411	case APIC_PIC:
   1412	case APIC_VIRTUAL_WIRE_NO_CONFIG:
   1413		disable_smp();
   1414		return;
   1415	case APIC_SYMMETRIC_IO_NO_ROUTING:
   1416		disable_smp();
   1417		/* Setup local timer */
   1418		x86_init.timers.setup_percpu_clockev();
   1419		return;
   1420	case APIC_VIRTUAL_WIRE:
   1421	case APIC_SYMMETRIC_IO:
   1422		break;
   1423	}
   1424
   1425	/* Setup local timer */
   1426	x86_init.timers.setup_percpu_clockev();
   1427
   1428	smp_get_logical_apicid();
   1429
   1430	pr_info("CPU0: ");
   1431	print_cpu_info(&cpu_data(0));
   1432
   1433	uv_system_init();
   1434
   1435	set_mtrr_aps_delayed_init();
   1436
   1437	smp_quirk_init_udelay();
   1438
   1439	speculative_store_bypass_ht_init();
   1440
   1441	snp_set_wakeup_secondary_cpu();
   1442}
   1443
   1444void arch_thaw_secondary_cpus_begin(void)
   1445{
   1446	set_mtrr_aps_delayed_init();
   1447}
   1448
   1449void arch_thaw_secondary_cpus_end(void)
   1450{
   1451	mtrr_aps_init();
   1452}
   1453
   1454/*
   1455 * Early setup to make printk work.
   1456 */
   1457void __init native_smp_prepare_boot_cpu(void)
   1458{
   1459	int me = smp_processor_id();
   1460	switch_to_new_gdt(me);
   1461	/* already set me in cpu_online_mask in boot_cpu_init() */
   1462	cpumask_set_cpu(me, cpu_callout_mask);
   1463	cpu_set_state_online(me);
   1464	native_pv_lock_init();
   1465}
   1466
   1467void __init calculate_max_logical_packages(void)
   1468{
   1469	int ncpus;
   1470
   1471	/*
   1472	 * Today neither Intel nor AMD support heterogeneous systems so
   1473	 * extrapolate the boot cpu's data to all packages.
   1474	 */
   1475	ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
   1476	__max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
   1477	pr_info("Max logical packages: %u\n", __max_logical_packages);
   1478}
   1479
   1480void __init native_smp_cpus_done(unsigned int max_cpus)
   1481{
   1482	pr_debug("Boot done\n");
   1483
   1484	calculate_max_logical_packages();
   1485
   1486	/* XXX for now assume numa-in-package and hybrid don't overlap */
   1487	if (x86_has_numa_in_package)
   1488		set_sched_topology(x86_numa_in_package_topology);
   1489	if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
   1490		set_sched_topology(x86_hybrid_topology);
   1491
   1492	nmi_selftest();
   1493	impress_friends();
   1494	mtrr_aps_init();
   1495}
   1496
   1497static int __initdata setup_possible_cpus = -1;
   1498static int __init _setup_possible_cpus(char *str)
   1499{
   1500	get_option(&str, &setup_possible_cpus);
   1501	return 0;
   1502}
   1503early_param("possible_cpus", _setup_possible_cpus);
   1504
   1505
   1506/*
   1507 * cpu_possible_mask should be static, it cannot change as cpu's
   1508 * are onlined, or offlined. The reason is per-cpu data-structures
   1509 * are allocated by some modules at init time, and don't expect to
   1510 * do this dynamically on cpu arrival/departure.
   1511 * cpu_present_mask on the other hand can change dynamically.
   1512 * In case when cpu_hotplug is not compiled, then we resort to current
   1513 * behaviour, which is cpu_possible == cpu_present.
   1514 * - Ashok Raj
   1515 *
   1516 * Three ways to find out the number of additional hotplug CPUs:
   1517 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
   1518 * - The user can overwrite it with possible_cpus=NUM
   1519 * - Otherwise don't reserve additional CPUs.
   1520 * We do this because additional CPUs waste a lot of memory.
   1521 * -AK
   1522 */
   1523__init void prefill_possible_map(void)
   1524{
   1525	int i, possible;
   1526
   1527	/* No boot processor was found in mptable or ACPI MADT */
   1528	if (!num_processors) {
   1529		if (boot_cpu_has(X86_FEATURE_APIC)) {
   1530			int apicid = boot_cpu_physical_apicid;
   1531			int cpu = hard_smp_processor_id();
   1532
   1533			pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
   1534
   1535			/* Make sure boot cpu is enumerated */
   1536			if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
   1537			    apic->apic_id_valid(apicid))
   1538				generic_processor_info(apicid, boot_cpu_apic_version);
   1539		}
   1540
   1541		if (!num_processors)
   1542			num_processors = 1;
   1543	}
   1544
   1545	i = setup_max_cpus ?: 1;
   1546	if (setup_possible_cpus == -1) {
   1547		possible = num_processors;
   1548#ifdef CONFIG_HOTPLUG_CPU
   1549		if (setup_max_cpus)
   1550			possible += disabled_cpus;
   1551#else
   1552		if (possible > i)
   1553			possible = i;
   1554#endif
   1555	} else
   1556		possible = setup_possible_cpus;
   1557
   1558	total_cpus = max_t(int, possible, num_processors + disabled_cpus);
   1559
   1560	/* nr_cpu_ids could be reduced via nr_cpus= */
   1561	if (possible > nr_cpu_ids) {
   1562		pr_warn("%d Processors exceeds NR_CPUS limit of %u\n",
   1563			possible, nr_cpu_ids);
   1564		possible = nr_cpu_ids;
   1565	}
   1566
   1567#ifdef CONFIG_HOTPLUG_CPU
   1568	if (!setup_max_cpus)
   1569#endif
   1570	if (possible > i) {
   1571		pr_warn("%d Processors exceeds max_cpus limit of %u\n",
   1572			possible, setup_max_cpus);
   1573		possible = i;
   1574	}
   1575
   1576	nr_cpu_ids = possible;
   1577
   1578	pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
   1579		possible, max_t(int, possible - num_processors, 0));
   1580
   1581	reset_cpu_possible_mask();
   1582
   1583	for (i = 0; i < possible; i++)
   1584		set_cpu_possible(i, true);
   1585}
   1586
   1587#ifdef CONFIG_HOTPLUG_CPU
   1588
   1589/* Recompute SMT state for all CPUs on offline */
   1590static void recompute_smt_state(void)
   1591{
   1592	int max_threads, cpu;
   1593
   1594	max_threads = 0;
   1595	for_each_online_cpu (cpu) {
   1596		int threads = cpumask_weight(topology_sibling_cpumask(cpu));
   1597
   1598		if (threads > max_threads)
   1599			max_threads = threads;
   1600	}
   1601	__max_smt_threads = max_threads;
   1602}
   1603
   1604static void remove_siblinginfo(int cpu)
   1605{
   1606	int sibling;
   1607	struct cpuinfo_x86 *c = &cpu_data(cpu);
   1608
   1609	for_each_cpu(sibling, topology_core_cpumask(cpu)) {
   1610		cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
   1611		/*/
   1612		 * last thread sibling in this cpu core going down
   1613		 */
   1614		if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
   1615			cpu_data(sibling).booted_cores--;
   1616	}
   1617
   1618	for_each_cpu(sibling, topology_die_cpumask(cpu))
   1619		cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
   1620
   1621	for_each_cpu(sibling, topology_sibling_cpumask(cpu)) {
   1622		cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
   1623		if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1)
   1624			cpu_data(sibling).smt_active = false;
   1625	}
   1626
   1627	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
   1628		cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
   1629	for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
   1630		cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
   1631	cpumask_clear(cpu_llc_shared_mask(cpu));
   1632	cpumask_clear(cpu_l2c_shared_mask(cpu));
   1633	cpumask_clear(topology_sibling_cpumask(cpu));
   1634	cpumask_clear(topology_core_cpumask(cpu));
   1635	cpumask_clear(topology_die_cpumask(cpu));
   1636	c->cpu_core_id = 0;
   1637	c->booted_cores = 0;
   1638	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
   1639	recompute_smt_state();
   1640}
   1641
   1642static void remove_cpu_from_maps(int cpu)
   1643{
   1644	set_cpu_online(cpu, false);
   1645	cpumask_clear_cpu(cpu, cpu_callout_mask);
   1646	cpumask_clear_cpu(cpu, cpu_callin_mask);
   1647	/* was set by cpu_init() */
   1648	cpumask_clear_cpu(cpu, cpu_initialized_mask);
   1649	numa_remove_cpu(cpu);
   1650}
   1651
   1652void cpu_disable_common(void)
   1653{
   1654	int cpu = smp_processor_id();
   1655
   1656	remove_siblinginfo(cpu);
   1657
   1658	/* It's now safe to remove this processor from the online map */
   1659	lock_vector_lock();
   1660	remove_cpu_from_maps(cpu);
   1661	unlock_vector_lock();
   1662	fixup_irqs();
   1663	lapic_offline();
   1664}
   1665
   1666int native_cpu_disable(void)
   1667{
   1668	int ret;
   1669
   1670	ret = lapic_can_unplug_cpu();
   1671	if (ret)
   1672		return ret;
   1673
   1674	cpu_disable_common();
   1675
   1676        /*
   1677         * Disable the local APIC. Otherwise IPI broadcasts will reach
   1678         * it. It still responds normally to INIT, NMI, SMI, and SIPI
   1679         * messages.
   1680         *
   1681         * Disabling the APIC must happen after cpu_disable_common()
   1682         * which invokes fixup_irqs().
   1683         *
   1684         * Disabling the APIC preserves already set bits in IRR, but
   1685         * an interrupt arriving after disabling the local APIC does not
   1686         * set the corresponding IRR bit.
   1687         *
   1688         * fixup_irqs() scans IRR for set bits so it can raise a not
   1689         * yet handled interrupt on the new destination CPU via an IPI
   1690         * but obviously it can't do so for IRR bits which are not set.
   1691         * IOW, interrupts arriving after disabling the local APIC will
   1692         * be lost.
   1693         */
   1694	apic_soft_disable();
   1695
   1696	return 0;
   1697}
   1698
   1699int common_cpu_die(unsigned int cpu)
   1700{
   1701	int ret = 0;
   1702
   1703	/* We don't do anything here: idle task is faking death itself. */
   1704
   1705	/* They ack this in play_dead() by setting CPU_DEAD */
   1706	if (cpu_wait_death(cpu, 5)) {
   1707		if (system_state == SYSTEM_RUNNING)
   1708			pr_info("CPU %u is now offline\n", cpu);
   1709	} else {
   1710		pr_err("CPU %u didn't die...\n", cpu);
   1711		ret = -1;
   1712	}
   1713
   1714	return ret;
   1715}
   1716
   1717void native_cpu_die(unsigned int cpu)
   1718{
   1719	common_cpu_die(cpu);
   1720}
   1721
   1722void play_dead_common(void)
   1723{
   1724	idle_task_exit();
   1725
   1726	/* Ack it */
   1727	(void)cpu_report_death();
   1728
   1729	/*
   1730	 * With physical CPU hotplug, we should halt the cpu
   1731	 */
   1732	local_irq_disable();
   1733}
   1734
   1735/**
   1736 * cond_wakeup_cpu0 - Wake up CPU0 if needed.
   1737 *
   1738 * If NMI wants to wake up CPU0, start CPU0.
   1739 */
   1740void cond_wakeup_cpu0(void)
   1741{
   1742	if (smp_processor_id() == 0 && enable_start_cpu0)
   1743		start_cpu0();
   1744}
   1745EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
   1746
   1747/*
   1748 * We need to flush the caches before going to sleep, lest we have
   1749 * dirty data in our caches when we come back up.
   1750 */
   1751static inline void mwait_play_dead(void)
   1752{
   1753	unsigned int eax, ebx, ecx, edx;
   1754	unsigned int highest_cstate = 0;
   1755	unsigned int highest_subcstate = 0;
   1756	void *mwait_ptr;
   1757	int i;
   1758
   1759	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
   1760	    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
   1761		return;
   1762	if (!this_cpu_has(X86_FEATURE_MWAIT))
   1763		return;
   1764	if (!this_cpu_has(X86_FEATURE_CLFLUSH))
   1765		return;
   1766	if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
   1767		return;
   1768
   1769	eax = CPUID_MWAIT_LEAF;
   1770	ecx = 0;
   1771	native_cpuid(&eax, &ebx, &ecx, &edx);
   1772
   1773	/*
   1774	 * eax will be 0 if EDX enumeration is not valid.
   1775	 * Initialized below to cstate, sub_cstate value when EDX is valid.
   1776	 */
   1777	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
   1778		eax = 0;
   1779	} else {
   1780		edx >>= MWAIT_SUBSTATE_SIZE;
   1781		for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
   1782			if (edx & MWAIT_SUBSTATE_MASK) {
   1783				highest_cstate = i;
   1784				highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
   1785			}
   1786		}
   1787		eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
   1788			(highest_subcstate - 1);
   1789	}
   1790
   1791	/*
   1792	 * This should be a memory location in a cache line which is
   1793	 * unlikely to be touched by other processors.  The actual
   1794	 * content is immaterial as it is not actually modified in any way.
   1795	 */
   1796	mwait_ptr = &current_thread_info()->flags;
   1797
   1798	wbinvd();
   1799
   1800	while (1) {
   1801		/*
   1802		 * The CLFLUSH is a workaround for erratum AAI65 for
   1803		 * the Xeon 7400 series.  It's not clear it is actually
   1804		 * needed, but it should be harmless in either case.
   1805		 * The WBINVD is insufficient due to the spurious-wakeup
   1806		 * case where we return around the loop.
   1807		 */
   1808		mb();
   1809		clflush(mwait_ptr);
   1810		mb();
   1811		__monitor(mwait_ptr, 0, 0);
   1812		mb();
   1813		__mwait(eax, 0);
   1814
   1815		cond_wakeup_cpu0();
   1816	}
   1817}
   1818
   1819void hlt_play_dead(void)
   1820{
   1821	if (__this_cpu_read(cpu_info.x86) >= 4)
   1822		wbinvd();
   1823
   1824	while (1) {
   1825		native_halt();
   1826
   1827		cond_wakeup_cpu0();
   1828	}
   1829}
   1830
   1831void native_play_dead(void)
   1832{
   1833	play_dead_common();
   1834	tboot_shutdown(TB_SHUTDOWN_WFS);
   1835
   1836	mwait_play_dead();	/* Only returns on failure */
   1837	if (cpuidle_play_dead())
   1838		hlt_play_dead();
   1839}
   1840
   1841#else /* ... !CONFIG_HOTPLUG_CPU */
   1842int native_cpu_disable(void)
   1843{
   1844	return -ENOSYS;
   1845}
   1846
   1847void native_cpu_die(unsigned int cpu)
   1848{
   1849	/* We said "no" in __cpu_disable */
   1850	BUG();
   1851}
   1852
   1853void native_play_dead(void)
   1854{
   1855	BUG();
   1856}
   1857
   1858#endif