smp_64.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
smp_64.c (37101B)
      1// SPDX-License-Identifier: GPL-2.0
      2/* smp.c: Sparc64 SMP support.
      3 *
      4 * Copyright (C) 1997, 2007, 2008 David S. Miller (davem@davemloft.net)
      5 */
      6
      7#include <linux/export.h>
      8#include <linux/kernel.h>
      9#include <linux/sched/mm.h>
     10#include <linux/sched/hotplug.h>
     11#include <linux/mm.h>
     12#include <linux/pagemap.h>
     13#include <linux/threads.h>
     14#include <linux/smp.h>
     15#include <linux/interrupt.h>
     16#include <linux/kernel_stat.h>
     17#include <linux/delay.h>
     18#include <linux/init.h>
     19#include <linux/spinlock.h>
     20#include <linux/fs.h>
     21#include <linux/seq_file.h>
     22#include <linux/cache.h>
     23#include <linux/jiffies.h>
     24#include <linux/profile.h>
     25#include <linux/memblock.h>
     26#include <linux/vmalloc.h>
     27#include <linux/ftrace.h>
     28#include <linux/cpu.h>
     29#include <linux/slab.h>
     30#include <linux/kgdb.h>
     31
     32#include <asm/head.h>
     33#include <asm/ptrace.h>
     34#include <linux/atomic.h>
     35#include <asm/tlbflush.h>
     36#include <asm/mmu_context.h>
     37#include <asm/cpudata.h>
     38#include <asm/hvtramp.h>
     39#include <asm/io.h>
     40#include <asm/timer.h>
     41#include <asm/setup.h>
     42
     43#include <asm/irq.h>
     44#include <asm/irq_regs.h>
     45#include <asm/page.h>
     46#include <asm/oplib.h>
     47#include <linux/uaccess.h>
     48#include <asm/starfire.h>
     49#include <asm/tlb.h>
     50#include <asm/pgalloc.h>
     51#include <asm/sections.h>
     52#include <asm/prom.h>
     53#include <asm/mdesc.h>
     54#include <asm/ldc.h>
     55#include <asm/hypervisor.h>
     56#include <asm/pcr.h>
     57
     58#include "cpumap.h"
     59#include "kernel.h"
     60
     61DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
     62cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
     63	{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
     64
     65cpumask_t cpu_core_sib_map[NR_CPUS] __read_mostly = {
     66	[0 ... NR_CPUS-1] = CPU_MASK_NONE };
     67
     68cpumask_t cpu_core_sib_cache_map[NR_CPUS] __read_mostly = {
     69	[0 ... NR_CPUS - 1] = CPU_MASK_NONE };
     70
     71EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
     72EXPORT_SYMBOL(cpu_core_map);
     73EXPORT_SYMBOL(cpu_core_sib_map);
     74EXPORT_SYMBOL(cpu_core_sib_cache_map);
     75
     76static cpumask_t smp_commenced_mask;
     77
     78static DEFINE_PER_CPU(bool, poke);
     79static bool cpu_poke;
     80
     81void smp_info(struct seq_file *m)
     82{
     83	int i;
     84	
     85	seq_printf(m, "State:\n");
     86	for_each_online_cpu(i)
     87		seq_printf(m, "CPU%d:\t\tonline\n", i);
     88}
     89
     90void smp_bogo(struct seq_file *m)
     91{
     92	int i;
     93	
     94	for_each_online_cpu(i)
     95		seq_printf(m,
     96			   "Cpu%dClkTck\t: %016lx\n",
     97			   i, cpu_data(i).clock_tick);
     98}
     99
    100extern void setup_sparc64_timer(void);
    101
    102static volatile unsigned long callin_flag = 0;
    103
    104void smp_callin(void)
    105{
    106	int cpuid = hard_smp_processor_id();
    107
    108	__local_per_cpu_offset = __per_cpu_offset(cpuid);
    109
    110	if (tlb_type == hypervisor)
    111		sun4v_ktsb_register();
    112
    113	__flush_tlb_all();
    114
    115	setup_sparc64_timer();
    116
    117	if (cheetah_pcache_forced_on)
    118		cheetah_enable_pcache();
    119
    120	callin_flag = 1;
    121	__asm__ __volatile__("membar #Sync\n\t"
    122			     "flush  %%g6" : : : "memory");
    123
    124	/* Clear this or we will die instantly when we
    125	 * schedule back to this idler...
    126	 */
    127	current_thread_info()->new_child = 0;
    128
    129	/* Attach to the address space of init_task. */
    130	mmgrab(&init_mm);
    131	current->active_mm = &init_mm;
    132
    133	/* inform the notifiers about the new cpu */
    134	notify_cpu_starting(cpuid);
    135
    136	while (!cpumask_test_cpu(cpuid, &smp_commenced_mask))
    137		rmb();
    138
    139	set_cpu_online(cpuid, true);
    140
    141	local_irq_enable();
    142
    143	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
    144}
    145
    146void cpu_panic(void)
    147{
    148	printk("CPU[%d]: Returns from cpu_idle!\n", smp_processor_id());
    149	panic("SMP bolixed\n");
    150}
    151
    152/* This tick register synchronization scheme is taken entirely from
    153 * the ia64 port, see arch/ia64/kernel/smpboot.c for details and credit.
    154 *
    155 * The only change I've made is to rework it so that the master
    156 * initiates the synchonization instead of the slave. -DaveM
    157 */
    158
    159#define MASTER	0
    160#define SLAVE	(SMP_CACHE_BYTES/sizeof(unsigned long))
    161
    162#define NUM_ROUNDS	64	/* magic value */
    163#define NUM_ITERS	5	/* likewise */
    164
    165static DEFINE_RAW_SPINLOCK(itc_sync_lock);
    166static unsigned long go[SLAVE + 1];
    167
    168#define DEBUG_TICK_SYNC	0
    169
    170static inline long get_delta (long *rt, long *master)
    171{
    172	unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
    173	unsigned long tcenter, t0, t1, tm;
    174	unsigned long i;
    175
    176	for (i = 0; i < NUM_ITERS; i++) {
    177		t0 = tick_ops->get_tick();
    178		go[MASTER] = 1;
    179		membar_safe("#StoreLoad");
    180		while (!(tm = go[SLAVE]))
    181			rmb();
    182		go[SLAVE] = 0;
    183		wmb();
    184		t1 = tick_ops->get_tick();
    185
    186		if (t1 - t0 < best_t1 - best_t0)
    187			best_t0 = t0, best_t1 = t1, best_tm = tm;
    188	}
    189
    190	*rt = best_t1 - best_t0;
    191	*master = best_tm - best_t0;
    192
    193	/* average best_t0 and best_t1 without overflow: */
    194	tcenter = (best_t0/2 + best_t1/2);
    195	if (best_t0 % 2 + best_t1 % 2 == 2)
    196		tcenter++;
    197	return tcenter - best_tm;
    198}
    199
    200void smp_synchronize_tick_client(void)
    201{
    202	long i, delta, adj, adjust_latency = 0, done = 0;
    203	unsigned long flags, rt, master_time_stamp;
    204#if DEBUG_TICK_SYNC
    205	struct {
    206		long rt;	/* roundtrip time */
    207		long master;	/* master's timestamp */
    208		long diff;	/* difference between midpoint and master's timestamp */
    209		long lat;	/* estimate of itc adjustment latency */
    210	} t[NUM_ROUNDS];
    211#endif
    212
    213	go[MASTER] = 1;
    214
    215	while (go[MASTER])
    216		rmb();
    217
    218	local_irq_save(flags);
    219	{
    220		for (i = 0; i < NUM_ROUNDS; i++) {
    221			delta = get_delta(&rt, &master_time_stamp);
    222			if (delta == 0)
    223				done = 1;	/* let's lock on to this... */
    224
    225			if (!done) {
    226				if (i > 0) {
    227					adjust_latency += -delta;
    228					adj = -delta + adjust_latency/4;
    229				} else
    230					adj = -delta;
    231
    232				tick_ops->add_tick(adj);
    233			}
    234#if DEBUG_TICK_SYNC
    235			t[i].rt = rt;
    236			t[i].master = master_time_stamp;
    237			t[i].diff = delta;
    238			t[i].lat = adjust_latency/4;
    239#endif
    240		}
    241	}
    242	local_irq_restore(flags);
    243
    244#if DEBUG_TICK_SYNC
    245	for (i = 0; i < NUM_ROUNDS; i++)
    246		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
    247		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
    248#endif
    249
    250	printk(KERN_INFO "CPU %d: synchronized TICK with master CPU "
    251	       "(last diff %ld cycles, maxerr %lu cycles)\n",
    252	       smp_processor_id(), delta, rt);
    253}
    254
    255static void smp_start_sync_tick_client(int cpu);
    256
    257static void smp_synchronize_one_tick(int cpu)
    258{
    259	unsigned long flags, i;
    260
    261	go[MASTER] = 0;
    262
    263	smp_start_sync_tick_client(cpu);
    264
    265	/* wait for client to be ready */
    266	while (!go[MASTER])
    267		rmb();
    268
    269	/* now let the client proceed into his loop */
    270	go[MASTER] = 0;
    271	membar_safe("#StoreLoad");
    272
    273	raw_spin_lock_irqsave(&itc_sync_lock, flags);
    274	{
    275		for (i = 0; i < NUM_ROUNDS*NUM_ITERS; i++) {
    276			while (!go[MASTER])
    277				rmb();
    278			go[MASTER] = 0;
    279			wmb();
    280			go[SLAVE] = tick_ops->get_tick();
    281			membar_safe("#StoreLoad");
    282		}
    283	}
    284	raw_spin_unlock_irqrestore(&itc_sync_lock, flags);
    285}
    286
    287#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU)
    288static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg,
    289				void **descrp)
    290{
    291	extern unsigned long sparc64_ttable_tl0;
    292	extern unsigned long kern_locked_tte_data;
    293	struct hvtramp_descr *hdesc;
    294	unsigned long trampoline_ra;
    295	struct trap_per_cpu *tb;
    296	u64 tte_vaddr, tte_data;
    297	unsigned long hv_err;
    298	int i;
    299
    300	hdesc = kzalloc(sizeof(*hdesc) +
    301			(sizeof(struct hvtramp_mapping) *
    302			 num_kernel_image_mappings - 1),
    303			GFP_KERNEL);
    304	if (!hdesc) {
    305		printk(KERN_ERR "ldom_startcpu_cpuid: Cannot allocate "
    306		       "hvtramp_descr.\n");
    307		return;
    308	}
    309	*descrp = hdesc;
    310
    311	hdesc->cpu = cpu;
    312	hdesc->num_mappings = num_kernel_image_mappings;
    313
    314	tb = &trap_block[cpu];
    315
    316	hdesc->fault_info_va = (unsigned long) &tb->fault_info;
    317	hdesc->fault_info_pa = kimage_addr_to_ra(&tb->fault_info);
    318
    319	hdesc->thread_reg = thread_reg;
    320
    321	tte_vaddr = (unsigned long) KERNBASE;
    322	tte_data = kern_locked_tte_data;
    323
    324	for (i = 0; i < hdesc->num_mappings; i++) {
    325		hdesc->maps[i].vaddr = tte_vaddr;
    326		hdesc->maps[i].tte   = tte_data;
    327		tte_vaddr += 0x400000;
    328		tte_data  += 0x400000;
    329	}
    330
    331	trampoline_ra = kimage_addr_to_ra(hv_cpu_startup);
    332
    333	hv_err = sun4v_cpu_start(cpu, trampoline_ra,
    334				 kimage_addr_to_ra(&sparc64_ttable_tl0),
    335				 __pa(hdesc));
    336	if (hv_err)
    337		printk(KERN_ERR "ldom_startcpu_cpuid: sun4v_cpu_start() "
    338		       "gives error %lu\n", hv_err);
    339}
    340#endif
    341
    342extern unsigned long sparc64_cpu_startup;
    343
    344/* The OBP cpu startup callback truncates the 3rd arg cookie to
    345 * 32-bits (I think) so to be safe we have it read the pointer
    346 * contained here so we work on >4GB machines. -DaveM
    347 */
    348static struct thread_info *cpu_new_thread = NULL;
    349
    350static int smp_boot_one_cpu(unsigned int cpu, struct task_struct *idle)
    351{
    352	unsigned long entry =
    353		(unsigned long)(&sparc64_cpu_startup);
    354	unsigned long cookie =
    355		(unsigned long)(&cpu_new_thread);
    356	void *descr = NULL;
    357	int timeout, ret;
    358
    359	callin_flag = 0;
    360	cpu_new_thread = task_thread_info(idle);
    361
    362	if (tlb_type == hypervisor) {
    363#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU)
    364		if (ldom_domaining_enabled)
    365			ldom_startcpu_cpuid(cpu,
    366					    (unsigned long) cpu_new_thread,
    367					    &descr);
    368		else
    369#endif
    370			prom_startcpu_cpuid(cpu, entry, cookie);
    371	} else {
    372		struct device_node *dp = of_find_node_by_cpuid(cpu);
    373
    374		prom_startcpu(dp->phandle, entry, cookie);
    375	}
    376
    377	for (timeout = 0; timeout < 50000; timeout++) {
    378		if (callin_flag)
    379			break;
    380		udelay(100);
    381	}
    382
    383	if (callin_flag) {
    384		ret = 0;
    385	} else {
    386		printk("Processor %d is stuck.\n", cpu);
    387		ret = -ENODEV;
    388	}
    389	cpu_new_thread = NULL;
    390
    391	kfree(descr);
    392
    393	return ret;
    394}
    395
    396static void spitfire_xcall_helper(u64 data0, u64 data1, u64 data2, u64 pstate, unsigned long cpu)
    397{
    398	u64 result, target;
    399	int stuck, tmp;
    400
    401	if (this_is_starfire) {
    402		/* map to real upaid */
    403		cpu = (((cpu & 0x3c) << 1) |
    404			((cpu & 0x40) >> 4) |
    405			(cpu & 0x3));
    406	}
    407
    408	target = (cpu << 14) | 0x70;
    409again:
    410	/* Ok, this is the real Spitfire Errata #54.
    411	 * One must read back from a UDB internal register
    412	 * after writes to the UDB interrupt dispatch, but
    413	 * before the membar Sync for that write.
    414	 * So we use the high UDB control register (ASI 0x7f,
    415	 * ADDR 0x20) for the dummy read. -DaveM
    416	 */
    417	tmp = 0x40;
    418	__asm__ __volatile__(
    419	"wrpr	%1, %2, %%pstate\n\t"
    420	"stxa	%4, [%0] %3\n\t"
    421	"stxa	%5, [%0+%8] %3\n\t"
    422	"add	%0, %8, %0\n\t"
    423	"stxa	%6, [%0+%8] %3\n\t"
    424	"membar	#Sync\n\t"
    425	"stxa	%%g0, [%7] %3\n\t"
    426	"membar	#Sync\n\t"
    427	"mov	0x20, %%g1\n\t"
    428	"ldxa	[%%g1] 0x7f, %%g0\n\t"
    429	"membar	#Sync"
    430	: "=r" (tmp)
    431	: "r" (pstate), "i" (PSTATE_IE), "i" (ASI_INTR_W),
    432	  "r" (data0), "r" (data1), "r" (data2), "r" (target),
    433	  "r" (0x10), "0" (tmp)
    434        : "g1");
    435
    436	/* NOTE: PSTATE_IE is still clear. */
    437	stuck = 100000;
    438	do {
    439		__asm__ __volatile__("ldxa [%%g0] %1, %0"
    440			: "=r" (result)
    441			: "i" (ASI_INTR_DISPATCH_STAT));
    442		if (result == 0) {
    443			__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
    444					     : : "r" (pstate));
    445			return;
    446		}
    447		stuck -= 1;
    448		if (stuck == 0)
    449			break;
    450	} while (result & 0x1);
    451	__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
    452			     : : "r" (pstate));
    453	if (stuck == 0) {
    454		printk("CPU[%d]: mondo stuckage result[%016llx]\n",
    455		       smp_processor_id(), result);
    456	} else {
    457		udelay(2);
    458		goto again;
    459	}
    460}
    461
    462static void spitfire_xcall_deliver(struct trap_per_cpu *tb, int cnt)
    463{
    464	u64 *mondo, data0, data1, data2;
    465	u16 *cpu_list;
    466	u64 pstate;
    467	int i;
    468
    469	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
    470	cpu_list = __va(tb->cpu_list_pa);
    471	mondo = __va(tb->cpu_mondo_block_pa);
    472	data0 = mondo[0];
    473	data1 = mondo[1];
    474	data2 = mondo[2];
    475	for (i = 0; i < cnt; i++)
    476		spitfire_xcall_helper(data0, data1, data2, pstate, cpu_list[i]);
    477}
    478
    479/* Cheetah now allows to send the whole 64-bytes of data in the interrupt
    480 * packet, but we have no use for that.  However we do take advantage of
    481 * the new pipelining feature (ie. dispatch to multiple cpus simultaneously).
    482 */
    483static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt)
    484{
    485	int nack_busy_id, is_jbus, need_more;
    486	u64 *mondo, pstate, ver, busy_mask;
    487	u16 *cpu_list;
    488
    489	cpu_list = __va(tb->cpu_list_pa);
    490	mondo = __va(tb->cpu_mondo_block_pa);
    491
    492	/* Unfortunately, someone at Sun had the brilliant idea to make the
    493	 * busy/nack fields hard-coded by ITID number for this Ultra-III
    494	 * derivative processor.
    495	 */
    496	__asm__ ("rdpr %%ver, %0" : "=r" (ver));
    497	is_jbus = ((ver >> 32) == __JALAPENO_ID ||
    498		   (ver >> 32) == __SERRANO_ID);
    499
    500	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
    501
    502retry:
    503	need_more = 0;
    504	__asm__ __volatile__("wrpr %0, %1, %%pstate\n\t"
    505			     : : "r" (pstate), "i" (PSTATE_IE));
    506
    507	/* Setup the dispatch data registers. */
    508	__asm__ __volatile__("stxa	%0, [%3] %6\n\t"
    509			     "stxa	%1, [%4] %6\n\t"
    510			     "stxa	%2, [%5] %6\n\t"
    511			     "membar	#Sync\n\t"
    512			     : /* no outputs */
    513			     : "r" (mondo[0]), "r" (mondo[1]), "r" (mondo[2]),
    514			       "r" (0x40), "r" (0x50), "r" (0x60),
    515			       "i" (ASI_INTR_W));
    516
    517	nack_busy_id = 0;
    518	busy_mask = 0;
    519	{
    520		int i;
    521
    522		for (i = 0; i < cnt; i++) {
    523			u64 target, nr;
    524
    525			nr = cpu_list[i];
    526			if (nr == 0xffff)
    527				continue;
    528
    529			target = (nr << 14) | 0x70;
    530			if (is_jbus) {
    531				busy_mask |= (0x1UL << (nr * 2));
    532			} else {
    533				target |= (nack_busy_id << 24);
    534				busy_mask |= (0x1UL <<
    535					      (nack_busy_id * 2));
    536			}
    537			__asm__ __volatile__(
    538				"stxa	%%g0, [%0] %1\n\t"
    539				"membar	#Sync\n\t"
    540				: /* no outputs */
    541				: "r" (target), "i" (ASI_INTR_W));
    542			nack_busy_id++;
    543			if (nack_busy_id == 32) {
    544				need_more = 1;
    545				break;
    546			}
    547		}
    548	}
    549
    550	/* Now, poll for completion. */
    551	{
    552		u64 dispatch_stat, nack_mask;
    553		long stuck;
    554
    555		stuck = 100000 * nack_busy_id;
    556		nack_mask = busy_mask << 1;
    557		do {
    558			__asm__ __volatile__("ldxa	[%%g0] %1, %0"
    559					     : "=r" (dispatch_stat)
    560					     : "i" (ASI_INTR_DISPATCH_STAT));
    561			if (!(dispatch_stat & (busy_mask | nack_mask))) {
    562				__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
    563						     : : "r" (pstate));
    564				if (unlikely(need_more)) {
    565					int i, this_cnt = 0;
    566					for (i = 0; i < cnt; i++) {
    567						if (cpu_list[i] == 0xffff)
    568							continue;
    569						cpu_list[i] = 0xffff;
    570						this_cnt++;
    571						if (this_cnt == 32)
    572							break;
    573					}
    574					goto retry;
    575				}
    576				return;
    577			}
    578			if (!--stuck)
    579				break;
    580		} while (dispatch_stat & busy_mask);
    581
    582		__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
    583				     : : "r" (pstate));
    584
    585		if (dispatch_stat & busy_mask) {
    586			/* Busy bits will not clear, continue instead
    587			 * of freezing up on this cpu.
    588			 */
    589			printk("CPU[%d]: mondo stuckage result[%016llx]\n",
    590			       smp_processor_id(), dispatch_stat);
    591		} else {
    592			int i, this_busy_nack = 0;
    593
    594			/* Delay some random time with interrupts enabled
    595			 * to prevent deadlock.
    596			 */
    597			udelay(2 * nack_busy_id);
    598
    599			/* Clear out the mask bits for cpus which did not
    600			 * NACK us.
    601			 */
    602			for (i = 0; i < cnt; i++) {
    603				u64 check_mask, nr;
    604
    605				nr = cpu_list[i];
    606				if (nr == 0xffff)
    607					continue;
    608
    609				if (is_jbus)
    610					check_mask = (0x2UL << (2*nr));
    611				else
    612					check_mask = (0x2UL <<
    613						      this_busy_nack);
    614				if ((dispatch_stat & check_mask) == 0)
    615					cpu_list[i] = 0xffff;
    616				this_busy_nack += 2;
    617				if (this_busy_nack == 64)
    618					break;
    619			}
    620
    621			goto retry;
    622		}
    623	}
    624}
    625
    626#define	CPU_MONDO_COUNTER(cpuid)	(cpu_mondo_counter[cpuid])
    627#define	MONDO_USEC_WAIT_MIN		2
    628#define	MONDO_USEC_WAIT_MAX		100
    629#define	MONDO_RETRY_LIMIT		500000
    630
    631/* Multi-cpu list version.
    632 *
    633 * Deliver xcalls to 'cnt' number of cpus in 'cpu_list'.
    634 * Sometimes not all cpus receive the mondo, requiring us to re-send
    635 * the mondo until all cpus have received, or cpus are truly stuck
    636 * unable to receive mondo, and we timeout.
    637 * Occasionally a target cpu strand is borrowed briefly by hypervisor to
    638 * perform guest service, such as PCIe error handling. Consider the
    639 * service time, 1 second overall wait is reasonable for 1 cpu.
    640 * Here two in-between mondo check wait time are defined: 2 usec for
    641 * single cpu quick turn around and up to 100usec for large cpu count.
    642 * Deliver mondo to large number of cpus could take longer, we adjusts
    643 * the retry count as long as target cpus are making forward progress.
    644 */
    645static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
    646{
    647	int this_cpu, tot_cpus, prev_sent, i, rem;
    648	int usec_wait, retries, tot_retries;
    649	u16 first_cpu = 0xffff;
    650	unsigned long xc_rcvd = 0;
    651	unsigned long status;
    652	int ecpuerror_id = 0;
    653	int enocpu_id = 0;
    654	u16 *cpu_list;
    655	u16 cpu;
    656
    657	this_cpu = smp_processor_id();
    658	cpu_list = __va(tb->cpu_list_pa);
    659	usec_wait = cnt * MONDO_USEC_WAIT_MIN;
    660	if (usec_wait > MONDO_USEC_WAIT_MAX)
    661		usec_wait = MONDO_USEC_WAIT_MAX;
    662	retries = tot_retries = 0;
    663	tot_cpus = cnt;
    664	prev_sent = 0;
    665
    666	do {
    667		int n_sent, mondo_delivered, target_cpu_busy;
    668
    669		status = sun4v_cpu_mondo_send(cnt,
    670					      tb->cpu_list_pa,
    671					      tb->cpu_mondo_block_pa);
    672
    673		/* HV_EOK means all cpus received the xcall, we're done.  */
    674		if (likely(status == HV_EOK))
    675			goto xcall_done;
    676
    677		/* If not these non-fatal errors, panic */
    678		if (unlikely((status != HV_EWOULDBLOCK) &&
    679			(status != HV_ECPUERROR) &&
    680			(status != HV_ENOCPU)))
    681			goto fatal_errors;
    682
    683		/* First, see if we made any forward progress.
    684		 *
    685		 * Go through the cpu_list, count the target cpus that have
    686		 * received our mondo (n_sent), and those that did not (rem).
    687		 * Re-pack cpu_list with the cpus remain to be retried in the
    688		 * front - this simplifies tracking the truly stalled cpus.
    689		 *
    690		 * The hypervisor indicates successful sends by setting
    691		 * cpu list entries to the value 0xffff.
    692		 *
    693		 * EWOULDBLOCK means some target cpus did not receive the
    694		 * mondo and retry usually helps.
    695		 *
    696		 * ECPUERROR means at least one target cpu is in error state,
    697		 * it's usually safe to skip the faulty cpu and retry.
    698		 *
    699		 * ENOCPU means one of the target cpu doesn't belong to the
    700		 * domain, perhaps offlined which is unexpected, but not
    701		 * fatal and it's okay to skip the offlined cpu.
    702		 */
    703		rem = 0;
    704		n_sent = 0;
    705		for (i = 0; i < cnt; i++) {
    706			cpu = cpu_list[i];
    707			if (likely(cpu == 0xffff)) {
    708				n_sent++;
    709			} else if ((status == HV_ECPUERROR) &&
    710				(sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) {
    711				ecpuerror_id = cpu + 1;
    712			} else if (status == HV_ENOCPU && !cpu_online(cpu)) {
    713				enocpu_id = cpu + 1;
    714			} else {
    715				cpu_list[rem++] = cpu;
    716			}
    717		}
    718
    719		/* No cpu remained, we're done. */
    720		if (rem == 0)
    721			break;
    722
    723		/* Otherwise, update the cpu count for retry. */
    724		cnt = rem;
    725
    726		/* Record the overall number of mondos received by the
    727		 * first of the remaining cpus.
    728		 */
    729		if (first_cpu != cpu_list[0]) {
    730			first_cpu = cpu_list[0];
    731			xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
    732		}
    733
    734		/* Was any mondo delivered successfully? */
    735		mondo_delivered = (n_sent > prev_sent);
    736		prev_sent = n_sent;
    737
    738		/* or, was any target cpu busy processing other mondos? */
    739		target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu));
    740		xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
    741
    742		/* Retry count is for no progress. If we're making progress,
    743		 * reset the retry count.
    744		 */
    745		if (likely(mondo_delivered || target_cpu_busy)) {
    746			tot_retries += retries;
    747			retries = 0;
    748		} else if (unlikely(retries > MONDO_RETRY_LIMIT)) {
    749			goto fatal_mondo_timeout;
    750		}
    751
    752		/* Delay a little bit to let other cpus catch up on
    753		 * their cpu mondo queue work.
    754		 */
    755		if (!mondo_delivered)
    756			udelay(usec_wait);
    757
    758		retries++;
    759	} while (1);
    760
    761xcall_done:
    762	if (unlikely(ecpuerror_id > 0)) {
    763		pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n",
    764		       this_cpu, ecpuerror_id - 1);
    765	} else if (unlikely(enocpu_id > 0)) {
    766		pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n",
    767		       this_cpu, enocpu_id - 1);
    768	}
    769	return;
    770
    771fatal_errors:
    772	/* fatal errors include bad alignment, etc */
    773	pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n",
    774	       this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
    775	panic("Unexpected SUN4V mondo error %lu\n", status);
    776
    777fatal_mondo_timeout:
    778	/* some cpus being non-responsive to the cpu mondo */
    779	pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n",
    780	       this_cpu, first_cpu, (tot_retries + retries), tot_cpus);
    781	panic("SUN4V mondo timeout panic\n");
    782}
    783
    784static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);
    785
    786static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
    787{
    788	struct trap_per_cpu *tb;
    789	int this_cpu, i, cnt;
    790	unsigned long flags;
    791	u16 *cpu_list;
    792	u64 *mondo;
    793
    794	/* We have to do this whole thing with interrupts fully disabled.
    795	 * Otherwise if we send an xcall from interrupt context it will
    796	 * corrupt both our mondo block and cpu list state.
    797	 *
    798	 * One consequence of this is that we cannot use timeout mechanisms
    799	 * that depend upon interrupts being delivered locally.  So, for
    800	 * example, we cannot sample jiffies and expect it to advance.
    801	 *
    802	 * Fortunately, udelay() uses %stick/%tick so we can use that.
    803	 */
    804	local_irq_save(flags);
    805
    806	this_cpu = smp_processor_id();
    807	tb = &trap_block[this_cpu];
    808
    809	mondo = __va(tb->cpu_mondo_block_pa);
    810	mondo[0] = data0;
    811	mondo[1] = data1;
    812	mondo[2] = data2;
    813	wmb();
    814
    815	cpu_list = __va(tb->cpu_list_pa);
    816
    817	/* Setup the initial cpu list.  */
    818	cnt = 0;
    819	for_each_cpu(i, mask) {
    820		if (i == this_cpu || !cpu_online(i))
    821			continue;
    822		cpu_list[cnt++] = i;
    823	}
    824
    825	if (cnt)
    826		xcall_deliver_impl(tb, cnt);
    827
    828	local_irq_restore(flags);
    829}
    830
    831/* Send cross call to all processors mentioned in MASK_P
    832 * except self.  Really, there are only two cases currently,
    833 * "cpu_online_mask" and "mm_cpumask(mm)".
    834 */
    835static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, const cpumask_t *mask)
    836{
    837	u64 data0 = (((u64)ctx)<<32 | (((u64)func) & 0xffffffff));
    838
    839	xcall_deliver(data0, data1, data2, mask);
    840}
    841
    842/* Send cross call to all processors except self. */
    843static void smp_cross_call(unsigned long *func, u32 ctx, u64 data1, u64 data2)
    844{
    845	smp_cross_call_masked(func, ctx, data1, data2, cpu_online_mask);
    846}
    847
    848extern unsigned long xcall_sync_tick;
    849
    850static void smp_start_sync_tick_client(int cpu)
    851{
    852	xcall_deliver((u64) &xcall_sync_tick, 0, 0,
    853		      cpumask_of(cpu));
    854}
    855
    856extern unsigned long xcall_call_function;
    857
    858void arch_send_call_function_ipi_mask(const struct cpumask *mask)
    859{
    860	xcall_deliver((u64) &xcall_call_function, 0, 0, mask);
    861}
    862
    863extern unsigned long xcall_call_function_single;
    864
    865void arch_send_call_function_single_ipi(int cpu)
    866{
    867	xcall_deliver((u64) &xcall_call_function_single, 0, 0,
    868		      cpumask_of(cpu));
    869}
    870
    871void __irq_entry smp_call_function_client(int irq, struct pt_regs *regs)
    872{
    873	clear_softint(1 << irq);
    874	irq_enter();
    875	generic_smp_call_function_interrupt();
    876	irq_exit();
    877}
    878
    879void __irq_entry smp_call_function_single_client(int irq, struct pt_regs *regs)
    880{
    881	clear_softint(1 << irq);
    882	irq_enter();
    883	generic_smp_call_function_single_interrupt();
    884	irq_exit();
    885}
    886
    887static void tsb_sync(void *info)
    888{
    889	struct trap_per_cpu *tp = &trap_block[raw_smp_processor_id()];
    890	struct mm_struct *mm = info;
    891
    892	/* It is not valid to test "current->active_mm == mm" here.
    893	 *
    894	 * The value of "current" is not changed atomically with
    895	 * switch_mm().  But that's OK, we just need to check the
    896	 * current cpu's trap block PGD physical address.
    897	 */
    898	if (tp->pgd_paddr == __pa(mm->pgd))
    899		tsb_context_switch(mm);
    900}
    901
    902void smp_tsb_sync(struct mm_struct *mm)
    903{
    904	smp_call_function_many(mm_cpumask(mm), tsb_sync, mm, 1);
    905}
    906
    907extern unsigned long xcall_flush_tlb_mm;
    908extern unsigned long xcall_flush_tlb_page;
    909extern unsigned long xcall_flush_tlb_kernel_range;
    910extern unsigned long xcall_fetch_glob_regs;
    911extern unsigned long xcall_fetch_glob_pmu;
    912extern unsigned long xcall_fetch_glob_pmu_n4;
    913extern unsigned long xcall_receive_signal;
    914extern unsigned long xcall_new_mmu_context_version;
    915#ifdef CONFIG_KGDB
    916extern unsigned long xcall_kgdb_capture;
    917#endif
    918
    919#ifdef DCACHE_ALIASING_POSSIBLE
    920extern unsigned long xcall_flush_dcache_page_cheetah;
    921#endif
    922extern unsigned long xcall_flush_dcache_page_spitfire;
    923
    924static inline void __local_flush_dcache_page(struct page *page)
    925{
    926#ifdef DCACHE_ALIASING_POSSIBLE
    927	__flush_dcache_page(page_address(page),
    928			    ((tlb_type == spitfire) &&
    929			     page_mapping_file(page) != NULL));
    930#else
    931	if (page_mapping_file(page) != NULL &&
    932	    tlb_type == spitfire)
    933		__flush_icache_page(__pa(page_address(page)));
    934#endif
    935}
    936
    937void smp_flush_dcache_page_impl(struct page *page, int cpu)
    938{
    939	int this_cpu;
    940
    941	if (tlb_type == hypervisor)
    942		return;
    943
    944#ifdef CONFIG_DEBUG_DCFLUSH
    945	atomic_inc(&dcpage_flushes);
    946#endif
    947
    948	this_cpu = get_cpu();
    949
    950	if (cpu == this_cpu) {
    951		__local_flush_dcache_page(page);
    952	} else if (cpu_online(cpu)) {
    953		void *pg_addr = page_address(page);
    954		u64 data0 = 0;
    955
    956		if (tlb_type == spitfire) {
    957			data0 = ((u64)&xcall_flush_dcache_page_spitfire);
    958			if (page_mapping_file(page) != NULL)
    959				data0 |= ((u64)1 << 32);
    960		} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
    961#ifdef DCACHE_ALIASING_POSSIBLE
    962			data0 =	((u64)&xcall_flush_dcache_page_cheetah);
    963#endif
    964		}
    965		if (data0) {
    966			xcall_deliver(data0, __pa(pg_addr),
    967				      (u64) pg_addr, cpumask_of(cpu));
    968#ifdef CONFIG_DEBUG_DCFLUSH
    969			atomic_inc(&dcpage_flushes_xcall);
    970#endif
    971		}
    972	}
    973
    974	put_cpu();
    975}
    976
    977void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
    978{
    979	void *pg_addr;
    980	u64 data0;
    981
    982	if (tlb_type == hypervisor)
    983		return;
    984
    985	preempt_disable();
    986
    987#ifdef CONFIG_DEBUG_DCFLUSH
    988	atomic_inc(&dcpage_flushes);
    989#endif
    990	data0 = 0;
    991	pg_addr = page_address(page);
    992	if (tlb_type == spitfire) {
    993		data0 = ((u64)&xcall_flush_dcache_page_spitfire);
    994		if (page_mapping_file(page) != NULL)
    995			data0 |= ((u64)1 << 32);
    996	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
    997#ifdef DCACHE_ALIASING_POSSIBLE
    998		data0 = ((u64)&xcall_flush_dcache_page_cheetah);
    999#endif
   1000	}
   1001	if (data0) {
   1002		xcall_deliver(data0, __pa(pg_addr),
   1003			      (u64) pg_addr, cpu_online_mask);
   1004#ifdef CONFIG_DEBUG_DCFLUSH
   1005		atomic_inc(&dcpage_flushes_xcall);
   1006#endif
   1007	}
   1008	__local_flush_dcache_page(page);
   1009
   1010	preempt_enable();
   1011}
   1012
   1013#ifdef CONFIG_KGDB
   1014void kgdb_roundup_cpus(void)
   1015{
   1016	smp_cross_call(&xcall_kgdb_capture, 0, 0, 0);
   1017}
   1018#endif
   1019
   1020void smp_fetch_global_regs(void)
   1021{
   1022	smp_cross_call(&xcall_fetch_glob_regs, 0, 0, 0);
   1023}
   1024
   1025void smp_fetch_global_pmu(void)
   1026{
   1027	if (tlb_type == hypervisor &&
   1028	    sun4v_chip_type >= SUN4V_CHIP_NIAGARA4)
   1029		smp_cross_call(&xcall_fetch_glob_pmu_n4, 0, 0, 0);
   1030	else
   1031		smp_cross_call(&xcall_fetch_glob_pmu, 0, 0, 0);
   1032}
   1033
   1034/* We know that the window frames of the user have been flushed
   1035 * to the stack before we get here because all callers of us
   1036 * are flush_tlb_*() routines, and these run after flush_cache_*()
   1037 * which performs the flushw.
   1038 *
   1039 * mm->cpu_vm_mask is a bit mask of which cpus an address
   1040 * space has (potentially) executed on, this is the heuristic
   1041 * we use to limit cross calls.
   1042 */
   1043
   1044/* This currently is only used by the hugetlb arch pre-fault
   1045 * hook on UltraSPARC-III+ and later when changing the pagesize
   1046 * bits of the context register for an address space.
   1047 */
   1048void smp_flush_tlb_mm(struct mm_struct *mm)
   1049{
   1050	u32 ctx = CTX_HWBITS(mm->context);
   1051
   1052	get_cpu();
   1053
   1054	smp_cross_call_masked(&xcall_flush_tlb_mm,
   1055			      ctx, 0, 0,
   1056			      mm_cpumask(mm));
   1057
   1058	__flush_tlb_mm(ctx, SECONDARY_CONTEXT);
   1059
   1060	put_cpu();
   1061}
   1062
   1063struct tlb_pending_info {
   1064	unsigned long ctx;
   1065	unsigned long nr;
   1066	unsigned long *vaddrs;
   1067};
   1068
   1069static void tlb_pending_func(void *info)
   1070{
   1071	struct tlb_pending_info *t = info;
   1072
   1073	__flush_tlb_pending(t->ctx, t->nr, t->vaddrs);
   1074}
   1075
   1076void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long *vaddrs)
   1077{
   1078	u32 ctx = CTX_HWBITS(mm->context);
   1079	struct tlb_pending_info info;
   1080
   1081	get_cpu();
   1082
   1083	info.ctx = ctx;
   1084	info.nr = nr;
   1085	info.vaddrs = vaddrs;
   1086
   1087	smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
   1088			       &info, 1);
   1089
   1090	__flush_tlb_pending(ctx, nr, vaddrs);
   1091
   1092	put_cpu();
   1093}
   1094
   1095void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
   1096{
   1097	unsigned long context = CTX_HWBITS(mm->context);
   1098
   1099	get_cpu();
   1100
   1101	smp_cross_call_masked(&xcall_flush_tlb_page,
   1102			      context, vaddr, 0,
   1103			      mm_cpumask(mm));
   1104
   1105	__flush_tlb_page(context, vaddr);
   1106
   1107	put_cpu();
   1108}
   1109
   1110void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end)
   1111{
   1112	start &= PAGE_MASK;
   1113	end    = PAGE_ALIGN(end);
   1114	if (start != end) {
   1115		smp_cross_call(&xcall_flush_tlb_kernel_range,
   1116			       0, start, end);
   1117
   1118		__flush_tlb_kernel_range(start, end);
   1119	}
   1120}
   1121
   1122/* CPU capture. */
   1123/* #define CAPTURE_DEBUG */
   1124extern unsigned long xcall_capture;
   1125
   1126static atomic_t smp_capture_depth = ATOMIC_INIT(0);
   1127static atomic_t smp_capture_registry = ATOMIC_INIT(0);
   1128static unsigned long penguins_are_doing_time;
   1129
   1130void smp_capture(void)
   1131{
   1132	int result = atomic_add_return(1, &smp_capture_depth);
   1133
   1134	if (result == 1) {
   1135		int ncpus = num_online_cpus();
   1136
   1137#ifdef CAPTURE_DEBUG
   1138		printk("CPU[%d]: Sending penguins to jail...",
   1139		       smp_processor_id());
   1140#endif
   1141		penguins_are_doing_time = 1;
   1142		atomic_inc(&smp_capture_registry);
   1143		smp_cross_call(&xcall_capture, 0, 0, 0);
   1144		while (atomic_read(&smp_capture_registry) != ncpus)
   1145			rmb();
   1146#ifdef CAPTURE_DEBUG
   1147		printk("done\n");
   1148#endif
   1149	}
   1150}
   1151
   1152void smp_release(void)
   1153{
   1154	if (atomic_dec_and_test(&smp_capture_depth)) {
   1155#ifdef CAPTURE_DEBUG
   1156		printk("CPU[%d]: Giving pardon to "
   1157		       "imprisoned penguins\n",
   1158		       smp_processor_id());
   1159#endif
   1160		penguins_are_doing_time = 0;
   1161		membar_safe("#StoreLoad");
   1162		atomic_dec(&smp_capture_registry);
   1163	}
   1164}
   1165
   1166/* Imprisoned penguins run with %pil == PIL_NORMAL_MAX, but PSTATE_IE
   1167 * set, so they can service tlb flush xcalls...
   1168 */
   1169extern void prom_world(int);
   1170
   1171void __irq_entry smp_penguin_jailcell(int irq, struct pt_regs *regs)
   1172{
   1173	clear_softint(1 << irq);
   1174
   1175	preempt_disable();
   1176
   1177	__asm__ __volatile__("flushw");
   1178	prom_world(1);
   1179	atomic_inc(&smp_capture_registry);
   1180	membar_safe("#StoreLoad");
   1181	while (penguins_are_doing_time)
   1182		rmb();
   1183	atomic_dec(&smp_capture_registry);
   1184	prom_world(0);
   1185
   1186	preempt_enable();
   1187}
   1188
   1189/* /proc/profile writes can call this, don't __init it please. */
   1190int setup_profiling_timer(unsigned int multiplier)
   1191{
   1192	return -EINVAL;
   1193}
   1194
   1195void __init smp_prepare_cpus(unsigned int max_cpus)
   1196{
   1197}
   1198
   1199void smp_prepare_boot_cpu(void)
   1200{
   1201}
   1202
   1203void __init smp_setup_processor_id(void)
   1204{
   1205	if (tlb_type == spitfire)
   1206		xcall_deliver_impl = spitfire_xcall_deliver;
   1207	else if (tlb_type == cheetah || tlb_type == cheetah_plus)
   1208		xcall_deliver_impl = cheetah_xcall_deliver;
   1209	else
   1210		xcall_deliver_impl = hypervisor_xcall_deliver;
   1211}
   1212
   1213void __init smp_fill_in_cpu_possible_map(void)
   1214{
   1215	int possible_cpus = num_possible_cpus();
   1216	int i;
   1217
   1218	if (possible_cpus > nr_cpu_ids)
   1219		possible_cpus = nr_cpu_ids;
   1220
   1221	for (i = 0; i < possible_cpus; i++)
   1222		set_cpu_possible(i, true);
   1223	for (; i < NR_CPUS; i++)
   1224		set_cpu_possible(i, false);
   1225}
   1226
   1227void smp_fill_in_sib_core_maps(void)
   1228{
   1229	unsigned int i;
   1230
   1231	for_each_present_cpu(i) {
   1232		unsigned int j;
   1233
   1234		cpumask_clear(&cpu_core_map[i]);
   1235		if (cpu_data(i).core_id == 0) {
   1236			cpumask_set_cpu(i, &cpu_core_map[i]);
   1237			continue;
   1238		}
   1239
   1240		for_each_present_cpu(j) {
   1241			if (cpu_data(i).core_id ==
   1242			    cpu_data(j).core_id)
   1243				cpumask_set_cpu(j, &cpu_core_map[i]);
   1244		}
   1245	}
   1246
   1247	for_each_present_cpu(i)  {
   1248		unsigned int j;
   1249
   1250		for_each_present_cpu(j)  {
   1251			if (cpu_data(i).max_cache_id ==
   1252			    cpu_data(j).max_cache_id)
   1253				cpumask_set_cpu(j, &cpu_core_sib_cache_map[i]);
   1254
   1255			if (cpu_data(i).sock_id == cpu_data(j).sock_id)
   1256				cpumask_set_cpu(j, &cpu_core_sib_map[i]);
   1257		}
   1258	}
   1259
   1260	for_each_present_cpu(i) {
   1261		unsigned int j;
   1262
   1263		cpumask_clear(&per_cpu(cpu_sibling_map, i));
   1264		if (cpu_data(i).proc_id == -1) {
   1265			cpumask_set_cpu(i, &per_cpu(cpu_sibling_map, i));
   1266			continue;
   1267		}
   1268
   1269		for_each_present_cpu(j) {
   1270			if (cpu_data(i).proc_id ==
   1271			    cpu_data(j).proc_id)
   1272				cpumask_set_cpu(j, &per_cpu(cpu_sibling_map, i));
   1273		}
   1274	}
   1275}
   1276
   1277int __cpu_up(unsigned int cpu, struct task_struct *tidle)
   1278{
   1279	int ret = smp_boot_one_cpu(cpu, tidle);
   1280
   1281	if (!ret) {
   1282		cpumask_set_cpu(cpu, &smp_commenced_mask);
   1283		while (!cpu_online(cpu))
   1284			mb();
   1285		if (!cpu_online(cpu)) {
   1286			ret = -ENODEV;
   1287		} else {
   1288			/* On SUN4V, writes to %tick and %stick are
   1289			 * not allowed.
   1290			 */
   1291			if (tlb_type != hypervisor)
   1292				smp_synchronize_one_tick(cpu);
   1293		}
   1294	}
   1295	return ret;
   1296}
   1297
   1298#ifdef CONFIG_HOTPLUG_CPU
   1299void cpu_play_dead(void)
   1300{
   1301	int cpu = smp_processor_id();
   1302	unsigned long pstate;
   1303
   1304	idle_task_exit();
   1305
   1306	if (tlb_type == hypervisor) {
   1307		struct trap_per_cpu *tb = &trap_block[cpu];
   1308
   1309		sun4v_cpu_qconf(HV_CPU_QUEUE_CPU_MONDO,
   1310				tb->cpu_mondo_pa, 0);
   1311		sun4v_cpu_qconf(HV_CPU_QUEUE_DEVICE_MONDO,
   1312				tb->dev_mondo_pa, 0);
   1313		sun4v_cpu_qconf(HV_CPU_QUEUE_RES_ERROR,
   1314				tb->resum_mondo_pa, 0);
   1315		sun4v_cpu_qconf(HV_CPU_QUEUE_NONRES_ERROR,
   1316				tb->nonresum_mondo_pa, 0);
   1317	}
   1318
   1319	cpumask_clear_cpu(cpu, &smp_commenced_mask);
   1320	membar_safe("#Sync");
   1321
   1322	local_irq_disable();
   1323
   1324	__asm__ __volatile__(
   1325		"rdpr	%%pstate, %0\n\t"
   1326		"wrpr	%0, %1, %%pstate"
   1327		: "=r" (pstate)
   1328		: "i" (PSTATE_IE));
   1329
   1330	while (1)
   1331		barrier();
   1332}
   1333
   1334int __cpu_disable(void)
   1335{
   1336	int cpu = smp_processor_id();
   1337	cpuinfo_sparc *c;
   1338	int i;
   1339
   1340	for_each_cpu(i, &cpu_core_map[cpu])
   1341		cpumask_clear_cpu(cpu, &cpu_core_map[i]);
   1342	cpumask_clear(&cpu_core_map[cpu]);
   1343
   1344	for_each_cpu(i, &per_cpu(cpu_sibling_map, cpu))
   1345		cpumask_clear_cpu(cpu, &per_cpu(cpu_sibling_map, i));
   1346	cpumask_clear(&per_cpu(cpu_sibling_map, cpu));
   1347
   1348	c = &cpu_data(cpu);
   1349
   1350	c->core_id = 0;
   1351	c->proc_id = -1;
   1352
   1353	smp_wmb();
   1354
   1355	/* Make sure no interrupts point to this cpu.  */
   1356	fixup_irqs();
   1357
   1358	local_irq_enable();
   1359	mdelay(1);
   1360	local_irq_disable();
   1361
   1362	set_cpu_online(cpu, false);
   1363
   1364	cpu_map_rebuild();
   1365
   1366	return 0;
   1367}
   1368
   1369void __cpu_die(unsigned int cpu)
   1370{
   1371	int i;
   1372
   1373	for (i = 0; i < 100; i++) {
   1374		smp_rmb();
   1375		if (!cpumask_test_cpu(cpu, &smp_commenced_mask))
   1376			break;
   1377		msleep(100);
   1378	}
   1379	if (cpumask_test_cpu(cpu, &smp_commenced_mask)) {
   1380		printk(KERN_ERR "CPU %u didn't die...\n", cpu);
   1381	} else {
   1382#if defined(CONFIG_SUN_LDOMS)
   1383		unsigned long hv_err;
   1384		int limit = 100;
   1385
   1386		do {
   1387			hv_err = sun4v_cpu_stop(cpu);
   1388			if (hv_err == HV_EOK) {
   1389				set_cpu_present(cpu, false);
   1390				break;
   1391			}
   1392		} while (--limit > 0);
   1393		if (limit <= 0) {
   1394			printk(KERN_ERR "sun4v_cpu_stop() fails err=%lu\n",
   1395			       hv_err);
   1396		}
   1397#endif
   1398	}
   1399}
   1400#endif
   1401
   1402void __init smp_cpus_done(unsigned int max_cpus)
   1403{
   1404}
   1405
   1406static void send_cpu_ipi(int cpu)
   1407{
   1408	xcall_deliver((u64) &xcall_receive_signal,
   1409			0, 0, cpumask_of(cpu));
   1410}
   1411
   1412void scheduler_poke(void)
   1413{
   1414	if (!cpu_poke)
   1415		return;
   1416
   1417	if (!__this_cpu_read(poke))
   1418		return;
   1419
   1420	__this_cpu_write(poke, false);
   1421	set_softint(1 << PIL_SMP_RECEIVE_SIGNAL);
   1422}
   1423
   1424static unsigned long send_cpu_poke(int cpu)
   1425{
   1426	unsigned long hv_err;
   1427
   1428	per_cpu(poke, cpu) = true;
   1429	hv_err = sun4v_cpu_poke(cpu);
   1430	if (hv_err != HV_EOK) {
   1431		per_cpu(poke, cpu) = false;
   1432		pr_err_ratelimited("%s: sun4v_cpu_poke() fails err=%lu\n",
   1433				    __func__, hv_err);
   1434	}
   1435
   1436	return hv_err;
   1437}
   1438
   1439void smp_send_reschedule(int cpu)
   1440{
   1441	if (cpu == smp_processor_id()) {
   1442		WARN_ON_ONCE(preemptible());
   1443		set_softint(1 << PIL_SMP_RECEIVE_SIGNAL);
   1444		return;
   1445	}
   1446
   1447	/* Use cpu poke to resume idle cpu if supported. */
   1448	if (cpu_poke && idle_cpu(cpu)) {
   1449		unsigned long ret;
   1450
   1451		ret = send_cpu_poke(cpu);
   1452		if (ret == HV_EOK)
   1453			return;
   1454	}
   1455
   1456	/* Use IPI in following cases:
   1457	 * - cpu poke not supported
   1458	 * - cpu not idle
   1459	 * - send_cpu_poke() returns with error
   1460	 */
   1461	send_cpu_ipi(cpu);
   1462}
   1463
   1464void smp_init_cpu_poke(void)
   1465{
   1466	unsigned long major;
   1467	unsigned long minor;
   1468	int ret;
   1469
   1470	if (tlb_type != hypervisor)
   1471		return;
   1472
   1473	ret = sun4v_hvapi_get(HV_GRP_CORE, &major, &minor);
   1474	if (ret) {
   1475		pr_debug("HV_GRP_CORE is not registered\n");
   1476		return;
   1477	}
   1478
   1479	if (major == 1 && minor >= 6) {
   1480		/* CPU POKE is registered. */
   1481		cpu_poke = true;
   1482		return;
   1483	}
   1484
   1485	pr_debug("CPU_POKE not supported\n");
   1486}
   1487
   1488void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs)
   1489{
   1490	clear_softint(1 << irq);
   1491	scheduler_ipi();
   1492}
   1493
   1494static void stop_this_cpu(void *dummy)
   1495{
   1496	set_cpu_online(smp_processor_id(), false);
   1497	prom_stopself();
   1498}
   1499
   1500void smp_send_stop(void)
   1501{
   1502	int cpu;
   1503
   1504	if (tlb_type == hypervisor) {
   1505		int this_cpu = smp_processor_id();
   1506#ifdef CONFIG_SERIAL_SUNHV
   1507		sunhv_migrate_hvcons_irq(this_cpu);
   1508#endif
   1509		for_each_online_cpu(cpu) {
   1510			if (cpu == this_cpu)
   1511				continue;
   1512
   1513			set_cpu_online(cpu, false);
   1514#ifdef CONFIG_SUN_LDOMS
   1515			if (ldom_domaining_enabled) {
   1516				unsigned long hv_err;
   1517				hv_err = sun4v_cpu_stop(cpu);
   1518				if (hv_err)
   1519					printk(KERN_ERR "sun4v_cpu_stop() "
   1520					       "failed err=%lu\n", hv_err);
   1521			} else
   1522#endif
   1523				prom_stopcpu_cpuid(cpu);
   1524		}
   1525	} else
   1526		smp_call_function(stop_this_cpu, NULL, 0);
   1527}
   1528
   1529static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
   1530{
   1531	if (cpu_to_node(from) == cpu_to_node(to))
   1532		return LOCAL_DISTANCE;
   1533	else
   1534		return REMOTE_DISTANCE;
   1535}
   1536
   1537static int __init pcpu_cpu_to_node(int cpu)
   1538{
   1539	return cpu_to_node(cpu);
   1540}
   1541
   1542void __init setup_per_cpu_areas(void)
   1543{
   1544	unsigned long delta;
   1545	unsigned int cpu;
   1546	int rc = -EINVAL;
   1547
   1548	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
   1549		rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
   1550					    PERCPU_DYNAMIC_RESERVE, 4 << 20,
   1551					    pcpu_cpu_distance,
   1552					    pcpu_cpu_to_node);
   1553		if (rc)
   1554			pr_warn("PERCPU: %s allocator failed (%d), "
   1555				"falling back to page size\n",
   1556				pcpu_fc_names[pcpu_chosen_fc], rc);
   1557	}
   1558	if (rc < 0)
   1559		rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE,
   1560					   pcpu_cpu_to_node);
   1561	if (rc < 0)
   1562		panic("cannot initialize percpu area (err=%d)", rc);
   1563
   1564	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
   1565	for_each_possible_cpu(cpu)
   1566		__per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
   1567
   1568	/* Setup %g5 for the boot cpu.  */
   1569	__local_per_cpu_offset = __per_cpu_offset(smp_processor_id());
   1570
   1571	of_fill_in_cpu_data();
   1572	if (tlb_type == hypervisor)
   1573		mdesc_fill_in_cpu_data(cpu_all_mask);
   1574}