cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

opal.c (30426B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * PowerNV OPAL high level interfaces
      4 *
      5 * Copyright 2011 IBM Corp.
      6 */
      7
      8#define pr_fmt(fmt)	"opal: " fmt
      9
     10#include <linux/printk.h>
     11#include <linux/types.h>
     12#include <linux/of.h>
     13#include <linux/of_fdt.h>
     14#include <linux/of_platform.h>
     15#include <linux/of_address.h>
     16#include <linux/interrupt.h>
     17#include <linux/notifier.h>
     18#include <linux/slab.h>
     19#include <linux/sched.h>
     20#include <linux/kobject.h>
     21#include <linux/delay.h>
     22#include <linux/memblock.h>
     23#include <linux/kthread.h>
     24#include <linux/freezer.h>
     25#include <linux/kmsg_dump.h>
     26#include <linux/console.h>
     27#include <linux/sched/debug.h>
     28
     29#include <asm/machdep.h>
     30#include <asm/opal.h>
     31#include <asm/firmware.h>
     32#include <asm/mce.h>
     33#include <asm/imc-pmu.h>
     34#include <asm/bug.h>
     35
     36#include "powernv.h"
     37
     38#define OPAL_MSG_QUEUE_MAX 16
     39
     40struct opal_msg_node {
     41	struct list_head	list;
     42	struct opal_msg		msg;
     43};
     44
     45static DEFINE_SPINLOCK(msg_list_lock);
     46static LIST_HEAD(msg_list);
     47
     48/* /sys/firmware/opal */
     49struct kobject *opal_kobj;
     50
     51struct opal {
     52	u64 base;
     53	u64 entry;
     54	u64 size;
     55} opal;
     56
     57struct mcheck_recoverable_range {
     58	u64 start_addr;
     59	u64 end_addr;
     60	u64 recover_addr;
     61};
     62
     63static int msg_list_size;
     64
     65static struct mcheck_recoverable_range *mc_recoverable_range;
     66static int mc_recoverable_range_len;
     67
     68struct device_node *opal_node;
     69static DEFINE_SPINLOCK(opal_write_lock);
     70static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
     71static uint32_t opal_heartbeat;
     72static struct task_struct *kopald_tsk;
     73static struct opal_msg *opal_msg;
     74static u32 opal_msg_size __ro_after_init;
     75
     76void __init opal_configure_cores(void)
     77{
     78	u64 reinit_flags = 0;
     79
     80	/* Do the actual re-init, This will clobber all FPRs, VRs, etc...
     81	 *
     82	 * It will preserve non volatile GPRs and HSPRG0/1. It will
     83	 * also restore HIDs and other SPRs to their original value
     84	 * but it might clobber a bunch.
     85	 */
     86#ifdef __BIG_ENDIAN__
     87	reinit_flags |= OPAL_REINIT_CPUS_HILE_BE;
     88#else
     89	reinit_flags |= OPAL_REINIT_CPUS_HILE_LE;
     90#endif
     91
     92	/*
     93	 * POWER9 always support running hash:
     94	 *  ie. Host hash  supports  hash guests
     95	 *      Host radix supports  hash/radix guests
     96	 */
     97	if (early_cpu_has_feature(CPU_FTR_ARCH_300)) {
     98		reinit_flags |= OPAL_REINIT_CPUS_MMU_HASH;
     99		if (early_radix_enabled())
    100			reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX;
    101	}
    102
    103	opal_reinit_cpus(reinit_flags);
    104
    105	/* Restore some bits */
    106	if (cur_cpu_spec->cpu_restore)
    107		cur_cpu_spec->cpu_restore();
    108}
    109
    110int __init early_init_dt_scan_opal(unsigned long node,
    111				   const char *uname, int depth, void *data)
    112{
    113	const void *basep, *entryp, *sizep;
    114	int basesz, entrysz, runtimesz;
    115
    116	if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
    117		return 0;
    118
    119	basep  = of_get_flat_dt_prop(node, "opal-base-address", &basesz);
    120	entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz);
    121	sizep = of_get_flat_dt_prop(node, "opal-runtime-size", &runtimesz);
    122
    123	if (!basep || !entryp || !sizep)
    124		return 1;
    125
    126	opal.base = of_read_number(basep, basesz/4);
    127	opal.entry = of_read_number(entryp, entrysz/4);
    128	opal.size = of_read_number(sizep, runtimesz/4);
    129
    130	pr_debug("OPAL Base  = 0x%llx (basep=%p basesz=%d)\n",
    131		 opal.base, basep, basesz);
    132	pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%d)\n",
    133		 opal.entry, entryp, entrysz);
    134	pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n",
    135		 opal.size, sizep, runtimesz);
    136
    137	if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) {
    138		powerpc_firmware_features |= FW_FEATURE_OPAL;
    139		pr_debug("OPAL detected !\n");
    140	} else {
    141		panic("OPAL != V3 detected, no longer supported.\n");
    142	}
    143
    144	return 1;
    145}
    146
    147int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
    148				   const char *uname, int depth, void *data)
    149{
    150	int i, psize, size;
    151	const __be32 *prop;
    152
    153	if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
    154		return 0;
    155
    156	prop = of_get_flat_dt_prop(node, "mcheck-recoverable-ranges", &psize);
    157
    158	if (!prop)
    159		return 1;
    160
    161	pr_debug("Found machine check recoverable ranges.\n");
    162
    163	/*
    164	 * Calculate number of available entries.
    165	 *
    166	 * Each recoverable address range entry is (start address, len,
    167	 * recovery address), 2 cells each for start and recovery address,
    168	 * 1 cell for len, totalling 5 cells per entry.
    169	 */
    170	mc_recoverable_range_len = psize / (sizeof(*prop) * 5);
    171
    172	/* Sanity check */
    173	if (!mc_recoverable_range_len)
    174		return 1;
    175
    176	/* Size required to hold all the entries. */
    177	size = mc_recoverable_range_len *
    178			sizeof(struct mcheck_recoverable_range);
    179
    180	/*
    181	 * Allocate a buffer to hold the MC recoverable ranges.
    182	 */
    183	mc_recoverable_range = memblock_alloc(size, __alignof__(u64));
    184	if (!mc_recoverable_range)
    185		panic("%s: Failed to allocate %u bytes align=0x%lx\n",
    186		      __func__, size, __alignof__(u64));
    187
    188	for (i = 0; i < mc_recoverable_range_len; i++) {
    189		mc_recoverable_range[i].start_addr =
    190					of_read_number(prop + (i * 5) + 0, 2);
    191		mc_recoverable_range[i].end_addr =
    192					mc_recoverable_range[i].start_addr +
    193					of_read_number(prop + (i * 5) + 2, 1);
    194		mc_recoverable_range[i].recover_addr =
    195					of_read_number(prop + (i * 5) + 3, 2);
    196
    197		pr_debug("Machine check recoverable range: %llx..%llx: %llx\n",
    198				mc_recoverable_range[i].start_addr,
    199				mc_recoverable_range[i].end_addr,
    200				mc_recoverable_range[i].recover_addr);
    201	}
    202	return 1;
    203}
    204
    205static int __init opal_register_exception_handlers(void)
    206{
    207#ifdef __BIG_ENDIAN__
    208	u64 glue;
    209
    210	if (!(powerpc_firmware_features & FW_FEATURE_OPAL))
    211		return -ENODEV;
    212
    213	/* Hookup some exception handlers except machine check. We use the
    214	 * fwnmi area at 0x7000 to provide the glue space to OPAL
    215	 */
    216	glue = 0x7000;
    217
    218	/*
    219	 * Only ancient OPAL firmware requires this.
    220	 * Specifically, firmware from FW810.00 (released June 2014)
    221	 * through FW810.20 (Released October 2014).
    222	 *
    223	 * Check if we are running on newer (post Oct 2014) firmware that
    224	 * exports the OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to
    225	 * patch the HMI interrupt and we catch it directly in Linux.
    226	 *
    227	 * For older firmware (i.e < FW810.20), we fallback to old behavior and
    228	 * let OPAL patch the HMI vector and handle it inside OPAL firmware.
    229	 *
    230	 * For newer firmware we catch/handle the HMI directly in Linux.
    231	 */
    232	if (!opal_check_token(OPAL_HANDLE_HMI)) {
    233		pr_info("Old firmware detected, OPAL handles HMIs.\n");
    234		opal_register_exception_handler(
    235				OPAL_HYPERVISOR_MAINTENANCE_HANDLER,
    236				0, glue);
    237		glue += 128;
    238	}
    239
    240	/*
    241	 * Only applicable to ancient firmware, all modern
    242	 * (post March 2015/skiboot 5.0) firmware will just return
    243	 * OPAL_UNSUPPORTED.
    244	 */
    245	opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
    246#endif
    247
    248	return 0;
    249}
    250machine_early_initcall(powernv, opal_register_exception_handlers);
    251
    252static void queue_replay_msg(void *msg)
    253{
    254	struct opal_msg_node *msg_node;
    255
    256	if (msg_list_size < OPAL_MSG_QUEUE_MAX) {
    257		msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
    258		if (msg_node) {
    259			INIT_LIST_HEAD(&msg_node->list);
    260			memcpy(&msg_node->msg, msg, sizeof(struct opal_msg));
    261			list_add_tail(&msg_node->list, &msg_list);
    262			msg_list_size++;
    263		} else
    264			pr_warn_once("message queue no memory\n");
    265
    266		if (msg_list_size >= OPAL_MSG_QUEUE_MAX)
    267			pr_warn_once("message queue full\n");
    268	}
    269}
    270
    271static void dequeue_replay_msg(enum opal_msg_type msg_type)
    272{
    273	struct opal_msg_node *msg_node, *tmp;
    274
    275	list_for_each_entry_safe(msg_node, tmp, &msg_list, list) {
    276		if (be32_to_cpu(msg_node->msg.msg_type) != msg_type)
    277			continue;
    278
    279		atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
    280					msg_type,
    281					&msg_node->msg);
    282
    283		list_del(&msg_node->list);
    284		kfree(msg_node);
    285		msg_list_size--;
    286	}
    287}
    288
    289/*
    290 * Opal message notifier based on message type. Allow subscribers to get
    291 * notified for specific messgae type.
    292 */
    293int opal_message_notifier_register(enum opal_msg_type msg_type,
    294					struct notifier_block *nb)
    295{
    296	int ret;
    297	unsigned long flags;
    298
    299	if (!nb || msg_type >= OPAL_MSG_TYPE_MAX) {
    300		pr_warn("%s: Invalid arguments, msg_type:%d\n",
    301			__func__, msg_type);
    302		return -EINVAL;
    303	}
    304
    305	spin_lock_irqsave(&msg_list_lock, flags);
    306	ret = atomic_notifier_chain_register(
    307		&opal_msg_notifier_head[msg_type], nb);
    308
    309	/*
    310	 * If the registration succeeded, replay any queued messages that came
    311	 * in prior to the notifier chain registration. msg_list_lock held here
    312	 * to ensure they're delivered prior to any subsequent messages.
    313	 */
    314	if (ret == 0)
    315		dequeue_replay_msg(msg_type);
    316
    317	spin_unlock_irqrestore(&msg_list_lock, flags);
    318
    319	return ret;
    320}
    321EXPORT_SYMBOL_GPL(opal_message_notifier_register);
    322
    323int opal_message_notifier_unregister(enum opal_msg_type msg_type,
    324				     struct notifier_block *nb)
    325{
    326	return atomic_notifier_chain_unregister(
    327			&opal_msg_notifier_head[msg_type], nb);
    328}
    329EXPORT_SYMBOL_GPL(opal_message_notifier_unregister);
    330
    331static void opal_message_do_notify(uint32_t msg_type, void *msg)
    332{
    333	unsigned long flags;
    334	bool queued = false;
    335
    336	spin_lock_irqsave(&msg_list_lock, flags);
    337	if (opal_msg_notifier_head[msg_type].head == NULL) {
    338		/*
    339		 * Queue up the msg since no notifiers have registered
    340		 * yet for this msg_type.
    341		 */
    342		queue_replay_msg(msg);
    343		queued = true;
    344	}
    345	spin_unlock_irqrestore(&msg_list_lock, flags);
    346
    347	if (queued)
    348		return;
    349
    350	/* notify subscribers */
    351	atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
    352					msg_type, msg);
    353}
    354
    355static void opal_handle_message(void)
    356{
    357	s64 ret;
    358	u32 type;
    359
    360	ret = opal_get_msg(__pa(opal_msg), opal_msg_size);
    361	/* No opal message pending. */
    362	if (ret == OPAL_RESOURCE)
    363		return;
    364
    365	/* check for errors. */
    366	if (ret) {
    367		pr_warn("%s: Failed to retrieve opal message, err=%lld\n",
    368			__func__, ret);
    369		return;
    370	}
    371
    372	type = be32_to_cpu(opal_msg->msg_type);
    373
    374	/* Sanity check */
    375	if (type >= OPAL_MSG_TYPE_MAX) {
    376		pr_warn_once("%s: Unknown message type: %u\n", __func__, type);
    377		return;
    378	}
    379	opal_message_do_notify(type, (void *)opal_msg);
    380}
    381
    382static irqreturn_t opal_message_notify(int irq, void *data)
    383{
    384	opal_handle_message();
    385	return IRQ_HANDLED;
    386}
    387
    388static int __init opal_message_init(struct device_node *opal_node)
    389{
    390	int ret, i, irq;
    391
    392	ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size);
    393	if (ret) {
    394		pr_notice("Failed to read opal-msg-size property\n");
    395		opal_msg_size = sizeof(struct opal_msg);
    396	}
    397
    398	opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
    399	if (!opal_msg) {
    400		opal_msg_size = sizeof(struct opal_msg);
    401		/* Try to allocate fixed message size */
    402		opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
    403		BUG_ON(opal_msg == NULL);
    404	}
    405
    406	for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
    407		ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
    408
    409	irq = opal_event_request(ilog2(OPAL_EVENT_MSG_PENDING));
    410	if (!irq) {
    411		pr_err("%s: Can't register OPAL event irq (%d)\n",
    412		       __func__, irq);
    413		return irq;
    414	}
    415
    416	ret = request_irq(irq, opal_message_notify,
    417			IRQ_TYPE_LEVEL_HIGH, "opal-msg", NULL);
    418	if (ret) {
    419		pr_err("%s: Can't request OPAL event irq (%d)\n",
    420		       __func__, ret);
    421		return ret;
    422	}
    423
    424	return 0;
    425}
    426
    427int opal_get_chars(uint32_t vtermno, char *buf, int count)
    428{
    429	s64 rc;
    430	__be64 evt, len;
    431
    432	if (!opal.entry)
    433		return -ENODEV;
    434	opal_poll_events(&evt);
    435	if ((be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_INPUT) == 0)
    436		return 0;
    437	len = cpu_to_be64(count);
    438	rc = opal_console_read(vtermno, &len, buf);
    439	if (rc == OPAL_SUCCESS)
    440		return be64_to_cpu(len);
    441	return 0;
    442}
    443
    444static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, bool atomic)
    445{
    446	unsigned long flags = 0 /* shut up gcc */;
    447	int written;
    448	__be64 olen;
    449	s64 rc;
    450
    451	if (!opal.entry)
    452		return -ENODEV;
    453
    454	if (atomic)
    455		spin_lock_irqsave(&opal_write_lock, flags);
    456	rc = opal_console_write_buffer_space(vtermno, &olen);
    457	if (rc || be64_to_cpu(olen) < total_len) {
    458		/* Closed -> drop characters */
    459		if (rc)
    460			written = total_len;
    461		else
    462			written = -EAGAIN;
    463		goto out;
    464	}
    465
    466	/* Should not get a partial write here because space is available. */
    467	olen = cpu_to_be64(total_len);
    468	rc = opal_console_write(vtermno, &olen, data);
    469	if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
    470		if (rc == OPAL_BUSY_EVENT)
    471			opal_poll_events(NULL);
    472		written = -EAGAIN;
    473		goto out;
    474	}
    475
    476	/* Closed or other error drop */
    477	if (rc != OPAL_SUCCESS) {
    478		written = opal_error_code(rc);
    479		goto out;
    480	}
    481
    482	written = be64_to_cpu(olen);
    483	if (written < total_len) {
    484		if (atomic) {
    485			/* Should not happen */
    486			pr_warn("atomic console write returned partial "
    487				"len=%d written=%d\n", total_len, written);
    488		}
    489		if (!written)
    490			written = -EAGAIN;
    491	}
    492
    493out:
    494	if (atomic)
    495		spin_unlock_irqrestore(&opal_write_lock, flags);
    496
    497	return written;
    498}
    499
    500int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
    501{
    502	return __opal_put_chars(vtermno, data, total_len, false);
    503}
    504
    505/*
    506 * opal_put_chars_atomic will not perform partial-writes. Data will be
    507 * atomically written to the terminal or not at all. This is not strictly
    508 * true at the moment because console space can race with OPAL's console
    509 * writes.
    510 */
    511int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len)
    512{
    513	return __opal_put_chars(vtermno, data, total_len, true);
    514}
    515
    516static s64 __opal_flush_console(uint32_t vtermno)
    517{
    518	s64 rc;
    519
    520	if (!opal_check_token(OPAL_CONSOLE_FLUSH)) {
    521		__be64 evt;
    522
    523		/*
    524		 * If OPAL_CONSOLE_FLUSH is not implemented in the firmware,
    525		 * the console can still be flushed by calling the polling
    526		 * function while it has OPAL_EVENT_CONSOLE_OUTPUT events.
    527		 */
    528		WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n");
    529
    530		opal_poll_events(&evt);
    531		if (!(be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT))
    532			return OPAL_SUCCESS;
    533		return OPAL_BUSY;
    534
    535	} else {
    536		rc = opal_console_flush(vtermno);
    537		if (rc == OPAL_BUSY_EVENT) {
    538			opal_poll_events(NULL);
    539			rc = OPAL_BUSY;
    540		}
    541		return rc;
    542	}
    543
    544}
    545
    546/*
    547 * opal_flush_console spins until the console is flushed
    548 */
    549int opal_flush_console(uint32_t vtermno)
    550{
    551	for (;;) {
    552		s64 rc = __opal_flush_console(vtermno);
    553
    554		if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
    555			mdelay(1);
    556			continue;
    557		}
    558
    559		return opal_error_code(rc);
    560	}
    561}
    562
    563/*
    564 * opal_flush_chars is an hvc interface that sleeps until the console is
    565 * flushed if wait, otherwise it will return -EBUSY if the console has data,
    566 * -EAGAIN if it has data and some of it was flushed.
    567 */
    568int opal_flush_chars(uint32_t vtermno, bool wait)
    569{
    570	for (;;) {
    571		s64 rc = __opal_flush_console(vtermno);
    572
    573		if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
    574			if (wait) {
    575				msleep(OPAL_BUSY_DELAY_MS);
    576				continue;
    577			}
    578			if (rc == OPAL_PARTIAL)
    579				return -EAGAIN;
    580		}
    581
    582		return opal_error_code(rc);
    583	}
    584}
    585
    586static int opal_recover_mce(struct pt_regs *regs,
    587					struct machine_check_event *evt)
    588{
    589	int recovered = 0;
    590
    591	if (regs_is_unrecoverable(regs)) {
    592		/* If MSR_RI isn't set, we cannot recover */
    593		pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
    594		recovered = 0;
    595	} else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
    596		/* Platform corrected itself */
    597		recovered = 1;
    598	} else if (evt->severity == MCE_SEV_FATAL) {
    599		/* Fatal machine check */
    600		pr_err("Machine check interrupt is fatal\n");
    601		recovered = 0;
    602	}
    603
    604	if (!recovered && evt->sync_error) {
    605		/*
    606		 * Try to kill processes if we get a synchronous machine check
    607		 * (e.g., one caused by execution of this instruction). This
    608		 * will devolve into a panic if we try to kill init or are in
    609		 * an interrupt etc.
    610		 *
    611		 * TODO: Queue up this address for hwpoisioning later.
    612		 * TODO: This is not quite right for d-side machine
    613		 *       checks ->nip is not necessarily the important
    614		 *       address.
    615		 */
    616		if ((user_mode(regs))) {
    617			_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
    618			recovered = 1;
    619		} else if (die_will_crash()) {
    620			/*
    621			 * die() would kill the kernel, so better to go via
    622			 * the platform reboot code that will log the
    623			 * machine check.
    624			 */
    625			recovered = 0;
    626		} else {
    627			die_mce("Machine check", regs, SIGBUS);
    628			recovered = 1;
    629		}
    630	}
    631
    632	return recovered;
    633}
    634
    635void __noreturn pnv_platform_error_reboot(struct pt_regs *regs, const char *msg)
    636{
    637	panic_flush_kmsg_start();
    638
    639	pr_emerg("Hardware platform error: %s\n", msg);
    640	if (regs)
    641		show_regs(regs);
    642	smp_send_stop();
    643
    644	panic_flush_kmsg_end();
    645
    646	/*
    647	 * Don't bother to shut things down because this will
    648	 * xstop the system.
    649	 */
    650	if (opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, msg)
    651						== OPAL_UNSUPPORTED) {
    652		pr_emerg("Reboot type %d not supported for %s\n",
    653				OPAL_REBOOT_PLATFORM_ERROR, msg);
    654	}
    655
    656	/*
    657	 * We reached here. There can be three possibilities:
    658	 * 1. We are running on a firmware level that do not support
    659	 *    opal_cec_reboot2()
    660	 * 2. We are running on a firmware level that do not support
    661	 *    OPAL_REBOOT_PLATFORM_ERROR reboot type.
    662	 * 3. We are running on FSP based system that does not need
    663	 *    opal to trigger checkstop explicitly for error analysis.
    664	 *    The FSP PRD component would have already got notified
    665	 *    about this error through other channels.
    666	 * 4. We are running on a newer skiboot that by default does
    667	 *    not cause a checkstop, drops us back to the kernel to
    668	 *    extract context and state at the time of the error.
    669	 */
    670
    671	panic(msg);
    672}
    673
    674int opal_machine_check(struct pt_regs *regs)
    675{
    676	struct machine_check_event evt;
    677
    678	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
    679		return 0;
    680
    681	/* Print things out */
    682	if (evt.version != MCE_V1) {
    683		pr_err("Machine Check Exception, Unknown event version %d !\n",
    684		       evt.version);
    685		return 0;
    686	}
    687	machine_check_print_event_info(&evt, user_mode(regs), false);
    688
    689	if (opal_recover_mce(regs, &evt))
    690		return 1;
    691
    692	pnv_platform_error_reboot(regs, "Unrecoverable Machine Check exception");
    693}
    694
    695/* Early hmi handler called in real mode. */
    696int opal_hmi_exception_early(struct pt_regs *regs)
    697{
    698	s64 rc;
    699
    700	/*
    701	 * call opal hmi handler. Pass paca address as token.
    702	 * The return value OPAL_SUCCESS is an indication that there is
    703	 * an HMI event generated waiting to pull by Linux.
    704	 */
    705	rc = opal_handle_hmi();
    706	if (rc == OPAL_SUCCESS) {
    707		local_paca->hmi_event_available = 1;
    708		return 1;
    709	}
    710	return 0;
    711}
    712
    713int opal_hmi_exception_early2(struct pt_regs *regs)
    714{
    715	s64 rc;
    716	__be64 out_flags;
    717
    718	/*
    719	 * call opal hmi handler.
    720	 * Check 64-bit flag mask to find out if an event was generated,
    721	 * and whether TB is still valid or not etc.
    722	 */
    723	rc = opal_handle_hmi2(&out_flags);
    724	if (rc != OPAL_SUCCESS)
    725		return 0;
    726
    727	if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_NEW_EVENT)
    728		local_paca->hmi_event_available = 1;
    729	if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_TOD_TB_FAIL)
    730		tb_invalid = true;
    731	return 1;
    732}
    733
    734/* HMI exception handler called in virtual mode when irqs are next enabled. */
    735int opal_handle_hmi_exception(struct pt_regs *regs)
    736{
    737	/*
    738	 * Check if HMI event is available.
    739	 * if Yes, then wake kopald to process them.
    740	 */
    741	if (!local_paca->hmi_event_available)
    742		return 0;
    743
    744	local_paca->hmi_event_available = 0;
    745	opal_wake_poller();
    746
    747	return 1;
    748}
    749
    750static uint64_t find_recovery_address(uint64_t nip)
    751{
    752	int i;
    753
    754	for (i = 0; i < mc_recoverable_range_len; i++)
    755		if ((nip >= mc_recoverable_range[i].start_addr) &&
    756		    (nip < mc_recoverable_range[i].end_addr))
    757		    return mc_recoverable_range[i].recover_addr;
    758	return 0;
    759}
    760
    761bool opal_mce_check_early_recovery(struct pt_regs *regs)
    762{
    763	uint64_t recover_addr = 0;
    764
    765	if (!opal.base || !opal.size)
    766		goto out;
    767
    768	if ((regs->nip >= opal.base) &&
    769			(regs->nip < (opal.base + opal.size)))
    770		recover_addr = find_recovery_address(regs->nip);
    771
    772	/*
    773	 * Setup regs->nip to rfi into fixup address.
    774	 */
    775	if (recover_addr)
    776		regs_set_return_ip(regs, recover_addr);
    777
    778out:
    779	return !!recover_addr;
    780}
    781
    782static int __init opal_sysfs_init(void)
    783{
    784	opal_kobj = kobject_create_and_add("opal", firmware_kobj);
    785	if (!opal_kobj) {
    786		pr_warn("kobject_create_and_add opal failed\n");
    787		return -ENOMEM;
    788	}
    789
    790	return 0;
    791}
    792
    793static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
    794				struct bin_attribute *bin_attr, char *buf,
    795				loff_t off, size_t count)
    796{
    797	return memory_read_from_buffer(buf, count, &off, bin_attr->private,
    798				       bin_attr->size);
    799}
    800
    801static int opal_add_one_export(struct kobject *parent, const char *export_name,
    802			       struct device_node *np, const char *prop_name)
    803{
    804	struct bin_attribute *attr = NULL;
    805	const char *name = NULL;
    806	u64 vals[2];
    807	int rc;
    808
    809	rc = of_property_read_u64_array(np, prop_name, &vals[0], 2);
    810	if (rc)
    811		goto out;
    812
    813	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
    814	if (!attr) {
    815		rc = -ENOMEM;
    816		goto out;
    817	}
    818	name = kstrdup(export_name, GFP_KERNEL);
    819	if (!name) {
    820		rc = -ENOMEM;
    821		goto out;
    822	}
    823
    824	sysfs_bin_attr_init(attr);
    825	attr->attr.name = name;
    826	attr->attr.mode = 0400;
    827	attr->read = export_attr_read;
    828	attr->private = __va(vals[0]);
    829	attr->size = vals[1];
    830
    831	rc = sysfs_create_bin_file(parent, attr);
    832out:
    833	if (rc) {
    834		kfree(name);
    835		kfree(attr);
    836	}
    837
    838	return rc;
    839}
    840
    841static void opal_add_exported_attrs(struct device_node *np,
    842				    struct kobject *kobj)
    843{
    844	struct device_node *child;
    845	struct property *prop;
    846
    847	for_each_property_of_node(np, prop) {
    848		int rc;
    849
    850		if (!strcmp(prop->name, "name") ||
    851		    !strcmp(prop->name, "phandle"))
    852			continue;
    853
    854		rc = opal_add_one_export(kobj, prop->name, np, prop->name);
    855		if (rc) {
    856			pr_warn("Unable to add export %pOF/%s, rc = %d!\n",
    857				np, prop->name, rc);
    858		}
    859	}
    860
    861	for_each_child_of_node(np, child) {
    862		struct kobject *child_kobj;
    863
    864		child_kobj = kobject_create_and_add(child->name, kobj);
    865		if (!child_kobj) {
    866			pr_err("Unable to create export dir for %pOF\n", child);
    867			continue;
    868		}
    869
    870		opal_add_exported_attrs(child, child_kobj);
    871	}
    872}
    873
    874/*
    875 * opal_export_attrs: creates a sysfs node for each property listed in
    876 * the device-tree under /ibm,opal/firmware/exports/
    877 * All new sysfs nodes are created under /opal/exports/.
    878 * This allows for reserved memory regions (e.g. HDAT) to be read.
    879 * The new sysfs nodes are only readable by root.
    880 */
    881static void opal_export_attrs(void)
    882{
    883	struct device_node *np;
    884	struct kobject *kobj;
    885	int rc;
    886
    887	np = of_find_node_by_path("/ibm,opal/firmware/exports");
    888	if (!np)
    889		return;
    890
    891	/* Create new 'exports' directory - /sys/firmware/opal/exports */
    892	kobj = kobject_create_and_add("exports", opal_kobj);
    893	if (!kobj) {
    894		pr_warn("kobject_create_and_add() of exports failed\n");
    895		return;
    896	}
    897
    898	opal_add_exported_attrs(np, kobj);
    899
    900	/*
    901	 * NB: symbol_map existed before the generic export interface so it
    902	 * lives under the top level opal_kobj.
    903	 */
    904	rc = opal_add_one_export(opal_kobj, "symbol_map",
    905				 np->parent, "symbol-map");
    906	if (rc)
    907		pr_warn("Error %d creating OPAL symbols file\n", rc);
    908
    909	of_node_put(np);
    910}
    911
    912static void __init opal_dump_region_init(void)
    913{
    914	void *addr;
    915	uint64_t size;
    916	int rc;
    917
    918	if (!opal_check_token(OPAL_REGISTER_DUMP_REGION))
    919		return;
    920
    921	/* Register kernel log buffer */
    922	addr = log_buf_addr_get();
    923	if (addr == NULL)
    924		return;
    925
    926	size = log_buf_len_get();
    927	if (size == 0)
    928		return;
    929
    930	rc = opal_register_dump_region(OPAL_DUMP_REGION_LOG_BUF,
    931				       __pa(addr), size);
    932	/* Don't warn if this is just an older OPAL that doesn't
    933	 * know about that call
    934	 */
    935	if (rc && rc != OPAL_UNSUPPORTED)
    936		pr_warn("DUMP: Failed to register kernel log buffer. "
    937			"rc = %d\n", rc);
    938}
    939
    940static void __init opal_pdev_init(const char *compatible)
    941{
    942	struct device_node *np;
    943
    944	for_each_compatible_node(np, NULL, compatible)
    945		of_platform_device_create(np, NULL, NULL);
    946}
    947
    948static void __init opal_imc_init_dev(void)
    949{
    950	struct device_node *np;
    951
    952	np = of_find_compatible_node(NULL, NULL, IMC_DTB_COMPAT);
    953	if (np)
    954		of_platform_device_create(np, NULL, NULL);
    955}
    956
    957static int kopald(void *unused)
    958{
    959	unsigned long timeout = msecs_to_jiffies(opal_heartbeat) + 1;
    960
    961	set_freezable();
    962	do {
    963		try_to_freeze();
    964
    965		opal_handle_events();
    966
    967		set_current_state(TASK_INTERRUPTIBLE);
    968		if (opal_have_pending_events())
    969			__set_current_state(TASK_RUNNING);
    970		else
    971			schedule_timeout(timeout);
    972
    973	} while (!kthread_should_stop());
    974
    975	return 0;
    976}
    977
    978void opal_wake_poller(void)
    979{
    980	if (kopald_tsk)
    981		wake_up_process(kopald_tsk);
    982}
    983
    984static void __init opal_init_heartbeat(void)
    985{
    986	/* Old firwmware, we assume the HVC heartbeat is sufficient */
    987	if (of_property_read_u32(opal_node, "ibm,heartbeat-ms",
    988				 &opal_heartbeat) != 0)
    989		opal_heartbeat = 0;
    990
    991	if (opal_heartbeat)
    992		kopald_tsk = kthread_run(kopald, NULL, "kopald");
    993}
    994
    995static int __init opal_init(void)
    996{
    997	struct device_node *np, *consoles, *leds;
    998	int rc;
    999
   1000	opal_node = of_find_node_by_path("/ibm,opal");
   1001	if (!opal_node) {
   1002		pr_warn("Device node not found\n");
   1003		return -ENODEV;
   1004	}
   1005
   1006	/* Register OPAL consoles if any ports */
   1007	consoles = of_find_node_by_path("/ibm,opal/consoles");
   1008	if (consoles) {
   1009		for_each_child_of_node(consoles, np) {
   1010			if (!of_node_name_eq(np, "serial"))
   1011				continue;
   1012			of_platform_device_create(np, NULL, NULL);
   1013		}
   1014		of_node_put(consoles);
   1015	}
   1016
   1017	/* Initialise OPAL messaging system */
   1018	opal_message_init(opal_node);
   1019
   1020	/* Initialise OPAL asynchronous completion interface */
   1021	opal_async_comp_init();
   1022
   1023	/* Initialise OPAL sensor interface */
   1024	opal_sensor_init();
   1025
   1026	/* Initialise OPAL hypervisor maintainence interrupt handling */
   1027	opal_hmi_handler_init();
   1028
   1029	/* Create i2c platform devices */
   1030	opal_pdev_init("ibm,opal-i2c");
   1031
   1032	/* Handle non-volatile memory devices */
   1033	opal_pdev_init("pmem-region");
   1034
   1035	/* Setup a heatbeat thread if requested by OPAL */
   1036	opal_init_heartbeat();
   1037
   1038	/* Detect In-Memory Collection counters and create devices*/
   1039	opal_imc_init_dev();
   1040
   1041	/* Create leds platform devices */
   1042	leds = of_find_node_by_path("/ibm,opal/leds");
   1043	if (leds) {
   1044		of_platform_device_create(leds, "opal_leds", NULL);
   1045		of_node_put(leds);
   1046	}
   1047
   1048	/* Initialise OPAL message log interface */
   1049	opal_msglog_init();
   1050
   1051	/* Create "opal" kobject under /sys/firmware */
   1052	rc = opal_sysfs_init();
   1053	if (rc == 0) {
   1054		/* Setup dump region interface */
   1055		opal_dump_region_init();
   1056		/* Setup error log interface */
   1057		rc = opal_elog_init();
   1058		/* Setup code update interface */
   1059		opal_flash_update_init();
   1060		/* Setup platform dump extract interface */
   1061		opal_platform_dump_init();
   1062		/* Setup system parameters interface */
   1063		opal_sys_param_init();
   1064		/* Setup message log sysfs interface. */
   1065		opal_msglog_sysfs_init();
   1066		/* Add all export properties*/
   1067		opal_export_attrs();
   1068	}
   1069
   1070	/* Initialize platform devices: IPMI backend, PRD & flash interface */
   1071	opal_pdev_init("ibm,opal-ipmi");
   1072	opal_pdev_init("ibm,opal-flash");
   1073	opal_pdev_init("ibm,opal-prd");
   1074
   1075	/* Initialise platform device: oppanel interface */
   1076	opal_pdev_init("ibm,opal-oppanel");
   1077
   1078	/* Initialise OPAL kmsg dumper for flushing console on panic */
   1079	opal_kmsg_init();
   1080
   1081	/* Initialise OPAL powercap interface */
   1082	opal_powercap_init();
   1083
   1084	/* Initialise OPAL Power-Shifting-Ratio interface */
   1085	opal_psr_init();
   1086
   1087	/* Initialise OPAL sensor groups */
   1088	opal_sensor_groups_init();
   1089
   1090	/* Initialise OPAL Power control interface */
   1091	opal_power_control_init();
   1092
   1093	/* Initialize OPAL secure variables */
   1094	opal_pdev_init("ibm,secvar-backend");
   1095
   1096	return 0;
   1097}
   1098machine_subsys_initcall(powernv, opal_init);
   1099
   1100void opal_shutdown(void)
   1101{
   1102	long rc = OPAL_BUSY;
   1103
   1104	opal_event_shutdown();
   1105
   1106	/*
   1107	 * Then sync with OPAL which ensure anything that can
   1108	 * potentially write to our memory has completed such
   1109	 * as an ongoing dump retrieval
   1110	 */
   1111	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
   1112		rc = opal_sync_host_reboot();
   1113		if (rc == OPAL_BUSY)
   1114			opal_poll_events(NULL);
   1115		else
   1116			mdelay(10);
   1117	}
   1118
   1119	/* Unregister memory dump region */
   1120	if (opal_check_token(OPAL_UNREGISTER_DUMP_REGION))
   1121		opal_unregister_dump_region(OPAL_DUMP_REGION_LOG_BUF);
   1122}
   1123
   1124/* Export this so that test modules can use it */
   1125EXPORT_SYMBOL_GPL(opal_invalid_call);
   1126EXPORT_SYMBOL_GPL(opal_xscom_read);
   1127EXPORT_SYMBOL_GPL(opal_xscom_write);
   1128EXPORT_SYMBOL_GPL(opal_ipmi_send);
   1129EXPORT_SYMBOL_GPL(opal_ipmi_recv);
   1130EXPORT_SYMBOL_GPL(opal_flash_read);
   1131EXPORT_SYMBOL_GPL(opal_flash_write);
   1132EXPORT_SYMBOL_GPL(opal_flash_erase);
   1133EXPORT_SYMBOL_GPL(opal_prd_msg);
   1134EXPORT_SYMBOL_GPL(opal_check_token);
   1135
   1136/* Convert a region of vmalloc memory to an opal sg list */
   1137struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
   1138					     unsigned long vmalloc_size)
   1139{
   1140	struct opal_sg_list *sg, *first = NULL;
   1141	unsigned long i = 0;
   1142
   1143	sg = kzalloc(PAGE_SIZE, GFP_KERNEL);
   1144	if (!sg)
   1145		goto nomem;
   1146
   1147	first = sg;
   1148
   1149	while (vmalloc_size > 0) {
   1150		uint64_t data = vmalloc_to_pfn(vmalloc_addr) << PAGE_SHIFT;
   1151		uint64_t length = min(vmalloc_size, PAGE_SIZE);
   1152
   1153		sg->entry[i].data = cpu_to_be64(data);
   1154		sg->entry[i].length = cpu_to_be64(length);
   1155		i++;
   1156
   1157		if (i >= SG_ENTRIES_PER_NODE) {
   1158			struct opal_sg_list *next;
   1159
   1160			next = kzalloc(PAGE_SIZE, GFP_KERNEL);
   1161			if (!next)
   1162				goto nomem;
   1163
   1164			sg->length = cpu_to_be64(
   1165					i * sizeof(struct opal_sg_entry) + 16);
   1166			i = 0;
   1167			sg->next = cpu_to_be64(__pa(next));
   1168			sg = next;
   1169		}
   1170
   1171		vmalloc_addr += length;
   1172		vmalloc_size -= length;
   1173	}
   1174
   1175	sg->length = cpu_to_be64(i * sizeof(struct opal_sg_entry) + 16);
   1176
   1177	return first;
   1178
   1179nomem:
   1180	pr_err("%s : Failed to allocate memory\n", __func__);
   1181	opal_free_sg_list(first);
   1182	return NULL;
   1183}
   1184
   1185void opal_free_sg_list(struct opal_sg_list *sg)
   1186{
   1187	while (sg) {
   1188		uint64_t next = be64_to_cpu(sg->next);
   1189
   1190		kfree(sg);
   1191
   1192		if (next)
   1193			sg = __va(next);
   1194		else
   1195			sg = NULL;
   1196	}
   1197}
   1198
   1199int opal_error_code(int rc)
   1200{
   1201	switch (rc) {
   1202	case OPAL_SUCCESS:		return 0;
   1203
   1204	case OPAL_PARAMETER:		return -EINVAL;
   1205	case OPAL_ASYNC_COMPLETION:	return -EINPROGRESS;
   1206	case OPAL_BUSY:
   1207	case OPAL_BUSY_EVENT:		return -EBUSY;
   1208	case OPAL_NO_MEM:		return -ENOMEM;
   1209	case OPAL_PERMISSION:		return -EPERM;
   1210
   1211	case OPAL_UNSUPPORTED:		return -EIO;
   1212	case OPAL_HARDWARE:		return -EIO;
   1213	case OPAL_INTERNAL_ERROR:	return -EIO;
   1214	case OPAL_TIMEOUT:		return -ETIMEDOUT;
   1215	default:
   1216		pr_err("%s: unexpected OPAL error %d\n", __func__, rc);
   1217		return -EIO;
   1218	}
   1219}
   1220
   1221void powernv_set_nmmu_ptcr(unsigned long ptcr)
   1222{
   1223	int rc;
   1224
   1225	if (firmware_has_feature(FW_FEATURE_OPAL)) {
   1226		rc = opal_nmmu_set_ptcr(-1UL, ptcr);
   1227		if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
   1228			pr_warn("%s: Unable to set nest mmu ptcr\n", __func__);
   1229	}
   1230}
   1231
   1232EXPORT_SYMBOL_GPL(opal_poll_events);
   1233EXPORT_SYMBOL_GPL(opal_rtc_read);
   1234EXPORT_SYMBOL_GPL(opal_rtc_write);
   1235EXPORT_SYMBOL_GPL(opal_tpo_read);
   1236EXPORT_SYMBOL_GPL(opal_tpo_write);
   1237EXPORT_SYMBOL_GPL(opal_i2c_request);
   1238/* Export these symbols for PowerNV LED class driver */
   1239EXPORT_SYMBOL_GPL(opal_leds_get_ind);
   1240EXPORT_SYMBOL_GPL(opal_leds_set_ind);
   1241/* Export this symbol for PowerNV Operator Panel class driver */
   1242EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
   1243/* Export this for KVM */
   1244EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
   1245EXPORT_SYMBOL_GPL(opal_int_eoi);
   1246EXPORT_SYMBOL_GPL(opal_error_code);
   1247/* Export the below symbol for NX compression */
   1248EXPORT_SYMBOL(opal_nx_coproc_init);