cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mobility.c (16722B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Support for Partition Mobility/Migration
      4 *
      5 * Copyright (C) 2010 Nathan Fontenot
      6 * Copyright (C) 2010 IBM Corporation
      7 */
      8
      9
     10#define pr_fmt(fmt) "mobility: " fmt
     11
     12#include <linux/cpu.h>
     13#include <linux/kernel.h>
     14#include <linux/kobject.h>
     15#include <linux/nmi.h>
     16#include <linux/sched.h>
     17#include <linux/smp.h>
     18#include <linux/stat.h>
     19#include <linux/stop_machine.h>
     20#include <linux/completion.h>
     21#include <linux/device.h>
     22#include <linux/delay.h>
     23#include <linux/slab.h>
     24#include <linux/stringify.h>
     25
     26#include <asm/machdep.h>
     27#include <asm/rtas.h>
     28#include "pseries.h"
     29#include "vas.h"	/* vas_migration_handler() */
     30#include "../../kernel/cacheinfo.h"
     31
     32static struct kobject *mobility_kobj;
     33
     34struct update_props_workarea {
     35	__be32 phandle;
     36	__be32 state;
     37	__be64 reserved;
     38	__be32 nprops;
     39} __packed;
     40
     41#define NODE_ACTION_MASK	0xff000000
     42#define NODE_COUNT_MASK		0x00ffffff
     43
     44#define DELETE_DT_NODE	0x01000000
     45#define UPDATE_DT_NODE	0x02000000
     46#define ADD_DT_NODE	0x03000000
     47
     48#define MIGRATION_SCOPE	(1)
     49#define PRRN_SCOPE -2
     50
     51static int mobility_rtas_call(int token, char *buf, s32 scope)
     52{
     53	int rc;
     54
     55	spin_lock(&rtas_data_buf_lock);
     56
     57	memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
     58	rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
     59	memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
     60
     61	spin_unlock(&rtas_data_buf_lock);
     62	return rc;
     63}
     64
     65static int delete_dt_node(struct device_node *dn)
     66{
     67	struct device_node *pdn;
     68	bool is_platfac;
     69
     70	pdn = of_get_parent(dn);
     71	is_platfac = of_node_is_type(dn, "ibm,platform-facilities") ||
     72		     of_node_is_type(pdn, "ibm,platform-facilities");
     73	of_node_put(pdn);
     74
     75	/*
     76	 * The drivers that bind to nodes in the platform-facilities
     77	 * hierarchy don't support node removal, and the removal directive
     78	 * from firmware is always followed by an add of an equivalent
     79	 * node. The capability (e.g. RNG, encryption, compression)
     80	 * represented by the node is never interrupted by the migration.
     81	 * So ignore changes to this part of the tree.
     82	 */
     83	if (is_platfac) {
     84		pr_notice("ignoring remove operation for %pOFfp\n", dn);
     85		return 0;
     86	}
     87
     88	pr_debug("removing node %pOFfp\n", dn);
     89	dlpar_detach_node(dn);
     90	return 0;
     91}
     92
     93static int update_dt_property(struct device_node *dn, struct property **prop,
     94			      const char *name, u32 vd, char *value)
     95{
     96	struct property *new_prop = *prop;
     97	int more = 0;
     98
     99	/* A negative 'vd' value indicates that only part of the new property
    100	 * value is contained in the buffer and we need to call
    101	 * ibm,update-properties again to get the rest of the value.
    102	 *
    103	 * A negative value is also the two's compliment of the actual value.
    104	 */
    105	if (vd & 0x80000000) {
    106		vd = ~vd + 1;
    107		more = 1;
    108	}
    109
    110	if (new_prop) {
    111		/* partial property fixup */
    112		char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
    113		if (!new_data)
    114			return -ENOMEM;
    115
    116		memcpy(new_data, new_prop->value, new_prop->length);
    117		memcpy(new_data + new_prop->length, value, vd);
    118
    119		kfree(new_prop->value);
    120		new_prop->value = new_data;
    121		new_prop->length += vd;
    122	} else {
    123		new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
    124		if (!new_prop)
    125			return -ENOMEM;
    126
    127		new_prop->name = kstrdup(name, GFP_KERNEL);
    128		if (!new_prop->name) {
    129			kfree(new_prop);
    130			return -ENOMEM;
    131		}
    132
    133		new_prop->length = vd;
    134		new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
    135		if (!new_prop->value) {
    136			kfree(new_prop->name);
    137			kfree(new_prop);
    138			return -ENOMEM;
    139		}
    140
    141		memcpy(new_prop->value, value, vd);
    142		*prop = new_prop;
    143	}
    144
    145	if (!more) {
    146		pr_debug("updating node %pOF property %s\n", dn, name);
    147		of_update_property(dn, new_prop);
    148		*prop = NULL;
    149	}
    150
    151	return 0;
    152}
    153
    154static int update_dt_node(struct device_node *dn, s32 scope)
    155{
    156	struct update_props_workarea *upwa;
    157	struct property *prop = NULL;
    158	int i, rc, rtas_rc;
    159	char *prop_data;
    160	char *rtas_buf;
    161	int update_properties_token;
    162	u32 nprops;
    163	u32 vd;
    164
    165	update_properties_token = rtas_token("ibm,update-properties");
    166	if (update_properties_token == RTAS_UNKNOWN_SERVICE)
    167		return -EINVAL;
    168
    169	rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
    170	if (!rtas_buf)
    171		return -ENOMEM;
    172
    173	upwa = (struct update_props_workarea *)&rtas_buf[0];
    174	upwa->phandle = cpu_to_be32(dn->phandle);
    175
    176	do {
    177		rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf,
    178					scope);
    179		if (rtas_rc < 0)
    180			break;
    181
    182		prop_data = rtas_buf + sizeof(*upwa);
    183		nprops = be32_to_cpu(upwa->nprops);
    184
    185		/* On the first call to ibm,update-properties for a node the
    186		 * the first property value descriptor contains an empty
    187		 * property name, the property value length encoded as u32,
    188		 * and the property value is the node path being updated.
    189		 */
    190		if (*prop_data == 0) {
    191			prop_data++;
    192			vd = be32_to_cpu(*(__be32 *)prop_data);
    193			prop_data += vd + sizeof(vd);
    194			nprops--;
    195		}
    196
    197		for (i = 0; i < nprops; i++) {
    198			char *prop_name;
    199
    200			prop_name = prop_data;
    201			prop_data += strlen(prop_name) + 1;
    202			vd = be32_to_cpu(*(__be32 *)prop_data);
    203			prop_data += sizeof(vd);
    204
    205			switch (vd) {
    206			case 0x00000000:
    207				/* name only property, nothing to do */
    208				break;
    209
    210			case 0x80000000:
    211				of_remove_property(dn, of_find_property(dn,
    212							prop_name, NULL));
    213				prop = NULL;
    214				break;
    215
    216			default:
    217				rc = update_dt_property(dn, &prop, prop_name,
    218							vd, prop_data);
    219				if (rc) {
    220					pr_err("updating %s property failed: %d\n",
    221					       prop_name, rc);
    222				}
    223
    224				prop_data += vd;
    225				break;
    226			}
    227
    228			cond_resched();
    229		}
    230
    231		cond_resched();
    232	} while (rtas_rc == 1);
    233
    234	kfree(rtas_buf);
    235	return 0;
    236}
    237
    238static int add_dt_node(struct device_node *parent_dn, __be32 drc_index)
    239{
    240	struct device_node *dn;
    241	int rc;
    242
    243	dn = dlpar_configure_connector(drc_index, parent_dn);
    244	if (!dn)
    245		return -ENOENT;
    246
    247	/*
    248	 * Since delete_dt_node() ignores this node type, this is the
    249	 * necessary counterpart. We also know that a platform-facilities
    250	 * node returned from dlpar_configure_connector() has children
    251	 * attached, and dlpar_attach_node() only adds the parent, leaking
    252	 * the children. So ignore these on the add side for now.
    253	 */
    254	if (of_node_is_type(dn, "ibm,platform-facilities")) {
    255		pr_notice("ignoring add operation for %pOF\n", dn);
    256		dlpar_free_cc_nodes(dn);
    257		return 0;
    258	}
    259
    260	rc = dlpar_attach_node(dn, parent_dn);
    261	if (rc)
    262		dlpar_free_cc_nodes(dn);
    263
    264	pr_debug("added node %pOFfp\n", dn);
    265
    266	return rc;
    267}
    268
    269static int pseries_devicetree_update(s32 scope)
    270{
    271	char *rtas_buf;
    272	__be32 *data;
    273	int update_nodes_token;
    274	int rc;
    275
    276	update_nodes_token = rtas_token("ibm,update-nodes");
    277	if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
    278		return 0;
    279
    280	rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
    281	if (!rtas_buf)
    282		return -ENOMEM;
    283
    284	do {
    285		rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
    286		if (rc && rc != 1)
    287			break;
    288
    289		data = (__be32 *)rtas_buf + 4;
    290		while (be32_to_cpu(*data) & NODE_ACTION_MASK) {
    291			int i;
    292			u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;
    293			u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;
    294
    295			data++;
    296
    297			for (i = 0; i < node_count; i++) {
    298				struct device_node *np;
    299				__be32 phandle = *data++;
    300				__be32 drc_index;
    301
    302				np = of_find_node_by_phandle(be32_to_cpu(phandle));
    303				if (!np) {
    304					pr_warn("Failed lookup: phandle 0x%x for action 0x%x\n",
    305						be32_to_cpu(phandle), action);
    306					continue;
    307				}
    308
    309				switch (action) {
    310				case DELETE_DT_NODE:
    311					delete_dt_node(np);
    312					break;
    313				case UPDATE_DT_NODE:
    314					update_dt_node(np, scope);
    315					break;
    316				case ADD_DT_NODE:
    317					drc_index = *data++;
    318					add_dt_node(np, drc_index);
    319					break;
    320				}
    321
    322				of_node_put(np);
    323				cond_resched();
    324			}
    325		}
    326
    327		cond_resched();
    328	} while (rc == 1);
    329
    330	kfree(rtas_buf);
    331	return rc;
    332}
    333
    334void post_mobility_fixup(void)
    335{
    336	int rc;
    337
    338	rtas_activate_firmware();
    339
    340	/*
    341	 * We don't want CPUs to go online/offline while the device
    342	 * tree is being updated.
    343	 */
    344	cpus_read_lock();
    345
    346	/*
    347	 * It's common for the destination firmware to replace cache
    348	 * nodes.  Release all of the cacheinfo hierarchy's references
    349	 * before updating the device tree.
    350	 */
    351	cacheinfo_teardown();
    352
    353	rc = pseries_devicetree_update(MIGRATION_SCOPE);
    354	if (rc)
    355		pr_err("device tree update failed: %d\n", rc);
    356
    357	cacheinfo_rebuild();
    358
    359	cpus_read_unlock();
    360
    361	/* Possibly switch to a new L1 flush type */
    362	pseries_setup_security_mitigations();
    363
    364	/* Reinitialise system information for hv-24x7 */
    365	read_24x7_sys_info();
    366
    367	return;
    368}
    369
    370static int poll_vasi_state(u64 handle, unsigned long *res)
    371{
    372	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
    373	long hvrc;
    374	int ret;
    375
    376	hvrc = plpar_hcall(H_VASI_STATE, retbuf, handle);
    377	switch (hvrc) {
    378	case H_SUCCESS:
    379		ret = 0;
    380		*res = retbuf[0];
    381		break;
    382	case H_PARAMETER:
    383		ret = -EINVAL;
    384		break;
    385	case H_FUNCTION:
    386		ret = -EOPNOTSUPP;
    387		break;
    388	case H_HARDWARE:
    389	default:
    390		pr_err("unexpected H_VASI_STATE result %ld\n", hvrc);
    391		ret = -EIO;
    392		break;
    393	}
    394	return ret;
    395}
    396
    397static int wait_for_vasi_session_suspending(u64 handle)
    398{
    399	unsigned long state;
    400	int ret;
    401
    402	/*
    403	 * Wait for transition from H_VASI_ENABLED to
    404	 * H_VASI_SUSPENDING. Treat anything else as an error.
    405	 */
    406	while (true) {
    407		ret = poll_vasi_state(handle, &state);
    408
    409		if (ret != 0 || state == H_VASI_SUSPENDING) {
    410			break;
    411		} else if (state == H_VASI_ENABLED) {
    412			ssleep(1);
    413		} else {
    414			pr_err("unexpected H_VASI_STATE result %lu\n", state);
    415			ret = -EIO;
    416			break;
    417		}
    418	}
    419
    420	/*
    421	 * Proceed even if H_VASI_STATE is unavailable. If H_JOIN or
    422	 * ibm,suspend-me are also unimplemented, we'll recover then.
    423	 */
    424	if (ret == -EOPNOTSUPP)
    425		ret = 0;
    426
    427	return ret;
    428}
    429
    430static void prod_single(unsigned int target_cpu)
    431{
    432	long hvrc;
    433	int hwid;
    434
    435	hwid = get_hard_smp_processor_id(target_cpu);
    436	hvrc = plpar_hcall_norets(H_PROD, hwid);
    437	if (hvrc == H_SUCCESS)
    438		return;
    439	pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n",
    440			   target_cpu, hwid, hvrc);
    441}
    442
    443static void prod_others(void)
    444{
    445	unsigned int cpu;
    446
    447	for_each_online_cpu(cpu) {
    448		if (cpu != smp_processor_id())
    449			prod_single(cpu);
    450	}
    451}
    452
    453static u16 clamp_slb_size(void)
    454{
    455#ifdef CONFIG_PPC_64S_HASH_MMU
    456	u16 prev = mmu_slb_size;
    457
    458	slb_set_size(SLB_MIN_SIZE);
    459
    460	return prev;
    461#else
    462	return 0;
    463#endif
    464}
    465
    466static int do_suspend(void)
    467{
    468	u16 saved_slb_size;
    469	int status;
    470	int ret;
    471
    472	pr_info("calling ibm,suspend-me on CPU %i\n", smp_processor_id());
    473
    474	/*
    475	 * The destination processor model may have fewer SLB entries
    476	 * than the source. We reduce mmu_slb_size to a safe minimum
    477	 * before suspending in order to minimize the possibility of
    478	 * programming non-existent entries on the destination. If
    479	 * suspend fails, we restore it before returning. On success
    480	 * the OF reconfig path will update it from the new device
    481	 * tree after resuming on the destination.
    482	 */
    483	saved_slb_size = clamp_slb_size();
    484
    485	ret = rtas_ibm_suspend_me(&status);
    486	if (ret != 0) {
    487		pr_err("ibm,suspend-me error: %d\n", status);
    488		slb_set_size(saved_slb_size);
    489	}
    490
    491	return ret;
    492}
    493
    494/**
    495 * struct pseries_suspend_info - State shared between CPUs for join/suspend.
    496 * @counter: Threads are to increment this upon resuming from suspend
    497 *           or if an error is received from H_JOIN. The thread which performs
    498 *           the first increment (i.e. sets it to 1) is responsible for
    499 *           waking the other threads.
    500 * @done: False if join/suspend is in progress. True if the operation is
    501 *        complete (successful or not).
    502 */
    503struct pseries_suspend_info {
    504	atomic_t counter;
    505	bool done;
    506};
    507
    508static int do_join(void *arg)
    509{
    510	struct pseries_suspend_info *info = arg;
    511	atomic_t *counter = &info->counter;
    512	long hvrc;
    513	int ret;
    514
    515retry:
    516	/* Must ensure MSR.EE off for H_JOIN. */
    517	hard_irq_disable();
    518	hvrc = plpar_hcall_norets(H_JOIN);
    519
    520	switch (hvrc) {
    521	case H_CONTINUE:
    522		/*
    523		 * All other CPUs are offline or in H_JOIN. This CPU
    524		 * attempts the suspend.
    525		 */
    526		ret = do_suspend();
    527		break;
    528	case H_SUCCESS:
    529		/*
    530		 * The suspend is complete and this cpu has received a
    531		 * prod, or we've received a stray prod from unrelated
    532		 * code (e.g. paravirt spinlocks) and we need to join
    533		 * again.
    534		 *
    535		 * This barrier orders the return from H_JOIN above vs
    536		 * the load of info->done. It pairs with the barrier
    537		 * in the wakeup/prod path below.
    538		 */
    539		smp_mb();
    540		if (READ_ONCE(info->done) == false) {
    541			pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying",
    542					    smp_processor_id());
    543			goto retry;
    544		}
    545		ret = 0;
    546		break;
    547	case H_BAD_MODE:
    548	case H_HARDWARE:
    549	default:
    550		ret = -EIO;
    551		pr_err_ratelimited("H_JOIN error %ld on CPU %i\n",
    552				   hvrc, smp_processor_id());
    553		break;
    554	}
    555
    556	if (atomic_inc_return(counter) == 1) {
    557		pr_info("CPU %u waking all threads\n", smp_processor_id());
    558		WRITE_ONCE(info->done, true);
    559		/*
    560		 * This barrier orders the store to info->done vs subsequent
    561		 * H_PRODs to wake the other CPUs. It pairs with the barrier
    562		 * in the H_SUCCESS case above.
    563		 */
    564		smp_mb();
    565		prod_others();
    566	}
    567	/*
    568	 * Execution may have been suspended for several seconds, so
    569	 * reset the watchdog.
    570	 */
    571	touch_nmi_watchdog();
    572	return ret;
    573}
    574
    575/*
    576 * Abort reason code byte 0. We use only the 'Migrating partition' value.
    577 */
    578enum vasi_aborting_entity {
    579	ORCHESTRATOR        = 1,
    580	VSP_SOURCE          = 2,
    581	PARTITION_FIRMWARE  = 3,
    582	PLATFORM_FIRMWARE   = 4,
    583	VSP_TARGET          = 5,
    584	MIGRATING_PARTITION = 6,
    585};
    586
    587static void pseries_cancel_migration(u64 handle, int err)
    588{
    589	u32 reason_code;
    590	u32 detail;
    591	u8 entity;
    592	long hvrc;
    593
    594	entity = MIGRATING_PARTITION;
    595	detail = abs(err) & 0xffffff;
    596	reason_code = (entity << 24) | detail;
    597
    598	hvrc = plpar_hcall_norets(H_VASI_SIGNAL, handle,
    599				  H_VASI_SIGNAL_CANCEL, reason_code);
    600	if (hvrc)
    601		pr_err("H_VASI_SIGNAL error: %ld\n", hvrc);
    602}
    603
    604static int pseries_suspend(u64 handle)
    605{
    606	const unsigned int max_attempts = 5;
    607	unsigned int retry_interval_ms = 1;
    608	unsigned int attempt = 1;
    609	int ret;
    610
    611	while (true) {
    612		struct pseries_suspend_info info;
    613		unsigned long vasi_state;
    614		int vasi_err;
    615
    616		info = (struct pseries_suspend_info) {
    617			.counter = ATOMIC_INIT(0),
    618			.done = false,
    619		};
    620
    621		ret = stop_machine(do_join, &info, cpu_online_mask);
    622		if (ret == 0)
    623			break;
    624		/*
    625		 * Encountered an error. If the VASI stream is still
    626		 * in Suspending state, it's likely a transient
    627		 * condition related to some device in the partition
    628		 * and we can retry in the hope that the cause has
    629		 * cleared after some delay.
    630		 *
    631		 * A better design would allow drivers etc to prepare
    632		 * for the suspend and avoid conditions which prevent
    633		 * the suspend from succeeding. For now, we have this
    634		 * mitigation.
    635		 */
    636		pr_notice("Partition suspend attempt %u of %u error: %d\n",
    637			  attempt, max_attempts, ret);
    638
    639		if (attempt == max_attempts)
    640			break;
    641
    642		vasi_err = poll_vasi_state(handle, &vasi_state);
    643		if (vasi_err == 0) {
    644			if (vasi_state != H_VASI_SUSPENDING) {
    645				pr_notice("VASI state %lu after failed suspend\n",
    646					  vasi_state);
    647				break;
    648			}
    649		} else if (vasi_err != -EOPNOTSUPP) {
    650			pr_err("VASI state poll error: %d", vasi_err);
    651			break;
    652		}
    653
    654		pr_notice("Will retry partition suspend after %u ms\n",
    655			  retry_interval_ms);
    656
    657		msleep(retry_interval_ms);
    658		retry_interval_ms *= 10;
    659		attempt++;
    660	}
    661
    662	return ret;
    663}
    664
    665static int pseries_migrate_partition(u64 handle)
    666{
    667	int ret;
    668
    669	ret = wait_for_vasi_session_suspending(handle);
    670	if (ret)
    671		return ret;
    672
    673	vas_migration_handler(VAS_SUSPEND);
    674
    675	ret = pseries_suspend(handle);
    676	if (ret == 0)
    677		post_mobility_fixup();
    678	else
    679		pseries_cancel_migration(handle, ret);
    680
    681	vas_migration_handler(VAS_RESUME);
    682
    683	return ret;
    684}
    685
    686int rtas_syscall_dispatch_ibm_suspend_me(u64 handle)
    687{
    688	return pseries_migrate_partition(handle);
    689}
    690
    691static ssize_t migration_store(struct class *class,
    692			       struct class_attribute *attr, const char *buf,
    693			       size_t count)
    694{
    695	u64 streamid;
    696	int rc;
    697
    698	rc = kstrtou64(buf, 0, &streamid);
    699	if (rc)
    700		return rc;
    701
    702	rc = pseries_migrate_partition(streamid);
    703	if (rc)
    704		return rc;
    705
    706	return count;
    707}
    708
    709/*
    710 * Used by drmgr to determine the kernel behavior of the migration interface.
    711 *
    712 * Version 1: Performs all PAPR requirements for migration including
    713 *	firmware activation and device tree update.
    714 */
    715#define MIGRATION_API_VERSION	1
    716
    717static CLASS_ATTR_WO(migration);
    718static CLASS_ATTR_STRING(api_version, 0444, __stringify(MIGRATION_API_VERSION));
    719
    720static int __init mobility_sysfs_init(void)
    721{
    722	int rc;
    723
    724	mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
    725	if (!mobility_kobj)
    726		return -ENOMEM;
    727
    728	rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
    729	if (rc)
    730		pr_err("unable to create migration sysfs file (%d)\n", rc);
    731
    732	rc = sysfs_create_file(mobility_kobj, &class_attr_api_version.attr.attr);
    733	if (rc)
    734		pr_err("unable to create api_version sysfs file (%d)\n", rc);
    735
    736	return 0;
    737}
    738machine_device_initcall(pseries, mobility_sysfs_init);